diff --git a/.buildkite/check-wheel-size.py b/.buildkite/check-wheel-size.py
index 68aff793ae6aa55ac1eedc130446c0b5e7046f2a..76f6d7aeca0d86f128dc820005166044525547eb 100644
--- a/.buildkite/check-wheel-size.py
+++ b/.buildkite/check-wheel-size.py
@@ -5,11 +5,11 @@ import os
 import sys
 import zipfile
 
-# Read the VLLM_MAX_SIZE_MB environment variable, defaulting to 400 MiB
-# Note that we have 400 MiB quota, please use it wisely.
-# See https://github.com/pypi/support/issues/3792 .
+# Read the VLLM_MAX_SIZE_MB environment variable, defaulting to 450 MiB
+# Note that we have 800 MiB quota, please use it wisely.
+# See https://github.com/pypi/support/issues/6326 .
 # Please also sync the value with the one in Dockerfile.
-VLLM_MAX_SIZE_MB = int(os.environ.get("VLLM_MAX_SIZE_MB", 400))
+VLLM_MAX_SIZE_MB = int(os.environ.get("VLLM_MAX_SIZE_MB", 450))
 
 
 def print_top_10_largest_files(zip_file):
diff --git a/.buildkite/nightly-benchmarks/scripts/compare-json-results.py b/.buildkite/nightly-benchmarks/scripts/compare-json-results.py
index 50431d0cd4c5e69362f4047918ec42edea497ae9..5ea5a50a258a434cbece3fa82163ce53f4a86e7e 100644
--- a/.buildkite/nightly-benchmarks/scripts/compare-json-results.py
+++ b/.buildkite/nightly-benchmarks/scripts/compare-json-results.py
@@ -218,7 +218,7 @@ if __name__ == "__main__":
         "--xaxis",
         type=str,
         default="# of max concurrency.",
-        help="column name to use as X Axis in comparision graph",
+        help="column name to use as X Axis in comparison graph",
     )
     args = parser.parse_args()
 
diff --git a/.buildkite/nightly-benchmarks/tests/serving-tests-cpu-snc2.json b/.buildkite/nightly-benchmarks/tests/serving-tests-cpu-snc2.json
index 2d88a0b30c4f8434eb371b84d88c4fa8fb5ae7bf..f758097e098e48a30a467ff4cc6cb130b6931f56 100644
--- a/.buildkite/nightly-benchmarks/tests/serving-tests-cpu-snc2.json
+++ b/.buildkite/nightly-benchmarks/tests/serving-tests-cpu-snc2.json
@@ -1,6 +1,6 @@
 [
     {
-        "test_name": "serving_llama8B_tp1_sharegpt",
+        "test_name": "serving_llama8B_bf16_tp1_sharegpt",
         "qps_list": ["inf"],
         "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
         "server_environment_variables": {
@@ -32,7 +32,7 @@
         }
     },
     {
-        "test_name": "serving_llama8B_tp2_sharegpt",
+        "test_name": "serving_llama8B_bf16_tp2_sharegpt",
         "qps_list": ["inf"],
         "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
         "server_environment_variables": {
@@ -64,7 +64,7 @@
         }
     },
     {
-        "test_name": "serving_llama8B_tp4_sharegpt",
+        "test_name": "serving_llama8B_bf16_tp4_sharegpt",
         "qps_list": ["inf"],
         "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
         "server_environment_variables": {
@@ -96,7 +96,7 @@
         }
     },
     {
-        "test_name": "serving_llama8B_tp1_random_128_128",
+        "test_name": "serving_llama8B_bf16_tp1_random_128_128",
         "qps_list": ["inf"],
         "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
         "server_environment_variables": {
@@ -131,7 +131,7 @@
         }
     },
     {
-        "test_name": "serving_llama8B_tp2_random_128_128",
+        "test_name": "serving_llama8B_bf16_tp2_random_128_128",
         "qps_list": ["inf"],
         "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
         "server_environment_variables": {
@@ -166,7 +166,7 @@
         }
     },
     {
-        "test_name": "serving_llama8B_tp4_random_128_128",
+        "test_name": "serving_llama8B_bf16_tp4_random_128_128",
         "qps_list": ["inf"],
         "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
         "server_environment_variables": {
@@ -198,5 +198,413 @@
 	    "random-output-len": 128,
             "num_prompts": 1000
         }
+    },
+    {
+        "test_name": "serving_llama8B_int8_tp1_sharegpt",
+        "qps_list": ["inf"],
+        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
+        "server_environment_variables": {
+            "VLLM_RPC_TIMEOUT": 100000,
+	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
+	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
+	    "VLLM_CPU_SGL_KERNEL": 1,
+	    "VLLM_CPU_KVCACHE_SPACE": 40
+        },
+        "server_parameters": {
+            "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
+            "tensor_parallel_size": 1,
+	    "dtype": "bfloat16",
+	    "distributed_executor_backend": "mp",
+	    "block_size": 128,
+	    "trust_remote_code": "",
+            "disable_log_stats": "",
+	    "enforce_eager": "",
+	    "max_num_batched_tokens": 2048,
+	    "max_num_seqs": 256,
+            "load_format": "dummy"
+        },
+        "client_parameters": {
+            "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
+            "backend": "vllm",
+            "dataset_name": "sharegpt",
+            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
+            "num_prompts": 200
+        }
+    },
+    {
+        "test_name": "serving_llama8B_int8_tp2_sharegpt",
+        "qps_list": ["inf"],
+        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
+        "server_environment_variables": {
+            "VLLM_RPC_TIMEOUT": 100000,
+	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
+	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
+	    "VLLM_CPU_SGL_KERNEL": 1,
+	    "VLLM_CPU_KVCACHE_SPACE": 40
+        },
+        "server_parameters": {
+            "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
+            "tensor_parallel_size": 2,
+	    "dtype": "bfloat16",
+	    "distributed_executor_backend": "mp",
+	    "block_size": 128,
+	    "trust_remote_code": "",
+            "disable_log_stats": "",
+	    "enforce_eager": "",
+	    "max_num_batched_tokens": 2048,
+	    "max_num_seqs": 256,
+            "load_format": "dummy"
+        },
+        "client_parameters": {
+            "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
+            "backend": "vllm",
+            "dataset_name": "sharegpt",
+            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
+            "num_prompts": 200
+        }
+    },
+    {
+        "test_name": "serving_llama8B_int8_tp4_sharegpt",
+        "qps_list": ["inf"],
+        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
+        "server_environment_variables": {
+            "VLLM_RPC_TIMEOUT": 100000,
+	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
+	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
+	    "VLLM_CPU_SGL_KERNEL": 1,
+	    "VLLM_CPU_KVCACHE_SPACE": 40
+        },
+        "server_parameters": {
+            "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
+            "tensor_parallel_size": 4,
+	    "dtype": "bfloat16",
+	    "distributed_executor_backend": "mp",
+	    "block_size": 128,
+	    "trust_remote_code": "",
+            "disable_log_stats": "",
+	    "enforce_eager": "",
+	    "max_num_batched_tokens": 2048,
+	    "max_num_seqs": 256,
+            "load_format": "dummy"
+        },
+        "client_parameters": {
+            "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
+            "backend": "vllm",
+            "dataset_name": "sharegpt",
+            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
+            "num_prompts": 200
+        }
+    },
+    {
+        "test_name": "serving_llama8B_int8_tp1_random_128_128",
+        "qps_list": ["inf"],
+        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
+        "server_environment_variables": {
+            "VLLM_RPC_TIMEOUT": 100000,
+	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
+	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
+	    "VLLM_CPU_SGL_KERNEL": 1,
+	    "VLLM_CPU_KVCACHE_SPACE": 40
+        },
+        "server_parameters": {
+            "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
+            "tensor_parallel_size": 1,
+	    "dtype": "bfloat16",
+	    "distributed_executor_backend": "mp",
+	    "block_size": 128,
+	    "trust_remote_code": "",
+	    "enable_chunked_prefill": "",
+            "disable_log_stats": "",
+	    "enforce_eager": "",
+	    "max_num_batched_tokens": 2048,
+	    "max_num_seqs": 256,
+            "load_format": "dummy"
+        },
+        "client_parameters": {
+            "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
+            "backend": "vllm",
+            "dataset_name": "random",
+	    "random-input-len": 128,
+	    "random-output-len": 128,
+	    "ignore-eos": "",
+            "num_prompts": 1000
+        }
+    },
+    {
+        "test_name": "serving_llama8B_int8_tp2_random_128_128",
+        "qps_list": ["inf"],
+        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
+        "server_environment_variables": {
+            "VLLM_RPC_TIMEOUT": 100000,
+	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
+	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
+	    "VLLM_CPU_SGL_KERNEL": 1,
+	    "VLLM_CPU_KVCACHE_SPACE": 40
+        },
+        "server_parameters": {
+            "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
+            "tensor_parallel_size": 2,
+	    "dtype": "bfloat16",
+	    "distributed_executor_backend": "mp",
+	    "block_size": 128,
+	    "trust_remote_code": "",
+	    "enable_chunked_prefill": "",
+            "disable_log_stats": "",
+	    "enforce_eager": "",
+	    "max_num_batched_tokens": 2048,
+	    "max_num_seqs": 256,
+            "load_format": "dummy"
+        },
+        "client_parameters": {
+            "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
+            "backend": "vllm",
+            "dataset_name": "random",
+	    "random-input-len": 128,
+	    "random-output-len": 128,
+	    "ignore-eos": "",
+            "num_prompts": 1000
+        }
+    },
+    {
+        "test_name": "serving_llama8B_int8_tp4_random_128_128",
+        "qps_list": ["inf"],
+        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
+        "server_environment_variables": {
+            "VLLM_RPC_TIMEOUT": 100000,
+	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
+	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
+	    "VLLM_CPU_SGL_KERNEL": 1,
+	    "VLLM_CPU_KVCACHE_SPACE": 40
+        },
+        "server_parameters": {
+            "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
+            "tensor_parallel_size": 4,
+	    "dtype": "bfloat16",
+	    "distributed_executor_backend": "mp",
+	    "block_size": 128,
+	    "trust_remote_code": "",
+	    "enable_chunked_prefill": "",
+            "disable_log_stats": "",
+	    "enforce_eager": "",
+	    "max_num_batched_tokens": 2048,
+	    "max_num_seqs": 256,
+            "load_format": "dummy"
+        },
+        "client_parameters": {
+            "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
+            "backend": "vllm",
+            "dataset_name": "random",
+	    "random-input-len": 128,
+	    "random-output-len": 128,
+	    "ignore-eos": "",
+            "num_prompts": 1000
+        }
+    },
+    {
+        "test_name": "serving_llama8B_int4_tp1_sharegpt",
+        "qps_list": ["inf"],
+        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
+        "server_environment_variables": {
+            "VLLM_RPC_TIMEOUT": 100000,
+	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
+	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
+	    "VLLM_CPU_SGL_KERNEL": 1,
+	    "VLLM_CPU_KVCACHE_SPACE": 40
+        },
+        "server_parameters": {
+            "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
+	    "quantization": "awq",
+            "tensor_parallel_size": 1,
+	    "dtype": "bfloat16",
+	    "distributed_executor_backend": "mp",
+	    "block_size": 128,
+	    "trust_remote_code": "",
+            "disable_log_stats": "",
+	    "enforce_eager": "",
+	    "max_num_batched_tokens": 2048,
+	    "max_num_seqs": 256,
+            "load_format": "dummy"
+        },
+        "client_parameters": {
+            "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
+            "backend": "vllm",
+            "dataset_name": "sharegpt",
+            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
+            "num_prompts": 200
+        }
+    },
+    {
+        "test_name": "serving_llama8B_int4_tp2_sharegpt",
+        "qps_list": ["inf"],
+        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
+        "server_environment_variables": {
+            "VLLM_RPC_TIMEOUT": 100000,
+	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
+	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
+	    "VLLM_CPU_SGL_KERNEL": 1,
+	    "VLLM_CPU_KVCACHE_SPACE": 40
+        },
+        "server_parameters": {
+            "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
+	    "quantization": "awq",
+            "tensor_parallel_size": 2,
+	    "dtype": "bfloat16",
+	    "distributed_executor_backend": "mp",
+	    "block_size": 128,
+	    "trust_remote_code": "",
+            "disable_log_stats": "",
+	    "enforce_eager": "",
+	    "max_num_batched_tokens": 2048,
+	    "max_num_seqs": 256,
+            "load_format": "dummy"
+        },
+        "client_parameters": {
+            "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
+            "backend": "vllm",
+            "dataset_name": "sharegpt",
+            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
+            "num_prompts": 200
+        }
+    },
+    {
+        "test_name": "serving_llama8B_int4_tp4_sharegpt",
+        "qps_list": ["inf"],
+        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
+        "server_environment_variables": {
+            "VLLM_RPC_TIMEOUT": 100000,
+	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
+	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
+	    "VLLM_CPU_SGL_KERNEL": 1,
+	    "VLLM_CPU_KVCACHE_SPACE": 40
+        },
+        "server_parameters": {
+            "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
+	    "quantization": "awq",
+            "tensor_parallel_size": 4,
+	    "dtype": "bfloat16",
+	    "distributed_executor_backend": "mp",
+	    "block_size": 128,
+	    "trust_remote_code": "",
+            "disable_log_stats": "",
+	    "enforce_eager": "",
+	    "max_num_batched_tokens": 2048,
+	    "max_num_seqs": 256,
+            "load_format": "dummy"
+        },
+        "client_parameters": {
+            "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
+            "backend": "vllm",
+            "dataset_name": "sharegpt",
+            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
+            "num_prompts": 200
+        }
+    },
+    {
+        "test_name": "serving_llama8B_int4_tp1_random_128_128",
+        "qps_list": ["inf"],
+        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
+        "server_environment_variables": {
+            "VLLM_RPC_TIMEOUT": 100000,
+	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
+	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
+	    "VLLM_CPU_SGL_KERNEL": 1,
+	    "VLLM_CPU_KVCACHE_SPACE": 40
+        },
+        "server_parameters": {
+            "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
+	    "quantization": "awq",
+            "tensor_parallel_size": 1,
+	    "dtype": "bfloat16",
+	    "distributed_executor_backend": "mp",
+	    "block_size": 128,
+	    "trust_remote_code": "",
+	    "enable_chunked_prefill": "",
+            "disable_log_stats": "",
+	    "enforce_eager": "",
+	    "max_num_batched_tokens": 2048,
+	    "max_num_seqs": 256,
+            "load_format": "dummy"
+        },
+        "client_parameters": {
+            "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
+            "backend": "vllm",
+            "dataset_name": "random",
+	    "random-input-len": 128,
+	    "random-output-len": 128,
+	    "ignore-eos": "",
+            "num_prompts": 1000
+        }
+    },
+    {
+        "test_name": "serving_llama8B_int4_tp2_random_128_128",
+        "qps_list": ["inf"],
+        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
+        "server_environment_variables": {
+            "VLLM_RPC_TIMEOUT": 100000,
+	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
+	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
+	    "VLLM_CPU_SGL_KERNEL": 1,
+	    "VLLM_CPU_KVCACHE_SPACE": 40
+        },
+        "server_parameters": {
+            "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
+	    "quantization": "awq",
+            "tensor_parallel_size": 2,
+	    "dtype": "bfloat16",
+	    "distributed_executor_backend": "mp",
+	    "block_size": 128,
+	    "trust_remote_code": "",
+	    "enable_chunked_prefill": "",
+            "disable_log_stats": "",
+	    "enforce_eager": "",
+	    "max_num_batched_tokens": 2048,
+	    "max_num_seqs": 256,
+            "load_format": "dummy"
+        },
+        "client_parameters": {
+            "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
+            "backend": "vllm",
+            "dataset_name": "random",
+	    "random-input-len": 128,
+	    "random-output-len": 128,
+	    "ignore-eos": "",
+            "num_prompts": 1000
+        }
+    },
+    {
+        "test_name": "serving_llama8B_int4_tp4_random_128_128",
+        "qps_list": ["inf"],
+        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
+        "server_environment_variables": {
+            "VLLM_RPC_TIMEOUT": 100000,
+	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
+	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
+	    "VLLM_CPU_SGL_KERNEL": 1,
+	    "VLLM_CPU_KVCACHE_SPACE": 40
+        },
+        "server_parameters": {
+            "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
+	    "quantization": "awq",
+            "tensor_parallel_size": 4,
+	    "dtype": "bfloat16",
+	    "distributed_executor_backend": "mp",
+	    "block_size": 128,
+	    "trust_remote_code": "",
+	    "enable_chunked_prefill": "",
+            "disable_log_stats": "",
+	    "enforce_eager": "",
+	    "max_num_batched_tokens": 2048,
+	    "max_num_seqs": 256,
+            "load_format": "dummy"
+        },
+        "client_parameters": {
+            "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
+            "backend": "vllm",
+            "dataset_name": "random",
+	    "random-input-len": 128,
+	    "random-output-len": 128,
+	    "ignore-eos": "",
+            "num_prompts": 1000
+        }
     }
 ]
diff --git a/.buildkite/nightly-benchmarks/tests/serving-tests-cpu-snc3.json b/.buildkite/nightly-benchmarks/tests/serving-tests-cpu-snc3.json
index 823abbaa99f864428ff4e198cd3771752b5197eb..ce396d6e54f278cd8605a844469828301dff212f 100644
--- a/.buildkite/nightly-benchmarks/tests/serving-tests-cpu-snc3.json
+++ b/.buildkite/nightly-benchmarks/tests/serving-tests-cpu-snc3.json
@@ -1,6 +1,6 @@
 [
     {
-        "test_name": "serving_llama8B_pp1_sharegpt",
+        "test_name": "serving_llama8B_bf16_pp1_sharegpt",
         "qps_list": ["inf"],
         "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
         "server_environment_variables": {
@@ -32,7 +32,39 @@
         }
     },
     {
-        "test_name": "serving_llama8B_pp3_sharegpt",
+        "test_name": "serving_llama8B_bf16_tp2_sharegpt",
+        "qps_list": ["inf"],
+        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
+        "server_environment_variables": {
+            "VLLM_RPC_TIMEOUT": 100000,
+	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
+	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
+	    "VLLM_CPU_SGL_KERNEL": 1,
+	    "VLLM_CPU_KVCACHE_SPACE": 40
+        },
+        "server_parameters": {
+            "model": "meta-llama/Llama-3.1-8B-Instruct",
+            "tensor_parallel_size": 2,
+	    "dtype": "bfloat16",
+	    "distributed_executor_backend": "mp",
+	    "block_size": 128,
+	    "trust_remote_code": "",
+            "disable_log_stats": "",
+	    "enforce_eager": "",
+	    "max_num_batched_tokens": 2048,
+	    "max_num_seqs": 256,
+            "load_format": "dummy"
+        },
+        "client_parameters": {
+            "model": "meta-llama/Llama-3.1-8B-Instruct",
+            "backend": "vllm",
+            "dataset_name": "sharegpt",
+            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
+            "num_prompts": 200
+        }
+    },
+    {
+        "test_name": "serving_llama8B_bf16_pp3_sharegpt",
         "qps_list": ["inf"],
         "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
         "server_environment_variables": {
@@ -64,7 +96,7 @@
         }
     },
     {
-        "test_name": "serving_llama8B_tp2pp3_sharegpt",
+        "test_name": "serving_llama8B_bf16_tp2pp3_sharegpt",
         "qps_list": ["inf"],
         "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
         "server_environment_variables": {
@@ -97,7 +129,7 @@
         }
     },
     {
-        "test_name": "serving_llama8B_pp1_random_128_128",
+        "test_name": "serving_llama8B_bf16_pp1_random_128_128",
         "qps_list": ["inf"],
         "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
         "server_environment_variables": {
@@ -132,7 +164,42 @@
         }
     },
     {
-        "test_name": "serving_llama8B_pp3_random_128_128",
+        "test_name": "serving_llama8B_bf16_tp2_random_128_128",
+        "qps_list": ["inf"],
+        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
+        "server_environment_variables": {
+            "VLLM_RPC_TIMEOUT": 100000,
+	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
+	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
+	    "VLLM_CPU_SGL_KERNEL": 1,
+	    "VLLM_CPU_KVCACHE_SPACE": 40
+        },
+        "server_parameters": {
+            "model": "meta-llama/Llama-3.1-8B-Instruct",
+            "tensor_parallel_size": 2,
+	    "dtype": "bfloat16",
+	    "distributed_executor_backend": "mp",
+	    "block_size": 128,
+	    "trust_remote_code": "",
+	    "enable_chunked_prefill": "",
+            "disable_log_stats": "",
+	    "enforce_eager": "",
+	    "max_num_batched_tokens": 2048,
+	    "max_num_seqs": 256,
+            "load_format": "dummy"
+        },
+        "client_parameters": {
+            "model": "meta-llama/Llama-3.1-8B-Instruct",
+            "backend": "vllm",
+            "dataset_name": "random",
+	    "random-input-len": 128,
+	    "random-output-len": 128,
+	    "ignore-eos": "",
+            "num_prompts": 1000
+        }
+    },
+    {
+        "test_name": "serving_llama8B_bf16_pp3_random_128_128",
         "qps_list": ["inf"],
         "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
         "server_environment_variables": {
@@ -167,7 +234,7 @@
         }
     },
     {
-        "test_name": "serving_llama8B_tp2pp3_random_128_128",
+        "test_name": "serving_llama8B_bf16_tp2pp3_random_128_128",
         "qps_list": ["inf"],
         "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
         "server_environment_variables": {
@@ -201,5 +268,553 @@
 	    "ignore-eos": "",
             "num_prompts": 1000
         }
+    },
+    {
+        "test_name": "serving_llama8B_int8_pp1_sharegpt",
+        "qps_list": ["inf"],
+        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
+        "server_environment_variables": {
+            "VLLM_RPC_TIMEOUT": 100000,
+	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
+	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
+	    "VLLM_CPU_SGL_KERNEL": 1,
+	    "VLLM_CPU_KVCACHE_SPACE": 40
+        },
+        "server_parameters": {
+            "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
+            "pipeline_parallel_size": 1,
+	    "dtype": "bfloat16",
+	    "distributed_executor_backend": "mp",
+	    "block_size": 128,
+	    "trust_remote_code": "",
+            "disable_log_stats": "",
+	    "enforce_eager": "",
+	    "max_num_batched_tokens": 2048,
+	    "max_num_seqs": 256,
+            "load_format": "dummy"
+        },
+        "client_parameters": {
+            "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
+            "backend": "vllm",
+            "dataset_name": "sharegpt",
+            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
+            "num_prompts": 200
+        }
+    },
+    {
+        "test_name": "serving_llama8B_int8_tp2_sharegpt",
+        "qps_list": ["inf"],
+        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
+        "server_environment_variables": {
+            "VLLM_RPC_TIMEOUT": 100000,
+	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
+	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
+	    "VLLM_CPU_SGL_KERNEL": 1,
+	    "VLLM_CPU_KVCACHE_SPACE": 40
+        },
+        "server_parameters": {
+            "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
+            "tensor_parallel_size": 2,
+	    "dtype": "bfloat16",
+	    "distributed_executor_backend": "mp",
+	    "block_size": 128,
+	    "trust_remote_code": "",
+            "disable_log_stats": "",
+	    "enforce_eager": "",
+	    "max_num_batched_tokens": 2048,
+	    "max_num_seqs": 256,
+            "load_format": "dummy"
+        },
+        "client_parameters": {
+            "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
+            "backend": "vllm",
+            "dataset_name": "sharegpt",
+            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
+            "num_prompts": 200
+        }
+    },
+    {
+        "test_name": "serving_llama8B_int8_pp3_sharegpt",
+        "qps_list": ["inf"],
+        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
+        "server_environment_variables": {
+            "VLLM_RPC_TIMEOUT": 100000,
+	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
+	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
+	    "VLLM_CPU_SGL_KERNEL": 1,
+	    "VLLM_CPU_KVCACHE_SPACE": 40
+        },
+        "server_parameters": {
+            "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
+            "pipeline_parallel_size": 3,
+	    "dtype": "bfloat16",
+	    "distributed_executor_backend": "mp",
+	    "block_size": 128,
+	    "trust_remote_code": "",
+            "disable_log_stats": "",
+	    "enforce_eager": "",
+	    "max_num_batched_tokens": 2048,
+	    "max_num_seqs": 256,
+            "load_format": "dummy"
+        },
+        "client_parameters": {
+            "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
+            "backend": "vllm",
+            "dataset_name": "sharegpt",
+            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
+            "num_prompts": 200
+        }
+    },
+    {
+        "test_name": "serving_llama8B_int8_tp2pp3_sharegpt",
+        "qps_list": ["inf"],
+        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
+        "server_environment_variables": {
+            "VLLM_RPC_TIMEOUT": 100000,
+	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
+	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
+	    "VLLM_CPU_SGL_KERNEL": 1,
+	    "VLLM_CPU_KVCACHE_SPACE": 40
+        },
+        "server_parameters": {
+            "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
+            "tensor_parallel_size": 2,
+            "pipeline_parallel_size": 3,
+	    "dtype": "bfloat16",
+	    "distributed_executor_backend": "mp",
+	    "block_size": 128,
+	    "trust_remote_code": "",
+            "disable_log_stats": "",
+	    "enforce_eager": "",
+	    "max_num_batched_tokens": 2048,
+	    "max_num_seqs": 256,
+            "load_format": "dummy"
+        },
+        "client_parameters": {
+            "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
+            "backend": "vllm",
+            "dataset_name": "sharegpt",
+            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
+            "num_prompts": 200
+        }
+    },
+    {
+        "test_name": "serving_llama8B_int8_pp1_random_128_128",
+        "qps_list": ["inf"],
+        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
+        "server_environment_variables": {
+            "VLLM_RPC_TIMEOUT": 100000,
+	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
+	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
+	    "VLLM_CPU_SGL_KERNEL": 1,
+	    "VLLM_CPU_KVCACHE_SPACE": 40
+        },
+        "server_parameters": {
+            "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
+            "pipeline_parallel_size": 1,
+	    "dtype": "bfloat16",
+	    "distributed_executor_backend": "mp",
+	    "block_size": 128,
+	    "trust_remote_code": "",
+	    "enable_chunked_prefill": "",
+            "disable_log_stats": "",
+	    "enforce_eager": "",
+	    "max_num_batched_tokens": 2048,
+	    "max_num_seqs": 256,
+            "load_format": "dummy"
+        },
+        "client_parameters": {
+            "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
+            "backend": "vllm",
+            "dataset_name": "random",
+	    "random-input-len": 128,
+	    "random-output-len": 128,
+	    "ignore-eos": "",
+            "num_prompts": 1000
+        }
+    },
+    {
+        "test_name": "serving_llama8B_int8_tp2_random_128_128",
+        "qps_list": ["inf"],
+        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
+        "server_environment_variables": {
+            "VLLM_RPC_TIMEOUT": 100000,
+	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
+	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
+	    "VLLM_CPU_SGL_KERNEL": 1,
+	    "VLLM_CPU_KVCACHE_SPACE": 40
+        },
+        "server_parameters": {
+            "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
+            "tensor_parallel_size": 2,
+	    "dtype": "bfloat16",
+	    "distributed_executor_backend": "mp",
+	    "block_size": 128,
+	    "trust_remote_code": "",
+	    "enable_chunked_prefill": "",
+            "disable_log_stats": "",
+	    "enforce_eager": "",
+	    "max_num_batched_tokens": 2048,
+	    "max_num_seqs": 256,
+            "load_format": "dummy"
+        },
+        "client_parameters": {
+            "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
+            "backend": "vllm",
+            "dataset_name": "random",
+	    "random-input-len": 128,
+	    "random-output-len": 128,
+	    "ignore-eos": "",
+            "num_prompts": 1000
+        }
+    },
+    {
+        "test_name": "serving_llama8B_int8_pp3_random_128_128",
+        "qps_list": ["inf"],
+        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
+        "server_environment_variables": {
+            "VLLM_RPC_TIMEOUT": 100000,
+	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
+	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
+	    "VLLM_CPU_SGL_KERNEL": 1,
+	    "VLLM_CPU_KVCACHE_SPACE": 40
+        },
+        "server_parameters": {
+            "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
+            "pipeline_parallel_size": 3,
+	    "dtype": "bfloat16",
+	    "distributed_executor_backend": "mp",
+	    "block_size": 128,
+	    "trust_remote_code": "",
+	    "enable_chunked_prefill": "",
+            "disable_log_stats": "",
+	    "enforce_eager": "",
+	    "max_num_batched_tokens": 2048,
+	    "max_num_seqs": 256,
+            "load_format": "dummy"
+        },
+        "client_parameters": {
+            "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
+            "backend": "vllm",
+            "dataset_name": "random",
+	    "random-input-len": 128,
+	    "random-output-len": 128,
+	    "ignore-eos": "",
+            "num_prompts": 1000
+        }
+    },
+    {
+        "test_name": "serving_llama8B_int8_tp2pp3_random_128_128",
+        "qps_list": ["inf"],
+        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
+        "server_environment_variables": {
+            "VLLM_RPC_TIMEOUT": 100000,
+	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
+	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
+	    "VLLM_CPU_SGL_KERNEL": 1,
+	    "VLLM_CPU_KVCACHE_SPACE": 40
+        },
+        "server_parameters": {
+            "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
+            "tensor_parallel_size": 2,
+            "pipeline_parallel_size": 3,
+	    "dtype": "bfloat16",
+	    "distributed_executor_backend": "mp",
+	    "block_size": 128,
+	    "trust_remote_code": "",
+	    "enable_chunked_prefill": "",
+            "disable_log_stats": "",
+	    "enforce_eager": "",
+	    "max_num_batched_tokens": 2048,
+	    "max_num_seqs": 256,
+            "load_format": "dummy"
+        },
+        "client_parameters": {
+            "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
+            "backend": "vllm",
+            "dataset_name": "random",
+	    "random-input-len": 128,
+	    "random-output-len": 128,
+	    "ignore-eos": "",
+            "num_prompts": 1000
+        }
+    },
+    {
+        "test_name": "serving_llama8B_int4_pp1_sharegpt",
+        "qps_list": ["inf"],
+        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
+        "server_environment_variables": {
+            "VLLM_RPC_TIMEOUT": 100000,
+	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
+	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
+	    "VLLM_CPU_SGL_KERNEL": 1,
+	    "VLLM_CPU_KVCACHE_SPACE": 40
+        },
+        "server_parameters": {
+            "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
+	    "quantization": "awq",
+            "pipeline_parallel_size": 1,
+	    "dtype": "bfloat16",
+	    "distributed_executor_backend": "mp",
+	    "block_size": 128,
+	    "trust_remote_code": "",
+            "disable_log_stats": "",
+	    "enforce_eager": "",
+	    "max_num_batched_tokens": 2048,
+	    "max_num_seqs": 256,
+            "load_format": "dummy"
+        },
+        "client_parameters": {
+            "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
+            "backend": "vllm",
+            "dataset_name": "sharegpt",
+            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
+            "num_prompts": 200
+        }
+    },
+    {
+        "test_name": "serving_llama8B_int4_tp2_sharegpt",
+        "qps_list": ["inf"],
+        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
+        "server_environment_variables": {
+            "VLLM_RPC_TIMEOUT": 100000,
+	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
+	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
+	    "VLLM_CPU_SGL_KERNEL": 1,
+	    "VLLM_CPU_KVCACHE_SPACE": 40
+        },
+        "server_parameters": {
+            "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
+	    "quantization": "awq",
+            "tensor_parallel_size": 2,
+	    "dtype": "bfloat16",
+	    "distributed_executor_backend": "mp",
+	    "block_size": 128,
+	    "trust_remote_code": "",
+            "disable_log_stats": "",
+	    "enforce_eager": "",
+	    "max_num_batched_tokens": 2048,
+	    "max_num_seqs": 256,
+            "load_format": "dummy"
+        },
+        "client_parameters": {
+            "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
+            "backend": "vllm",
+            "dataset_name": "sharegpt",
+            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
+            "num_prompts": 200
+        }
+    },
+    {
+        "test_name": "serving_llama8B_int4_pp3_sharegpt",
+        "qps_list": ["inf"],
+        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
+        "server_environment_variables": {
+            "VLLM_RPC_TIMEOUT": 100000,
+	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
+	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
+	    "VLLM_CPU_SGL_KERNEL": 1,
+	    "VLLM_CPU_KVCACHE_SPACE": 40
+        },
+        "server_parameters": {
+            "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
+	    "quantization": "awq",
+            "pipeline_parallel_size": 3,
+	    "dtype": "bfloat16",
+	    "distributed_executor_backend": "mp",
+	    "block_size": 128,
+	    "trust_remote_code": "",
+            "disable_log_stats": "",
+	    "enforce_eager": "",
+	    "max_num_batched_tokens": 2048,
+	    "max_num_seqs": 256,
+            "load_format": "dummy"
+        },
+        "client_parameters": {
+            "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
+            "backend": "vllm",
+            "dataset_name": "sharegpt",
+            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
+            "num_prompts": 200
+        }
+    },
+    {
+        "test_name": "serving_llama8B_int4_tp2pp3_sharegpt",
+        "qps_list": ["inf"],
+        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
+        "server_environment_variables": {
+            "VLLM_RPC_TIMEOUT": 100000,
+	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
+	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
+	    "VLLM_CPU_SGL_KERNEL": 1,
+	    "VLLM_CPU_KVCACHE_SPACE": 40
+        },
+        "server_parameters": {
+            "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
+	    "quantization": "awq",
+            "tensor_parallel_size": 2,
+            "pipeline_parallel_size": 3,
+	    "dtype": "bfloat16",
+	    "distributed_executor_backend": "mp",
+	    "block_size": 128,
+	    "trust_remote_code": "",
+            "disable_log_stats": "",
+	    "enforce_eager": "",
+	    "max_num_batched_tokens": 2048,
+	    "max_num_seqs": 256,
+            "load_format": "dummy"
+        },
+        "client_parameters": {
+            "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
+            "backend": "vllm",
+            "dataset_name": "sharegpt",
+            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
+            "num_prompts": 200
+        }
+    },
+    {
+        "test_name": "serving_llama8B_int4_pp1_random_128_128",
+        "qps_list": ["inf"],
+        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
+        "server_environment_variables": {
+            "VLLM_RPC_TIMEOUT": 100000,
+	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
+	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
+	    "VLLM_CPU_SGL_KERNEL": 1,
+	    "VLLM_CPU_KVCACHE_SPACE": 40
+        },
+        "server_parameters": {
+            "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
+	    "quantization": "awq",
+            "pipeline_parallel_size": 1,
+	    "dtype": "bfloat16",
+	    "distributed_executor_backend": "mp",
+	    "block_size": 128,
+	    "trust_remote_code": "",
+	    "enable_chunked_prefill": "",
+            "disable_log_stats": "",
+	    "enforce_eager": "",
+	    "max_num_batched_tokens": 2048,
+	    "max_num_seqs": 256,
+            "load_format": "dummy"
+        },
+        "client_parameters": {
+            "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
+            "backend": "vllm",
+            "dataset_name": "random",
+	    "random-input-len": 128,
+	    "random-output-len": 128,
+	    "ignore-eos": "",
+            "num_prompts": 1000
+        }
+    },
+    {
+        "test_name": "serving_llama8B_int4_tp2_random_128_128",
+        "qps_list": ["inf"],
+        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
+        "server_environment_variables": {
+            "VLLM_RPC_TIMEOUT": 100000,
+	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
+	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
+	    "VLLM_CPU_SGL_KERNEL": 1,
+	    "VLLM_CPU_KVCACHE_SPACE": 40
+        },
+        "server_parameters": {
+            "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
+	    "quantization": "awq",
+            "tensor_parallel_size": 2,
+	    "dtype": "bfloat16",
+	    "distributed_executor_backend": "mp",
+	    "block_size": 128,
+	    "trust_remote_code": "",
+	    "enable_chunked_prefill": "",
+            "disable_log_stats": "",
+	    "enforce_eager": "",
+	    "max_num_batched_tokens": 2048,
+	    "max_num_seqs": 256,
+            "load_format": "dummy"
+        },
+        "client_parameters": {
+            "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
+            "backend": "vllm",
+            "dataset_name": "random",
+	    "random-input-len": 128,
+	    "random-output-len": 128,
+	    "ignore-eos": "",
+            "num_prompts": 1000
+        }
+    },
+    {
+        "test_name": "serving_llama8B_int4_pp3_random_128_128",
+        "qps_list": ["inf"],
+        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
+        "server_environment_variables": {
+            "VLLM_RPC_TIMEOUT": 100000,
+	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
+	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
+	    "VLLM_CPU_SGL_KERNEL": 1,
+	    "VLLM_CPU_KVCACHE_SPACE": 40
+        },
+        "server_parameters": {
+            "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
+	    "quantization": "awq",
+            "pipeline_parallel_size": 3,
+	    "dtype": "bfloat16",
+	    "distributed_executor_backend": "mp",
+	    "block_size": 128,
+	    "trust_remote_code": "",
+	    "enable_chunked_prefill": "",
+            "disable_log_stats": "",
+	    "enforce_eager": "",
+	    "max_num_batched_tokens": 2048,
+	    "max_num_seqs": 256,
+            "load_format": "dummy"
+        },
+        "client_parameters": {
+            "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
+            "backend": "vllm",
+            "dataset_name": "random",
+	    "random-input-len": 128,
+	    "random-output-len": 128,
+	    "ignore-eos": "",
+            "num_prompts": 1000
+        }
+    },
+    {
+        "test_name": "serving_llama8B_int4_tp2pp3_random_128_128",
+        "qps_list": ["inf"],
+        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
+        "server_environment_variables": {
+            "VLLM_RPC_TIMEOUT": 100000,
+	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
+	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
+	    "VLLM_CPU_SGL_KERNEL": 1,
+	    "VLLM_CPU_KVCACHE_SPACE": 40
+        },
+        "server_parameters": {
+            "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
+	    "quantization": "awq",
+            "tensor_parallel_size": 2,
+            "pipeline_parallel_size": 3,
+	    "dtype": "bfloat16",
+	    "distributed_executor_backend": "mp",
+	    "block_size": 128,
+	    "trust_remote_code": "",
+	    "enable_chunked_prefill": "",
+            "disable_log_stats": "",
+	    "enforce_eager": "",
+	    "max_num_batched_tokens": 2048,
+	    "max_num_seqs": 256,
+            "load_format": "dummy"
+        },
+        "client_parameters": {
+            "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
+            "backend": "vllm",
+            "dataset_name": "random",
+	    "random-input-len": 128,
+	    "random-output-len": 128,
+	    "ignore-eos": "",
+            "num_prompts": 1000
+        }
     }
 ]
diff --git a/.buildkite/release-pipeline.yaml b/.buildkite/release-pipeline.yaml
index 92a1bcada3879bbd5299b7359aa94e4c6648142f..a1de41652c9a6dd7897c93ce4b1cb85949f2299b 100644
--- a/.buildkite/release-pipeline.yaml
+++ b/.buildkite/release-pipeline.yaml
@@ -1,21 +1,24 @@
 steps:
-  # aarch64 + CUDA builds
-  - label: "Build arm64 wheel - CUDA 12.8"
-    id: build-wheel-arm64-cuda-12-8
+  # aarch64 + CUDA builds. PyTorch 2.8 aarch64 + CUDA wheel is only available on CUDA 12.9
+  - label: "Build arm64 wheel - CUDA 12.9"
+    id: build-wheel-arm64-cuda-12-9
     agents:
       queue: arm64_cpu_queue_postmerge
     commands:
       # #NOTE: torch_cuda_arch_list is derived from upstream PyTorch build files here:
       # https://github.com/pytorch/pytorch/blob/main/.ci/aarch64_linux/aarch64_ci_build.sh#L7
-      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.8.1 --build-arg torch_cuda_arch_list='8.7 9.0 10.0+PTX 12.0' --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
+      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --build-arg torch_cuda_arch_list='8.7 9.0 10.0+PTX 12.0' --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
       - "mkdir artifacts"
       - "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
       - "bash .buildkite/scripts/upload-wheels.sh"
     env:
       DOCKER_BUILDKIT: "1"
 
-  # x86 + CUDA builds
+  - block: "Build CUDA 12.8 wheel"
+    key: block-build-cu128-wheel
+
   - label: "Build wheel - CUDA 12.8"
+    depends_on: block-build-cu128-wheel
     id: build-wheel-cuda-12-8
     agents:
       queue: cpu_queue_postmerge
@@ -44,18 +47,14 @@ steps:
     env:
       DOCKER_BUILDKIT: "1"
 
-  # Note(simon): We can always build CUDA 11.8 wheel to ensure the build is working.
-  # However, this block can be uncommented to save some compute hours.
-  # - block: "Build CUDA 11.8 wheel"
-  #   key: block-build-cu118-wheel
-
-  - label: "Build wheel - CUDA 11.8"
-    # depends_on: block-build-cu118-wheel
-    id: build-wheel-cuda-11-8
+  # x86 + CUDA builds
+  - label: "Build wheel - CUDA 12.9"
+    depends_on: ~
+    id: build-wheel-cuda-12-9
     agents:
       queue: cpu_queue_postmerge
     commands:
-      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=11.8.0 --build-arg torch_cuda_arch_list='7.0 7.5 8.0 8.9 9.0+PTX' --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
+      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --build-arg torch_cuda_arch_list='7.0 7.5 8.0 8.9 9.0+PTX' --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
       - "mkdir artifacts"
       - "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
       - "bash .buildkite/scripts/upload-wheels.sh"
@@ -75,6 +74,7 @@ steps:
       - "docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m) public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT"
       - "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT"
 
+  # PyTorch 2.8 aarch64 + CUDA wheel is only available on CUDA 12.9
   - label: "Build release image (arm64)"
     depends_on: ~
     id: build-release-image-arm64
@@ -82,7 +82,7 @@ steps:
       queue: arm64_cpu_queue_postmerge
     commands:
       - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
-      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.8.1 --build-arg torch_cuda_arch_list='8.7 9.0 10.0+PTX 12.0' --build-arg INSTALL_KV_CONNECTORS=true --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m) --target vllm-openai --progress plain -f docker/Dockerfile ."
+      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --build-arg torch_cuda_arch_list='8.7 9.0 10.0+PTX 12.0' --build-arg INSTALL_KV_CONNECTORS=true --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m) --target vllm-openai --progress plain -f docker/Dockerfile ."
       - "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)"
 
   # Add job to create multi-arch manifest
@@ -103,7 +103,7 @@ steps:
       - create-multi-arch-manifest
       - build-wheel-cuda-12-8
       - build-wheel-cuda-12-6
-      - build-wheel-cuda-11-8
+      - build-wheel-cuda-12-9
     id: annotate-release-workflow
     agents:
       queue: cpu_queue_postmerge
@@ -150,18 +150,24 @@ steps:
     env:
       DOCKER_BUILDKIT: "1"
 
-  - block: "Build Neuron release image"
-    key: block-neuron-release-image-build
-    depends_on: ~
-
-  - label: "Build and publish Neuron release image"
-    depends_on: block-neuron-release-image-build
+  - label: "Build and publish nightly multi-arch image to DockerHub"
+    depends_on:
+      - create-multi-arch-manifest
+    if: build.env("NIGHTLY") == "1"
     agents:
-      queue: neuron-postmerge
+      queue: cpu_queue_postmerge
     commands:
       - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
-      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --tag public.ecr.aws/q9t5s3a7/vllm-neuron-release-repo:$(buildkite-agent meta-data get release-version) --tag public.ecr.aws/q9t5s3a7/vllm-neuron-release-repo:latest --progress plain -f docker/Dockerfile.neuron ."
-      - "docker push public.ecr.aws/q9t5s3a7/vllm-neuron-release-repo:latest"
-      - "docker push public.ecr.aws/q9t5s3a7/vllm-neuron-release-repo:$(buildkite-agent meta-data get release-version)"
+      - "docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT"
+      - "docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT vllm/vllm-openai:nightly"
+      - "docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT vllm/vllm-openai:nightly-$BUILDKITE_COMMIT"
+      - "docker push vllm/vllm-openai:nightly"
+      - "docker push vllm/vllm-openai:nightly-$BUILDKITE_COMMIT"
+      # Clean up old nightly builds (keep only last 14)
+      - "bash .buildkite/scripts/cleanup-nightly-builds.sh"
+    plugins:
+      - docker-login#v3.0.0:
+          username: vllmbot
+          password-env: DOCKERHUB_TOKEN
     env:
       DOCKER_BUILDKIT: "1"
diff --git a/.buildkite/scripts/cleanup-nightly-builds.sh b/.buildkite/scripts/cleanup-nightly-builds.sh
new file mode 100755
index 0000000000000000000000000000000000000000..1a82f7d085233bdc8262a34201cb610b3ca393b4
--- /dev/null
+++ b/.buildkite/scripts/cleanup-nightly-builds.sh
@@ -0,0 +1,97 @@
+#!/bin/bash
+
+set -ex
+
+# Clean up old nightly builds from DockerHub, keeping only the last 14 builds
+# This script uses DockerHub API to list and delete old tags with "nightly-" prefix
+
+# DockerHub API endpoint for vllm/vllm-openai repository
+REPO_API_URL="https://hub.docker.com/v2/repositories/vllm/vllm-openai/tags"
+
+# Get DockerHub token from environment
+if [ -z "$DOCKERHUB_TOKEN" ]; then
+    echo "Error: DOCKERHUB_TOKEN environment variable is not set"
+    exit 1
+fi
+
+# Function to get all tags from DockerHub
+get_all_tags() {
+    local page=1
+    local all_tags=""
+    
+    while true; do
+        local response=$(curl -s -H "Authorization: Bearer $DOCKERHUB_TOKEN" \
+            "$REPO_API_URL?page=$page&page_size=100")
+        
+        # Get both last_updated timestamp and tag name, separated by |
+        local tags=$(echo "$response" | jq -r '.results[] | select(.name | startswith("nightly-")) | "\(.last_updated)|\(.name)"')
+        
+        if [ -z "$tags" ]; then
+            break
+        fi
+        
+        all_tags="$all_tags$tags"$'\n'
+        page=$((page + 1))
+    done
+    
+    # Sort by timestamp (newest first) and extract just the tag names
+    echo "$all_tags" | sort -r | cut -d'|' -f2
+}
+
+delete_tag() {
+    local tag_name="$1"
+    echo "Deleting tag: $tag_name"
+    
+    local delete_url="https://hub.docker.com/v2/repositories/vllm/vllm-openai/tags/$tag_name"
+    local response=$(curl -s -X DELETE -H "Authorization: Bearer $DOCKERHUB_TOKEN" "$delete_url")
+    
+    if echo "$response" | jq -e '.detail' > /dev/null 2>&1; then
+        echo "Warning: Failed to delete tag $tag_name: $(echo "$response" | jq -r '.detail')"
+    else
+        echo "Successfully deleted tag: $tag_name"
+    fi
+}
+
+# Get all nightly- prefixed tags, sorted by last_updated timestamp (newest first)
+echo "Fetching all tags from DockerHub..."
+all_tags=$(get_all_tags)
+
+if [ -z "$all_tags" ]; then
+    echo "No tags found to clean up"
+    exit 0
+fi
+
+# Count total tags
+total_tags=$(echo "$all_tags" | wc -l)
+echo "Found $total_tags tags"
+
+# Keep only the last 14 builds (including the current one)
+tags_to_keep=14
+tags_to_delete=$((total_tags - tags_to_keep))
+
+if [ $tags_to_delete -le 0 ]; then
+    echo "No tags need to be deleted (only $total_tags tags found, keeping $tags_to_keep)"
+    exit 0
+fi
+
+echo "Will delete $tags_to_delete old tags, keeping the newest $tags_to_keep"
+
+# Get tags to delete (skip the first $tags_to_keep tags)
+tags_to_delete_list=$(echo "$all_tags" | tail -n +$((tags_to_keep + 1)))
+
+if [ -z "$tags_to_delete_list" ]; then
+    echo "No tags to delete"
+    exit 0
+fi
+
+# Delete old tags
+echo "Deleting old tags..."
+while IFS= read -r tag; do
+    if [ -n "$tag" ]; then
+        delete_tag "$tag"
+        # Add a small delay to avoid rate limiting
+        sleep 1
+    fi
+done <<< "$tags_to_delete_list"
+
+echo "Cleanup completed successfully"
diff --git a/.buildkite/scripts/hardware_ci/run-neuron-test.sh b/.buildkite/scripts/hardware_ci/run-neuron-test.sh
deleted file mode 100644
index a397457c83261757a6b1a2812c032be4f43558da..0000000000000000000000000000000000000000
--- a/.buildkite/scripts/hardware_ci/run-neuron-test.sh
+++ /dev/null
@@ -1,64 +0,0 @@
-#!/bin/bash
-
-# This script build the Neuron docker image and run the API server inside the container.
-# It serves a sanity check for compilation and basic model usage.
-set -e
-set -v
-
-image_name="neuron/vllm-ci"
-container_name="neuron_$(tr -dc A-Za-z0-9 < /dev/urandom | head -c 10; echo)"
-
-HF_CACHE="$(realpath ~)/huggingface"
-mkdir -p "${HF_CACHE}"
-HF_MOUNT="/root/.cache/huggingface"
-HF_TOKEN=$(aws secretsmanager get-secret-value  --secret-id "ci/vllm-neuron/hf-token" --region us-west-2 --query 'SecretString' --output text | jq -r .VLLM_NEURON_CI_HF_TOKEN)
-
-NEURON_COMPILE_CACHE_URL="$(realpath ~)/neuron_compile_cache"
-mkdir -p "${NEURON_COMPILE_CACHE_URL}"
-NEURON_COMPILE_CACHE_MOUNT="/root/.cache/neuron_compile_cache"
-
-# Try building the docker image
-aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws
-
-# prune old image and containers to save disk space, and only once a day
-# by using a timestamp file in tmp.
-if [ -f /tmp/neuron-docker-build-timestamp ]; then
-    last_build=$(cat /tmp/neuron-docker-build-timestamp)
-    current_time=$(date +%s)
-    if [ $((current_time - last_build)) -gt 86400 ]; then
-        # Remove dangling images (those that are not tagged and not used by any container)
-        docker image prune -f
-        # Remove unused volumes / force the system prune for old images as well.
-        docker volume prune -f && docker system prune -f
-        echo "$current_time" > /tmp/neuron-docker-build-timestamp
-    fi
-else
-    date "+%s" > /tmp/neuron-docker-build-timestamp
-fi
-
-docker build -t "${image_name}" -f docker/Dockerfile.neuron .
-
-# Setup cleanup
-remove_docker_container() {
-    docker image rm -f "${image_name}" || true;
-}
-trap remove_docker_container EXIT
-
-# Run the image
-docker run --rm -it --device=/dev/neuron0 --network bridge \
-       -v "${HF_CACHE}:${HF_MOUNT}" \
-       -e "HF_HOME=${HF_MOUNT}" \
-       -e "HF_TOKEN=${HF_TOKEN}" \
-       -v "${NEURON_COMPILE_CACHE_URL}:${NEURON_COMPILE_CACHE_MOUNT}" \
-       -e "NEURON_COMPILE_CACHE_URL=${NEURON_COMPILE_CACHE_MOUNT}" \
-       --name "${container_name}" \
-       ${image_name} \
-       /bin/bash -c "
-            set -e; # Exit on first error
-            python3 /workspace/vllm/examples/offline_inference/neuron.py;
-            python3 -m pytest /workspace/vllm/tests/neuron/1_core/ -v --capture=tee-sys;
-            for f in /workspace/vllm/tests/neuron/2_core/*.py; do
-                echo \"Running test file: \$f\";
-                python3 -m pytest \$f -v --capture=tee-sys;
-            done
-       "
\ No newline at end of file
diff --git a/.buildkite/scripts/hardware_ci/run-xpu-test.sh b/.buildkite/scripts/hardware_ci/run-xpu-test.sh
index 73f3e63fbf5f6dac30e03a6aad910cb041620eec..8c9b00990e9953b246d6d298d52aaf7ad3dcb4ab 100644
--- a/.buildkite/scripts/hardware_ci/run-xpu-test.sh
+++ b/.buildkite/scripts/hardware_ci/run-xpu-test.sh
@@ -30,10 +30,12 @@ docker run \
     bash -c '
     set -e
     echo $ZE_AFFINITY_MASK
-    VLLM_USE_V1=1 python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager
-    VLLM_USE_V1=1 python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 -O3 -O.cudagraph_mode=NONE
-    VLLM_USE_V1=1 python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager -tp 2 --distributed-executor-backend ray
-    VLLM_USE_V1=1 python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager -tp 2 --distributed-executor-backend mp
+    pip install tblib==3.1.0
+    python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager
+    python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 -O3 -O.cudagraph_mode=NONE
+    python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager -tp 2 --distributed-executor-backend ray
+    python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager -tp 2 --distributed-executor-backend mp
+    VLLM_ATTENTION_BACKEND=TRITON_ATTN_VLLM_V1 python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager
     cd tests
     pytest -v -s v1/core
     pytest -v -s v1/engine
diff --git a/.buildkite/scripts/upload-wheels.sh b/.buildkite/scripts/upload-wheels.sh
index 745f285c008ad846051f85e88972e6525869739c..43aa8c47be2992975a9264d2da7f0e6d06698d4b 100644
--- a/.buildkite/scripts/upload-wheels.sh
+++ b/.buildkite/scripts/upload-wheels.sh
@@ -58,14 +58,15 @@ python3 .buildkite/generate_index.py --wheel "$normal_wheel"
 aws s3 cp "$wheel" "s3://vllm-wheels/$BUILDKITE_COMMIT/"
 aws s3 cp "$normal_wheel" "s3://vllm-wheels/$BUILDKITE_COMMIT/"
 
-if [[ $normal_wheel == *"cu118"* ]]; then
-    # if $normal_wheel matches cu118, do not upload the index.html
-    echo "Skipping index files for cu118 wheels"
-elif [[ $normal_wheel == *"cu126"* ]]; then
+if [[ $normal_wheel == *"cu126"* ]]; then
     # if $normal_wheel matches cu126, do not upload the index.html
     echo "Skipping index files for cu126 wheels"
+elif [[ $normal_wheel == *"cu128"* ]]; then
+    # if $normal_wheel matches cu128, do not upload the index.html
+    echo "Skipping index files for cu128 wheels"
 else
-    # only upload index.html for cu128 wheels (default wheels)
+    # only upload index.html for cu129 wheels (default wheels) as it
+    # is available on both x86 and arm64
     aws s3 cp index.html "s3://vllm-wheels/$BUILDKITE_COMMIT/vllm/index.html"
     aws s3 cp "s3://vllm-wheels/nightly/index.html" "s3://vllm-wheels/$BUILDKITE_COMMIT/index.html"
 fi
@@ -74,14 +75,15 @@ fi
 aws s3 cp "$wheel" "s3://vllm-wheels/nightly/"
 aws s3 cp "$normal_wheel" "s3://vllm-wheels/nightly/"
 
-if [[ $normal_wheel == *"cu118"* ]]; then
-    # if $normal_wheel matches cu118, do not upload the index.html
-    echo "Skipping index files for cu118 wheels"
-elif [[ $normal_wheel == *"cu126"* ]]; then
+if [[ $normal_wheel == *"cu126"* ]]; then
     # if $normal_wheel matches cu126, do not upload the index.html
     echo "Skipping index files for cu126 wheels"
+elif [[ $normal_wheel == *"cu128"* ]]; then
+    # if $normal_wheel matches cu128, do not upload the index.html
+    echo "Skipping index files for cu128 wheels"
 else
-    # only upload index.html for cu128 wheels (default wheels)
+    # only upload index.html for cu129 wheels (default wheels) as it
+    # is available on both x86 and arm64
     aws s3 cp index.html "s3://vllm-wheels/nightly/vllm/index.html"
 fi
 
diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
index 55349e0ac9321df1dd343e77d0a7ee49bf80991d..b0f5fe418dcf3783524a9d71fc504d493888061c 100644
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -41,7 +41,8 @@ steps:
   commands:
   - bash standalone_tests/pytorch_nightly_dependency.sh
 
-- label: Async Engine, Inputs, Utils, Worker Test # 24min
+- label: Async Engine, Inputs, Utils, Worker Test # 36min
+  timeout_in_minutes: 50
   mirror_hardwares: [amdexperimental]
   source_file_dependencies:
   - vllm/
@@ -53,6 +54,7 @@ steps:
   - tests/utils_
   - tests/worker
   - tests/standalone_tests/lazy_imports.py
+  - tests/transformers_utils
   commands:
   - python3 standalone_tests/lazy_imports.py
   - pytest -v -s mq_llm_engine # MQLLMEngine
@@ -62,8 +64,10 @@ steps:
   - pytest -v -s multimodal
   - pytest -v -s utils_ # Utils
   - pytest -v -s worker # Worker
+  - pytest -v -s transformers_utils # transformers_utils
 
-- label: Python-only Installation Test
+- label: Python-only Installation Test # 10min
+  timeout_in_minutes: 20
   mirror_hardwares: [amdexperimental]
   source_file_dependencies:
   - tests/standalone_tests/python_only_compile.sh
@@ -71,7 +75,8 @@ steps:
   commands:
   - bash standalone_tests/python_only_compile.sh
 
-- label: Basic Correctness Test # 30min
+- label: Basic Correctness Test # 20min
+  timeout_in_minutes: 30
   mirror_hardwares: [amdexperimental]
   fast_check: true
   torch_nightly: true
@@ -88,7 +93,8 @@ steps:
   - pytest -v -s basic_correctness/test_cpu_offload.py
   - VLLM_TEST_ENABLE_ARTIFICIAL_PREEMPT=1 pytest -v -s basic_correctness/test_preemption.py
 
-- label: Core Test # 10min
+- label: Core Test # 22min
+  timeout_in_minutes: 35
   mirror_hardwares: [amdexperimental]
   fast_check: true
   source_file_dependencies:
@@ -98,7 +104,19 @@ steps:
   commands:
   - pytest -v -s core
 
-- label: Entrypoints Test (LLM) # 40min
+- label: Entrypoints Unit Tests # 5min
+  timeout_in_minutes: 10
+  working_dir: "/vllm-workspace/tests"
+  fast_check: true
+  source_file_dependencies:
+  - vllm/entrypoints
+  - tests/entrypoints/
+  commands:
+  - pytest -v -s entrypoints/openai/tool_parsers
+  - pytest -v -s entrypoints/ --ignore=entrypoints/llm --ignore=entrypoints/openai --ignore=entrypoints/offline_mode --ignore=entrypoints/test_chat_utils.py  --ignore=entrypoints/pooling
+
+- label: Entrypoints Integration Test (LLM) # 30min
+  timeout_in_minutes: 40
   mirror_hardwares: [amdexperimental]
   working_dir: "/vllm-workspace/tests"
   fast_check: true
@@ -114,7 +132,8 @@ steps:
   - pytest -v -s entrypoints/llm/test_generate.py # it needs a clean process
   - VLLM_USE_V1=0 pytest -v -s entrypoints/offline_mode # Needs to avoid interference with other tests
 
-- label: Entrypoints Test (API Server) # 40min
+- label: Entrypoints Integration Test (API Server) # 100min
+  timeout_in_minutes: 130
   mirror_hardwares: [amdexperimental]
   working_dir: "/vllm-workspace/tests"
   fast_check: true
@@ -126,10 +145,24 @@ steps:
   commands:
   - export VLLM_WORKER_MULTIPROC_METHOD=spawn
   - PYTHONPATH=/vllm-workspace pytest -v -s entrypoints/openai/test_collective_rpc.py # PYTHONPATH is needed to import custom Worker extension
-  - pytest -v -s entrypoints/openai --ignore=entrypoints/openai/test_chat_with_tool_reasoning.py --ignore=entrypoints/openai/test_oot_registration.py --ignore=entrypoints/openai/test_tensorizer_entrypoint.py --ignore=entrypoints/openai/correctness/ --ignore=entrypoints/openai/test_collective_rpc.py
+  - pytest -v -s entrypoints/openai --ignore=entrypoints/openai/test_chat_with_tool_reasoning.py --ignore=entrypoints/openai/test_oot_registration.py --ignore=entrypoints/openai/test_tensorizer_entrypoint.py --ignore=entrypoints/openai/correctness/ --ignore=entrypoints/openai/test_collective_rpc.py --ignore=entrypoints/openai/tool_parsers/
   - pytest -v -s entrypoints/test_chat_utils.py
 
-- label: Distributed Tests (4 GPUs) # 10min
+- label: Entrypoints Integration Test (Pooling)
+  timeout_in_minutes: 50
+  mirror_hardwares: [amdexperimental]
+  working_dir: "/vllm-workspace/tests"
+  fast_check: true
+  torch_nightly: true
+  source_file_dependencies:
+  - vllm/
+  - tests/entrypoints/pooling
+  commands:
+  - export VLLM_WORKER_MULTIPROC_METHOD=spawn
+  - pytest -v -s entrypoints/pooling
+
+- label: Distributed Tests (4 GPUs) # 35min
+  timeout_in_minutes: 50
   mirror_hardwares: [amdexperimental]
   working_dir: "/vllm-workspace/tests"
   num_gpus: 4
@@ -172,7 +205,8 @@ steps:
   - VLLM_ALLOW_INSECURE_SERIALIZATION=1 RAY_DEDUP_LOGS=0 python3 rlhf_colocate.py
   - popd
 
-- label: EPLB Algorithm Test
+- label: EPLB Algorithm Test # 5min
+  timeout_in_minutes: 15
   working_dir: "/vllm-workspace/tests"
   source_file_dependencies:
   - vllm/distributed/eplb
@@ -181,6 +215,7 @@ steps:
   - pytest -v -s distributed/test_eplb_algo.py
 
 - label: EPLB Execution Test # 5min
+  timeout_in_minutes: 15
   working_dir: "/vllm-workspace/tests"
   num_gpus: 4
   source_file_dependencies:
@@ -189,13 +224,14 @@ steps:
   commands:
   - pytest -v -s distributed/test_eplb_execute.py
 
-- label: Metrics, Tracing Test # 10min
+- label: Metrics, Tracing Test # 12min
+  timeout_in_minutes: 20
   mirror_hardwares: [amdexperimental]
   num_gpus: 2
   source_file_dependencies:
   - vllm/
   - tests/metrics
-  - tests/tracing
+  - tests/v1/tracing
   commands:
   - pytest -v -s metrics
   - "pip install \
@@ -208,7 +244,8 @@ steps:
 ##### fast check tests  #####
 #####  1 GPU test  #####
 
-- label: Regression Test # 5min
+- label: Regression Test # 7min
+  timeout_in_minutes: 20
   mirror_hardwares: [amdexperimental]
   source_file_dependencies:
   - vllm/
@@ -218,7 +255,8 @@ steps:
   - pytest -v -s test_regression.py
   working_dir: "/vllm-workspace/tests" # optional
 
-- label: Engine Test # 10min
+- label: Engine Test # 25min
+  timeout_in_minutes: 40
   mirror_hardwares: [amdexperimental]
   source_file_dependencies:
   - vllm/
@@ -233,7 +271,8 @@ steps:
   # OOM in the CI unless we run this separately
   - pytest -v -s tokenization
 
-- label: V1 Test e2e + engine
+- label: V1 Test e2e + engine # 30min
+  timeout_in_minutes: 45
   mirror_hardwares: [amdexperimental]
   source_file_dependencies:
     - vllm/
@@ -244,7 +283,8 @@ steps:
     - pytest -v -s v1/e2e
     - pytest -v -s v1/engine
 
-- label: V1 Test entrypoints
+- label: V1 Test entrypoints # 35min
+  timeout_in_minutes: 50
   mirror_hardwares: [amdexperimental]
   source_file_dependencies:
     - vllm/
@@ -252,7 +292,8 @@ steps:
   commands:
     - pytest -v -s v1/entrypoints
 
-- label: V1 Test others
+- label: V1 Test others # 42min
+  timeout_in_minutes: 60
   mirror_hardwares: [amdexperimental]
   source_file_dependencies:
     - vllm/
@@ -276,7 +317,8 @@ steps:
     - pip install -U git+https://github.com/robertgshaw2-redhat/lm-evaluation-harness.git@streaming-api
     - pytest -v -s entrypoints/openai/correctness/test_lmeval.py::test_lm_eval_accuracy_v1_engine
 
-- label: Examples Test # 25min
+- label: Examples Test # 30min
+  timeout_in_minutes: 45
   mirror_hardwares: [amdexperimental]
   working_dir: "/vllm-workspace/examples"
   source_file_dependencies:
@@ -294,14 +336,14 @@ steps:
     - python3 offline_inference/vision_language_pooling.py --seed 0
     - python3 offline_inference/vision_language_multi_image.py --seed 0
     - VLLM_USE_V1=0 python3 others/tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 others/tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors
-    - python3 offline_inference/encoder_decoder.py
     - python3 offline_inference/encoder_decoder_multimodal.py --model-type whisper --seed 0
     - python3 offline_inference/basic/classify.py
     - python3 offline_inference/basic/embed.py
     - python3 offline_inference/basic/score.py
     - VLLM_USE_V1=0 python3 offline_inference/profiling.py --model facebook/opt-125m run_num_steps --num-steps 2
 
-- label: Platform Tests (CUDA)
+- label: Platform Tests (CUDA) # 4min
+  timeout_in_minutes: 15
   mirror_hardwares: [amdexperimental]
   source_file_dependencies:
   - vllm/
@@ -309,7 +351,8 @@ steps:
   commands:
     - pytest -v -s cuda/test_cuda_context.py
 
-- label: Samplers Test # 36min
+- label: Samplers Test # 56min
+  timeout_in_minutes: 75
   mirror_hardwares: [amdexperimental]
   source_file_dependencies:
   - vllm/model_executor/layers
@@ -320,15 +363,23 @@ steps:
     - pytest -v -s samplers
     - VLLM_USE_FLASHINFER_SAMPLER=1 pytest -v -s samplers
 
-- label: LoRA Test %N # 15min each
+- label: LoRA Test %N # 20min each
+  timeout_in_minutes: 30
   mirror_hardwares: [amdexperimental]
   source_file_dependencies:
   - vllm/lora
   - tests/lora
-  command: pytest -v -s lora --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT --ignore=lora/test_chatglm3_tp.py --ignore=lora/test_llama_tp.py --ignore=lora/test_llm_with_multi_loras.py
+  commands:
+    - pytest -v -s lora \
+      --shard-id=$$BUILDKITE_PARALLEL_JOB \
+      --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT \
+      --ignore=lora/test_chatglm3_tp.py \
+      --ignore=lora/test_llama_tp.py \
+      --ignore=lora/test_llm_with_multi_loras.py
   parallelism: 4
 
-- label: PyTorch Compilation Unit Tests
+- label: PyTorch Compilation Unit Tests # 15min
+  timeout_in_minutes: 30
   mirror_hardwares: [amdexperimental]
   torch_nightly: true
   source_file_dependencies:
@@ -344,7 +395,8 @@ steps:
     - pytest -v -s compile/test_fusion_all_reduce.py
     - pytest -v -s compile/test_decorator.py
 
-- label: PyTorch Fullgraph Smoke Test # 9min
+- label: PyTorch Fullgraph Smoke Test # 15min
+  timeout_in_minutes: 30
   mirror_hardwares: [amdexperimental]
   torch_nightly: true
   source_file_dependencies:
@@ -352,13 +404,10 @@ steps:
   - tests/compile
   commands:
   - pytest -v -s compile/test_basic_correctness.py
-  # these tests need to be separated, cannot combine
-  - pytest -v -s compile/piecewise/test_simple.py
-  - pytest -v -s compile/piecewise/test_toy_llama.py
-  - pytest -v -s compile/piecewise/test_full_cudagraph.py
-  - pytest -v -s compile/piecewise/test_multiple_graphs.py
+  - pytest -v -s compile/piecewise/
 
-- label: PyTorch Fullgraph Test # 18min
+- label: PyTorch Fullgraph Test # 20min
+  timeout_in_minutes: 30
   mirror_hardwares: [amdexperimental]
   torch_nightly: true
   source_file_dependencies:
@@ -367,7 +416,8 @@ steps:
   commands:
   - pytest -v -s compile/test_full_graph.py
 
-- label: Kernels Core Operation Test
+- label: Kernels Core Operation Test # 48min
+  timeout_in_minutes: 75
   mirror_hardwares: [amdexperimental]
   source_file_dependencies:
   - csrc/
@@ -375,7 +425,8 @@ steps:
   commands:
     - pytest -v -s kernels/core
 
-- label: Kernels Attention Test %N
+- label: Kernels Attention Test %N # 23min
+  timeout_in_minutes: 35
   mirror_hardwares: [amdexperimental]
   source_file_dependencies:
   - csrc/attention/
@@ -386,7 +437,8 @@ steps:
     - pytest -v -s kernels/attention --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
   parallelism: 2
 
-- label: Kernels Quantization Test %N
+- label: Kernels Quantization Test %N # 64min
+  timeout_in_minutes: 90
   mirror_hardwares: [amdexperimental]
   source_file_dependencies:
   - csrc/quantization/
@@ -396,7 +448,8 @@ steps:
     - pytest -v -s kernels/quantization --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
   parallelism: 2
 
-- label: Kernels MoE Test %N
+- label: Kernels MoE Test %N # 40min
+  timeout_in_minutes: 60
   mirror_hardwares: [amdexperimental]
   source_file_dependencies:
   - csrc/quantization/cutlass_w8a8/moe/
@@ -408,7 +461,8 @@ steps:
     - pytest -v -s kernels/moe --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
   parallelism: 2
 
-- label: Kernels Mamba Test
+- label: Kernels Mamba Test # 31min
+  timeout_in_minutes: 45
   mirror_hardwares: [amdexperimental]
   source_file_dependencies:
   - csrc/mamba/
@@ -416,7 +470,8 @@ steps:
   commands:
     - pytest -v -s kernels/mamba
 
-- label: Tensorizer Test # 11min
+- label: Tensorizer Test # 14min
+  timeout_in_minutes: 25
   mirror_hardwares: [amdexperimental]
   source_file_dependencies:
   - vllm/model_executor/model_loader
@@ -428,7 +483,8 @@ steps:
     - pytest -v -s tensorizer_loader
     - pytest -v -s entrypoints/openai/test_tensorizer_entrypoint.py
 
-- label: Model Executor Test
+- label: Model Executor Test # 7min
+  timeout_in_minutes: 20
   mirror_hardwares: [amdexperimental]
   source_file_dependencies:
   - vllm/model_executor
@@ -438,7 +494,8 @@ steps:
     - export VLLM_WORKER_MULTIPROC_METHOD=spawn
     - pytest -v -s model_executor
 
-- label: Benchmarks # 9min
+- label: Benchmarks # 11min
+  timeout_in_minutes: 20
   mirror_hardwares: [amdexperimental]
   working_dir: "/vllm-workspace/.buildkite"
   source_file_dependencies:
@@ -446,7 +503,8 @@ steps:
   commands:
   - bash scripts/run-benchmarks.sh
 
-- label: Benchmarks CLI Test # 10min
+- label: Benchmarks CLI Test # 7min
+  timeout_in_minutes: 20
   mirror_hardwares: [amdexperimental]
   source_file_dependencies:
   - vllm/
@@ -454,7 +512,8 @@ steps:
   commands:
   - pytest -v -s benchmarks/
 
-- label: Quantization Test
+- label: Quantization Test # 70min
+  timeout_in_minutes: 90
   mirror_hardwares: [amdexperimental]
   source_file_dependencies:
   - csrc/
@@ -463,10 +522,15 @@ steps:
   commands:
   # temporary install here since we need nightly, will move to requirements/test.in
   # after torchao 0.12 release, and pin a working version of torchao nightly here
+
+  # since torchao nightly is only compatible with torch nightly currently
+  # https://github.com/pytorch/ao/issues/2919, we'll have to skip new torchao tests for now
+  # we can only upgrade after this is resolved
   - pip install --pre torchao==0.13.0.dev20250814 --index-url https://download.pytorch.org/whl/nightly/cu128
   - VLLM_TEST_FORCE_LOAD_FORMAT=auto pytest -v -s quantization
 
 - label: LM Eval Small Models # 53min
+  timeout_in_minutes: 75
   mirror_hardwares: [amdexperimental]
   source_file_dependencies:
   - csrc/
@@ -474,7 +538,8 @@ steps:
   commands:
   - pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-small.txt --tp-size=1
 
-- label: OpenAI API correctness
+- label: OpenAI API correctness # 22min
+  timeout_in_minutes: 30
   mirror_hardwares: [amdexperimental]
   source_file_dependencies:
   - csrc/
@@ -483,7 +548,8 @@ steps:
   commands: # LMEval+Transcription WER check
   - pytest -s entrypoints/openai/correctness/
 
-- label: Encoder Decoder tests # 5min
+- label: Encoder Decoder tests # 12min
+  timeout_in_minutes: 20
   mirror_hardwares: [amdexperimental]
   source_file_dependencies:
   - vllm/
@@ -491,7 +557,8 @@ steps:
   commands:
     - pytest -v -s encoder_decoder
 
-- label: OpenAI-Compatible Tool Use # 20 min
+- label: OpenAI-Compatible Tool Use # 23 min
+  timeout_in_minutes: 35
   mirror_hardwares: [amdexperimental]
   fast_check: false
   source_file_dependencies:
@@ -504,7 +571,8 @@ steps:
 
 #####  models test  #####
 
-- label: Basic Models Test # 24min
+- label: Basic Models Test # 57min
+  timeout_in_minutes: 75
   mirror_hardwares: [amdexperimental]
   torch_nightly: true
   source_file_dependencies:
@@ -517,7 +585,8 @@ steps:
     - pytest -v -s models/test_vision.py
     - pytest -v -s models/test_initialization.py
 
-- label: Language Models Test (Standard)
+- label: Language Models Test (Standard) # 35min
+  timeout_in_minutes: 45
   mirror_hardwares: [amdexperimental]
   torch_nightly: true
   source_file_dependencies:
@@ -528,6 +597,7 @@ steps:
     - pytest -v -s models/language -m core_model
 
 - label: Language Models Test (Hybrid) # 35 min
+  timeout_in_minutes: 45
   mirror_hardwares: [amdexperimental]
   torch_nightly: true
   source_file_dependencies:
@@ -540,7 +610,8 @@ steps:
     - uv pip install --system --no-build-isolation 'git+https://github.com/Dao-AILab/causal-conv1d@v1.5.2'
     - pytest -v -s models/language/generation -m hybrid_model
 
-- label: Language Models Test (Extended Generation) # 1hr20min
+- label: Language Models Test (Extended Generation) # 80min
+  timeout_in_minutes: 110
   mirror_hardwares: [amdexperimental]
   optional: true
   source_file_dependencies:
@@ -551,7 +622,18 @@ steps:
     - pip install 'git+https://github.com/Dao-AILab/causal-conv1d@v1.5.0.post8'
     - pytest -v -s models/language/generation -m '(not core_model) and (not hybrid_model)'
 
+- label: Language Models Test (PPL)
+  timeout_in_minutes: 110
+  mirror_hardwares: [amdexperimental]
+  optional: true
+  source_file_dependencies:
+  - vllm/
+  - tests/models/language/generation_ppl_test
+  commands:
+    - pytest -v -s models/language/generation_ppl_test
+
 - label: Language Models Test (Extended Pooling)  # 36min
+  timeout_in_minutes: 50
   mirror_hardwares: [amdexperimental]
   optional: true
   source_file_dependencies:
@@ -560,7 +642,18 @@ steps:
   commands:
     - pytest -v -s models/language/pooling -m 'not core_model'
 
-- label: Multi-Modal Processor Test
+- label: Language Models Test (MTEB)
+  timeout_in_minutes: 110
+  mirror_hardwares: [amdexperimental]
+  optional: true
+  source_file_dependencies:
+  - vllm/
+  - tests/models/language/pooling_mteb_test
+  commands:
+    - pytest -v -s models/language/pooling_mteb_test
+
+- label: Multi-Modal Processor Test # 44min
+  timeout_in_minutes: 60
   source_file_dependencies:
   - vllm/
   - tests/models/multimodal
@@ -568,7 +661,8 @@ steps:
     - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
     - pytest -v -s models/multimodal/processing
 
-- label: Multi-Modal Models Test (Standard)
+- label: Multi-Modal Models Test (Standard) # 60min
+  timeout_in_minutes: 80
   mirror_hardwares: [amdexperimental]
   torch_nightly: true
   source_file_dependencies:
@@ -578,7 +672,7 @@ steps:
     - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
     - pip freeze | grep -E 'torch'
     - pytest -v -s models/multimodal -m core_model --ignore models/multimodal/generation/test_whisper.py --ignore models/multimodal/processing
-    - cd .. && pytest -v -s tests/models/multimodal/generation/test_whisper.py -m core_model  # Otherwise, mp_method="spawn" doesn't work
+    - cd .. && VLLM_WORKER_MULTIPROC_METHOD=spawn pytest -v -s tests/models/multimodal/generation/test_whisper.py -m core_model  # Otherwise, mp_method="spawn" doesn't work
 
 - label: Multi-Modal Models Test (Extended) 1
   mirror_hardwares: [amdexperimental]
@@ -610,7 +704,8 @@ steps:
     - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
     - pytest -v -s models/multimodal/generation/test_common.py -m 'split(group=1) and not core_model'
 
-- label: Quantized Models Test
+- label: Quantized Models Test # 45 min
+  timeout_in_minutes: 60
   mirror_hardwares: [amdexperimental]
   source_file_dependencies:
   - vllm/model_executor/layers/quantization
@@ -640,7 +735,8 @@ steps:
     - python3 examples/offline_inference/audio_language.py --model-type whisper
     - python3 examples/offline_inference/vision_language.py --model-type qwen2_5_vl
 
-- label: Blackwell Test
+- label: Blackwell Test # 38 min
+  timeout_in_minutes: 60
   working_dir: "/vllm-workspace/"
   gpu: b200
   # optional: true
@@ -662,7 +758,8 @@ steps:
     # num_heads2 broken by https://github.com/flashinfer-ai/flashinfer/issues/1353
     - pytest -v -s tests/kernels/attention/test_flashinfer.py -k 'not num_heads2'
     - pytest -v -s tests/kernels/attention/test_flashinfer_trtllm_attention.py
-    - pytest -v -s tests/kernels/test_cutlass_mla_decode.py
+    - pytest -v -s tests/kernels/attention/test_cutlass_mla_decode.py
+    - pytest -v -s tests/kernels/attention/test_flashinfer_mla_decode.py
     # Quantization
     - pytest -v -s tests/kernels/quantization/test_cutlass_scaled_mm.py -k 'fp8'
     - pytest -v -s tests/kernels/quantization/test_nvfp4_quant.py
@@ -682,6 +779,7 @@ steps:
 #####  multi gpus test  #####
 
 - label: Distributed Comm Ops Test # 7min
+  timeout_in_minutes: 20
   mirror_hardwares: [amdexperimental]
   working_dir: "/vllm-workspace/tests"
   num_gpus: 2
@@ -693,6 +791,7 @@ steps:
   - pytest -v -s distributed/test_shm_broadcast.py
 
 - label: 2 Node Tests (4 GPUs in total) # 16min
+  timeout_in_minutes: 30
   mirror_hardwares: [amdexperimental]
   working_dir: "/vllm-workspace/tests"
   num_gpus: 2
@@ -716,7 +815,8 @@ steps:
     - NUM_NODES=2 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_node_count.py | grep 'Node count test passed'
     - python3 ../examples/offline_inference/data_parallel.py --dp-size=2 --tp-size=1 --node-size=2 --node-rank=1 --master-addr=192.168.10.10 --master-port=12345 --enforce-eager --trust-remote-code
 
-- label: Distributed Tests (2 GPUs) # 40min
+- label: Distributed Tests (2 GPUs) # 110min
+  timeout_in_minutes: 150
   mirror_hardwares: [amdexperimental]
   working_dir: "/vllm-workspace/tests"
   num_gpus: 2
@@ -747,7 +847,8 @@ steps:
   # Avoid importing model tests that cause CUDA reinitialization error
   - pytest models/test_transformers.py -v -s -m 'distributed(num_gpus=2)'
   - pytest models/language -v -s -m 'distributed(num_gpus=2)'
-  - pytest models/multimodal -v -s -m 'distributed(num_gpus=2)'
+  - pytest models/multimodal -v -s -m 'distributed(num_gpus=2)' --ignore models/multimodal/generation/test_whisper.py
+  - VLLM_WORKER_MULTIPROC_METHOD=spawn pytest models/multimodal/generation/test_whisper.py -v -s -m 'distributed(num_gpus=2)'
   # test sequence parallel
   - pytest -v -s distributed/test_sequence_parallel.py
   # this test fails consistently.
@@ -757,6 +858,7 @@ steps:
   - pytest -v -s models/multimodal/generation/test_maverick.py
 
 - label: Plugin Tests (2 GPUs) # 40min
+  timeout_in_minutes: 60
   mirror_hardwares: [amdexperimental]
   working_dir: "/vllm-workspace/tests"
   num_gpus: 2
@@ -772,7 +874,7 @@ steps:
   # begin io_processor plugins test, all the code in between uses the prithvi_io_processor plugin
   - pip install -e ./plugins/prithvi_io_processor_plugin
   - pytest -v -s plugins_tests/test_io_processor_plugins.py
-  - pip uninstall prithvi_io_processor_plugin -y 
+  - pip uninstall prithvi_io_processor_plugin -y
   # end io_processor plugins test
   # other tests continue here:
   - pytest -v -s plugins_tests/test_scheduler_plugins.py
@@ -782,7 +884,8 @@ steps:
   - pytest -v -s models/test_oot_registration.py # it needs a clean process
   - pytest -v -s plugins/lora_resolvers # unit tests for in-tree lora resolver plugins
 
-- label: Pipeline Parallelism Test # 45min
+- label: Pipeline + Context Parallelism Test # 45min
+  timeout_in_minutes: 60
   mirror_hardwares: [amdexperimental]
   working_dir: "/vllm-workspace/tests"
   num_gpus: 4
@@ -795,8 +898,10 @@ steps:
   commands:
   - pytest -v -s distributed/test_pp_cudagraph.py
   - pytest -v -s distributed/test_pipeline_parallel.py
+  # - pytest -v -s distributed/test_context_parallel.py # TODO: enable it on Hopper runners or add triton MLA support
 
-- label: LoRA TP Test (Distributed)
+- label: LoRA TP Test (Distributed) # 17 min
+  timeout_in_minutes: 30
   mirror_hardwares: [amdexperimental]
   num_gpus: 4
   source_file_dependencies:
@@ -814,9 +919,10 @@ steps:
 
 
 - label: Weight Loading Multiple GPU Test  # 33min
+  timeout_in_minutes: 45
   mirror_hardwares: [amdexperimental]
   working_dir: "/vllm-workspace/tests"
-  num_gpus: 2 
+  num_gpus: 2
   optional: true
   source_file_dependencies:
   - vllm/
diff --git a/.github/.bc-linter.yml b/.github/.bc-linter.yml
new file mode 100644
index 0000000000000000000000000000000000000000..443dfa45af22c16ee3619b76caa0a910735a657c
--- /dev/null
+++ b/.github/.bc-linter.yml
@@ -0,0 +1,24 @@
+# doc: https://github.com/pytorch/test-infra/blob/main/tools/stronghold/docs/bc_linter_config.md
+version: 1
+paths:
+# We temporarily disable globally, and will only enable with `annotations.include`
+# include:
+#   - "vllm/v1/attetion/*.py"
+#   - "vllm/v1/core/*.py"
+exclude:
+  - "**/*.py"
+
+scan:
+  functions: true        # check free functions and methods
+  classes: true          # check classes/dataclasses
+  public_only: true      # ignore names starting with "_" at any level
+
+annotations:
+  include:               # decorators that force‑include a symbol
+    - name: "bc_linter_include"  # matched by simple name or dotted suffix
+      propagate_to_members: false # for classes, include methods/inner classes
+  exclude:               # decorators that force‑exclude a symbol
+    - name: "bc_linter_skip"     # matched by simple name or dotted suffix
+      propagate_to_members: true  # for classes, exclude methods/inner classes
+
+excluded_violations: []  # e.g. ["ParameterRenamed", "FieldTypeChanged"]
diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS
index c087fd555c661e5d23ce16a19cda3c75900accbf..846b68054c0a157af188b12c529cd0e958fe14aa 100644
--- a/.github/CODEOWNERS
+++ b/.github/CODEOWNERS
@@ -5,18 +5,21 @@
 /vllm/attention/backends/abstract.py @WoosukKwon @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill
 /vllm/core @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill
 /vllm/engine/llm_engine.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill
-/vllm/executor/executor_base.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill
-/vllm/worker/worker_base.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill
+/vllm/executor/executor_base.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill @22quinn
+/vllm/worker/worker_base.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill @22quinn
 /vllm/worker/worker.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill
-/vllm/model_executor/layers/sampler.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill
+/vllm/model_executor/layers/sampler.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill @NickLucche
 /vllm/model_executor/layers/quantization @mgoin @robertgshaw2-redhat @tlrmchlsmth @yewentao256
 /vllm/model_executor/layers/mamba @tdoublep
-/vllm/multimodal @DarkLight1337 @ywang96
+/vllm/model_executor/model_loader @22quinn
+/vllm/multimodal @DarkLight1337 @ywang96 @NickLucche
+/vllm/v1/sample @22quinn @houseroad
 /vllm/vllm_flash_attn @LucasWilkinson
 /vllm/lora @jeejeelee
-/vllm/reasoning @aarnphm
-/vllm/entrypoints @aarnphm
+/vllm/reasoning @aarnphm @chaunceyjiang
+/vllm/entrypoints @aarnphm @chaunceyjiang
 /vllm/compilation @zou3519 @youkaichao @ProExpertProg
+/vllm/distributed/kv_transfer @NickLucche
 CMakeLists.txt @tlrmchlsmth @LucasWilkinson
 
 # Any change to the VllmConfig changes can have a large user-facing impact,
@@ -25,8 +28,11 @@ CMakeLists.txt @tlrmchlsmth @LucasWilkinson
 
 # vLLM V1
 /vllm/v1 @WoosukKwon @robertgshaw2-redhat @njhill @ywang96 @comaniac @alexm-redhat
-/vllm/v1/structured_output @mgoin @russellb @aarnphm
+/vllm/v1/structured_output @mgoin @russellb @aarnphm @benchislett
+/vllm/v1/spec_decode @benchislett @luccafong
 /vllm/v1/attention/backends/triton_attn.py @tdoublep
+/vllm/v1/core @heheda12345
+/vllm/v1/kv_cache_interface.py @heheda12345
 
 # Test ownership
 /.buildkite/lm-eval-harness @mgoin @simon-mo
@@ -34,18 +40,20 @@ CMakeLists.txt @tlrmchlsmth @LucasWilkinson
 /tests/distributed/test_multi_node_assignment.py @youkaichao
 /tests/distributed/test_pipeline_parallel.py @youkaichao
 /tests/distributed/test_same_node.py @youkaichao
-/tests/entrypoints @DarkLight1337 @robertgshaw2-redhat @simon-mo @aarnphm
+/tests/entrypoints @DarkLight1337 @robertgshaw2-redhat @simon-mo @aarnphm @NickLucche
 /tests/kernels @tlrmchlsmth @WoosukKwon @yewentao256
 /tests/models @DarkLight1337 @ywang96
-/tests/multimodal @DarkLight1337 @ywang96
+/tests/multimodal @DarkLight1337 @ywang96 @NickLucche
 /tests/prefix_caching @comaniac @KuntaiDu
 /tests/quantization @mgoin @robertgshaw2-redhat @yewentao256
 /tests/test_inputs.py @DarkLight1337 @ywang96
 /tests/v1/entrypoints/llm/test_struct_output_generate.py @mgoin @russellb @aarnphm
 /tests/v1/structured_output @mgoin @russellb @aarnphm
+/tests/v1/core @heheda12345
 /tests/weight_loading @mgoin @youkaichao @yewentao256
 /tests/lora @jeejeelee
 /tests/models/language/generation/test_hybrid.py @tdoublep
+/tests/v1/kv_connector/nixl_integration @NickLucche
 
 # Docs
 /docs @hmellor
@@ -67,6 +75,9 @@ mkdocs.yaml @hmellor
 /vllm/attention/backends/dual_chunk_flash_attn.py @sighingnow
 /vllm/model_executor/models/qwen* @sighingnow
 
+# MTP-specific files
+/vllm/model_executor/models/deepseek_mtp.py @luccafong
+
 # Mistral-specific files
 /vllm/model_executor/models/mistral*.py @patrickvonplaten
 /vllm/model_executor/models/mixtral*.py @patrickvonplaten
@@ -86,3 +97,8 @@ mkdocs.yaml @hmellor
 /vllm/attention/ops/rocm*.py @gshtras
 /vllm/model_executor/layers/fused_moe/rocm*.py @gshtras
 
+# TPU
+/vllm/v1/worker/tpu* @NickLucche
+/vllm/platforms/tpu.py @NickLucche
+/vllm/v1/sample/tpu @NickLucche
+/vllm/tests/v1/tpu @NickLucche
\ No newline at end of file
diff --git a/.github/mergify.yml b/.github/mergify.yml
index 495d207d44260ce57faea2f13dcf725890411a8b..befad23da8664aa820262f72dd23a1882b1a295f 100644
--- a/.github/mergify.yml
+++ b/.github/mergify.yml
@@ -273,6 +273,20 @@ pull_request_rules:
       users:
         - "sangstar"
 
+- name: assign reviewer for modelopt changes
+  conditions:
+    - or:
+        - files~=^vllm/model_executor/layers/quantization/modelopt\.py$
+        - files~=^vllm/model_executor/layers/quantization/__init__\.py$
+        - files~=^tests/models/quantization/test_modelopt\.py$
+        - files~=^tests/quantization/test_modelopt\.py$
+        - files~=^tests/models/quantization/test_nvfp4\.py$
+        - files~=^docs/features/quantization/modelopt\.md$
+  actions:
+    assign:
+      users:
+        - "Edwardf0t1"
+
 - name: remove 'needs-rebase' label when conflict is resolved
   conditions:
       - -conflict
diff --git a/.github/workflows/add_label_automerge.yml b/.github/workflows/add_label_automerge.yml
index 315042fbf5cf44f409fb57923947e9fa3f1791b4..d8bbedef3174bc55cd2e23f5d47cedac18c17f4f 100644
--- a/.github/workflows/add_label_automerge.yml
+++ b/.github/workflows/add_label_automerge.yml
@@ -10,7 +10,7 @@ jobs:
         runs-on: ubuntu-latest
         steps:
             -   name: Add label
-                uses: actions/github-script@60a0d83039c74a4aee543508d2ffcb1c3799cdea # v7.0.1
+                uses: actions/github-script@ed597411d8f924073f98dfc5c65a23a2325f34cd # v8.0.0
                 with:
                     script: |
                         github.rest.issues.addLabels({
diff --git a/.github/workflows/bc-lint.yml b/.github/workflows/bc-lint.yml
new file mode 100644
index 0000000000000000000000000000000000000000..3795b046d7808d98f485ee2fc415bfa869e051fe
--- /dev/null
+++ b/.github/workflows/bc-lint.yml
@@ -0,0 +1,27 @@
+name: BC Lint
+
+on:
+  pull_request:
+    types:
+      - opened
+      - synchronize
+      - reopened
+
+jobs:
+  bc_lint:
+    if: github.repository_owner == 'vllm-project'
+    runs-on: ubuntu-latest
+    steps:
+      - name: Run BC Lint Action
+        uses: pytorch/test-infra/.github/actions/bc-lint@main
+        with:
+          repo: ${{ github.event.pull_request.head.repo.full_name }}
+          base_sha: ${{ github.event.pull_request.base.sha }}
+          head_sha: ${{ github.event.pull_request.head.sha }}
+          suppression: ${{ contains(github.event.pull_request.labels.*.name, 'suppress-bc-linter') }}
+          docs_link: 'https://github.com/pytorch/test-infra/wiki/BC-Linter'
+          config_dir: .github
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.sha }}
+  cancel-in-progress: true
diff --git a/.github/workflows/cleanup_pr_body.yml b/.github/workflows/cleanup_pr_body.yml
index d5c6b8d43a6efd11944c8e7821c1fe9c057b9b23..c3e132a536a42102b0058628fd5ebea38811490e 100644
--- a/.github/workflows/cleanup_pr_body.yml
+++ b/.github/workflows/cleanup_pr_body.yml
@@ -16,7 +16,7 @@ jobs:
         uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
 
       - name: Set up Python
-        uses: actions/setup-python@42375524e23c412d93fb67b49958b491fce71c38 # v5.4.0
+        uses: actions/setup-python@e797f83bcb11b83ae66e0230d6156d7c80228e7c # v6.0.0
         with:
           python-version: '3.12'
 
diff --git a/.github/workflows/issue_autolabel.yml b/.github/workflows/issue_autolabel.yml
index e0ab3872d8fa377f383b5677b3430cc636335d13..c2b17abe811cdcd133159dd4349bb8ed9235aede 100644
--- a/.github/workflows/issue_autolabel.yml
+++ b/.github/workflows/issue_autolabel.yml
@@ -13,7 +13,7 @@ jobs:
     runs-on: ubuntu-latest
     steps:
       - name: Label issues based on keywords
-        uses: actions/github-script@60a0d83039c74a4aee543508d2ffcb1c3799cdea  # v7.0.1
+        uses: actions/github-script@ed597411d8f924073f98dfc5c65a23a2325f34cd  # v8.0.0
         with:
           script: |
             // Configuration: Add new labels and keywords here
diff --git a/.github/workflows/pre-commit.yml b/.github/workflows/pre-commit.yml
index 195579f206a2f9f369e3f1adc55786a4a45ad4e5..e21d13b8161f34a372db432fb489297f0da39542 100644
--- a/.github/workflows/pre-commit.yml
+++ b/.github/workflows/pre-commit.yml
@@ -17,7 +17,7 @@ jobs:
     runs-on: ubuntu-latest
     steps:
     - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
-    - uses: actions/setup-python@42375524e23c412d93fb67b49958b491fce71c38 # v5.4.0
+    - uses: actions/setup-python@e797f83bcb11b83ae66e0230d6156d7c80228e7c # v6.0.0
       with:
         python-version: "3.12"
     - run: echo "::add-matcher::.github/workflows/matchers/actionlint.json"
diff --git a/.github/workflows/reminder_comment.yml b/.github/workflows/reminder_comment.yml
index 1ee605dc7bb0d40e3aba622cf4f18b16c878ee08..8884359fa0ce4ac31d0314b1dfd25a869e7892e9 100644
--- a/.github/workflows/reminder_comment.yml
+++ b/.github/workflows/reminder_comment.yml
@@ -9,7 +9,7 @@ jobs:
     runs-on: ubuntu-latest
     steps:
       - name: Remind to run full CI on PR
-        uses: actions/github-script@60a0d83039c74a4aee543508d2ffcb1c3799cdea # v7.0.1
+        uses: actions/github-script@ed597411d8f924073f98dfc5c65a23a2325f34cd # v8.0.0
         with:
           script: |
             try {
diff --git a/.github/workflows/stale.yml b/.github/workflows/stale.yml
index 656f3d3fa7bc4dc6e0c62cb5e4978c53417dd6ad..82844810a633a18a73ddd2a73e12d0cd9073934e 100644
--- a/.github/workflows/stale.yml
+++ b/.github/workflows/stale.yml
@@ -13,7 +13,7 @@ jobs:
       actions: write
     runs-on: ubuntu-latest
     steps:
-      - uses: actions/stale@5bef64f19d7facfb25b37b414482c7164d639639 # v9.1.0
+      - uses: actions/stale@3a9db7e6a41a89f618792c92c0e97cc736e1b13f # v10.0.0
         with:
           # Increasing this value ensures that changes to this workflow
           # propagate to all issues and PRs in days rather than months
diff --git a/.gitignore b/.gitignore
index 465935d488f8453abf81068788f3aee1be8346dd..b1df673e83ca8c1f94fbeb1e1754cd541c39ea26 100644
--- a/.gitignore
+++ b/.gitignore
@@ -4,7 +4,7 @@
 # vllm-flash-attn built from source
 vllm/vllm_flash_attn/*
 
-# triton jit 
+# triton jit
 .triton
 
 # Byte-compiled / optimized / DLL files
@@ -177,6 +177,14 @@ cython_debug/
 # VSCode
 .vscode/
 
+# Claude
+CLAUDE.md
+.claude/
+
+# Codex
+AGENTS.md
+.codex/
+
 # DS Store
 .DS_Store
 
@@ -209,4 +217,4 @@ shellcheck*/
 csrc/moe/marlin_moe_wna16/kernel_*
 
 # Ignore ep_kernels_workspace folder
-ep_kernels_workspace/
\ No newline at end of file
+ep_kernels_workspace/
diff --git a/.yapfignore b/.yapfignore
index 2d6dcf8380cac8ed79cdcb4267033052453ab72e..38158259032a69d0c44cd0e34d23fca8948a5a33 100644
--- a/.yapfignore
+++ b/.yapfignore
@@ -1 +1,2 @@
 collect_env.py
+vllm/model_executor/layers/fla/ops/*.py
diff --git a/MANIFEST.in b/MANIFEST.in
index 82fd22b845f099d01d95ae03cadd619dafdc4843..fb3cccbb4a9c156bc3aa0b08c8333e1d5340dcda 100644
--- a/MANIFEST.in
+++ b/MANIFEST.in
@@ -2,7 +2,6 @@ include LICENSE
 include requirements/common.txt
 include requirements/cuda.txt
 include requirements/rocm.txt
-include requirements/neuron.txt
 include requirements/cpu.txt
 include CMakeLists.txt
 
diff --git a/README.md b/README.md
index 8812aac4ea266e3e360984c1a765968c8b992aed..b4a3583c214cf8c51674a183534b53441f3df133 100644
--- a/README.md
+++ b/README.md
@@ -14,19 +14,24 @@ Easy, fast, and cheap LLM serving for everyone
 | <a href="https://docs.vllm.ai"><b>Documentation</b></a> | <a href="https://blog.vllm.ai/"><b>Blog</b></a> | <a href="https://arxiv.org/abs/2309.06180"><b>Paper</b></a> | <a href="https://x.com/vllm_project"><b>Twitter/X</b></a> | <a href="https://discuss.vllm.ai"><b>User Forum</b></a> | <a href="https://slack.vllm.ai"><b>Developer Slack</b></a> |
 </p>
 
+---
+Join us at the [PyTorch Conference, October 22-23](https://events.linuxfoundation.org/pytorch-conference/) and [Ray Summit, November 3-5](https://www.anyscale.com/ray-summit/2025) in San Francisco for our latest updates on vLLM and to meet the vLLM team! Register now for the largest vLLM community events of the year!
+
 ---
 
 *Latest News* 🔥
 
+- [2025/08] We hosted [vLLM Shenzhen Meetup](https://mp.weixin.qq.com/s/k8ZBO1u2_2odgiKWH_GVTQ) focusing on the ecosystem around vLLM! Please find the meetup slides [here](https://drive.google.com/drive/folders/1Ua2SVKVSu-wp5vou_6ElraDt2bnKhiEA).
+- [2025/08] We hosted [vLLM Singapore Meetup](https://www.sginnovate.com/event/vllm-sg-meet). We shared V1 updates, disaggregated serving and MLLM speedups with speakers from Embedded LLM, AMD, WekaIO, and A*STAR. Please find the meetup slides [here](https://drive.google.com/drive/folders/1ncf3GyqLdqFaB6IeB834E5TZJPLAOiXZ?usp=sharing).
 - [2025/08] We hosted [vLLM Shanghai Meetup](https://mp.weixin.qq.com/s/pDmAXHcN7Iqc8sUKgJgGtg) focusing on building, developing, and integrating with vLLM! Please find the meetup slides [here](https://drive.google.com/drive/folders/1OvLx39wnCGy_WKq8SiVKf7YcxxYI3WCH).
-- [2025/08] We hosted [vLLM Korea Meetup](https://luma.com/cgcgprmh) with Red Hat and Rebellions! We shared the latest advancements in vLLM along with project spotlights from the vLLM Korea community. Please find the meetup slides [here](https://drive.google.com/file/d/1bcrrAE1rxUgx0mjIeOWT6hNe2RefC5Hm/view).
-- [2025/08] We hosted [vLLM Beijing Meetup](https://mp.weixin.qq.com/s/dgkWg1WFpWGO2jCdTqQHxA) focusing on large-scale LLM deployment! Please find the meetup slides [here](https://drive.google.com/drive/folders/1Pid6NSFLU43DZRi0EaTcPgXsAzDvbBqF) and the recording [here](https://www.chaspark.com/#/live/1166916873711665152).
 - [2025/05] vLLM is now a hosted project under PyTorch Foundation! Please find the announcement [here](https://pytorch.org/blog/pytorch-foundation-welcomes-vllm/).
 - [2025/01] We are excited to announce the alpha release of vLLM V1: A major architectural upgrade with 1.7x speedup! Clean code, optimized execution loop, zero-overhead prefix caching, enhanced multimodal support, and more. Please check out our blog post [here](https://blog.vllm.ai/2025/01/27/v1-alpha-release.html).
 
 <details>
 <summary>Previous News</summary>
 
+- [2025/08] We hosted [vLLM Korea Meetup](https://luma.com/cgcgprmh) with Red Hat and Rebellions! We shared the latest advancements in vLLM along with project spotlights from the vLLM Korea community. Please find the meetup slides [here](https://drive.google.com/file/d/1bcrrAE1rxUgx0mjIeOWT6hNe2RefC5Hm/view).
+- [2025/08] We hosted [vLLM Beijing Meetup](https://mp.weixin.qq.com/s/dgkWg1WFpWGO2jCdTqQHxA) focusing on large-scale LLM deployment! Please find the meetup slides [here](https://drive.google.com/drive/folders/1Pid6NSFLU43DZRi0EaTcPgXsAzDvbBqF) and the recording [here](https://www.chaspark.com/#/live/1166916873711665152).
 - [2025/05] We hosted [NYC vLLM Meetup](https://lu.ma/c1rqyf1f)! Please find the meetup slides [here](https://docs.google.com/presentation/d/1_q_aW_ioMJWUImf1s1YM-ZhjXz8cUeL0IJvaquOYBeA/edit?usp=sharing).
 - [2025/04] We hosted [Asia Developer Day](https://www.sginnovate.com/event/limited-availability-morning-evening-slots-remaining-inaugural-vllm-asia-developer-day)! Please find the meetup slides from the vLLM team [here](https://docs.google.com/presentation/d/19cp6Qu8u48ihB91A064XfaXruNYiBOUKrBxAmDOllOo/edit?usp=sharing).
 - [2025/03] We hosted [vLLM x Ollama Inference Night](https://lu.ma/vllm-ollama)! Please find the meetup slides from the vLLM team [here](https://docs.google.com/presentation/d/16T2PDD1YwRnZ4Tu8Q5r6n53c5Lr5c73UV9Vd2_eBo4U/edit?usp=sharing).
diff --git a/benchmarks/README.md b/benchmarks/README.md
index 38072152b653b831375b46802fd9ce88d29b2b8a..ee172642033dec9b3dff6f48f9c888656f251a89 100644
--- a/benchmarks/README.md
+++ b/benchmarks/README.md
@@ -95,6 +95,24 @@ become available.
       <td style="text-align: center;">✅</td>
       <td><code>lmms-lab/LLaVA-OneVision-Data</code>, <code>Aeala/ShareGPT_Vicuna_unfiltered</code></td>
     </tr>
+    <tr>
+      <td><strong>HuggingFace-MTBench</strong></td>
+      <td style="text-align: center;">✅</td>
+      <td style="text-align: center;">✅</td>
+      <td><code>philschmid/mt-bench</code></td>
+    </tr>
+    <tr>
+      <td><strong>HuggingFace-Blazedit</strong></td>
+      <td style="text-align: center;">✅</td>
+      <td style="text-align: center;">✅</td>
+      <td><code>vdaita/edit_5k_char</code>, <code>vdaita/edit_10k_char</code></td>
+    </tr>
+    <tr>
+      <td><strong>Spec Bench</strong></td>
+      <td style="text-align: center;">✅</td>
+      <td style="text-align: center;">✅</td>
+      <td><code>wget https://raw.githubusercontent.com/hemingkx/Spec-Bench/refs/heads/main/data/spec_bench/question.jsonl</code></td>
+    </tr>
     <tr>
       <td><strong>Custom</strong></td>
       <td style="text-align: center;">✅</td>
@@ -110,7 +128,12 @@ become available.
 
 🚧: to be supported
 
-**Note**: HuggingFace dataset's `dataset-name` should be set to `hf`
+**Note**: HuggingFace dataset's `dataset-name` should be set to `hf`.
+For local `dataset-path`, please set `hf-name` to its Hugging Face ID like
+
+```bash
+--dataset-path /datasets/VisionArena-Chat/ --hf-name lmarena-ai/VisionArena-Chat
+```
 
 ## 🚀 Example - Online Benchmark
 
@@ -234,6 +257,43 @@ vllm bench serve \
     --num-prompts 2048
 ```
 
+### Spec Bench Benchmark with Speculative Decoding
+
+``` bash
+VLLM_USE_V1=1 vllm serve meta-llama/Meta-Llama-3-8B-Instruct \
+    --speculative-config $'{"method": "ngram",
+    "num_speculative_tokens": 5, "prompt_lookup_max": 5,
+    "prompt_lookup_min": 2}'
+```
+
+[SpecBench dataset](https://github.com/hemingkx/Spec-Bench)
+
+Run all categories:
+
+``` bash
+# Download the dataset using:
+# wget https://raw.githubusercontent.com/hemingkx/Spec-Bench/refs/heads/main/data/spec_bench/question.jsonl
+
+vllm bench serve \
+    --model meta-llama/Meta-Llama-3-8B-Instruct \
+    --dataset-name spec_bench \ 
+    --dataset-path "<YOUR_DOWNLOADED_PATH>/data/spec_bench/question.jsonl" \
+    --num-prompts -1
+```
+
+Available categories include `[writing, roleplay, reasoning, math, coding, extraction, stem, humanities, translation, summarization, qa, math_reasoning, rag]`.
+
+Run only a specific category like "summarization":
+
+``` bash
+vllm bench serve \
+    --model meta-llama/Meta-Llama-3-8B-Instruct \
+    --dataset-name spec_bench \ 
+    --dataset-path "<YOUR_DOWNLOADED_PATH>/data/spec_bench/question.jsonl" \
+    --num-prompts -1
+    --spec-bench-category "summarization"
+```
+
 ### Other HuggingFaceDataset Examples
 
 ```bash
@@ -290,6 +350,18 @@ vllm bench serve \
     --num-prompts 80
 ```
 
+`vdaita/edit_5k_char` or `vdaita/edit_10k_char`:
+
+``` bash
+vllm bench serve \
+    --model Qwen/QwQ-32B \
+    --dataset-name hf \
+    --dataset-path vdaita/edit_5k_char \
+    --num-prompts 90 \
+    --blazedit-min-distance 0.01 \
+    --blazedit-max-distance 0.99
+```
+
 ### Running With Sampling Parameters
 
 When using OpenAI-compatible backends such as `vllm`, optional sampling
@@ -689,7 +761,7 @@ python -m vllm.entrypoints.openai.api_server \
 Send requests with images:
 
 ```bash
-python benchmarks/benchmark_serving.py \
+vllm bench serve \
   --backend openai-chat \
   --model Qwen/Qwen2.5-VL-7B-Instruct \
   --dataset-name sharegpt \
@@ -716,7 +788,7 @@ python -m vllm.entrypoints.openai.api_server \
 Send requests with videos:
 
 ```bash
-python benchmarks/benchmark_serving.py \
+vllm bench serve \
   --backend openai-chat \
   --model Qwen/Qwen2.5-VL-7B-Instruct \
   --dataset-name sharegpt \
diff --git a/benchmarks/auto_tune/README.md b/benchmarks/auto_tune/README.md
index 9aad51df6e0035f5242a2107d9ed99f1d55f833f..3aa988aac2548039a2f999dc442c0cce0630ef3a 100644
--- a/benchmarks/auto_tune/README.md
+++ b/benchmarks/auto_tune/README.md
@@ -31,6 +31,12 @@ cd vllm
 
 You must set the following variables at the top of the script before execution.
 
+   Note: You can also override the default values below via environment variables when running the script.
+
+```bash
+MODEL=meta-llama/Llama-3.3-70B-Instruct SYSTEM=TPU TP=8 DOWNLOAD_DIR='' INPUT_LEN=128 OUTPUT_LEN=2048 MAX_MODEL_LEN=2300 MIN_CACHE_HIT_PCT=0 MAX_LATENCY_ALLOWED_MS=100000000000 NUM_SEQS_LIST="128 256" NUM_BATCHED_TOKENS_LIST="1024 2048 4096" VLLM_LOGGING_LEVEL=DEBUG bash auto_tune.sh
+```
+
 | Variable | Description | Example Value |
 | --- | --- | --- |
 | `BASE` | **Required.** The absolute path to the parent directory of your vLLM repository directory. | `"$HOME"` |
diff --git a/benchmarks/auto_tune/auto_tune.sh b/benchmarks/auto_tune/auto_tune.sh
index 82c20ffa6554c964df14999b01c74ef090680013..ed3679b66f805c514944f0993e6b7b2cdc4a0c70 100644
--- a/benchmarks/auto_tune/auto_tune.sh
+++ b/benchmarks/auto_tune/auto_tune.sh
@@ -5,25 +5,41 @@
 
 TAG=$(date +"%Y_%m_%d_%H_%M")
 SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
-BASE="$SCRIPT_DIR/../../.."
-MODEL="meta-llama/Llama-3.1-8B-Instruct"
-SYSTEM="TPU"
-TP=1
-DOWNLOAD_DIR=""
-INPUT_LEN=4000
-OUTPUT_LEN=16
-MAX_MODEL_LEN=4096
-MIN_CACHE_HIT_PCT=0
-MAX_LATENCY_ALLOWED_MS=100000000000
-NUM_SEQS_LIST="128 256"
-NUM_BATCHED_TOKENS_LIST="512 1024 2048 4096"
+VLLM_LOGGING_LEVEL=${VLLM_LOGGING_LEVEL:-INFO}
+BASE=${BASE:-"$SCRIPT_DIR/../../.."}
+MODEL=${MODEL:-"meta-llama/Llama-3.1-8B-Instruct"}
+SYSTEM=${SYSTEM:-"TPU"}
+TP=${TP:-1}
+DOWNLOAD_DIR=${DOWNLOAD_DIR:-""}
+INPUT_LEN=${INPUT_LEN:-4000}
+OUTPUT_LEN=${OUTPUT_LEN:-16}
+MAX_MODEL_LEN=${MAX_MODEL_LEN:-4096}
+MIN_CACHE_HIT_PCT=${MIN_CACHE_HIT_PCT:-0}
+MAX_LATENCY_ALLOWED_MS=${MAX_LATENCY_ALLOWED_MS:-100000000000}
+NUM_SEQS_LIST=${NUM_SEQS_LIST:-"128 256"}
+NUM_BATCHED_TOKENS_LIST=${NUM_BATCHED_TOKENS_LIST:-"512 1024 2048 4096"}
 
 LOG_FOLDER="$BASE/auto-benchmark/$TAG"
 RESULT="$LOG_FOLDER/result.txt"
 PROFILE_PATH="$LOG_FOLDER/profile"
 
-echo "result file: $RESULT"
-echo "model: $MODEL"
+echo "====================== AUTO TUNE PARAMETERS ===================="
+echo "SCRIPT_DIR=$SCRIPT_DIR"
+echo "BASE=$BASE"
+echo "MODEL=$MODEL"
+echo "SYSTEM=$SYSTEM"
+echo "TP=$TP"
+echo "DOWNLOAD_DIR=$DOWNLOAD_DIR"
+echo "INPUT_LEN=$INPUT_LEN"
+echo "OUTPUT_LEN=$OUTPUT_LEN"
+echo "MAX_MODEL_LEN=$MAX_MODEL_LEN"
+echo "MIN_CACHE_HIT_PCT=$MIN_CACHE_HIT_PCT"
+echo "MAX_LATENCY_ALLOWED_MS=$MAX_LATENCY_ALLOWED_MS"
+echo "NUM_SEQS_LIST=$NUM_SEQS_LIST"
+echo "NUM_BATCHED_TOKENS_LIST=$NUM_BATCHED_TOKENS_LIST"
+echo "VLLM_LOGGING_LEVEL=$VLLM_LOGGING_LEVEL"
+echo "RESULT_FILE=$RESULT"
+echo "====================== AUTO TUNEPARAMETERS ===================="
 
 rm -rf $LOG_FOLDER
 rm -rf $PROFILE_PATH
@@ -213,7 +229,7 @@ run_benchmark() {
 
     pkill -if vllm
     sleep 10
-    printf '=%.0s' $(seq 1 20)
+    echo "===================="
     return 0
 }
 
diff --git a/benchmarks/benchmark_block_pool.py b/benchmarks/benchmark_block_pool.py
index fd363c2ad05143befb3b43b468de462a875bce27..eae8d9927ea391d496e2c5582ea72e67391c93d3 100644
--- a/benchmarks/benchmark_block_pool.py
+++ b/benchmarks/benchmark_block_pool.py
@@ -57,7 +57,7 @@ def invoke_main() -> None:
         "--num-iteration",
         type=int,
         default=1000,
-        help="Number of iterations to run to stablize final data readings",
+        help="Number of iterations to run to stabilize final data readings",
     )
     parser.add_argument(
         "--allocate-blocks",
diff --git a/benchmarks/benchmark_dataset.py b/benchmarks/benchmark_dataset.py
index 2ea4f9ccaff2b70d57a791de36dd9ec9ace69fe3..64ffa62c04d85181b856f2fc3de3442873aba458 100644
--- a/benchmarks/benchmark_dataset.py
+++ b/benchmarks/benchmark_dataset.py
@@ -403,7 +403,7 @@ class RandomDataset(BenchmarkDataset):
             # [6880, 6881] -> ['Ġcalls', 'here'] ->
             # [1650, 939, 486] -> ['Ġcall', 'sh', 'ere']
             # To avoid uncontrolled change of the prompt length,
-            # the encoded sequence is truncated before being decode again.
+            # the encoded sequence is truncated before being decoded again.
             total_input_len = prefix_len + int(input_lens[i])
             re_encoded_sequence = tokenizer.encode(prompt, add_special_tokens=False)[
                 :total_input_len
diff --git a/benchmarks/benchmark_latency.py b/benchmarks/benchmark_latency.py
index d8b960edaa468ad8827575e5968f5c09b63a644e..a7892f3f71243755a9d2cf59c1ad562e1878fda8 100644
--- a/benchmarks/benchmark_latency.py
+++ b/benchmarks/benchmark_latency.py
@@ -1,191 +1,17 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-"""Benchmark the latency of processing a single batch of requests."""
+import sys
 
-import argparse
-import dataclasses
-import json
-import os
-import time
-from typing import Any, Optional
-
-import numpy as np
-from tqdm import tqdm
-from typing_extensions import deprecated
-
-import vllm.envs as envs
-from benchmark_utils import convert_to_pytorch_benchmark_format, write_to_json
-from vllm import LLM, SamplingParams
-from vllm.engine.arg_utils import EngineArgs
-from vllm.inputs import PromptType
-from vllm.sampling_params import BeamSearchParams
-from vllm.utils import FlexibleArgumentParser
-
-
-def save_to_pytorch_benchmark_format(
-    args: argparse.Namespace, results: dict[str, Any]
-) -> None:
-    pt_records = convert_to_pytorch_benchmark_format(
-        args=args,
-        metrics={"latency": results["latencies"]},
-        extra_info={k: results[k] for k in ["avg_latency", "percentiles"]},
-    )
-    if pt_records:
-        pt_file = f"{os.path.splitext(args.output_json)[0]}.pytorch.json"
-        write_to_json(pt_file, pt_records)
-
-
-@deprecated(
-    "benchmark_latency.py is deprecated and will be removed in a "
-    "future version. Please use 'vllm bench latency' instead.",
-)
-def main(args: argparse.Namespace):
-    print(args)
-
-    engine_args = EngineArgs.from_cli_args(args)
-
-    # NOTE(woosuk): If the request cannot be processed in a single batch,
-    # the engine will automatically process the request in multiple batches.
-    llm = LLM(**dataclasses.asdict(engine_args))
-    assert llm.llm_engine.model_config.max_model_len >= (
-        args.input_len + args.output_len
-    ), (
-        "Please ensure that max_model_len is greater than"
-        " the sum of input_len and output_len."
-    )
-
-    sampling_params = SamplingParams(
-        n=args.n,
-        temperature=1.0,
-        top_p=1.0,
-        ignore_eos=True,
-        max_tokens=args.output_len,
-        detokenize=not args.disable_detokenize,
-    )
-    print(sampling_params)
-    dummy_prompt_token_ids = np.random.randint(
-        10000, size=(args.batch_size, args.input_len)
-    )
-    dummy_prompts: list[PromptType] = [
-        {"prompt_token_ids": batch} for batch in dummy_prompt_token_ids.tolist()
-    ]
-
-    def llm_generate():
-        if not args.use_beam_search:
-            llm.generate(dummy_prompts, sampling_params=sampling_params, use_tqdm=False)
-        else:
-            llm.beam_search(
-                dummy_prompts,
-                BeamSearchParams(
-                    beam_width=args.n,
-                    max_tokens=args.output_len,
-                    ignore_eos=True,
-                ),
-            )
-
-    def run_to_completion(profile_dir: Optional[str] = None):
-        if profile_dir:
-            llm.start_profile()
-            llm_generate()
-            llm.stop_profile()
-        else:
-            start_time = time.perf_counter()
-            llm_generate()
-            end_time = time.perf_counter()
-            latency = end_time - start_time
-            return latency
-
-    print("Warming up...")
-    for _ in tqdm(range(args.num_iters_warmup), desc="Warmup iterations"):
-        run_to_completion(profile_dir=None)
-
-    if args.profile:
-        profile_dir = envs.VLLM_TORCH_PROFILER_DIR
-        print(f"Profiling (results will be saved to '{profile_dir}')...")
-        run_to_completion(profile_dir=profile_dir)
-        return
-
-    # Benchmark.
-    latencies = []
-    for _ in tqdm(range(args.num_iters), desc="Profiling iterations"):
-        latencies.append(run_to_completion(profile_dir=None))
-    latencies = np.array(latencies)
-    percentages = [10, 25, 50, 75, 90, 99]
-    percentiles = np.percentile(latencies, percentages)
-    print(f"Avg latency: {np.mean(latencies)} seconds")
-    for percentage, percentile in zip(percentages, percentiles):
-        print(f"{percentage}% percentile latency: {percentile} seconds")
-
-    # Output JSON results if specified
-    if args.output_json:
-        results = {
-            "avg_latency": np.mean(latencies),
-            "latencies": latencies.tolist(),
-            "percentiles": dict(zip(percentages, percentiles.tolist())),
-        }
-        with open(args.output_json, "w") as f:
-            json.dump(results, f, indent=4)
-        save_to_pytorch_benchmark_format(args, results)
-
-
-def create_argument_parser():
-    parser = FlexibleArgumentParser(
-        description="Benchmark the latency of processing a single batch of "
-        "requests till completion."
-    )
-    parser.add_argument("--input-len", type=int, default=32)
-    parser.add_argument("--output-len", type=int, default=128)
-    parser.add_argument("--batch-size", type=int, default=8)
-    parser.add_argument(
-        "--n",
-        type=int,
-        default=1,
-        help="Number of generated sequences per prompt.",
-    )
-    parser.add_argument("--use-beam-search", action="store_true")
-    parser.add_argument(
-        "--num-iters-warmup",
-        type=int,
-        default=10,
-        help="Number of iterations to run for warmup.",
-    )
-    parser.add_argument(
-        "--num-iters", type=int, default=30, help="Number of iterations to run."
-    )
-    parser.add_argument(
-        "--profile",
-        action="store_true",
-        help="profile the generation process of a single batch",
-    )
-    parser.add_argument(
-        "--output-json",
-        type=str,
-        default=None,
-        help="Path to save the latency results in JSON format.",
-    )
-    parser.add_argument(
-        "--disable-detokenize",
-        action="store_true",
-        help=(
-            "Do not detokenize responses (i.e. do not include "
-            "detokenization time in the latency measurement)"
-        ),
-    )
-
-    parser = EngineArgs.add_cli_args(parser)
-    # V1 enables prefix caching by default which skews the latency
-    # numbers. We need to disable prefix caching by default.
-    parser.set_defaults(enable_prefix_caching=False)
+if __name__ == "__main__":
+    print("""DEPRECATED: This script has been moved to the vLLM CLI.
 
-    return parser
+Please use the following command instead:
+    vllm bench latency
 
+For help with the new command, run:
+    vllm bench latency --help
 
-if __name__ == "__main__":
-    parser = create_argument_parser()
-    args = parser.parse_args()
-    if args.profile and not envs.VLLM_TORCH_PROFILER_DIR:
-        raise OSError(
-            "The environment variable 'VLLM_TORCH_PROFILER_DIR' is not set. "
-            "Please set it to a valid path to use torch profiler."
-        )
-    main(args)
+Alternatively, you can run the new command directly with:
+    python -m vllm.entrypoints.cli.main bench latency --help
+""")
+    sys.exit(1)
diff --git a/benchmarks/benchmark_ngram_proposer.py b/benchmarks/benchmark_ngram_proposer.py
index c60040d05ab7ad4737e98b1c409ece7bf68c4c4e..11833fa1b3c8bee6d81cbaf9ffe162a8ce77418f 100644
--- a/benchmarks/benchmark_ngram_proposer.py
+++ b/benchmarks/benchmark_ngram_proposer.py
@@ -77,7 +77,7 @@ def invoke_main() -> None:
         "--num-iteration",
         type=int,
         default=100,
-        help="Number of iterations to run to stablize final data readings",
+        help="Number of iterations to run to stabilize final data readings",
     )
     parser.add_argument(
         "--num-req", type=int, default=128, help="Number of requests in the batch"
diff --git a/benchmarks/benchmark_serving.py b/benchmarks/benchmark_serving.py
index 02f5f585c0c1677db7133b994bf3f090b43448a2..76cf51498020b2581157527acc38987e75e242aa 100644
--- a/benchmarks/benchmark_serving.py
+++ b/benchmarks/benchmark_serving.py
@@ -1,1324 +1,17 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-r"""Benchmark online serving throughput.
+import sys
 
-On the server side, run one of the following commands:
-    vLLM OpenAI API server
-    vllm serve <your_model> \
-        --swap-space 16
-
-On the client side, run:
-    python benchmarks/benchmark_serving.py \
-        --backend <backend> \
-        --model <your_model> \
-        --dataset-name sharegpt \
-        --dataset-path <path to dataset> \
-        --request-rate <request_rate> \ # By default <request_rate> is inf
-        --num-prompts <num_prompts> # By default <num_prompts> is 1000
-
-    when using tgi backend, add
-        --endpoint /generate_stream
-    to the end of the command above.
-"""
-
-import argparse
-import asyncio
-import gc
-import json
-import os
-import random
-import time
-import warnings
-from collections.abc import Iterable
-from dataclasses import dataclass
-from datetime import datetime
-from typing import Any, Literal, Optional
-
-import numpy as np
-from tqdm.asyncio import tqdm
-from transformers import PreTrainedTokenizerBase
-from typing_extensions import deprecated
-
-from backend_request_func import (
-    ASYNC_REQUEST_FUNCS,
-    OPENAI_COMPATIBLE_BACKENDS,
-    RequestFuncInput,
-    RequestFuncOutput,
-)
-
-try:
-    from vllm.transformers_utils.tokenizer import get_tokenizer
-except ImportError:
-    from backend_request_func import get_tokenizer
-
-try:
-    from vllm.utils import FlexibleArgumentParser
-except ImportError:
-    from argparse import ArgumentParser as FlexibleArgumentParser
-
-from benchmark_dataset import (
-    AIMODataset,
-    ASRDataset,
-    BurstGPTDataset,
-    ConversationDataset,
-    CustomDataset,
-    HuggingFaceDataset,
-    InstructCoderDataset,
-    MTBenchDataset,
-    NextEditPredictionDataset,
-    RandomDataset,
-    SampleRequest,
-    ShareGPTDataset,
-    SonnetDataset,
-    VisionArenaDataset,
-)
-from benchmark_utils import convert_to_pytorch_benchmark_format, write_to_json
-from vllm.benchmarks.serve import get_request
-
-MILLISECONDS_TO_SECONDS_CONVERSION = 1000
-
-
-@dataclass
-class BenchmarkMetrics:
-    completed: int
-    total_input: int
-    total_output: int
-    request_throughput: float
-    request_goodput: float
-    output_throughput: float
-    total_token_throughput: float
-    mean_ttft_ms: float
-    median_ttft_ms: float
-    std_ttft_ms: float
-    percentiles_ttft_ms: list[tuple[float, float]]
-    mean_tpot_ms: float
-    median_tpot_ms: float
-    std_tpot_ms: float
-    percentiles_tpot_ms: list[tuple[float, float]]
-    mean_itl_ms: float
-    median_itl_ms: float
-    std_itl_ms: float
-    percentiles_itl_ms: list[tuple[float, float]]
-    # E2EL stands for end-to-end latency per request.
-    # It is the time taken on the client side from sending
-    # a request to receiving a complete response.
-    mean_e2el_ms: float
-    median_e2el_ms: float
-    std_e2el_ms: float
-    percentiles_e2el_ms: list[tuple[float, float]]
-
-
-def calculate_metrics(
-    input_requests: list[SampleRequest],
-    outputs: list[RequestFuncOutput],
-    dur_s: float,
-    tokenizer: PreTrainedTokenizerBase,
-    selected_percentile_metrics: list[str],
-    selected_percentiles: list[float],
-    goodput_config_dict: dict[str, float],
-) -> tuple[BenchmarkMetrics, list[int]]:
-    actual_output_lens: list[int] = []
-    total_input = 0
-    completed = 0
-    good_completed = 0
-    itls: list[float] = []
-    tpots: list[float] = []
-    all_tpots: list[float] = []
-    ttfts: list[float] = []
-    e2els: list[float] = []
-    for i in range(len(outputs)):
-        if outputs[i].success:
-            output_len = outputs[i].output_tokens
-
-            if not output_len:
-                # We use the tokenizer to count the number of output tokens
-                # for some serving backends instead of looking at
-                # len(outputs[i].itl) since multiple output tokens may be
-                # bundled together
-                # Note : this may inflate the output token count slightly
-                output_len = len(
-                    tokenizer(
-                        outputs[i].generated_text, add_special_tokens=False
-                    ).input_ids
-                )
-            actual_output_lens.append(output_len)
-            total_input += input_requests[i].prompt_len
-            tpot = 0
-            if output_len > 1:
-                latency_minus_ttft = outputs[i].latency - outputs[i].ttft
-                tpot = latency_minus_ttft / (output_len - 1)
-                tpots.append(tpot)
-            # Note: if output_len <= 1, we regard tpot as 0 for goodput
-            all_tpots.append(tpot)
-            itls += outputs[i].itl
-            ttfts.append(outputs[i].ttft)
-            e2els.append(outputs[i].latency)
-            completed += 1
-        else:
-            actual_output_lens.append(0)
-
-    if goodput_config_dict:
-        valid_metrics = []
-        slo_values = []
-
-        if "ttft" in goodput_config_dict:
-            valid_metrics.append(ttfts)
-            slo_values.append(
-                goodput_config_dict["ttft"] / MILLISECONDS_TO_SECONDS_CONVERSION
-            )
-        if "tpot" in goodput_config_dict:
-            valid_metrics.append(all_tpots)
-            slo_values.append(
-                goodput_config_dict["tpot"] / MILLISECONDS_TO_SECONDS_CONVERSION
-            )
-        if "e2el" in goodput_config_dict:
-            valid_metrics.append(e2els)
-            slo_values.append(
-                goodput_config_dict["e2el"] / MILLISECONDS_TO_SECONDS_CONVERSION
-            )
-
-        for req_metric in zip(*valid_metrics):
-            is_good_req = all([s >= r for s, r in zip(slo_values, req_metric)])
-            if is_good_req:
-                good_completed += 1
-
-    if completed == 0:
-        warnings.warn(
-            "All requests failed. This is likely due to a misconfiguration "
-            "on the benchmark arguments.",
-            stacklevel=2,
-        )
-    metrics = BenchmarkMetrics(
-        completed=completed,
-        total_input=total_input,
-        total_output=sum(actual_output_lens),
-        request_throughput=completed / dur_s,
-        request_goodput=good_completed / dur_s,
-        output_throughput=sum(actual_output_lens) / dur_s,
-        total_token_throughput=(total_input + sum(actual_output_lens)) / dur_s,
-        mean_ttft_ms=np.mean(ttfts or 0)
-        * 1000,  # ttfts is empty if streaming is not supported by backend
-        std_ttft_ms=np.std(ttfts or 0) * 1000,
-        median_ttft_ms=np.median(ttfts or 0) * 1000,
-        percentiles_ttft_ms=[
-            (p, np.percentile(ttfts or 0, p) * 1000) for p in selected_percentiles
-        ],
-        mean_tpot_ms=np.mean(tpots or 0) * 1000,
-        std_tpot_ms=np.std(tpots or 0) * 1000,
-        median_tpot_ms=np.median(tpots or 0) * 1000,
-        percentiles_tpot_ms=[
-            (p, np.percentile(tpots or 0, p) * 1000) for p in selected_percentiles
-        ],
-        mean_itl_ms=np.mean(itls or 0) * 1000,
-        std_itl_ms=np.std(itls or 0) * 1000,
-        median_itl_ms=np.median(itls or 0) * 1000,
-        percentiles_itl_ms=[
-            (p, np.percentile(itls or 0, p) * 1000) for p in selected_percentiles
-        ],
-        mean_e2el_ms=np.mean(e2els or 0) * 1000,
-        std_e2el_ms=np.std(e2els or 0) * 1000,
-        median_e2el_ms=np.median(e2els or 0) * 1000,
-        percentiles_e2el_ms=[
-            (p, np.percentile(e2els or 0, p) * 1000) for p in selected_percentiles
-        ],
-    )
-
-    return metrics, actual_output_lens
-
-
-async def benchmark(
-    backend: str,
-    api_url: str,
-    base_url: str,
-    model_id: str,
-    model_name: str,
-    tokenizer: PreTrainedTokenizerBase,
-    input_requests: list[SampleRequest],
-    logprobs: Optional[int],
-    request_rate: float,
-    burstiness: float,
-    disable_tqdm: bool,
-    profile: bool,
-    selected_percentile_metrics: list[str],
-    selected_percentiles: list[float],
-    ignore_eos: bool,
-    goodput_config_dict: dict[str, float],
-    max_concurrency: Optional[int],
-    lora_modules: Optional[Iterable[str]],
-    extra_body: Optional[dict],
-    ramp_up_strategy: Optional[Literal["linear", "exponential"]] = None,
-    ramp_up_start_rps: Optional[int] = None,
-    ramp_up_end_rps: Optional[int] = None,
-):
-    if backend in ASYNC_REQUEST_FUNCS:
-        request_func = ASYNC_REQUEST_FUNCS[backend]
-    else:
-        raise ValueError(f"Unknown backend: {backend}")
-
-    print("Starting initial single prompt test run...")
-    test_prompt, test_prompt_len, test_output_len, test_mm_content = (
-        input_requests[0].prompt,
-        input_requests[0].prompt_len,
-        input_requests[0].expected_output_len,
-        input_requests[0].multi_modal_data,
-    )
-
-    assert (
-        test_mm_content is None
-        or isinstance(test_mm_content, dict)
-        or (
-            isinstance(test_mm_content, list)
-            and all(isinstance(item, dict) for item in test_mm_content)
-        )
-    ), "multi_modal_data must be a dict or list[dict]"
-    test_input = RequestFuncInput(
-        model=model_id,
-        model_name=model_name,
-        prompt=test_prompt,
-        api_url=api_url,
-        prompt_len=test_prompt_len,
-        output_len=test_output_len,
-        logprobs=logprobs,
-        multi_modal_content=test_mm_content,
-        ignore_eos=ignore_eos,
-        extra_body=extra_body,
-    )
-
-    test_output = await request_func(request_func_input=test_input)
-    if not test_output.success:
-        raise ValueError(
-            "Initial test run failed - Please make sure benchmark arguments "
-            f"are correctly specified. Error: {test_output.error}"
-        )
-    else:
-        print("Initial test run completed. Starting main benchmark run...")
-
-    if lora_modules:
-        # For each input request, choose a LoRA module at random.
-        lora_modules = iter(
-            [random.choice(lora_modules) for _ in range(len(input_requests))]
-        )
-
-    if profile:
-        print("Starting profiler...")
-        profile_input = RequestFuncInput(
-            model=model_id,
-            model_name=model_name,
-            prompt=test_prompt,
-            api_url=base_url + "/start_profile",
-            prompt_len=test_prompt_len,
-            output_len=test_output_len,
-            logprobs=logprobs,
-            multi_modal_content=test_mm_content,
-            ignore_eos=ignore_eos,
-            extra_body=extra_body,
-        )
-        profile_output = await request_func(request_func_input=profile_input)
-        if profile_output.success:
-            print("Profiler started")
-
-    distribution = "Poisson process" if burstiness == 1.0 else "Gamma distribution"
-
-    if ramp_up_strategy is not None:
-        print(
-            f"Traffic ramp-up strategy: {ramp_up_strategy}. Will increase "
-            f"RPS from {ramp_up_start_rps} to {ramp_up_end_rps} RPS over "
-            "the duration of the benchmark."
-        )
-    else:
-        print(f"Traffic request rate: {request_rate} RPS.")
-
-    print(f"Burstiness factor: {burstiness} ({distribution})")
-    print(f"Maximum request concurrency: {max_concurrency}")
-
-    pbar = None if disable_tqdm else tqdm(total=len(input_requests))
-
-    # This can be used once the minimum Python version is 3.10 or higher,
-    # and it will simplify the code in limited_request_func.
-    #    semaphore = (asyncio.Semaphore(max_concurrency)
-    #                 if max_concurrency else contextlib.nullcontext())
-    semaphore = asyncio.Semaphore(max_concurrency) if max_concurrency else None
-
-    async def limited_request_func(request_func_input, pbar):
-        if semaphore is None:
-            return await request_func(request_func_input=request_func_input, pbar=pbar)
-        async with semaphore:
-            return await request_func(request_func_input=request_func_input, pbar=pbar)
-
-    benchmark_start_time = time.perf_counter()
-    tasks: list[asyncio.Task] = []
-
-    rps_change_events = []
-    last_int_rps = -1
-    if ramp_up_strategy is not None and ramp_up_start_rps is not None:
-        last_int_rps = ramp_up_start_rps
-        rps_change_events.append(
-            {
-                "rps": last_int_rps,
-                "timestamp": datetime.now().isoformat(),
-            }
-        )
-
-    async for request, current_request_rate in get_request(
-        input_requests,
-        request_rate,
-        burstiness,
-        ramp_up_strategy,
-        ramp_up_start_rps,
-        ramp_up_end_rps,
-    ):
-        if ramp_up_strategy is not None:
-            current_int_rps = int(current_request_rate)
-            if current_int_rps > last_int_rps:
-                timestamp = datetime.now().isoformat()
-                for rps_val in range(last_int_rps + 1, current_int_rps + 1):
-                    rps_change_events.append({"rps": rps_val, "timestamp": timestamp})
-                last_int_rps = current_int_rps
-
-        prompt, prompt_len, output_len, mm_content, request_id = (
-            request.prompt,
-            request.prompt_len,
-            request.expected_output_len,
-            request.multi_modal_data,
-            request.request_id,
-        )
-        req_model_id, req_model_name = model_id, model_name
-        if lora_modules:
-            req_lora_module = next(lora_modules)
-            req_model_id, req_model_name = req_lora_module, req_lora_module
-
-        request_func_input = RequestFuncInput(
-            model=req_model_id,
-            model_name=req_model_name,
-            prompt=prompt,
-            api_url=api_url,
-            prompt_len=prompt_len,
-            output_len=output_len,
-            logprobs=logprobs,
-            multi_modal_content=mm_content,
-            ignore_eos=ignore_eos,
-            extra_body=extra_body,
-            request_id=request_id,
-        )
-        task = limited_request_func(request_func_input=request_func_input, pbar=pbar)
-        tasks.append(asyncio.create_task(task))
-    outputs: list[RequestFuncOutput] = await asyncio.gather(*tasks)
-
-    if pbar is not None:
-        pbar.close()
-
-    benchmark_duration = time.perf_counter() - benchmark_start_time
-
-    metrics, actual_output_lens = calculate_metrics(
-        input_requests=input_requests,
-        outputs=outputs,
-        dur_s=benchmark_duration,
-        tokenizer=tokenizer,
-        selected_percentile_metrics=selected_percentile_metrics,
-        selected_percentiles=selected_percentiles,
-        goodput_config_dict=goodput_config_dict,
-    )
-
-    print("{s:{c}^{n}}".format(s=" Serving Benchmark Result ", n=50, c="="))
-    print("{:<40} {:<10}".format("Successful requests:", metrics.completed))
-    if max_concurrency is not None:
-        print("{:<40} {:<10}".format("Maximum request concurrency:", max_concurrency))
-    if request_rate != float("inf"):
-        print("{:<40} {:<10.2f}".format("Request rate configured (RPS):", request_rate))
-    print("{:<40} {:<10.2f}".format("Benchmark duration (s):", benchmark_duration))
-    print("{:<40} {:<10}".format("Total input tokens:", metrics.total_input))
-    print("{:<40} {:<10}".format("Total generated tokens:", metrics.total_output))
-    print(
-        "{:<40} {:<10.2f}".format(
-            "Request throughput (req/s):", metrics.request_throughput
-        )
-    )
-    if goodput_config_dict:
-        print(
-            "{:<40} {:<10.2f}".format(
-                "Request goodput (req/s):", metrics.request_goodput
-            )
-        )
-    print(
-        "{:<40} {:<10.2f}".format(
-            "Output token throughput (tok/s):", metrics.output_throughput
-        )
-    )
-    print(
-        "{:<40} {:<10.2f}".format(
-            "Total Token throughput (tok/s):", metrics.total_token_throughput
-        )
-    )
-
-    result = {
-        "duration": benchmark_duration,
-        "completed": metrics.completed,
-        "total_input_tokens": metrics.total_input,
-        "total_output_tokens": metrics.total_output,
-        "request_throughput": metrics.request_throughput,
-        "request_goodput": metrics.request_goodput if goodput_config_dict else None,
-        "output_throughput": metrics.output_throughput,
-        "total_token_throughput": metrics.total_token_throughput,
-        "input_lens": [output.prompt_len for output in outputs],
-        "output_lens": actual_output_lens,
-        "ttfts": [output.ttft for output in outputs],
-        "itls": [output.itl for output in outputs],
-        "generated_texts": [output.generated_text for output in outputs],
-        "errors": [output.error for output in outputs],
-    }
-
-    if rps_change_events:
-        result["rps_change_events"] = rps_change_events
-
-    def process_one_metric(
-        # E.g., "ttft"
-        metric_attribute_name: str,
-        # E.g., "TTFT"
-        metric_name: str,
-        # E.g., "Time to First Token"
-        metric_header: str,
-    ):
-        # This function prints and adds statistics of the specified
-        # metric.
-        if metric_attribute_name not in selected_percentile_metrics:
-            return
-        print("{s:{c}^{n}}".format(s=metric_header, n=50, c="-"))
-        print(
-            "{:<40} {:<10.2f}".format(
-                f"Mean {metric_name} (ms):",
-                getattr(metrics, f"mean_{metric_attribute_name}_ms"),
-            )
-        )
-        print(
-            "{:<40} {:<10.2f}".format(
-                f"Median {metric_name} (ms):",
-                getattr(metrics, f"median_{metric_attribute_name}_ms"),
-            )
-        )
-        result[f"mean_{metric_attribute_name}_ms"] = getattr(
-            metrics, f"mean_{metric_attribute_name}_ms"
-        )
-        result[f"median_{metric_attribute_name}_ms"] = getattr(
-            metrics, f"median_{metric_attribute_name}_ms"
-        )
-        result[f"std_{metric_attribute_name}_ms"] = getattr(
-            metrics, f"std_{metric_attribute_name}_ms"
-        )
-        for p, value in getattr(metrics, f"percentiles_{metric_attribute_name}_ms"):
-            p_word = str(int(p)) if int(p) == p else str(p)
-            print("{:<40} {:<10.2f}".format(f"P{p_word} {metric_name} (ms):", value))
-            result[f"p{p_word}_{metric_attribute_name}_ms"] = value
-
-    process_one_metric("ttft", "TTFT", "Time to First Token")
-    process_one_metric("tpot", "TPOT", "Time per Output Token (excl. 1st token)")
-    process_one_metric("itl", "ITL", "Inter-token Latency")
-    process_one_metric("e2el", "E2EL", "End-to-end Latency")
-
-    print("=" * 50)
-
-    if profile:
-        print("Stopping profiler...")
-        profile_input = RequestFuncInput(
-            model=model_id,
-            prompt=test_prompt,
-            api_url=base_url + "/stop_profile",
-            prompt_len=test_prompt_len,
-            output_len=test_output_len,
-            logprobs=logprobs,
-        )
-        profile_output = await request_func(request_func_input=profile_input)
-        if profile_output.success:
-            print("Profiler stopped")
-
-    return result
-
-
-def check_goodput_args(args):
-    # Check and parse goodput arguments
-    goodput_config_dict = {}
-    VALID_NAMES = ["ttft", "tpot", "e2el"]
-    if args.goodput:
-        goodput_config_dict = parse_goodput(args.goodput)
-        for slo_name, slo_val in goodput_config_dict.items():
-            if slo_name not in VALID_NAMES:
-                raise ValueError(
-                    f"Invalid metric name found, {slo_name}: {slo_val}. "
-                    "The service level objective name should be one of "
-                    f"{str(VALID_NAMES)}. "
-                )
-            if slo_val < 0:
-                raise ValueError(
-                    f"Invalid value found, {slo_name}: {slo_val}. "
-                    "The service level objective value should be "
-                    "non-negative."
-                )
-    return goodput_config_dict
-
-
-def parse_goodput(slo_pairs):
-    goodput_config_dict = {}
-    try:
-        for slo_pair in slo_pairs:
-            slo_name, slo_val = slo_pair.split(":")
-            goodput_config_dict[slo_name] = float(slo_val)
-    except ValueError as err:
-        raise argparse.ArgumentTypeError(
-            "Invalid format found for service level objectives. "
-            'Specify service level objectives for goodput as "KEY:VALUE" '
-            "pairs, where the key is a metric name, and the value is a "
-            "number in milliseconds."
-        ) from err
-    return goodput_config_dict
-
-
-def save_to_pytorch_benchmark_format(
-    args: argparse.Namespace, results: dict[str, Any], file_name: str
-) -> None:
-    metrics = [
-        "median_ttft_ms",
-        "mean_ttft_ms",
-        "std_ttft_ms",
-        "p99_ttft_ms",
-        "mean_tpot_ms",
-        "median_tpot_ms",
-        "std_tpot_ms",
-        "p99_tpot_ms",
-        "median_itl_ms",
-        "mean_itl_ms",
-        "std_itl_ms",
-        "p99_itl_ms",
-    ]
-    # These raw data might be useful, but they are rather big. They can be added
-    # later if needed
-    ignored_metrics = ["ttfts", "itls", "generated_texts", "errors"]
-    pt_records = convert_to_pytorch_benchmark_format(
-        args=args,
-        metrics={k: [results[k]] for k in metrics},
-        extra_info={
-            k: results[k]
-            for k in results
-            if k not in metrics and k not in ignored_metrics
-        },
-    )
-    if pt_records:
-        # Don't use json suffix here as we don't want CI to pick it up
-        pt_file = f"{os.path.splitext(file_name)[0]}.pytorch.json"
-        write_to_json(pt_file, pt_records)
-
-
-@deprecated(
-    "benchmark_serving.py is deprecated and will be removed in a future "
-    "version. Please use 'vllm bench serve' instead.",
-)
-def main(args: argparse.Namespace):
-    print(args)
-    random.seed(args.seed)
-    np.random.seed(args.seed)
-
-    backend = args.backend
-    model_id = args.model
-    model_name = args.served_model_name
-    tokenizer_id = args.tokenizer if args.tokenizer is not None else args.model
-    tokenizer_mode = args.tokenizer_mode
-
-    # Validate ramp-up arguments
-    if args.ramp_up_strategy is not None:
-        if args.request_rate != float("inf"):
-            raise ValueError(
-                "When using ramp-up, do not specify --request-rate. "
-                "The request rate will be controlled by ramp-up parameters. "
-                "Please remove the --request-rate argument."
-            )
-        if args.ramp_up_start_rps is None or args.ramp_up_end_rps is None:
-            raise ValueError(
-                "When using --ramp-up-strategy, both --ramp-up-start-rps and "
-                "--ramp-up-end-rps must be specified"
-            )
-        if args.ramp_up_start_rps < 0 or args.ramp_up_end_rps < 0:
-            raise ValueError("Ramp-up start and end RPS must be non-negative")
-        if args.ramp_up_start_rps > args.ramp_up_end_rps:
-            raise ValueError("Ramp-up start RPS must be less than end RPS")
-        if args.ramp_up_strategy == "exponential" and args.ramp_up_start_rps == 0:
-            raise ValueError("For exponential ramp-up, the start RPS cannot be 0.")
-
-    if args.base_url is not None:
-        api_url = f"{args.base_url}{args.endpoint}"
-        base_url = f"{args.base_url}"
-    else:
-        api_url = f"http://{args.host}:{args.port}{args.endpoint}"
-        base_url = f"http://{args.host}:{args.port}"
-
-    tokenizer = get_tokenizer(
-        tokenizer_id,
-        tokenizer_mode=tokenizer_mode,
-        trust_remote_code=args.trust_remote_code,
-    )
-
-    if args.dataset_name is None:
-        raise ValueError(
-            "Please specify '--dataset-name' and the corresponding "
-            "'--dataset-path' if required."
-        )
-
-    if args.dataset_name == "custom":
-        dataset = CustomDataset(dataset_path=args.dataset_path)
-        input_requests = dataset.sample(
-            num_requests=args.num_prompts,
-            tokenizer=tokenizer,
-            output_len=args.custom_output_len,
-            skip_chat_template=args.custom_skip_chat_template,
-            request_id_prefix=args.request_id_prefix,
-        )
-
-    elif args.dataset_name == "sonnet":
-        dataset = SonnetDataset(dataset_path=args.dataset_path)
-        # For the "sonnet" dataset, formatting depends on the backend.
-        if args.backend == "openai-chat":
-            input_requests = dataset.sample(
-                num_requests=args.num_prompts,
-                input_len=args.sonnet_input_len,
-                output_len=args.sonnet_output_len,
-                prefix_len=args.sonnet_prefix_len,
-                tokenizer=tokenizer,
-                return_prompt_formatted=False,
-                request_id_prefix=args.request_id_prefix,
-            )
-        else:
-            assert tokenizer.chat_template or tokenizer.default_chat_template, (
-                "Tokenizer/model must have chat template for sonnet dataset."
-            )
-            input_requests = dataset.sample(
-                num_requests=args.num_prompts,
-                input_len=args.sonnet_input_len,
-                output_len=args.sonnet_output_len,
-                prefix_len=args.sonnet_prefix_len,
-                tokenizer=tokenizer,
-                return_prompt_formatted=True,
-                request_id_prefix=args.request_id_prefix,
-            )
-
-    elif args.dataset_name == "hf":
-        # all following datasets are implemented from the
-        # HuggingFaceDataset base class
-        if args.dataset_path in VisionArenaDataset.SUPPORTED_DATASET_PATHS:
-            dataset_class = VisionArenaDataset
-            args.hf_split = "train"
-            args.hf_subset = None
-        elif args.dataset_path in InstructCoderDataset.SUPPORTED_DATASET_PATHS:
-            dataset_class = InstructCoderDataset
-            args.hf_split = "train"
-        elif args.dataset_path in MTBenchDataset.SUPPORTED_DATASET_PATHS:
-            dataset_class = MTBenchDataset
-            args.hf_split = "train"
-        elif args.dataset_path in ConversationDataset.SUPPORTED_DATASET_PATHS:
-            dataset_class = ConversationDataset
-        elif args.dataset_path in AIMODataset.SUPPORTED_DATASET_PATHS:
-            dataset_class = AIMODataset
-            args.hf_split = "train"
-        elif args.dataset_path in NextEditPredictionDataset.SUPPORTED_DATASET_PATHS:  # noqa: E501
-            dataset_class = NextEditPredictionDataset
-            args.hf_split = "train"
-        elif args.dataset_path in ASRDataset.SUPPORTED_DATASET_PATHS:
-            dataset_class = ASRDataset
-            args.hf_split = "train"
-        else:
-            supported_datasets = set(
-                [
-                    dataset_name
-                    for cls in HuggingFaceDataset.__subclasses__()
-                    for dataset_name in cls.SUPPORTED_DATASET_PATHS
-                ]
-            )
-            raise ValueError(
-                f"Unsupported dataset path: {args.dataset_path}. "
-                "Huggingface dataset only supports dataset_path"
-                f" from one of following: {supported_datasets}. "
-                "Please consider contributing if you would "
-                "like to add support for additional dataset formats."
-            )
-
-        if dataset_class.IS_MULTIMODAL and backend not in [
-            "openai-chat",
-            "openai-audio",
-        ]:
-            # multi-modal benchmark is only available on OpenAI Chat backend.
-            raise ValueError(
-                "Multi-modal content is only supported on 'openai-chat' and "
-                "'openai-audio' backend."
-            )
-        input_requests = dataset_class(
-            dataset_path=args.dataset_path,
-            dataset_subset=args.hf_subset,
-            dataset_split=args.hf_split,
-            random_seed=args.seed,
-            no_stream=args.no_stream,
-        ).sample(
-            num_requests=args.num_prompts,
-            tokenizer=tokenizer,
-            output_len=args.hf_output_len,
-            request_id_prefix=args.request_id_prefix,
-        )
-
-    else:
-        # For datasets that follow a similar structure, use a mapping.
-        dataset_mapping = {
-            "sharegpt": lambda: ShareGPTDataset(
-                random_seed=args.seed, dataset_path=args.dataset_path
-            ).sample(
-                tokenizer=tokenizer,
-                num_requests=args.num_prompts,
-                output_len=args.sharegpt_output_len,
-                request_id_prefix=args.request_id_prefix,
-            ),
-            "burstgpt": lambda: BurstGPTDataset(
-                random_seed=args.seed, dataset_path=args.dataset_path
-            ).sample(
-                tokenizer=tokenizer,
-                num_requests=args.num_prompts,
-                request_id_prefix=args.request_id_prefix,
-            ),
-            "random": lambda: RandomDataset(dataset_path=args.dataset_path).sample(
-                tokenizer=tokenizer,
-                num_requests=args.num_prompts,
-                prefix_len=args.random_prefix_len,
-                input_len=args.random_input_len,
-                output_len=args.random_output_len,
-                range_ratio=args.random_range_ratio,
-                request_id_prefix=args.request_id_prefix,
-            ),
-        }
-
-        try:
-            input_requests = dataset_mapping[args.dataset_name]()
-        except KeyError as err:
-            raise ValueError(f"Unknown dataset: {args.dataset_name}") from err
-    goodput_config_dict = check_goodput_args(args)
-
-    # Collect the sampling parameters.
-    sampling_params = {
-        k: v
-        for k, v in {
-            "top_p": args.top_p,
-            "top_k": args.top_k,
-            "min_p": args.min_p,
-            "temperature": args.temperature,
-        }.items()
-        if v is not None
-    }
-
-    # Sampling parameters are only supported by openai-compatible backend.
-    if sampling_params and args.backend not in OPENAI_COMPATIBLE_BACKENDS:
-        raise ValueError(
-            "Sampling parameters are only supported by openai-compatible backends."
-        )
-
-    if "temperature" not in sampling_params:
-        sampling_params["temperature"] = 0.0  # Default to greedy decoding.
-
-    if args.backend == "llama.cpp":
-        # Disable prompt caching in llama.cpp backend
-        sampling_params["cache_prompt"] = False
-
-    # Avoid GC processing "static" data - reduce pause times.
-    gc.collect()
-    gc.freeze()
-
-    benchmark_result = asyncio.run(
-        benchmark(
-            backend=backend,
-            api_url=api_url,
-            base_url=base_url,
-            model_id=model_id,
-            model_name=model_name,
-            tokenizer=tokenizer,
-            input_requests=input_requests,
-            logprobs=args.logprobs,
-            request_rate=args.request_rate,
-            burstiness=args.burstiness,
-            disable_tqdm=args.disable_tqdm,
-            profile=args.profile,
-            selected_percentile_metrics=args.percentile_metrics.split(","),
-            selected_percentiles=[float(p) for p in args.metric_percentiles.split(",")],
-            ignore_eos=args.ignore_eos,
-            goodput_config_dict=goodput_config_dict,
-            max_concurrency=args.max_concurrency,
-            lora_modules=args.lora_modules,
-            extra_body=sampling_params,
-            ramp_up_strategy=args.ramp_up_strategy,
-            ramp_up_start_rps=args.ramp_up_start_rps,
-            ramp_up_end_rps=args.ramp_up_end_rps,
-        )
-    )
-
-    # Save config and results to json
-    if args.save_result or args.append_result:
-        result_json: dict[str, Any] = {}
-
-        # Setup
-        current_dt = datetime.now().strftime("%Y%m%d-%H%M%S")
-        result_json["date"] = current_dt
-        result_json["backend"] = backend
-        result_json["model_id"] = model_id
-        result_json["tokenizer_id"] = tokenizer_id
-        result_json["num_prompts"] = args.num_prompts
-
-        # Metadata
-        if args.metadata:
-            for item in args.metadata:
-                if "=" in item:
-                    kvstring = item.split("=")
-                    result_json[kvstring[0].strip()] = kvstring[1].strip()
-                else:
-                    raise ValueError(
-                        "Invalid metadata format. Please use KEY=VALUE format."
-                    )
-        # Traffic
-        result_json["request_rate"] = (
-            args.request_rate if args.request_rate < float("inf") else "inf"
-        )
-        result_json["burstiness"] = args.burstiness
-        result_json["max_concurrency"] = args.max_concurrency
-
-        if args.ramp_up_strategy is not None:
-            result_json["ramp_up_strategy"] = args.ramp_up_strategy
-            result_json["ramp_up_start_rps"] = args.ramp_up_start_rps
-            result_json["ramp_up_end_rps"] = args.ramp_up_end_rps
-
-        # Merge with benchmark result
-        result_json = {**result_json, **benchmark_result}
-
-        if not args.save_detailed:
-            # Remove fields with too many data points
-            for field in [
-                "input_lens",
-                "output_lens",
-                "ttfts",
-                "itls",
-                "generated_texts",
-                "errors",
-            ]:
-                if field in result_json:
-                    del result_json[field]
-                if field in benchmark_result:
-                    del benchmark_result[field]
-
-        # Save to file
-        base_model_id = model_id.split("/")[-1]
-        max_concurrency_str = (
-            f"-concurrency{args.max_concurrency}"
-            if args.max_concurrency is not None
-            else ""
-        )
-        if args.ramp_up_strategy is not None:
-            file_name = f"{backend}-ramp-up-{args.ramp_up_strategy}-{args.ramp_up_start_rps}qps-{args.ramp_up_end_rps}qps{max_concurrency_str}-{base_model_id}-{current_dt}.json"  # noqa
-        else:
-            file_name = f"{backend}-{args.request_rate}qps{max_concurrency_str}-{base_model_id}-{current_dt}.json"  # noqa
-        if args.result_filename:
-            file_name = args.result_filename
-        if args.result_dir:
-            os.makedirs(args.result_dir, exist_ok=True)
-            file_name = os.path.join(args.result_dir, file_name)
-        with open(
-            file_name, mode="a+" if args.append_result else "w", encoding="utf-8"
-        ) as outfile:
-            # Append a newline.
-            if args.append_result and outfile.tell() != 0:
-                outfile.write("\n")
-            json.dump(result_json, outfile)
-        save_to_pytorch_benchmark_format(args, result_json, file_name)
-
-
-def create_argument_parser():
-    parser = FlexibleArgumentParser(
-        description="Benchmark the online serving throughput."
-    )
-    parser.add_argument(
-        "--backend",
-        type=str,
-        default="vllm",
-        choices=list(ASYNC_REQUEST_FUNCS.keys()),
-    )
-    parser.add_argument(
-        "--base-url",
-        type=str,
-        default=None,
-        help="Server or API base url if not using http host and port.",
-    )
-    # Use 127.0.0.1 here instead of localhost to force the use of ipv4
-    parser.add_argument("--host", type=str, default="127.0.0.1")
-    parser.add_argument("--port", type=int, default=8000)
-    parser.add_argument(
-        "--endpoint",
-        type=str,
-        default="/v1/completions",
-        help="API endpoint.",
-    )
-    parser.add_argument(
-        "--dataset-name",
-        type=str,
-        default="sharegpt",
-        choices=["sharegpt", "burstgpt", "sonnet", "random", "hf", "custom"],
-        help="Name of the dataset to benchmark on.",
-    )
-    parser.add_argument(
-        "--dataset-path",
-        type=str,
-        default=None,
-        help="Path to the sharegpt/sonnet dataset. "
-        "Or the huggingface dataset ID if using HF dataset.",
-    )
-    parser.add_argument(
-        "--no-stream",
-        action="store_true",
-        help="Do not load the dataset in streaming mode.",
-    )
-    parser.add_argument(
-        "--max-concurrency",
-        type=int,
-        default=None,
-        help="Maximum number of concurrent requests. This can be used "
-        "to help simulate an environment where a higher level component "
-        "is enforcing a maximum number of concurrent requests. While the "
-        "--request-rate argument controls the rate at which requests are "
-        "initiated, this argument will control how many are actually allowed "
-        "to execute at a time. This means that when used in combination, the "
-        "actual request rate may be lower than specified with --request-rate, "
-        "if the server is not processing requests fast enough to keep up.",
-    )
-
-    parser.add_argument(
-        "--model",
-        type=str,
-        required=True,
-        help="Name of the model.",
-    )
-    parser.add_argument(
-        "--tokenizer",
-        type=str,
-        help="Name or path of the tokenizer, if not using the default tokenizer.",  # noqa: E501
-    )
-    parser.add_argument("--use-beam-search", action="store_true")
-    parser.add_argument(
-        "--num-prompts",
-        type=int,
-        default=1000,
-        help="Number of prompts to process.",
-    )
-    parser.add_argument(
-        "--logprobs",
-        type=int,
-        default=None,
-        help=(
-            "Number of logprobs-per-token to compute & return as part of "
-            "the request. If unspecified, then either (1) if beam search "
-            "is disabled, no logprobs are computed & a single dummy "
-            "logprob is returned for each token; or (2) if beam search "
-            "is enabled 1 logprob per token is computed"
-        ),
-    )
-    parser.add_argument(
-        "--request-rate",
-        type=float,
-        default=float("inf"),
-        help="Number of requests per second. If this is inf, "
-        "then all the requests are sent at time 0. "
-        "Otherwise, we use Poisson process or gamma distribution "
-        "to synthesize the request arrival times.",
-    )
-    parser.add_argument(
-        "--burstiness",
-        type=float,
-        default=1.0,
-        help="Burstiness factor of the request generation. "
-        "Only take effect when request_rate is not inf. "
-        "Default value is 1, which follows Poisson process. "
-        "Otherwise, the request intervals follow a gamma distribution. "
-        "A lower burstiness value (0 < burstiness < 1) results in more "
-        "bursty requests. A higher burstiness value (burstiness > 1) "
-        "results in a more uniform arrival of requests.",
-    )
-    parser.add_argument("--seed", type=int, default=0)
-    parser.add_argument(
-        "--trust-remote-code",
-        action="store_true",
-        help="Trust remote code from huggingface",
-    )
-    parser.add_argument(
-        "--disable-tqdm",
-        action="store_true",
-        help="Specify to disable tqdm progress bar.",
-    )
-    parser.add_argument(
-        "--profile",
-        action="store_true",
-        help="Use Torch Profiler. The endpoint must be launched with "
-        "VLLM_TORCH_PROFILER_DIR to enable profiler.",
-    )
-    parser.add_argument(
-        "--save-result",
-        action="store_true",
-        help="Specify to save benchmark results to a json file",
-    )
-    parser.add_argument(
-        "--save-detailed",
-        action="store_true",
-        help="When saving the results, whether to include per request "
-        "information such as response, error, ttfs, tpots, etc.",
-    )
-    parser.add_argument(
-        "--append-result",
-        action="store_true",
-        help="Append the benchmark result to the existing json file.",
-    )
-    parser.add_argument(
-        "--metadata",
-        metavar="KEY=VALUE",
-        nargs="*",
-        help="Key-value pairs (e.g, --metadata version=0.3.3 tp=1) "
-        "for metadata of this run to be saved in the result JSON file "
-        "for record keeping purposes.",
-    )
-    parser.add_argument(
-        "--result-dir",
-        type=str,
-        default=None,
-        help="Specify directory to save benchmark json results."
-        "If not specified, results are saved in the current directory.",
-    )
-    parser.add_argument(
-        "--result-filename",
-        type=str,
-        default=None,
-        help="Specify the filename to save benchmark json results."
-        "If not specified, results will be saved in "
-        "{backend}-{args.request_rate}qps-{base_model_id}-{current_dt}.json"
-        " format.",
-    )
-    parser.add_argument(
-        "--ignore-eos",
-        action="store_true",
-        help="Set ignore_eos flag when sending the benchmark request."
-        "Warning: ignore_eos is not supported in deepspeed_mii and tgi.",
-    )
-    parser.add_argument(
-        "--percentile-metrics",
-        type=str,
-        default="ttft,tpot,itl",
-        help="Comma-separated list of selected metrics to report percentils. "
-        "This argument specifies the metrics to report percentiles. "
-        'Allowed metric names are "ttft", "tpot", "itl", "e2el". '
-        'Default value is "ttft,tpot,itl".',
-    )
-    parser.add_argument(
-        "--metric-percentiles",
-        type=str,
-        default="99",
-        help="Comma-separated list of percentiles for selected metrics. "
-        'To report 25-th, 50-th, and 75-th percentiles, use "25,50,75". '
-        'Default value is "99". '
-        'Use "--percentile-metrics" to select metrics.',
-    )
-    parser.add_argument(
-        "--goodput",
-        nargs="+",
-        required=False,
-        help='Specify service level objectives for goodput as "KEY:VALUE" '
-        "pairs, where the key is a metric name, and the value is in "
-        'milliseconds. Multiple "KEY:VALUE" pairs can be provided, '
-        "separated by spaces. Allowed request level metric names are "
-        '"ttft", "tpot", "e2el". For more context on the definition of '
-        "goodput, refer to DistServe paper: https://arxiv.org/pdf/2401.09670 "
-        "and the blog: https://hao-ai-lab.github.io/blogs/distserve",
-    )
-    parser.add_argument(
-        "--request-id-prefix",
-        type=str,
-        required=False,
-        default="benchmark-serving",
-        help="Specify the prefix of request id.",
-    )
-
-    # group for dataset specific arguments
-    custom_group = parser.add_argument_group("custom dataset options")
-    custom_group.add_argument(
-        "--custom-output-len",
-        type=int,
-        default=256,
-        help="Number of output tokens per request, used only for custom dataset.",
-    )
-    custom_group.add_argument(
-        "--custom-skip-chat-template",
-        action="store_true",
-        help="Skip applying chat template to prompt, used only for custom dataset.",
-    )
-
-    sonnet_group = parser.add_argument_group("sonnet dataset options")
-    sonnet_group.add_argument(
-        "--sonnet-input-len",
-        type=int,
-        default=550,
-        help="Number of input tokens per request, used only for sonnet dataset.",
-    )
-    sonnet_group.add_argument(
-        "--sonnet-output-len",
-        type=int,
-        default=150,
-        help="Number of output tokens per request, used only for sonnet dataset.",
-    )
-    sonnet_group.add_argument(
-        "--sonnet-prefix-len",
-        type=int,
-        default=200,
-        help="Number of prefix tokens per request, used only for sonnet dataset.",
-    )
-
-    sharegpt_group = parser.add_argument_group("sharegpt dataset options")
-    sharegpt_group.add_argument(
-        "--sharegpt-output-len",
-        type=int,
-        default=None,
-        help="Output length for each request. Overrides the output length "
-        "from the ShareGPT dataset.",
-    )
-
-    random_group = parser.add_argument_group("random dataset options")
-    random_group.add_argument(
-        "--random-input-len",
-        type=int,
-        default=1024,
-        help="Number of input tokens per request, used only for random sampling.",
-    )
-    random_group.add_argument(
-        "--random-output-len",
-        type=int,
-        default=128,
-        help="Number of output tokens per request, used only for random sampling.",
-    )
-    random_group.add_argument(
-        "--random-range-ratio",
-        type=float,
-        default=0.0,
-        help="Range ratio for sampling input/output length, "
-        "used only for random sampling. Must be in the range [0, 1) to define "
-        "a symmetric sampling range"
-        "[length * (1 - range_ratio), length * (1 + range_ratio)].",
-    )
-    random_group.add_argument(
-        "--random-prefix-len",
-        type=int,
-        default=0,
-        help=(
-            "Number of fixed prefix tokens before the random context "
-            "in a request. "
-            "The total input length is the sum of `random-prefix-len` and "
-            "a random "
-            "context length sampled from [input_len * (1 - range_ratio), "
-            "input_len * (1 + range_ratio)]."
-        ),
-    )
-
-    hf_group = parser.add_argument_group("hf dataset options")
-    hf_group.add_argument(
-        "--hf-subset", type=str, default=None, help="Subset of the HF dataset."
-    )
-    hf_group.add_argument(
-        "--hf-split", type=str, default=None, help="Split of the HF dataset."
-    )
-    hf_group.add_argument(
-        "--hf-output-len",
-        type=int,
-        default=None,
-        help="Output length for each request. Overrides the output lengths "
-        "from the sampled HF dataset.",
-    )
-
-    sampling_group = parser.add_argument_group("sampling parameters")
-    sampling_group.add_argument(
-        "--top-p",
-        type=float,
-        default=None,
-        help="Top-p sampling parameter. Only has effect on openai-compatible backends.",
-    )
-    sampling_group.add_argument(
-        "--top-k",
-        type=int,
-        default=None,
-        help="Top-k sampling parameter. Only has effect on openai-compatible backends.",
-    )
-    sampling_group.add_argument(
-        "--min-p",
-        type=float,
-        default=None,
-        help="Min-p sampling parameter. Only has effect on openai-compatible backends.",
-    )
-    sampling_group.add_argument(
-        "--temperature",
-        type=float,
-        default=None,
-        help="Temperature sampling parameter. Only has effect on "
-        "openai-compatible backends. If not specified, default to greedy "
-        "decoding (i.e. temperature==0.0).",
-    )
-
-    parser.add_argument(
-        "--tokenizer-mode",
-        type=str,
-        default="auto",
-        choices=["auto", "slow", "mistral", "custom"],
-        help='The tokenizer mode.\n\n* "auto" will use the '
-        'fast tokenizer if available.\n* "slow" will '
-        "always use the slow tokenizer. \n* "
-        '"mistral" will always use the `mistral_common` tokenizer. \n*'
-        '"custom" will use --tokenizer to select the preregistered tokenizer.',
-    )
-
-    parser.add_argument(
-        "--served-model-name",
-        type=str,
-        default=None,
-        help="The model name used in the API. "
-        "If not specified, the model name will be the "
-        "same as the ``--model`` argument. ",
-    )
-
-    parser.add_argument(
-        "--lora-modules",
-        nargs="+",
-        default=None,
-        help="A subset of LoRA module names passed in when "
-        "launching the server. For each request, the "
-        "script chooses a LoRA module at random.",
-    )
-
-    parser.add_argument(
-        "--ramp-up-strategy",
-        type=str,
-        default=None,
-        choices=["linear", "exponential"],
-        help="The ramp-up strategy. This would be used to "
-        "ramp up the request rate from initial RPS to final "
-        "RPS rate (specified by --ramp-up-start-rps and --ramp-up-end-rps). "
-        "over the duration of the benchmark.",
-    )
-    parser.add_argument(
-        "--ramp-up-start-rps",
-        type=int,
-        default=None,
-        help="The starting request rate for ramp-up (RPS). "
-        "Needs to be specified when --ramp-up-strategy is used.",
-    )
-    parser.add_argument(
-        "--ramp-up-end-rps",
-        type=int,
-        default=None,
-        help="The ending request rate for ramp-up (RPS). "
-        "Needs to be specified when --ramp-up-strategy is used.",
-    )
+if __name__ == "__main__":
+    print("""DEPRECATED: This script has been moved to the vLLM CLI.
 
-    return parser
+Please use the following command instead:
+    vllm bench serve
 
+For help with the new command, run:
+    vllm bench serve --help
 
-if __name__ == "__main__":
-    parser = create_argument_parser()
-    args = parser.parse_args()
-    main(args)
+Alternatively, you can run the new command directly with:
+    python -m vllm.entrypoints.cli.main bench serve --help
+""")
+    sys.exit(1)
diff --git a/benchmarks/benchmark_serving_structured_output.py b/benchmarks/benchmark_serving_structured_output.py
index ca6843a72aa36f56c66c5d83950fd03d012d2911..4aae755eb4e444cc858511e144d73c4816766aeb 100644
--- a/benchmarks/benchmark_serving_structured_output.py
+++ b/benchmarks/benchmark_serving_structured_output.py
@@ -998,7 +998,7 @@ def create_argument_parser():
         "--percentile-metrics",
         type=str,
         default="ttft,tpot,itl",
-        help="Comma-separated list of selected metrics to report percentils. "
+        help="Comma-separated list of selected metrics to report percentiles. "
         "This argument specifies the metrics to report percentiles. "
         'Allowed metric names are "ttft", "tpot", "itl", "e2el". '
         'Default value is "ttft,tpot,itl".',
diff --git a/benchmarks/benchmark_throughput.py b/benchmarks/benchmark_throughput.py
index 6b24b8c8f3c6768d27e49e2d1810754b50e9c665..b6dc0918fd4d1a3001241e84048344568aa15e16 100644
--- a/benchmarks/benchmark_throughput.py
+++ b/benchmarks/benchmark_throughput.py
@@ -1,741 +1,17 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-"""Benchmark offline inference throughput."""
+import sys
 
-import argparse
-import dataclasses
-import json
-import os
-import random
-import time
-import warnings
-from typing import Any, Optional, Union
-
-import torch
-import uvloop
-from tqdm import tqdm
-from transformers import AutoModelForCausalLM, AutoTokenizer, PreTrainedTokenizerBase
-from typing_extensions import deprecated
-
-from benchmark_dataset import (
-    AIMODataset,
-    BurstGPTDataset,
-    ConversationDataset,
-    InstructCoderDataset,
-    RandomDataset,
-    SampleRequest,
-    ShareGPTDataset,
-    SonnetDataset,
-    VisionArenaDataset,
-)
-from benchmark_utils import convert_to_pytorch_benchmark_format, write_to_json
-from vllm.engine.arg_utils import AsyncEngineArgs, EngineArgs
-from vllm.entrypoints.openai.api_server import (
-    build_async_engine_client_from_engine_args,
-)
-from vllm.inputs import TextPrompt, TokensPrompt
-from vllm.lora.request import LoRARequest
-from vllm.outputs import RequestOutput
-from vllm.sampling_params import BeamSearchParams
-from vllm.utils import FlexibleArgumentParser, merge_async_iterators
-
-
-def run_vllm(
-    requests: list[SampleRequest],
-    n: int,
-    engine_args: EngineArgs,
-    disable_detokenize: bool = False,
-) -> tuple[float, Optional[list[RequestOutput]]]:
-    from vllm import LLM, SamplingParams
-
-    llm = LLM(**dataclasses.asdict(engine_args))
-    assert all(
-        llm.llm_engine.model_config.max_model_len
-        >= (request.prompt_len + request.expected_output_len)
-        for request in requests
-    ), (
-        "Please ensure that max_model_len is greater than the sum of"
-        " prompt_len and expected_output_len for all requests."
-    )
-    # Add the requests to the engine.
-    prompts: list[Union[TextPrompt, TokensPrompt]] = []
-    sampling_params: list[SamplingParams] = []
-    for request in requests:
-        prompts.append(
-            TokensPrompt(
-                prompt_token_ids=request.prompt["prompt_token_ids"],
-                multi_modal_data=request.multi_modal_data,
-            )
-            if "prompt_token_ids" in request.prompt
-            else TextPrompt(
-                prompt=request.prompt, multi_modal_data=request.multi_modal_data
-            )
-        )
-        sampling_params.append(
-            SamplingParams(
-                n=n,
-                temperature=1.0,
-                top_p=1.0,
-                ignore_eos=True,
-                max_tokens=request.expected_output_len,
-                detokenize=not disable_detokenize,
-            )
-        )
-    lora_requests: Optional[list[LoRARequest]] = None
-    if engine_args.enable_lora:
-        lora_requests = [request.lora_request for request in requests]
-
-    use_beam_search = False
-
-    outputs = None
-    if not use_beam_search:
-        start = time.perf_counter()
-        outputs = llm.generate(
-            prompts, sampling_params, lora_request=lora_requests, use_tqdm=True
-        )
-        end = time.perf_counter()
-    else:
-        assert lora_requests is None, "BeamSearch API does not support LoRA"
-        # output_len should be the same for all requests.
-        output_len = requests[0].expected_output_len
-        for request in requests:
-            assert request.expected_output_len == output_len
-        start = time.perf_counter()
-        llm.beam_search(
-            prompts,
-            BeamSearchParams(
-                beam_width=n,
-                max_tokens=output_len,
-                ignore_eos=True,
-            ),
-        )
-        end = time.perf_counter()
-    return end - start, outputs
-
-
-def run_vllm_chat(
-    requests: list[SampleRequest],
-    n: int,
-    engine_args: EngineArgs,
-    disable_detokenize: bool = False,
-) -> tuple[float, list[RequestOutput]]:
-    """
-    Run vLLM chat benchmark. This function is recommended ONLY for benchmarking
-    multimodal models as it properly handles multimodal inputs and chat
-    formatting. For non-multimodal models, use run_vllm() instead.
-    """
-    from vllm import LLM, SamplingParams
-
-    llm = LLM(**dataclasses.asdict(engine_args))
-
-    assert all(
-        llm.llm_engine.model_config.max_model_len
-        >= (request.prompt_len + request.expected_output_len)
-        for request in requests
-    ), (
-        "Please ensure that max_model_len is greater than the sum of "
-        "prompt_len and expected_output_len for all requests."
-    )
-
-    prompts = []
-    sampling_params: list[SamplingParams] = []
-    for request in requests:
-        prompts.append(request.prompt)
-        sampling_params.append(
-            SamplingParams(
-                n=n,
-                temperature=1.0,
-                top_p=1.0,
-                ignore_eos=True,
-                max_tokens=request.expected_output_len,
-                detokenize=not disable_detokenize,
-            )
-        )
-    start = time.perf_counter()
-    outputs = llm.chat(prompts, sampling_params, use_tqdm=True)
-    end = time.perf_counter()
-    return end - start, outputs
-
-
-async def run_vllm_async(
-    requests: list[SampleRequest],
-    n: int,
-    engine_args: AsyncEngineArgs,
-    disable_frontend_multiprocessing: bool = False,
-    disable_detokenize: bool = False,
-) -> float:
-    from vllm import SamplingParams
-
-    async with build_async_engine_client_from_engine_args(
-        engine_args,
-        disable_frontend_multiprocessing=disable_frontend_multiprocessing,
-    ) as llm:
-        model_config = await llm.get_model_config()
-        assert all(
-            model_config.max_model_len
-            >= (request.prompt_len + request.expected_output_len)
-            for request in requests
-        ), (
-            "Please ensure that max_model_len is greater than the sum of"
-            " prompt_len and expected_output_len for all requests."
-        )
-
-        # Add the requests to the engine.
-        prompts: list[Union[TextPrompt, TokensPrompt]] = []
-        sampling_params: list[SamplingParams] = []
-        lora_requests: list[Optional[LoRARequest]] = []
-        for request in requests:
-            prompts.append(
-                TokensPrompt(
-                    prompt_token_ids=request.prompt["prompt_token_ids"],
-                    multi_modal_data=request.multi_modal_data,
-                )
-                if "prompt_token_ids" in request.prompt
-                else TextPrompt(
-                    prompt=request.prompt, multi_modal_data=request.multi_modal_data
-                )
-            )
-            sampling_params.append(
-                SamplingParams(
-                    n=n,
-                    temperature=1.0,
-                    top_p=1.0,
-                    ignore_eos=True,
-                    max_tokens=request.expected_output_len,
-                    detokenize=not disable_detokenize,
-                )
-            )
-            lora_requests.append(request.lora_request)
-
-        generators = []
-        start = time.perf_counter()
-        for i, (prompt, sp, lr) in enumerate(
-            zip(prompts, sampling_params, lora_requests)
-        ):
-            generator = llm.generate(prompt, sp, lora_request=lr, request_id=f"test{i}")
-            generators.append(generator)
-        all_gens = merge_async_iterators(*generators)
-        async for i, res in all_gens:
-            pass
-        end = time.perf_counter()
-        return end - start
-
-
-def run_hf(
-    requests: list[SampleRequest],
-    model: str,
-    tokenizer: PreTrainedTokenizerBase,
-    n: int,
-    max_batch_size: int,
-    trust_remote_code: bool,
-    disable_detokenize: bool = False,
-) -> float:
-    llm = AutoModelForCausalLM.from_pretrained(
-        model, torch_dtype=torch.float16, trust_remote_code=trust_remote_code
-    )
-    if llm.config.model_type == "llama":
-        # To enable padding in the HF backend.
-        tokenizer.pad_token = tokenizer.eos_token
-    llm = llm.cuda()
-
-    pbar = tqdm(total=len(requests))
-    start = time.perf_counter()
-    batch: list[str] = []
-    max_prompt_len = 0
-    max_output_len = 0
-    for i in range(len(requests)):
-        prompt = requests[i].prompt
-        prompt_len = requests[i].prompt_len
-        output_len = requests[i].expected_output_len
-        # Add the prompt to the batch.
-        batch.append(prompt)
-        max_prompt_len = max(max_prompt_len, prompt_len)
-        max_output_len = max(max_output_len, output_len)
-        if len(batch) < max_batch_size and i != len(requests) - 1:
-            # Check if we can add more requests to the batch.
-            next_prompt_len = requests[i + 1].prompt_len
-            next_output_len = requests[i + 1].expected_output_len
-            if (
-                max(max_prompt_len, next_prompt_len)
-                + max(max_output_len, next_output_len)
-            ) <= 2048:
-                # We can add more requests to the batch.
-                continue
-
-        # Generate the sequences.
-        input_ids = tokenizer(batch, return_tensors="pt", padding=True).input_ids
-        llm_outputs = llm.generate(
-            input_ids=input_ids.cuda(),
-            do_sample=True,
-            num_return_sequences=n,
-            temperature=1.0,
-            top_p=1.0,
-            use_cache=True,
-            max_new_tokens=max_output_len,
-        )
-        if not disable_detokenize:
-            # Include the decoding time.
-            tokenizer.batch_decode(llm_outputs, skip_special_tokens=True)
-        pbar.update(len(batch))
-
-        # Clear the batch.
-        batch = []
-        max_prompt_len = 0
-        max_output_len = 0
-    end = time.perf_counter()
-    return end - start
-
-
-def run_mii(
-    requests: list[SampleRequest],
-    model: str,
-    tensor_parallel_size: int,
-    output_len: int,
-) -> float:
-    from mii import client, serve
-
-    llm = serve(model, tensor_parallel=tensor_parallel_size)
-    prompts = [request.prompt for request in requests]
-
-    start = time.perf_counter()
-    llm.generate(prompts, max_new_tokens=output_len)
-    end = time.perf_counter()
-    client = client(model)
-    client.terminate_server()
-    return end - start
-
-
-def save_to_pytorch_benchmark_format(
-    args: argparse.Namespace, results: dict[str, Any]
-) -> None:
-    pt_records = convert_to_pytorch_benchmark_format(
-        args=args,
-        metrics={
-            "requests_per_second": [results["requests_per_second"]],
-            "tokens_per_second": [results["tokens_per_second"]],
-        },
-        extra_info={
-            k: results[k] for k in ["elapsed_time", "num_requests", "total_num_tokens"]
-        },
-    )
-    if pt_records:
-        # Don't use json suffix here as we don't want CI to pick it up
-        pt_file = f"{os.path.splitext(args.output_json)[0]}.pytorch.json"
-        write_to_json(pt_file, pt_records)
-
-
-def get_requests(args, tokenizer):
-    # Common parameters for all dataset types.
-    common_kwargs = {
-        "dataset_path": args.dataset_path,
-        "random_seed": args.seed,
-    }
-    sample_kwargs = {
-        "tokenizer": tokenizer,
-        "lora_path": args.lora_path,
-        "max_loras": args.max_loras,
-        "num_requests": args.num_prompts,
-        "input_len": args.input_len,
-        "output_len": args.output_len,
-    }
-
-    if args.dataset_path is None or args.dataset_name == "random":
-        sample_kwargs["range_ratio"] = args.random_range_ratio
-        sample_kwargs["prefix_len"] = args.prefix_len
-        dataset_cls = RandomDataset
-    elif args.dataset_name == "sharegpt":
-        dataset_cls = ShareGPTDataset
-        if args.backend == "vllm-chat":
-            sample_kwargs["enable_multimodal_chat"] = True
-    elif args.dataset_name == "sonnet":
-        assert tokenizer.chat_template or tokenizer.default_chat_template, (
-            "Tokenizer/model must have chat template for sonnet dataset."
-        )
-        dataset_cls = SonnetDataset
-        sample_kwargs["prefix_len"] = args.prefix_len
-        sample_kwargs["return_prompt_formatted"] = True
-    elif args.dataset_name == "burstgpt":
-        dataset_cls = BurstGPTDataset
-    elif args.dataset_name == "hf":
-        common_kwargs["no_stream"] = args.no_stream
-        if args.dataset_path in VisionArenaDataset.SUPPORTED_DATASET_PATHS:
-            dataset_cls = VisionArenaDataset
-            common_kwargs["dataset_subset"] = None
-            common_kwargs["dataset_split"] = "train"
-            sample_kwargs["enable_multimodal_chat"] = True
-        elif args.dataset_path in InstructCoderDataset.SUPPORTED_DATASET_PATHS:
-            dataset_cls = InstructCoderDataset
-            common_kwargs["dataset_split"] = "train"
-        elif args.dataset_path in ConversationDataset.SUPPORTED_DATASET_PATHS:
-            dataset_cls = ConversationDataset
-            common_kwargs["dataset_subset"] = args.hf_subset
-            common_kwargs["dataset_split"] = args.hf_split
-            sample_kwargs["enable_multimodal_chat"] = True
-        elif args.dataset_path in AIMODataset.SUPPORTED_DATASET_PATHS:
-            dataset_cls = AIMODataset
-            common_kwargs["dataset_subset"] = None
-            common_kwargs["dataset_split"] = "train"
-    else:
-        raise ValueError(f"Unknown dataset name: {args.dataset_name}")
-    # Remove None values
-    sample_kwargs = {k: v for k, v in sample_kwargs.items() if v is not None}
-    return dataset_cls(**common_kwargs).sample(**sample_kwargs)
-
-
-@deprecated(
-    "benchmark_throughput.py is deprecated and will be removed in a "
-    "future version. Please use 'vllm bench throughput' instead.",
-)
-def main(args: argparse.Namespace):
-    if args.seed is None:
-        args.seed = 0
-    print(args)
-    random.seed(args.seed)
-    # Sample the requests.
-    tokenizer = AutoTokenizer.from_pretrained(
-        args.tokenizer, trust_remote_code=args.trust_remote_code
-    )
-    requests = get_requests(args, tokenizer)
-    is_multi_modal = any(request.multi_modal_data is not None for request in requests)
-    request_outputs: Optional[list[RequestOutput]] = None
-    if args.backend == "vllm":
-        if args.async_engine:
-            elapsed_time = uvloop.run(
-                run_vllm_async(
-                    requests,
-                    args.n,
-                    AsyncEngineArgs.from_cli_args(args),
-                    args.disable_frontend_multiprocessing,
-                    args.disable_detokenize,
-                )
-            )
-        else:
-            elapsed_time, request_outputs = run_vllm(
-                requests,
-                args.n,
-                EngineArgs.from_cli_args(args),
-                args.disable_detokenize,
-            )
-    elif args.backend == "hf":
-        assert args.tensor_parallel_size == 1
-        elapsed_time = run_hf(
-            requests,
-            args.model,
-            tokenizer,
-            args.n,
-            args.hf_max_batch_size,
-            args.trust_remote_code,
-            args.disable_detokenize,
-        )
-    elif args.backend == "mii":
-        elapsed_time = run_mii(
-            requests, args.model, args.tensor_parallel_size, args.output_len
-        )
-    elif args.backend == "vllm-chat":
-        elapsed_time, request_outputs = run_vllm_chat(
-            requests, args.n, EngineArgs.from_cli_args(args), args.disable_detokenize
-        )
-    else:
-        raise ValueError(f"Unknown backend: {args.backend}")
-
-    if request_outputs:
-        # Note: with the vllm and vllm-chat backends,
-        # we have request_outputs, which we use to count tokens.
-        total_prompt_tokens = 0
-        total_output_tokens = 0
-        for ro in request_outputs:
-            if not isinstance(ro, RequestOutput):
-                continue
-            total_prompt_tokens += (
-                len(ro.prompt_token_ids) if ro.prompt_token_ids else 0
-            )
-            total_output_tokens += sum(len(o.token_ids) for o in ro.outputs if o)
-        total_num_tokens = total_prompt_tokens + total_output_tokens
-    else:
-        total_num_tokens = sum(r.prompt_len + r.expected_output_len for r in requests)
-        total_output_tokens = sum(r.expected_output_len for r in requests)
-        total_prompt_tokens = total_num_tokens - total_output_tokens
-
-    if is_multi_modal and args.backend != "vllm-chat":
-        print(
-            "\033[91mWARNING\033[0m: Multi-modal request with "
-            f"{args.backend} backend detected. The "
-            "following metrics are not accurate because image tokens are not"
-            " counted. See vllm-project/vllm/issues/9778 for details."
-        )
-        # TODO(vllm-project/vllm/issues/9778): Count multi-modal token length.
-        # vllm-chat backend counts the image tokens now
-
-    print(
-        f"Throughput: {len(requests) / elapsed_time:.2f} requests/s, "
-        f"{total_num_tokens / elapsed_time:.2f} total tokens/s, "
-        f"{total_output_tokens / elapsed_time:.2f} output tokens/s"
-    )
-    print(f"Total num prompt tokens:  {total_prompt_tokens}")
-    print(f"Total num output tokens:  {total_output_tokens}")
-
-    # Output JSON results if specified
-    if args.output_json:
-        results = {
-            "elapsed_time": elapsed_time,
-            "num_requests": len(requests),
-            "total_num_tokens": total_num_tokens,
-            "requests_per_second": len(requests) / elapsed_time,
-            "tokens_per_second": total_num_tokens / elapsed_time,
-        }
-        with open(args.output_json, "w") as f:
-            json.dump(results, f, indent=4)
-        save_to_pytorch_benchmark_format(args, results)
-
-
-def validate_args(args):
-    """
-    Validate command-line arguments.
-    """
-
-    # === Deprecation and Defaulting ===
-    if args.dataset is not None:
-        warnings.warn(
-            "The '--dataset' argument will be deprecated in the next release. "
-            "Please use '--dataset-name' and '--dataset-path' instead.",
-            stacklevel=2,
-        )
-        args.dataset_path = args.dataset
-
-    if not getattr(args, "tokenizer", None):
-        args.tokenizer = args.model
-
-    # === Backend Validation ===
-    valid_backends = {"vllm", "hf", "mii", "vllm-chat"}
-    if args.backend not in valid_backends:
-        raise ValueError(f"Unsupported backend: {args.backend}")
-
-    # === Dataset Configuration ===
-    if not args.dataset and not args.dataset_path:
-        print("When dataset path is not set, it will default to random dataset")
-        args.dataset_name = "random"
-        if args.input_len is None:
-            raise ValueError("input_len must be provided for a random dataset")
-
-    # === Dataset Name Specific Checks ===
-    # --hf-subset and --hf-split: only used
-    # when dataset_name is 'hf'
-    if args.dataset_name != "hf" and (
-        getattr(args, "hf_subset", None) is not None
-        or getattr(args, "hf_split", None) is not None
-    ):
-        warnings.warn(
-            "--hf-subset and --hf-split will be ignored \
-                since --dataset-name is not 'hf'.",
-            stacklevel=2,
-        )
-    elif args.dataset_name == "hf":
-        if args.dataset_path in (
-            VisionArenaDataset.SUPPORTED_DATASET_PATHS.keys()
-            | ConversationDataset.SUPPORTED_DATASET_PATHS
-        ):
-            assert args.backend == "vllm-chat", (
-                f"{args.dataset_path} needs to use vllm-chat as the backend."
-            )  # noqa: E501
-        elif args.dataset_path in (
-            InstructCoderDataset.SUPPORTED_DATASET_PATHS
-            | AIMODataset.SUPPORTED_DATASET_PATHS
-        ):
-            assert args.backend == "vllm", (
-                f"{args.dataset_path} needs to use vllm as the backend."
-            )  # noqa: E501
-        else:
-            raise ValueError(f"{args.dataset_path} is not supported by hf dataset.")
-
-    # --random-range-ratio: only used when dataset_name is 'random'
-    if args.dataset_name != "random" and args.random_range_ratio is not None:
-        warnings.warn(
-            "--random-range-ratio will be ignored since \
-                --dataset-name is not 'random'.",
-            stacklevel=2,
-        )
-
-    # --prefix-len: only used when dataset_name is 'random', 'sonnet', or not
-    # set.
-    if (
-        args.dataset_name not in {"random", "sonnet", None}
-        and args.prefix_len is not None
-    ):
-        warnings.warn(
-            "--prefix-len will be ignored since --dataset-name\
-                 is not 'random', 'sonnet', or not set.",
-            stacklevel=2,
-        )
-
-    # === LoRA Settings ===
-    if getattr(args, "enable_lora", False) and args.backend != "vllm":
-        raise ValueError("LoRA benchmarking is only supported for vLLM backend")
-    if getattr(args, "enable_lora", False) and args.lora_path is None:
-        raise ValueError("LoRA path must be provided when enable_lora is True")
-
-    # === Backend-specific Validations ===
-    if args.backend == "hf" and args.hf_max_batch_size is None:
-        raise ValueError("HF max batch size is required for HF backend")
-    if args.backend != "hf" and args.hf_max_batch_size is not None:
-        raise ValueError("HF max batch size is only for HF backend.")
-
-    if (
-        args.backend in {"hf", "mii"}
-        and getattr(args, "quantization", None) is not None
-    ):
-        raise ValueError("Quantization is only for vLLM backend.")
-
-    if args.backend == "mii" and args.dtype != "auto":
-        raise ValueError("dtype must be auto for MII backend.")
-    if args.backend == "mii" and args.n != 1:
-        raise ValueError("n must be 1 for MII backend.")
-    if args.backend == "mii" and args.tokenizer != args.model:
-        raise ValueError("Tokenizer must be the same as the model for MII backend.")
-
-    # --data-parallel is not supported currently.
-    # https://github.com/vllm-project/vllm/issues/16222
-    if args.data_parallel_size > 1:
-        raise ValueError(
-            "Data parallel is not supported in offline benchmark, "
-            "please use benchmark serving instead"
-        )
-
-
-def create_argument_parser():
-    parser = FlexibleArgumentParser(description="Benchmark the throughput.")
-    parser.add_argument(
-        "--backend",
-        type=str,
-        choices=["vllm", "hf", "mii", "vllm-chat"],
-        default="vllm",
-    )
-    parser.add_argument(
-        "--dataset-name",
-        type=str,
-        choices=["sharegpt", "random", "sonnet", "burstgpt", "hf"],
-        help="Name of the dataset to benchmark on.",
-        default="sharegpt",
-    )
-    parser.add_argument(
-        "--no-stream",
-        action="store_true",
-        help="Do not load the dataset in streaming mode.",
-    )
-    parser.add_argument(
-        "--dataset",
-        type=str,
-        default=None,
-        help="Path to the ShareGPT dataset, will be deprecated in\
-            the next release. The dataset is expected to "
-        "be a json in form of list[dict[..., conversations: "
-        "list[dict[..., value: <prompt_or_response>]]]]",
-    )
-    parser.add_argument(
-        "--dataset-path", type=str, default=None, help="Path to the dataset"
-    )
-    parser.add_argument(
-        "--input-len",
-        type=int,
-        default=None,
-        help="Input prompt length for each request",
-    )
-    parser.add_argument(
-        "--output-len",
-        type=int,
-        default=None,
-        help="Output length for each request. Overrides the "
-        "output length from the dataset.",
-    )
-    parser.add_argument(
-        "--n", type=int, default=1, help="Number of generated sequences per prompt."
-    )
-    parser.add_argument(
-        "--num-prompts", type=int, default=1000, help="Number of prompts to process."
-    )
-    parser.add_argument(
-        "--hf-max-batch-size",
-        type=int,
-        default=None,
-        help="Maximum batch size for HF backend.",
-    )
-    parser.add_argument(
-        "--output-json",
-        type=str,
-        default=None,
-        help="Path to save the throughput results in JSON format.",
-    )
-    parser.add_argument(
-        "--async-engine",
-        action="store_true",
-        default=False,
-        help="Use vLLM async engine rather than LLM class.",
-    )
-    parser.add_argument(
-        "--disable-frontend-multiprocessing",
-        action="store_true",
-        default=False,
-        help="Disable decoupled async engine frontend.",
-    )
-    parser.add_argument(
-        "--disable-detokenize",
-        action="store_true",
-        help=(
-            "Do not detokenize the response (i.e. do not include "
-            "detokenization time in the measurement)"
-        ),
-    )
-    # LoRA
-    parser.add_argument(
-        "--lora-path",
-        type=str,
-        default=None,
-        help="Path to the LoRA adapters to use. This can be an absolute path, "
-        "a relative path, or a Hugging Face model identifier.",
-    )
-    parser.add_argument(
-        "--prefix-len",
-        type=int,
-        default=None,
-        help=f"Number of prefix tokens to be used in RandomDataset "
-        "and SonnetDataset. For RandomDataset, the total input "
-        "length is the sum of prefix-len (default: "
-        f"{RandomDataset.DEFAULT_PREFIX_LEN}) and a random context length "
-        "sampled from [input_len * (1 - range_ratio), "
-        "input_len * (1 + range_ratio)]. For SonnetDataset, "
-        f"prefix_len (default: {SonnetDataset.DEFAULT_PREFIX_LEN}) "
-        "controls how much of the input is fixed lines versus "
-        "random lines, but the total input length remains approximately "
-        "input_len tokens.",
-    )
-    # random dataset
-    parser.add_argument(
-        "--random-range-ratio",
-        type=float,
-        default=None,
-        help=f"Range ratio (default : {RandomDataset.DEFAULT_RANGE_RATIO}) "
-        "for sampling input/output length, "
-        "used only for RandomDataset. Must be in the range [0, 1) to "
-        "define a symmetric sampling range "
-        "[length * (1 - range_ratio), length * (1 + range_ratio)].",
-    )
-
-    # hf dtaset
-    parser.add_argument(
-        "--hf-subset", type=str, default=None, help="Subset of the HF dataset."
-    )
-    parser.add_argument(
-        "--hf-split", type=str, default=None, help="Split of the HF dataset."
-    )
-
-    parser = AsyncEngineArgs.add_cli_args(parser)
+if __name__ == "__main__":
+    print("""DEPRECATED: This script has been moved to the vLLM CLI.
 
-    return parser
+Please use the following command instead:
+    vllm bench throughput
 
+For help with the new command, run:
+    vllm bench throughput --help
 
-if __name__ == "__main__":
-    parser = create_argument_parser()
-    args = parser.parse_args()
-    if args.tokenizer is None:
-        args.tokenizer = args.model
-    validate_args(args)
-    main(args)
+Alternatively, you can run the new command directly with:
+    python -m vllm.entrypoints.cli.main bench throughput --help
+""")
+    sys.exit(1)
diff --git a/benchmarks/disagg_benchmarks/disagg_overhead_benchmark.sh b/benchmarks/disagg_benchmarks/disagg_overhead_benchmark.sh
index 92f97ffabea2ad18aaa3d9a63ba4fa86f7b26ea6..2c72941cf7e5112724683a89585276e1bae6e9c0 100644
--- a/benchmarks/disagg_benchmarks/disagg_overhead_benchmark.sh
+++ b/benchmarks/disagg_benchmarks/disagg_overhead_benchmark.sh
@@ -62,7 +62,7 @@ benchmark() {
     --max-model-len 10000 \
     --gpu-memory-utilization 0.6 \
     --kv-transfer-config \
-    '{"kv_connector":"PyNcclConnector","kv_role":"kv_producer","kv_rank":0,"kv_parallel_size":2,"kv_buffer_size":5e9}' &
+    '{"kv_connector":"P2pNcclConnector","kv_role":"kv_producer","kv_rank":0,"kv_parallel_size":2,"kv_buffer_size":5e9}' &
 
 
   CUDA_VISIBLE_DEVICES=1 python3 \
@@ -72,7 +72,7 @@ benchmark() {
     --max-model-len 10000 \
     --gpu-memory-utilization 0.6 \
     --kv-transfer-config \
-    '{"kv_connector":"PyNcclConnector","kv_role":"kv_consumer","kv_rank":1,"kv_parallel_size":2,"kv_buffer_size":5e9}' &
+    '{"kv_connector":"P2pNcclConnector","kv_role":"kv_consumer","kv_rank":1,"kv_parallel_size":2,"kv_buffer_size":5e9}' &
 
   wait_for_server 8100
   wait_for_server 8200
diff --git a/benchmarks/disagg_benchmarks/disagg_performance_benchmark.sh b/benchmarks/disagg_benchmarks/disagg_performance_benchmark.sh
index af2bcba3ea57ae6c22355e0df726a54976d487f2..0bbf7cd2b1c81e5d3d6641b705918a11a3feead6 100644
--- a/benchmarks/disagg_benchmarks/disagg_performance_benchmark.sh
+++ b/benchmarks/disagg_benchmarks/disagg_performance_benchmark.sh
@@ -69,7 +69,7 @@ launch_disagg_prefill() {
     --max-model-len 10000 \
     --gpu-memory-utilization 0.6 \
     --kv-transfer-config \
-    '{"kv_connector":"PyNcclConnector","kv_role":"kv_producer","kv_rank":0,"kv_parallel_size":2,"kv_buffer_size":5e9}' &
+    '{"kv_connector":"P2pNcclConnector","kv_role":"kv_producer","kv_rank":0,"kv_parallel_size":2,"kv_buffer_size":5e9}' &
 
   CUDA_VISIBLE_DEVICES=1 python3 \
     -m vllm.entrypoints.openai.api_server \
@@ -78,7 +78,7 @@ launch_disagg_prefill() {
     --max-model-len 10000 \
     --gpu-memory-utilization 0.6 \
     --kv-transfer-config \
-    '{"kv_connector":"PyNcclConnector","kv_role":"kv_consumer","kv_rank":1,"kv_parallel_size":2,"kv_buffer_size":5e9}' &
+    '{"kv_connector":"P2pNcclConnector","kv_role":"kv_consumer","kv_rank":1,"kv_parallel_size":2,"kv_buffer_size":5e9}' &
 
   wait_for_server 8100
   wait_for_server 8200
diff --git a/benchmarks/kernels/bench_block_fp8_gemm.py b/benchmarks/kernels/bench_block_fp8_gemm.py
index 9663503e9baa0907aec3e73ba1f1db85107a2ed0..f1e504499eaf612de382709911f966b0de782f84 100644
--- a/benchmarks/kernels/bench_block_fp8_gemm.py
+++ b/benchmarks/kernels/bench_block_fp8_gemm.py
@@ -4,7 +4,10 @@
 import torch
 
 from vllm.model_executor.layers.quantization.utils.fp8_utils import (
-    w8a8_block_fp8_matmul,
+    apply_w8a8_block_fp8_linear,
+)
+from vllm.model_executor.layers.quantization.utils.w8a8_utils import (
+    CUTLASS_BLOCK_FP8_SUPPORTED,
 )
 from vllm.platforms import current_platform
 from vllm.triton_utils import triton as vllm_triton
@@ -29,7 +32,7 @@ DEEPSEEK_V3_SHAPES = [
 ]
 
 
-def build_w8a8_block_fp8_runner(M, N, K, block_size, device):
+def build_w8a8_block_fp8_runner(M, N, K, block_size, device, use_cutlass):
     """Build runner function for w8a8 block fp8 matmul."""
     factor_for_scale = 1e-2
 
@@ -37,37 +40,54 @@ def build_w8a8_block_fp8_runner(M, N, K, block_size, device):
     fp8_max, fp8_min = fp8_info.max, fp8_info.min
 
     # Create random FP8 tensors
-    A_fp32 = (torch.rand(M, K, dtype=torch.float32, device=device) - 0.5) * 2 * fp8_max
-    A = A_fp32.clamp(min=fp8_min, max=fp8_max).to(torch.float8_e4m3fn)
+    A_ref = (torch.rand(M, K, dtype=torch.bfloat16, device=device) - 0.5) * 2 * fp8_max
 
-    B_fp32 = (torch.rand(N, K, dtype=torch.float32, device=device) - 0.5) * 2 * fp8_max
-    B = B_fp32.clamp(min=fp8_min, max=fp8_max).to(torch.float8_e4m3fn)
+    B_ref = (torch.rand(N, K, dtype=torch.bfloat16, device=device) - 0.5) * 2 * fp8_max
+    B = B_ref.clamp(min=fp8_min, max=fp8_max).to(torch.float8_e4m3fn)
 
     # Create scales
     block_n, block_k = block_size[0], block_size[1]
     n_tiles = (N + block_n - 1) // block_n
     k_tiles = (K + block_k - 1) // block_k
 
-    As = torch.rand(M, k_tiles, dtype=torch.float32, device=device) * factor_for_scale
     Bs = (
         torch.rand(n_tiles, k_tiles, dtype=torch.float32, device=device)
         * factor_for_scale
     )
 
+    # SM90 CUTLASS requires row-major format for scales
+    if use_cutlass and current_platform.is_device_capability(90):
+        Bs = Bs.T.contiguous()
+
     def run():
-        return w8a8_block_fp8_matmul(A, B, As, Bs, block_size, torch.bfloat16)
+        if use_cutlass:
+            return apply_w8a8_block_fp8_linear(
+                A_ref, B, block_size, Bs, cutlass_block_fp8_supported=True
+            )
+        else:
+            return apply_w8a8_block_fp8_linear(
+                A_ref, B, block_size, Bs, cutlass_block_fp8_supported=False
+            )
 
     return run
 
 
+# Determine available providers
+available_providers = ["torch-bf16", "w8a8-block-fp8-triton"]
+plot_title = "BF16 vs W8A8 Block FP8 GEMMs"
+
+if CUTLASS_BLOCK_FP8_SUPPORTED:
+    available_providers.append("w8a8-block-fp8-cutlass")
+
+
 @vllm_triton.testing.perf_report(
     vllm_triton.testing.Benchmark(
         x_names=["batch_size"],
         x_vals=[1, 16, 64, 128, 256, 512, 1024, 2048, 4096, 8192, 16384],
         x_log=False,
         line_arg="provider",
-        line_vals=["torch-bf16", "w8a8-block-fp8"],
-        line_names=["torch-bf16", "w8a8-block-fp8"],
+        line_vals=available_providers,
+        line_names=available_providers,
         ylabel="TFLOP/s (larger is better)",
         plot_name="BF16 vs W8A8 Block FP8 GEMMs",
         args={},
@@ -85,11 +105,22 @@ def benchmark_tflops(batch_size, provider, N, K, block_size=(128, 128)):
         ms, min_ms, max_ms = vllm_triton.testing.do_bench_cudagraph(
             lambda: torch.nn.functional.linear(a, b), quantiles=quantiles
         )
-    else:  # w8a8-block-fp8
-        run_w8a8 = build_w8a8_block_fp8_runner(M, N, K, block_size, device)
+    elif provider == "w8a8-block-fp8-triton":
+        run_w8a8_triton = build_w8a8_block_fp8_runner(
+            M, N, K, block_size, device, use_cutlass=False
+        )
+        ms, min_ms, max_ms = vllm_triton.testing.do_bench_cudagraph(
+            lambda: run_w8a8_triton(), quantiles=quantiles
+        )
+    elif provider == "w8a8-block-fp8-cutlass":
+        run_w8a8_cutlass = build_w8a8_block_fp8_runner(
+            M, N, K, block_size, device, use_cutlass=True
+        )
         ms, min_ms, max_ms = vllm_triton.testing.do_bench_cudagraph(
-            lambda: run_w8a8(), quantiles=quantiles
+            lambda: run_w8a8_cutlass(), quantiles=quantiles
         )
+    else:
+        raise ValueError(f"Unknown provider: {provider}")
 
     to_tflops = lambda t_ms: (2 * M * N * K) * 1e-12 / (t_ms * 1e-3)
     return to_tflops(ms), to_tflops(max_ms), to_tflops(min_ms)
diff --git a/benchmarks/kernels/benchmark_activation.py b/benchmarks/kernels/benchmark_activation.py
new file mode 100644
index 0000000000000000000000000000000000000000..93edbcc9391fcf696e7333d7945919c8248cfb75
--- /dev/null
+++ b/benchmarks/kernels/benchmark_activation.py
@@ -0,0 +1,104 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+# benchmark custom activation op performance
+import itertools
+
+import torch
+
+import vllm.model_executor.layers.activation  # noqa F401
+from vllm.model_executor.custom_op import CustomOp
+from vllm.platforms import current_platform
+from vllm.triton_utils import triton
+from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE, FlexibleArgumentParser
+
+batch_size_range = [1, 16, 32, 64, 128]
+seq_len_range = [1, 16, 64, 128, 256, 512, 1024, 2048, 4096]
+intermediate_size = [3072, 9728, 12288]
+configs = list(itertools.product(batch_size_range, seq_len_range, intermediate_size))
+
+
+def benchmark_activation(
+    batch_size: int,
+    seq_len: int,
+    intermediate_size: int,
+    provider: str,
+    func_name: str,
+    dtype: torch.dtype,
+):
+    device = "cuda"
+    num_tokens = batch_size * seq_len
+    dim = intermediate_size
+    current_platform.seed_everything(42)
+    torch.set_default_device(device)
+
+    if func_name == "gelu_and_mul":
+        layer = CustomOp.op_registry[func_name](approximate="none")
+    elif func_name == "gelu_and_mul_tanh":
+        layer = CustomOp.op_registry["gelu_and_mul"](approximate="tanh")
+    elif func_name == "fatrelu_and_mul":
+        threshold = 0.5
+        layer = CustomOp.op_registry[func_name](threshold)
+    else:
+        layer = CustomOp.op_registry[func_name]()
+
+    x = torch.randn(num_tokens, dim, dtype=dtype, device=device)
+    compiled_layer = torch.compile(layer.forward_native)
+
+    if provider == "custom":
+        fn = lambda: layer(x)
+    elif provider == "compiled":
+        fn = lambda: compiled_layer(x)
+
+    ms, min_ms, max_ms = triton.testing.do_bench_cudagraph(
+        fn, quantiles=[0.5, 0.2, 0.8]
+    )
+    return ms, max_ms, min_ms
+
+
+if __name__ == "__main__":
+    parser = FlexibleArgumentParser(description="Benchmark the custom activation op.")
+    parser.add_argument(
+        "--func-name",
+        type=str,
+        choices=[
+            "mul_and_silu",
+            "silu_and_mul",
+            "gelu_and_mul",
+            "gelu_and_mul_tanh",
+            "fatrelu_and_mul",
+            "swigluoai_and_mul",
+            "gelu_new",
+            "gelu_fast",
+            "quick_gelu",
+        ],
+        default="silu_and_mul",
+    )
+    parser.add_argument(
+        "--dtype", type=str, choices=["half", "bfloat16", "float"], default="bfloat16"
+    )
+    args = parser.parse_args()
+    assert args
+
+    func_name = args.func_name
+    dtype = STR_DTYPE_TO_TORCH_DTYPE[args.dtype]
+
+    perf_report = triton.testing.perf_report(
+        triton.testing.Benchmark(
+            x_names=["batch_size", "seq_len", "intermediate_size"],
+            x_vals=configs,
+            line_arg="provider",
+            line_vals=["custom", "compiled"],
+            line_names=["Custom OP", "Compiled"],
+            styles=[("blue", "-"), ("green", "-")],
+            ylabel="ms",
+            plot_name=f"{func_name}-op-performance",
+            args={},
+        )
+    )
+
+    perf_report(
+        lambda batch_size, seq_len, intermediate_size, provider: benchmark_activation(
+            batch_size, seq_len, intermediate_size, provider, func_name, dtype
+        )
+    ).run(print_data=True)
diff --git a/benchmarks/kernels/benchmark_device_communicators.py b/benchmarks/kernels/benchmark_device_communicators.py
new file mode 100644
index 0000000000000000000000000000000000000000..a61c17edc1e28e0211edf5b865726723981bbfb8
--- /dev/null
+++ b/benchmarks/kernels/benchmark_device_communicators.py
@@ -0,0 +1,486 @@
+#!/usr/bin/env python3
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+"""
+Benchmark script for device communicators:
+CustomAllreduce (oneshot, twoshot), PyNcclCommunicator,
+and SymmMemCommunicator (multimem, two-shot).
+
+Usage:
+    torchrun --nproc_per_node=<N> benchmark_device_communicators.py [options]
+
+Example:
+    torchrun --nproc_per_node=2 benchmark_device_communicators.py
+    --sequence-lengths 512 1024 2048 --num-warmup 10 --num-trials 100
+"""
+
+import json
+import os
+import time
+from contextlib import nullcontext
+from typing import Callable, Optional
+
+import torch
+import torch.distributed as dist
+from torch.distributed import ProcessGroup
+
+from vllm.distributed.device_communicators.custom_all_reduce import CustomAllreduce
+from vllm.distributed.device_communicators.pynccl import PyNcclCommunicator
+from vllm.distributed.device_communicators.symm_mem import SymmMemCommunicator
+from vllm.logger import init_logger
+from vllm.utils import FlexibleArgumentParser
+
+logger = init_logger(__name__)
+
+# Default sequence lengths to benchmark
+DEFAULT_SEQUENCE_LENGTHS = [128, 512, 1024, 2048, 4096, 8192]
+
+# Fixed hidden size and dtype for all benchmarks
+HIDDEN_SIZE = 8192
+BENCHMARK_DTYPE = torch.bfloat16
+
+# CUDA graph settings
+CUDA_GRAPH_CAPTURE_CYCLES = 10
+
+
+class CommunicatorBenchmark:
+    """Benchmark class for testing device communicators."""
+
+    def __init__(
+        self,
+        rank: int,
+        world_size: int,
+        device: torch.device,
+        cpu_group: ProcessGroup,
+        sequence_lengths: list[int],
+    ):
+        self.rank = rank
+        self.world_size = world_size
+        self.device = device
+        self.cpu_group = cpu_group
+
+        # Calculate max_size_override based on largest sequence length
+        max_seq_len = max(sequence_lengths)
+        max_tensor_elements = max_seq_len * HIDDEN_SIZE
+        self.max_size_override = max_tensor_elements * BENCHMARK_DTYPE.itemsize + 1
+
+        # Initialize communicators
+        self.custom_allreduce = None
+        self.pynccl_comm = None
+        self.symm_mem_comm = None
+        self.symm_mem_comm_multimem = None
+        self.symm_mem_comm_two_shot = None
+
+        self._init_communicators()
+
+    def _init_communicators(self):
+        """Initialize all available communicators."""
+        try:
+            self.custom_allreduce = CustomAllreduce(
+                group=self.cpu_group,
+                device=self.device,
+                max_size=self.max_size_override,
+            )
+            if not self.custom_allreduce.disabled:
+                logger.info("Rank %s: CustomAllreduce initialized", self.rank)
+            else:
+                logger.info("Rank %s: CustomAllreduce disabled", self.rank)
+        except Exception as e:
+            logger.warning(
+                "Rank %s: Failed to initialize CustomAllreduce: %s", self.rank, e
+            )
+            self.custom_allreduce = None
+
+        try:
+            self.pynccl_comm = PyNcclCommunicator(
+                group=self.cpu_group, device=self.device
+            )
+            if not self.pynccl_comm.disabled:
+                logger.info("Rank %s: PyNcclCommunicator initialized", self.rank)
+            else:
+                logger.info("Rank %s: PyNcclCommunicator disabled", self.rank)
+                self.pynccl_comm = None
+        except Exception as e:
+            logger.warning(
+                "Rank %s: Failed to initialize PyNcclCommunicator: %s", self.rank, e
+            )
+            self.pynccl_comm = None
+
+        # Initialize variants for SymmMemCommunicator
+        try:
+            self.symm_mem_comm_multimem = SymmMemCommunicator(
+                group=self.cpu_group,
+                device=self.device,
+                force_multimem=True,
+                max_size_override=self.max_size_override,
+            )
+            if not self.symm_mem_comm_multimem.disabled:
+                logger.info(
+                    "Rank %s: SymmMemCommunicator (multimem) initialized", self.rank
+                )
+            else:
+                self.symm_mem_comm_multimem = None
+        except Exception as e:
+            logger.warning(
+                "Rank %s: Failed to initialize SymmMemCommunicator (multimem): %s",
+                self.rank,
+                e,
+            )
+            self.symm_mem_comm_multimem = None
+
+        try:
+            self.symm_mem_comm_two_shot = SymmMemCommunicator(
+                group=self.cpu_group,
+                device=self.device,
+                force_multimem=False,
+                max_size_override=self.max_size_override,
+            )
+            if not self.symm_mem_comm_two_shot.disabled:
+                logger.info(
+                    "Rank %s: SymmMemCommunicator (two_shot) initialized", self.rank
+                )
+            else:
+                self.symm_mem_comm_two_shot = None
+        except Exception as e:
+            logger.warning(
+                "Rank %s: Failed to initialize SymmMemCommunicator (two_shot): %s",
+                self.rank,
+                e,
+            )
+            self.symm_mem_comm_two_shot = None
+
+    def benchmark_allreduce(
+        self, sequence_length: int, num_warmup: int, num_trials: int
+    ) -> dict[str, float]:
+        """Benchmark allreduce operations for all available communicators."""
+
+        results = {}
+
+        # Define communicators with their benchmark functions
+        communicators = []
+
+        if self.custom_allreduce is not None:
+            comm = self.custom_allreduce
+            # CustomAllreduce one-shot
+            communicators.append(
+                (
+                    "ca_1stage",
+                    lambda t, c=comm: c.custom_all_reduce(t),
+                    lambda t, c=comm: c.should_custom_ar(t),
+                    comm.capture(),
+                    "1stage",  # env variable value
+                )
+            )
+            # CustomAllreduce two-shot
+            communicators.append(
+                (
+                    "ca_2stage",
+                    lambda t, c=comm: c.custom_all_reduce(t),
+                    lambda t, c=comm: c.should_custom_ar(t),
+                    comm.capture(),
+                    "2stage",  # env variable value
+                )
+            )
+
+        if self.pynccl_comm is not None:
+            comm = self.pynccl_comm
+            communicators.append(
+                (
+                    "pynccl",
+                    lambda t, c=comm: c.all_reduce(t),
+                    lambda t: True,  # Always available if initialized
+                    nullcontext(),
+                    None,  # no env variable needed
+                )
+            )
+
+        if self.symm_mem_comm_multimem is not None:
+            comm = self.symm_mem_comm_multimem
+            communicators.append(
+                (
+                    "symm_mem_multimem",
+                    lambda t, c=comm: c.all_reduce(t),
+                    lambda t, c=comm: c.should_use_symm_mem(t),
+                    nullcontext(),
+                    None,  # no env variable needed
+                )
+            )
+
+        if self.symm_mem_comm_two_shot is not None:
+            comm = self.symm_mem_comm_two_shot
+            communicators.append(
+                (
+                    "symm_mem_two_shot",
+                    lambda t, c=comm: c.all_reduce(t),
+                    lambda t, c=comm: c.should_use_symm_mem(t),
+                    nullcontext(),
+                    None,  # no env variable needed
+                )
+            )
+
+        # Benchmark each communicator
+        for name, allreduce_fn, should_use_fn, context, env_var in communicators:
+            # Set environment variable if needed
+            if env_var is not None:
+                os.environ["VLLM_CUSTOM_ALLREDUCE_ALGO"] = env_var
+            else:
+                # Clear the environment variable to avoid interference
+                os.environ.pop("VLLM_CUSTOM_ALLREDUCE_ALGO", None)
+
+            latency = self.benchmark_allreduce_single(
+                sequence_length,
+                allreduce_fn,
+                should_use_fn,
+                context,
+                num_warmup,
+                num_trials,
+            )
+            if latency is not None:
+                results[name] = latency
+
+        return results
+
+    def benchmark_allreduce_single(
+        self,
+        sequence_length: int,
+        allreduce_fn: Callable[[torch.Tensor], Optional[torch.Tensor]],
+        should_use_fn: Callable[[torch.Tensor], bool],
+        context,
+        num_warmup: int,
+        num_trials: int,
+    ) -> Optional[float]:
+        """Benchmark method with CUDA graph optimization."""
+        try:
+            # Create test tensor (2D: sequence_length x hidden_size)
+            tensor = torch.randn(
+                sequence_length, HIDDEN_SIZE, dtype=BENCHMARK_DTYPE, device=self.device
+            )
+            if not should_use_fn(tensor):
+                return None
+
+            torch.cuda.synchronize()
+            stream = torch.cuda.Stream()
+            with torch.cuda.stream(stream):
+                graph_input = tensor.clone()
+
+                # Warmup before capture
+                for _ in range(3):
+                    allreduce_fn(graph_input)
+
+                # Capture the graph using context manager
+                with context:
+                    graph = torch.cuda.CUDAGraph()
+                    with torch.cuda.graph(graph):
+                        for _ in range(CUDA_GRAPH_CAPTURE_CYCLES):
+                            allreduce_fn(graph_input)
+
+            torch.cuda.synchronize()
+            for _ in range(num_warmup):
+                graph.replay()
+            torch.cuda.synchronize()
+
+            torch.cuda.synchronize()
+            start_time = time.perf_counter()
+
+            for _ in range(num_trials):
+                graph.replay()
+            torch.cuda.synchronize()
+
+            end_time = time.perf_counter()
+
+            # Convert to ms and divide by CUDA_GRAPH_CAPTURE_CYCLES
+            return (
+                (end_time - start_time) / num_trials / CUDA_GRAPH_CAPTURE_CYCLES * 1000
+            )
+
+        except Exception as e:
+            logger.error("CUDA graph benchmark failed: %s", e)
+            raise RuntimeError(
+                f"CUDA graph benchmark failed for communicator: {e}"
+            ) from e
+
+
+def _calculate_speedup_info(comm_results: dict[str, float]) -> str:
+    """Calculate speedup information for a single tensor size."""
+    if not comm_results:
+        return "N/A"
+
+    # Find the fastest communicator
+    fastest_comm = min(comm_results.keys(), key=lambda k: comm_results[k])
+    fastest_time = comm_results[fastest_comm]
+
+    # Calculate speedup vs PyNccl if available
+    if "pynccl" in comm_results:
+        pynccl_time = comm_results["pynccl"]
+        speedup = pynccl_time / fastest_time
+        return f"{fastest_comm} ({speedup:.2f}x)"
+    else:
+        return f"{fastest_comm} (N/A)"
+
+
+def print_results(
+    results: dict[str, dict[str, float]], sequence_lengths: list[int], world_size: int
+):
+    """Print benchmark results in a formatted table."""
+
+    print(f"\n{'=' * 130}")
+    print("Device Communicator Benchmark Results")
+    print(
+        f"World Size: {world_size}, Data Type: {BENCHMARK_DTYPE}, "
+        f"Hidden Size: {HIDDEN_SIZE}"
+    )
+    print(f"{'=' * 130}")
+
+    # Get all communicator names
+    all_comms = set()
+    for size_results in results.values():
+        all_comms.update(size_results.keys())
+
+    all_comms = sorted(list(all_comms))
+
+    # Print header
+    header = f"{'Tensor Shape':<20}{'Tensor Size':<15}"
+    for comm in all_comms:
+        header += f"{comm:<20}"
+    header += f"{'Best (Speedup vs PyNccl)':<30}"
+    print(header)
+    print("-" * len(header))
+
+    # Print results for each sequence length
+    for seq_len in sequence_lengths:
+        if seq_len in results:
+            # Calculate tensor size in elements and bytes
+            tensor_elements = seq_len * HIDDEN_SIZE
+            tensor_bytes = tensor_elements * BENCHMARK_DTYPE.itemsize
+
+            # Format tensor size (MB)
+            tensor_size_mb = tensor_bytes / (1024 * 1024)
+            tensor_size_str = f"{tensor_size_mb:.2f} MB"
+
+            # Format tensor shape
+            tensor_shape = f"({seq_len}, {HIDDEN_SIZE})"
+
+            row = f"{tensor_shape:<20}{tensor_size_str:<15}"
+            for comm in all_comms:
+                if comm in results[seq_len]:
+                    row += f"{results[seq_len][comm]:<20.3f}"
+                else:
+                    row += f"{'N/A':<20}"
+
+            # Calculate speedup information
+            speedup_info = _calculate_speedup_info(results[seq_len])
+            row += f"{speedup_info:<30}"
+
+            print(row)
+
+    print(f"{'=' * 130}")
+    print("All times are in milliseconds (ms) per allreduce operation")
+    print("Speedup column shows: fastest_algorithm (speedup_vs_pynccl)")
+
+
+def main():
+    parser = FlexibleArgumentParser(description="Benchmark device communicators")
+
+    parser.add_argument(
+        "--sequence-lengths",
+        type=int,
+        nargs="+",
+        default=DEFAULT_SEQUENCE_LENGTHS,
+        help="Sequence lengths to benchmark (tensor shape: seq_len x hidden_size)",
+    )
+
+    parser.add_argument(
+        "--num-warmup", type=int, default=5, help="Number of warmup iterations"
+    )
+
+    parser.add_argument(
+        "--num-trials", type=int, default=50, help="Number of benchmark trials"
+    )
+
+    parser.add_argument("--output-json", type=str, help="Output results to JSON file")
+
+    args = parser.parse_args()
+
+    # Initialize distributed
+    if not dist.is_initialized():
+        dist.init_process_group(backend="gloo")
+    rank = dist.get_rank()
+    world_size = dist.get_world_size()
+
+    # Set device
+    device = torch.device(f"cuda:{rank}")
+    torch.cuda.set_device(device)
+
+    # Get CPU process group
+    cpu_group = dist.new_group(backend="gloo")
+
+    # Disable USE_SYMM_MEM to avoid affecting the max_sizes
+    # in symm_mem and custom_all_reduce for benchmark
+    os.environ["VLLM_ALLREDUCE_USE_SYMM_MEM"] = "0"
+
+    # Initialize benchmark
+    benchmark = CommunicatorBenchmark(
+        rank, world_size, device, cpu_group, args.sequence_lengths
+    )
+
+    # Run benchmarks
+    all_results = {}
+
+    for seq_len in args.sequence_lengths:
+        if rank == 0:
+            logger.info(
+                "Benchmarking sequence length: %s (tensor shape: %s x %s)",
+                seq_len,
+                seq_len,
+                HIDDEN_SIZE,
+            )
+
+        results = benchmark.benchmark_allreduce(
+            sequence_length=seq_len,
+            num_warmup=args.num_warmup,
+            num_trials=args.num_trials,
+        )
+
+        all_results[seq_len] = results
+
+        # Synchronize between ranks
+        dist.barrier()
+
+    # Print results (only rank 0)
+    if rank == 0:
+        print_results(all_results, args.sequence_lengths, world_size)
+
+        # Save to JSON if requested
+        if args.output_json:
+            # Add speedup information to results
+            enhanced_results = {}
+            for seq_len, comm_results in all_results.items():
+                enhanced_results[seq_len] = {
+                    "timings": comm_results,
+                    "speedup_info": _calculate_speedup_info(comm_results),
+                }
+
+            output_data = {
+                "world_size": world_size,
+                "dtype": str(BENCHMARK_DTYPE),
+                "hidden_size": HIDDEN_SIZE,
+                "sequence_lengths": args.sequence_lengths,
+                "num_warmup": args.num_warmup,
+                "num_trials": args.num_trials,
+                "cuda_graph_capture_cycles": CUDA_GRAPH_CAPTURE_CYCLES,
+                "results": enhanced_results,
+            }
+
+            with open(args.output_json, "w") as f:
+                json.dump(output_data, f, indent=2)
+
+            logger.info("Results saved to %s", args.output_json)
+
+    # Cleanup
+    if cpu_group != dist.group.WORLD:
+        dist.destroy_process_group(cpu_group)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/benchmarks/kernels/benchmark_lora.py b/benchmarks/kernels/benchmark_lora.py
index 3d38d4b3534e8993abc8aff85712230e7262a740..89309c79f09911c37302476454067f5057165d56 100644
--- a/benchmarks/kernels/benchmark_lora.py
+++ b/benchmarks/kernels/benchmark_lora.py
@@ -637,7 +637,7 @@ def bench_optype(
     # Clear LoRA optimization hash-maps.
     _LORA_A_PTR_DICT.clear()
     _LORA_B_PTR_DICT.clear()
-    # Run bench function so that _LORA_A_PTR_DICT and _LORA_B_PTR_DICT are setup
+    # Run bench function so that _LORA_A_PTR_DICT and _LORA_B_PTR_DICT are set up
     for kwargs in kwargs_list:
         op_type.bench_fn()(**kwargs)
     torch.cuda.synchronize()
diff --git a/benchmarks/kernels/benchmark_moe.py b/benchmarks/kernels/benchmark_moe.py
index 710d30adfd84605bfca1b664324956dab8cde343..94f3f1ae11f27be7ba66c28f844bd8b33652e830 100644
--- a/benchmarks/kernels/benchmark_moe.py
+++ b/benchmarks/kernels/benchmark_moe.py
@@ -594,7 +594,11 @@ def main(args: argparse.Namespace):
         E = config.n_routed_experts
         topk = config.num_experts_per_tok
         intermediate_size = config.moe_intermediate_size
-    elif config.architectures[0] in ("Qwen2MoeForCausalLM", "Qwen3MoeForCausalLM"):
+    elif config.architectures[0] in (
+        "Qwen2MoeForCausalLM",
+        "Qwen3MoeForCausalLM",
+        "Qwen3NextForCausalLM",
+    ):
         E = config.num_experts
         topk = config.num_experts_per_tok
         intermediate_size = config.moe_intermediate_size
@@ -678,7 +682,11 @@ def main(args: argparse.Namespace):
         is_fp16 = not (use_fp8_w8a8 or use_int8_w8a16)
         search_space = get_configs_compute_bound(is_fp16, block_quant_shape)
         print(f"Start tuning over {len(search_space)} configurations...")
-
+        if use_deep_gemm:
+            raise ValueError(
+                "Tuning with --use-deep-gemm is not supported as it only tunes Triton "
+                "kernels. Please remove the flag."
+            )
         start = time.time()
         configs = _distribute(
             "tune",
diff --git a/benchmarks/kernels/benchmark_polynorm.py b/benchmarks/kernels/benchmark_polynorm.py
new file mode 100644
index 0000000000000000000000000000000000000000..9ac8f5e6594e4562981576c41a693d8825f90e77
--- /dev/null
+++ b/benchmarks/kernels/benchmark_polynorm.py
@@ -0,0 +1,155 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import itertools
+
+import torch
+
+from vllm import _custom_ops as vllm_ops
+from vllm.triton_utils import triton
+
+
+def polynorm_naive(
+    x: torch.Tensor,
+    weight: torch.Tensor,
+    bias: torch.Tensor,
+    eps: float = 1e-6,
+):
+    orig_shape = x.shape
+    x = x.view(-1, x.shape[-1])
+
+    def norm(x, eps: float):
+        return x / torch.sqrt(x.pow(2).mean(-1, keepdim=True) + eps)
+
+    x = x.float()
+    return (
+        (
+            weight[0] * norm(x**3, eps)
+            + weight[1] * norm(x**2, eps)
+            + weight[2] * norm(x, eps)
+            + bias
+        )
+        .to(weight.dtype)
+        .view(orig_shape)
+    )
+
+
+def polynorm_vllm(
+    x: torch.Tensor,
+    weight: torch.Tensor,
+    bias: torch.Tensor,
+    eps: float = 1e-6,
+):
+    orig_shape = x.shape
+    x = x.view(-1, x.shape[-1])
+
+    out = torch.empty_like(x)
+    vllm_ops.poly_norm(out, x, weight, bias, eps)
+    output = out
+
+    output = output.view(orig_shape)
+    return output
+
+
+def calculate_diff(batch_size, seq_len, hidden_dim):
+    dtype = torch.bfloat16
+    x = torch.randn(batch_size, seq_len, hidden_dim, dtype=dtype, device="cuda")
+    weight = torch.ones(3, dtype=dtype, device="cuda")
+    bias = torch.ones(1, dtype=dtype, device="cuda")
+
+    output_naive = polynorm_naive(x, weight, bias)
+    output_vllm = polynorm_vllm(x, weight, bias)
+
+    if torch.allclose(output_naive, output_vllm, atol=1e-2, rtol=1e-2):
+        print("✅ All implementations match")
+    else:
+        print("❌ Implementations differ")
+
+
+batch_size_range = [2**i for i in range(0, 7, 2)]
+seq_length_range = [2**i for i in range(6, 11, 1)]
+dim_range = [2048, 4096]
+configs = list(itertools.product(dim_range, batch_size_range, seq_length_range))
+
+
+def get_benchmark():
+    @triton.testing.perf_report(
+        triton.testing.Benchmark(
+            x_names=["dim", "batch_size", "seq_len"],
+            x_vals=[list(_) for _ in configs],
+            line_arg="provider",
+            line_vals=["naive", "vllm"],
+            line_names=["Naive", "vLLM"],
+            styles=[("blue", "-"), ("red", "-")],
+            ylabel="us",
+            plot_name="polynorm-perf",
+            args={},
+        )
+    )
+    def benchmark(dim, batch_size, seq_len, provider):
+        dtype = torch.bfloat16
+        hidden_dim = dim * 4
+
+        x = torch.randn(batch_size, seq_len, hidden_dim, dtype=dtype, device="cuda")
+        weight = torch.ones(3, dtype=dtype, device="cuda")
+        bias = torch.ones(1, dtype=dtype, device="cuda")
+
+        quantiles = [0.5, 0.2, 0.8]
+
+        if provider == "naive":
+            ms, min_ms, max_ms = triton.testing.do_bench(
+                lambda: polynorm_naive(x, weight, bias),
+                quantiles=quantiles,
+            )
+        else:
+            ms, min_ms, max_ms = triton.testing.do_bench(
+                lambda: polynorm_vllm(x, weight, bias),
+                quantiles=quantiles,
+            )
+
+        return 1000 * ms, 1000 * max_ms, 1000 * min_ms
+
+    return benchmark
+
+
+if __name__ == "__main__":
+    import argparse
+
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--batch-size",
+        type=int,
+        default=4,
+        help="Batch size",
+    )
+    parser.add_argument(
+        "--seq-len",
+        type=int,
+        default=128,
+        help="Sequence length",
+    )
+    parser.add_argument(
+        "--hidden-dim",
+        type=int,
+        default=8192,
+        help="Intermediate size of MLP",
+    )
+    parser.add_argument(
+        "--save-path",
+        type=str,
+        default="./configs/polnorm/",
+        help="Path to save polnorm benchmark results",
+    )
+
+    args = parser.parse_args()
+
+    # Run correctness test
+    calculate_diff(
+        batch_size=args.batch_size,
+        seq_len=args.seq_len,
+        hidden_dim=args.hidden_dim,
+    )
+
+    benchmark = get_benchmark()
+    # Run performance benchmark
+    benchmark.run(print_data=True, save_path=args.save_path)
diff --git a/benchmarks/kernels/benchmark_trtllm_decode_attention.py b/benchmarks/kernels/benchmark_trtllm_decode_attention.py
index 603ce5ecf0d2c60ca4de585605fbe89f9529f61f..6ddab46214577d010498cf24f3774f27f7368475 100644
--- a/benchmarks/kernels/benchmark_trtllm_decode_attention.py
+++ b/benchmarks/kernels/benchmark_trtllm_decode_attention.py
@@ -259,6 +259,7 @@ if __name__ == "__main__":
         # (q_quant_dtype, kv_quant_dtype, o_quant_dtype)
         (None, None, None),
         (None, FP8_DTYPE, None),
+        (FP8_DTYPE, FP8_DTYPE, None),
         (FP8_DTYPE, FP8_DTYPE, FP8_DTYPE),
         (FP8_DTYPE, FP8_DTYPE, FP4_DTYPE),
     ]
diff --git a/benchmarks/kernels/benchmark_trtllm_prefill_attention.py b/benchmarks/kernels/benchmark_trtllm_prefill_attention.py
index 40903c6c3444f5aa3aa88e2247ad3c9c643c2cbb..131df74c7de1b4c196b75e074cdd3480b978db2f 100644
--- a/benchmarks/kernels/benchmark_trtllm_prefill_attention.py
+++ b/benchmarks/kernels/benchmark_trtllm_prefill_attention.py
@@ -274,6 +274,7 @@ if __name__ == "__main__":
     quant_dtypes = [
         # (q_quant_dtype, kv_quant_dtype, o_quant_dtype)
         (None, None, None),
+        (FP8_DTYPE, FP8_DTYPE, None),
         (FP8_DTYPE, FP8_DTYPE, FP8_DTYPE),
         (FP8_DTYPE, FP8_DTYPE, FP4_DTYPE),
     ]
diff --git a/benchmarks/multi_turn/benchmark_serving_multi_turn.py b/benchmarks/multi_turn/benchmark_serving_multi_turn.py
index d23b7b6e4571dff6263aeb508ba5c8e4604ed786..66d85eaf513125cc5431dff9256da8a621a2a7c4 100644
--- a/benchmarks/multi_turn/benchmark_serving_multi_turn.py
+++ b/benchmarks/multi_turn/benchmark_serving_multi_turn.py
@@ -962,7 +962,7 @@ async def main_mp(
 
     # At this point all the clients finished,
     # collect results (TTFT, TPOT, etc.) from all the clients.
-    # This needs to happens before calling join on the clients
+    # This needs to happen before calling join on the clients
     # (result_queue should be emptied).
     while not result_queue.empty():
         client_metrics.append(result_queue.get())
diff --git a/cmake/cpu_extension.cmake b/cmake/cpu_extension.cmake
index 52bfd82c7fcfed310bd925ecdbd59ba1f1140f40..06494463223bd41c859656cf36af0cf29b3ed5a8 100644
--- a/cmake/cpu_extension.cmake
+++ b/cmake/cpu_extension.cmake
@@ -88,6 +88,7 @@ is_avx512_disabled(AVX512_DISABLED)
 
 if (MACOSX_FOUND AND CMAKE_SYSTEM_PROCESSOR STREQUAL "arm64")
     message(STATUS "Apple Silicon Detected")
+    set(APPLE_SILICON_FOUND TRUE)
     set(ENABLE_NUMA OFF)
     check_sysctl(hw.optional.neon ASIMD_FOUND)
     check_sysctl(hw.optional.arm.FEAT_BF16 ARM_BF16_FOUND)
@@ -189,7 +190,7 @@ else()
     set(USE_ACL OFF)
 endif()
 
-if ((AVX512_FOUND AND NOT AVX512_DISABLED) OR ASIMD_FOUND OR POWER9_FOUND OR POWER10_FOUND OR POWER11_FOUND)
+if ((AVX512_FOUND AND NOT AVX512_DISABLED) OR (ASIMD_FOUND AND NOT APPLE_SILICON_FOUND) OR POWER9_FOUND OR POWER10_FOUND OR POWER11_FOUND)
     FetchContent_Declare(
         oneDNN
         GIT_REPOSITORY https://github.com/oneapi-src/oneDNN.git
diff --git a/cmake/external_projects/vllm_flash_attn.cmake b/cmake/external_projects/vllm_flash_attn.cmake
index 49defccbb1fa4b1b70d0ef0ab2b7019877ea6658..3d32121f13ac2709924cd23e47256e6a1bf820a6 100644
--- a/cmake/external_projects/vllm_flash_attn.cmake
+++ b/cmake/external_projects/vllm_flash_attn.cmake
@@ -38,7 +38,7 @@ else()
   FetchContent_Declare(
           vllm-flash-attn
           GIT_REPOSITORY https://github.com/vllm-project/flash-attention.git
-          GIT_TAG 57b4e68b9f9d94750b46de8f8dbd2bfcc86edd4f
+          GIT_TAG ee4d25bd84e0cbc7e0b9b9685085fd5db2dcb62a
           GIT_PROGRESS TRUE
           # Don't share the vllm-flash-attn build between build types
           BINARY_DIR ${CMAKE_BINARY_DIR}/vllm-flash-attn
diff --git a/csrc/attention/mla/sm100_cutlass_mla_kernel.cu b/csrc/attention/mla/sm100_cutlass_mla_kernel.cu
index 6dd6f269f3dc955558914fe9387c0cbe30d22f32..c60f1823b8a1d42f71beec18ba6b15c074daa76a 100644
--- a/csrc/attention/mla/sm100_cutlass_mla_kernel.cu
+++ b/csrc/attention/mla/sm100_cutlass_mla_kernel.cu
@@ -36,6 +36,7 @@ limitations under the License.
 #if !defined(CUDA_VERSION) || CUDA_VERSION < 12040
 void sm100_cutlass_mla_decode(
     torch::Tensor const& out,
+    torch::Tensor const& lse,
     torch::Tensor const& q_nope,
     torch::Tensor const& q_pe,
     torch::Tensor const& kv_c_and_k_pe_cache,
@@ -64,11 +65,11 @@ struct IsPersistent {
   static const bool value = v;
 };
 
-template <typename T, bool IsPaged128, typename PersistenceOption = IsPersistent<true>>
+template <typename T, typename TOut, bool IsPaged128, typename PersistenceOption = IsPersistent<true>>
 struct MlaSm100 {
   using Element = T;
   using ElementAcc = float;
-  using ElementOut = T;
+  using ElementOut = TOut;
 
   using TileShape = Shape<_128, _128, Shape<_512, _64>>;
   using TileShapeH = cute::tuple_element_t<0, TileShape>;
@@ -99,6 +100,7 @@ struct MlaSm100 {
 template <typename T>
 typename T::Fmha::Arguments args_from_options(
     at::Tensor const& out,
+    at::Tensor const& lse,
     at::Tensor const& q_nope,
     at::Tensor const& q_pe,
     at::Tensor const& kv_c_and_k_pe_cache,
@@ -162,7 +164,10 @@ typename T::Fmha::Arguments args_from_options(
        stride_PT,
        page_count_total,
        page_size},
-      {static_cast<ElementOut*>(out.data_ptr()), stride_O, static_cast<ElementAcc*>(nullptr), stride_LSE},
+      {static_cast<ElementOut*>(out.data_ptr()),
+       stride_O,
+       static_cast<ElementAcc*>(lse.defined() ? lse.data_ptr() : nullptr),
+       stride_LSE},
       hw_info,
       // TODO(trevor-m): Change split_kv back to -1 when
       // https://github.com/NVIDIA/cutlass/issues/2274 is fixed. Split_kv=1 will
@@ -178,9 +183,10 @@ typename T::Fmha::Arguments args_from_options(
   return arguments;
 }
 
-template <typename Element, bool IsPaged128, typename PersistenceOption>
+template <typename Element, typename ElementOut, bool IsPaged128, typename PersistenceOption>
 void runMla(
     at::Tensor const& out,
+    at::Tensor const& lse,
     at::Tensor const& q_nope,
     at::Tensor const& q_pe,
     at::Tensor const& kv_c_and_k_pe_cache,
@@ -190,9 +196,9 @@ void runMla(
     double sm_scale,
     int64_t num_kv_splits,
     cudaStream_t stream) {
-  using MlaSm100Type = MlaSm100<Element, IsPaged128, PersistenceOption>;
+  using MlaSm100Type = MlaSm100<Element, ElementOut, IsPaged128, PersistenceOption>;
   typename MlaSm100Type::Fmha fmha;
-  auto arguments = args_from_options<MlaSm100Type>(out, q_nope, q_pe, kv_c_and_k_pe_cache, seq_lens, page_table, sm_scale, num_kv_splits);
+  auto arguments = args_from_options<MlaSm100Type>(out, lse, q_nope, q_pe, kv_c_and_k_pe_cache, seq_lens, page_table, sm_scale, num_kv_splits);
 
   CUTLASS_CHECK(fmha.can_implement(arguments));
 
@@ -214,6 +220,7 @@ void runMla(
 
 void sm100_cutlass_mla_decode(
     torch::Tensor const& out,
+    torch::Tensor const& lse,
     torch::Tensor const& q_nope,
     torch::Tensor const& q_pe,
     torch::Tensor const& kv_c_and_k_pe_cache,
@@ -233,14 +240,14 @@ void sm100_cutlass_mla_decode(
   DISPATCH_BOOL(page_size == 128, IsPaged128, [&] {
     DISPATCH_BOOL(num_kv_splits <= 1, NotManualSplitKV, [&] {
       if (in_dtype == at::ScalarType::Half) {
-        runMla<cutlass::half_t, IsPaged128, IsPersistent<NotManualSplitKV>>(
-          out, q_nope, q_pe, kv_c_and_k_pe_cache, seq_lens, page_table, workspace, sm_scale, num_kv_splits, stream);
+        runMla<cutlass::half_t, cutlass::half_t, IsPaged128, IsPersistent<NotManualSplitKV>>(
+          out, lse, q_nope, q_pe, kv_c_and_k_pe_cache, seq_lens, page_table, workspace, sm_scale, num_kv_splits, stream);
       } else if (in_dtype == at::ScalarType::BFloat16) {
-        runMla<cutlass::bfloat16_t, IsPaged128, IsPersistent<NotManualSplitKV>>(
-          out, q_nope, q_pe, kv_c_and_k_pe_cache, seq_lens, page_table, workspace, sm_scale, num_kv_splits, stream);
+        runMla<cutlass::bfloat16_t, cutlass::bfloat16_t, IsPaged128, IsPersistent<NotManualSplitKV>>(
+          out, lse, q_nope, q_pe, kv_c_and_k_pe_cache, seq_lens, page_table, workspace, sm_scale, num_kv_splits, stream);
       } else if (in_dtype == at::ScalarType::Float8_e4m3fn) {
-        runMla<cutlass::float_e4m3_t, IsPaged128, IsPersistent<NotManualSplitKV>>(
-          out, q_nope, q_pe, kv_c_and_k_pe_cache, seq_lens, page_table, workspace, sm_scale, num_kv_splits, stream);
+        runMla<cutlass::float_e4m3_t, cutlass::bfloat16_t, IsPaged128, IsPersistent<NotManualSplitKV>>(
+          out, lse, q_nope, q_pe, kv_c_and_k_pe_cache, seq_lens, page_table, workspace, sm_scale, num_kv_splits, stream);
       } else {
         TORCH_CHECK(false, "Unsupported input data type of MLA");
       }
@@ -253,7 +260,7 @@ void sm100_cutlass_mla_decode(
 int64_t sm100_cutlass_mla_get_workspace_size(int64_t max_seq_len, int64_t num_batches, int64_t sm_count, int64_t num_kv_splits) {
   // Workspace size depends on ElementAcc and ElementLSE (same as ElementAcc)
   // which are float, so Element type here doesn't matter.
-  using MlaSm100Type = MlaSm100<cutlass::half_t, true>;
+  using MlaSm100Type = MlaSm100<cutlass::half_t, cutlass::half_t, true>;
 
   // Get split kv. Requires problem shape and sm_count only.
   typename MlaSm100Type::Fmha::Arguments arguments;
diff --git a/csrc/cache.h b/csrc/cache.h
index e8e069aefd9c58e9c315aab1f0151a66182837c9..fd230bec27fca5e43ffa5a273828d37d467a0c4e 100644
--- a/csrc/cache.h
+++ b/csrc/cache.h
@@ -36,13 +36,6 @@ void concat_and_cache_mla(torch::Tensor& kv_c, torch::Tensor& k_pe,
                           const std::string& kv_cache_dtype,
                           torch::Tensor& scale);
 
-void cp_fused_concat_and_cache_mla(torch::Tensor& kv_c, torch::Tensor& k_pe,
-                                   torch::Tensor& cp_local_token_select_indices,
-                                   torch::Tensor& kv_cache,
-                                   torch::Tensor& slot_mapping,
-                                   const std::string& kv_cache_dtype,
-                                   torch::Tensor& scale);
-
 // Just for unittest
 void convert_fp8(torch::Tensor& dst_cache, torch::Tensor& src_cache,
                  const double scale, const std::string& kv_cache_dtype);
diff --git a/csrc/cache_kernels.cu b/csrc/cache_kernels.cu
index ee1e374ccb14936c2110ae1f13ac95846267eae0..7074f4008847ebeca9039b1ae57c2aeb91c6c5f5 100644
--- a/csrc/cache_kernels.cu
+++ b/csrc/cache_kernels.cu
@@ -396,51 +396,6 @@ __global__ void concat_and_cache_mla_kernel(
   copy(k_pe, kv_cache, k_pe_stride, block_stride, pe_dim, kv_lora_rank);
 }
 
-template <typename scalar_t, typename cache_t, Fp8KVCacheDataType kv_dt>
-__global__ void cp_fused_concat_and_cache_mla_kernel(
-    const scalar_t* __restrict__ kv_c,  // [num_full_tokens, kv_lora_rank]
-    const scalar_t* __restrict__ k_pe,  // [num_full_tokens, pe_dim]
-    const int64_t* __restrict__ cp_local_token_select_indices,  // [num_tokens]
-    cache_t* __restrict__ kv_cache,  // [num_blocks, block_size, (kv_lora_rank
-                                     // + pe_dim)]
-    const int64_t* __restrict__ slot_mapping,  // [num_tokens]
-    const int block_stride,                    //
-    const int entry_stride,                    //
-    const int kv_c_stride,                     //
-    const int k_pe_stride,                     //
-    const int kv_lora_rank,                    //
-    const int pe_dim,                          //
-    const int block_size,                      //
-    const float* scale                         //
-) {
-  const int64_t token_idx = cp_local_token_select_indices[blockIdx.x];
-  const int64_t slot_idx = slot_mapping[blockIdx.x];
-  // NOTE: slot_idx can be -1 if the token is padded
-  if (slot_idx < 0) {
-    return;
-  }
-  const int64_t block_idx = slot_idx / block_size;
-  const int64_t block_offset = slot_idx % block_size;
-
-  auto copy = [&](const scalar_t* __restrict__ src, cache_t* __restrict__ dst,
-                  int src_stride, int dst_stride, int size, int offset) {
-    for (int i = threadIdx.x; i < size; i += blockDim.x) {
-      const int64_t src_idx = token_idx * src_stride + i;
-      const int64_t dst_idx =
-          block_idx * block_stride + block_offset * entry_stride + i + offset;
-      if constexpr (kv_dt == Fp8KVCacheDataType::kAuto) {
-        dst[dst_idx] = src[src_idx];
-      } else {
-        dst[dst_idx] =
-            fp8::scaled_convert<cache_t, scalar_t, kv_dt>(src[src_idx], *scale);
-      }
-    }
-  };
-
-  copy(kv_c, kv_cache, kv_c_stride, block_stride, kv_lora_rank, 0);
-  copy(k_pe, kv_cache, k_pe_stride, block_stride, pe_dim, kv_lora_rank);
-}
-
 }  // namespace vllm
 
 // KV_T is the data type of key and value tensors.
@@ -554,20 +509,6 @@ void reshape_and_cache_flash(
           kv_c_stride, k_pe_stride, kv_lora_rank, pe_dim, block_size,   \
           reinterpret_cast<const float*>(scale.data_ptr()));
 
-// KV_T is the data type of key and value tensors.
-// CACHE_T is the stored data type of kv-cache.
-// KV_DTYPE is the real data type of kv-cache.
-#define CALL_CP_FUSED_CONCAT_AND_CACHE_MLA(KV_T, CACHE_T, KV_DTYPE)     \
-  vllm::cp_fused_concat_and_cache_mla_kernel<KV_T, CACHE_T, KV_DTYPE>   \
-      <<<grid, block, 0, stream>>>(                                     \
-          reinterpret_cast<KV_T*>(kv_c.data_ptr()),                     \
-          reinterpret_cast<KV_T*>(k_pe.data_ptr()),                     \
-          cp_local_token_select_indices.data_ptr<int64_t>(),            \
-          reinterpret_cast<CACHE_T*>(kv_cache.data_ptr()),              \
-          slot_mapping.data_ptr<int64_t>(), block_stride, entry_stride, \
-          kv_c_stride, k_pe_stride, kv_lora_rank, pe_dim, block_size,   \
-          reinterpret_cast<const float*>(scale.data_ptr()));
-
 void concat_and_cache_mla(
     torch::Tensor& kv_c,          // [num_tokens, kv_lora_rank]
     torch::Tensor& k_pe,          // [num_tokens, pe_dim]
@@ -606,50 +547,6 @@ void concat_and_cache_mla(
                              CALL_CONCAT_AND_CACHE_MLA);
 }
 
-// Note(hc): cp_fused_concat_and_cache_mla fuses the following three kernel
-// calls into one:
-// k_c_normed.index_select(0, cp_local_token_select_indices) + \
-// k_pe.squeeze(1).index_select(0, cp_local_token_select_indices) + \
-// concat_and_cache_mla.
-void cp_fused_concat_and_cache_mla(
-    torch::Tensor& kv_c,  // [num_total_tokens, kv_lora_rank]
-    torch::Tensor& k_pe,  // [num_total_tokens, pe_dim]
-    torch::Tensor& cp_local_token_select_indices,  // [num_tokens]
-    torch::Tensor& kv_cache,      // [num_blocks, block_size, (kv_lora_rank +
-                                  // pe_dim)]
-    torch::Tensor& slot_mapping,  // [num_tokens] or [num_actual_tokens]
-    const std::string& kv_cache_dtype, torch::Tensor& scale) {
-  // NOTE(woosuk): In vLLM V1, key.size(0) can be different from
-  // slot_mapping.size(0) because of padding for CUDA graphs.
-  // In vLLM V0, key.size(0) is always equal to slot_mapping.size(0) because
-  // both include padding.
-  // In vLLM V1, however, key.size(0) can be larger than slot_mapping.size(0)
-  // since key includes padding for CUDA graphs, while slot_mapping does not.
-  // In this case, slot_mapping.size(0) represents the actual number of tokens
-  // before padding.
-  // For compatibility with both cases, we use slot_mapping.size(0) as the
-  // number of tokens.
-  int num_tokens = slot_mapping.size(0);
-  int kv_lora_rank = kv_c.size(1);
-  int pe_dim = k_pe.size(1);
-  int block_size = kv_cache.size(1);
-
-  TORCH_CHECK(kv_cache.size(2) == kv_lora_rank + pe_dim);
-
-  int kv_c_stride = kv_c.stride(0);
-  int k_pe_stride = k_pe.stride(0);
-  int block_stride = kv_cache.stride(0);
-  int entry_stride = kv_cache.stride(1);
-
-  dim3 grid(num_tokens);
-  dim3 block(std::min(kv_lora_rank, 512));
-  const at::cuda::OptionalCUDAGuard device_guard(device_of(kv_c));
-  const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
-
-  DISPATCH_BY_KV_CACHE_DTYPE(kv_c.dtype(), kv_cache_dtype,
-                             CALL_CP_FUSED_CONCAT_AND_CACHE_MLA);
-}
-
 namespace vllm {
 
 template <typename Tout, typename Tin, Fp8KVCacheDataType kv_dt>
diff --git a/csrc/cpu/dnnl_helper.cpp b/csrc/cpu/dnnl_helper.cpp
index f3f00edb36068aee040927c49c4cf3087cead3aa..6def0e061fa96f16a31dfeeb9079d0b56f24c79f 100644
--- a/csrc/cpu/dnnl_helper.cpp
+++ b/csrc/cpu/dnnl_helper.cpp
@@ -22,6 +22,23 @@ void release_dnnl_matmul_handler(int64_t handler) {
   delete ptr;
 }
 
+DNNLScratchPadManager::DNNLScratchPadManager() : size_(0), ptr_(nullptr) {
+  this->realloc(allocation_unit * 128);
+}
+
+void DNNLScratchPadManager::realloc(size_t new_size) {
+  new_size = round(new_size);
+  if (new_size > size_) {
+    ptr_ = std::aligned_alloc(64, new_size);
+    size_ = new_size;
+  }
+}
+
+DNNLScratchPadManager* DNNLScratchPadManager::get_dnnl_scratchpad_manager() {
+  static DNNLScratchPadManager manager;
+  return &manager;
+}
+
 template <typename KT, typename VT>
 class DNNLPrimitiveCache {
  public:
@@ -166,6 +183,23 @@ struct hash<W8A8MatMulPrimitiveHandler::MSizeCacheKey> {
            hash<int>()(static_cast<int>(val.bias_type));
   }
 };
+
+template <>
+struct hash<MatMulPrimitiveHandler::ClassMatmulCacheKey> {
+  size_t operator()(
+      const MatMulPrimitiveHandler::ClassMatmulCacheKey& val) const {
+    return hash<dnnl_dim_t>()(val.b_n_size) ^ hash<dnnl_dim_t>()(val.b_k_size);
+  }
+};
+
+template <>
+struct hash<MatMulPrimitiveHandler::MSizeCacheKey> {
+  size_t operator()(const MatMulPrimitiveHandler::MSizeCacheKey& val) const {
+    return hash<dnnl_dim_t>()(val.a_m_size) ^
+           hash<dnnl_dim_t>()(val.a_m_stride) ^ hash<bool>()(val.use_bias) ^
+           hash<int>()(static_cast<int>(val.bias_type));
+  }
+};
 }  // namespace std
 
 bool operator==(const W8A8MatMulPrimitiveHandler::ClassMatmulCacheKey& l,
@@ -181,6 +215,17 @@ bool operator==(const W8A8MatMulPrimitiveHandler::MSizeCacheKey& l,
          l.bias_type == r.bias_type;
 }
 
+bool operator==(const MatMulPrimitiveHandler::ClassMatmulCacheKey& l,
+                const MatMulPrimitiveHandler::ClassMatmulCacheKey& r) {
+  return l.b_n_size == r.b_n_size && l.b_k_size == r.b_k_size;
+}
+
+bool operator==(const MatMulPrimitiveHandler::MSizeCacheKey& l,
+                const MatMulPrimitiveHandler::MSizeCacheKey& r) {
+  return l.a_m_size == r.a_m_size && l.a_m_stride == r.a_m_stride &&
+         l.use_bias == r.use_bias && l.bias_type == r.bias_type;
+}
+
 static std::shared_ptr<W8A8MatMulPrimitiveHandler::MSizeCache>
 get_w8a8_class_primitive_cache(
     const W8A8MatMulPrimitiveHandler::ClassMatmulCacheKey& key,
@@ -239,6 +284,11 @@ void W8A8MatMulPrimitiveHandler::execute(ExecArgs& args) {
   }
 
   dnnl::matmul matmul = get_matmul_cache(args);
+
+  auto&& [scratchpad_storage, scratchpad_mem_desc] = get_runtime_memory_ptr(5);
+  scratchpad_storage->set_data_handle(
+      DNNLScratchPadManager::get_dnnl_scratchpad_manager()->get_data<void>());
+
   matmul.execute(default_stream(), memory_cache_);
   default_stream().wait();
 }
@@ -257,6 +307,8 @@ dnnl::matmul W8A8MatMulPrimitiveHandler::get_matmul_cache(
 
   return m_size_cache_->get_or_create(key, [&]() {
     dnnl::matmul::primitive_desc desc = this->create_primitive_desc(key, false);
+    auto manager = DNNLScratchPadManager::get_dnnl_scratchpad_manager();
+    manager->realloc(desc.scratchpad_desc().get_size());
     return dnnl::matmul(desc);
   });
 }
@@ -300,6 +352,11 @@ void W8A8MatMulPrimitiveHandler::init_runtime_memory_cache(const Args& args) {
       dnnl::memory({{b_n_size_}, dnnl::memory::data_type::f32, {1}},
                    default_engine(), nullptr);
   set_runtime_memory_ptr(4, memory_cache_[DNNL_ARG_BIAS].get());
+
+  memory_cache_[DNNL_ARG_SCRATCHPAD] =
+      dnnl::memory({{b_n_size_}, dnnl::memory::data_type::f32, {1}},
+                   default_engine(), nullptr);
+  set_runtime_memory_ptr(5, memory_cache_[DNNL_ARG_SCRATCHPAD].get());
 }
 
 dnnl::matmul::primitive_desc W8A8MatMulPrimitiveHandler::create_primitive_desc(
@@ -319,6 +376,9 @@ dnnl::matmul::primitive_desc W8A8MatMulPrimitiveHandler::create_primitive_desc(
                           dnnl::memory::format_tag::ab);
 
   dnnl::primitive_attr attr;
+
+  attr.set_scratchpad_mode(dnnl::scratchpad_mode::user);
+
   // For PER_TOKEN, scales will be applied in outside epilogue
   if (a_qs_ == QuantizationStrategy::PER_TENSOR) {
     attr.set_scales_mask(DNNL_ARG_SRC, 0);
@@ -344,3 +404,120 @@ dnnl::matmul::primitive_desc W8A8MatMulPrimitiveHandler::create_primitive_desc(
                                         attr);
   }
 }
+
+MatMulPrimitiveHandler::MatMulPrimitiveHandler(const Args& args)
+    : DNNLMatMulPrimitiveHandler(
+          static_cast<DNNLMatMulPrimitiveHandler::Args>(args), args.ab_type),
+      m_size_cache_(nullptr) {
+  assert(ab_type_ == dnnl::memory::data_type::f32 ||
+         ab_type_ == dnnl::memory::data_type::bf16 ||
+         ab_type_ == dnnl::memory::data_type::f16);
+  prepack_weight(args.b_ptr,
+                 create_primitive_desc(
+                     MSizeCacheKey{.a_m_size = DNNL_RUNTIME_DIM_VAL,
+                                   .a_m_stride = DNNL_RUNTIME_DIM_VAL,
+                                   .use_bias = false,
+                                   .bias_type = dnnl::memory::data_type::undef},
+                     true)
+                     .weights_desc());
+  init_runtime_memory_cache(args);
+}
+
+static std::shared_ptr<MatMulPrimitiveHandler::MSizeCache>
+get_matul_class_primitive_cache(
+    const MatMulPrimitiveHandler::ClassMatmulCacheKey& key,
+    int64_t cache_size) {
+  static MatMulPrimitiveHandler::ClassMatmulCache cache(128);
+  assert(cache_size > 0);
+  return cache.get_or_create(key, [&]() {
+    return std::make_shared<MatMulPrimitiveHandler::MSizeCache>(cache_size);
+  });
+}
+
+void MatMulPrimitiveHandler::execute(ExecArgs& args) {
+  auto&& [a_storage, a_mem_desc] = get_runtime_memory_ptr(0);
+  auto&& [c_storage, c_mem_desc] = get_runtime_memory_ptr(1);
+  a_storage->set_data_handle((void*)args.a_ptr);
+  a_mem_desc->dims[0] = args.a_m_size;
+  a_mem_desc->format_desc.blocking.strides[0] = args.a_m_stride;
+  c_storage->set_data_handle((void*)args.c_ptr);
+  c_mem_desc->dims[0] = args.a_m_size;
+
+  if (args.use_bias) {
+    auto&& [bias_storage, bias_mem_desc] = get_runtime_memory_ptr(2);
+    bias_storage->set_data_handle((void*)args.bias_ptr);
+  }
+
+  dnnl::matmul matmul = get_matmul_cache(args);
+
+  auto&& [scratchpad_storage, scratchpad_mem_desc] = get_runtime_memory_ptr(3);
+  scratchpad_storage->set_data_handle(
+      DNNLScratchPadManager::get_dnnl_scratchpad_manager()->get_data<void>());
+
+  matmul.execute(default_stream(), memory_cache_);
+  default_stream().wait();
+}
+
+dnnl::matmul MatMulPrimitiveHandler::get_matmul_cache(
+    const MSizeCacheKey& key) {
+  if (m_size_cache_.get() == nullptr) {
+    ClassMatmulCacheKey key = {.b_n_size = b_n_size_, .b_k_size = b_k_size_};
+    m_size_cache_ = get_matul_class_primitive_cache(key, primitive_cache_size_);
+  }
+  return m_size_cache_->get_or_create(key, [&]() {
+    dnnl::matmul::primitive_desc desc = this->create_primitive_desc(key, false);
+    auto manager = DNNLScratchPadManager::get_dnnl_scratchpad_manager();
+    manager->realloc(desc.scratchpad_desc().get_size());
+    return dnnl::matmul(desc);
+  });
+}
+
+dnnl::matmul::primitive_desc MatMulPrimitiveHandler::create_primitive_desc(
+    const MSizeCacheKey& key, bool first_time) {
+  dnnl::memory::desc a_md;
+  dnnl::memory::desc b_md;
+  if (first_time) {
+    a_md = dnnl::memory::desc({key.a_m_size, b_k_size_}, b_type_,
+                              dnnl::memory::format_tag::ab);
+    b_md = dnnl::memory::desc({b_k_size_, b_n_size_}, b_type_,
+                              dnnl::memory::format_tag::any);
+  } else {
+    a_md = dnnl::memory::desc({key.a_m_size, b_k_size_}, b_type_,
+                              {key.a_m_stride, 1});
+    b_md = b_target_mem_desc_;
+  }
+  dnnl::memory::desc c_md({key.a_m_size, b_n_size_}, c_type_,
+                          dnnl::memory::format_tag::ab);
+
+  dnnl::primitive_attr attr;
+  attr.set_scratchpad_mode(dnnl::scratchpad_mode::user);
+
+  if (key.use_bias) {
+    dnnl::memory::desc bias_md({1, b_n_size_}, key.bias_type, {b_n_size_, 1});
+    return dnnl::matmul::primitive_desc(default_engine(), a_md, b_md, bias_md,
+                                        c_md, attr);
+  } else {
+    return dnnl::matmul::primitive_desc(default_engine(), a_md, b_md, c_md,
+                                        attr);
+  }
+}
+
+void MatMulPrimitiveHandler::init_runtime_memory_cache(const Args& args) {
+  memory_cache_[DNNL_ARG_SRC] = dnnl::memory(
+      {{1, b_k_size_}, b_type_, {b_k_size_, 1}}, default_engine(), nullptr);
+  set_runtime_memory_ptr(0, memory_cache_[DNNL_ARG_SRC].get());
+  memory_cache_[DNNL_ARG_DST] =
+      dnnl::memory({{1, b_n_size_}, c_type_, dnnl::memory::format_tag::ab},
+                   default_engine(), nullptr);
+  set_runtime_memory_ptr(1, memory_cache_[DNNL_ARG_DST].get());
+
+  memory_cache_[DNNL_ARG_BIAS] =
+      dnnl::memory({{b_n_size_}, dnnl::memory::data_type::f32, {1}},
+                   default_engine(), nullptr);
+  set_runtime_memory_ptr(2, memory_cache_[DNNL_ARG_BIAS].get());
+
+  memory_cache_[DNNL_ARG_SCRATCHPAD] =
+      dnnl::memory({{b_n_size_}, dnnl::memory::data_type::f32, {1}},
+                   default_engine(), nullptr);
+  set_runtime_memory_ptr(3, memory_cache_[DNNL_ARG_SCRATCHPAD].get());
+}
diff --git a/csrc/cpu/dnnl_helper.h b/csrc/cpu/dnnl_helper.h
index 54ceefced9e985e4b3cd20a1df17e1cbc215d57e..ad6773d2b9fd6d89f9530780810d8912f5076570 100644
--- a/csrc/cpu/dnnl_helper.h
+++ b/csrc/cpu/dnnl_helper.h
@@ -59,6 +59,30 @@ constexpr inline dnnl::memory::data_type get_dnnl_type() {
   return DNNLType<std::decay_t<T>>::type;
 }
 
+class DNNLScratchPadManager {
+ public:
+  static constexpr size_t allocation_unit = 4 * 1024 * 1024;  // 4KB
+
+  static DNNLScratchPadManager* get_dnnl_scratchpad_manager();
+
+  DNNLScratchPadManager();
+
+  template <typename T>
+  T* get_data() {
+    return reinterpret_cast<T*>(ptr_);
+  }
+
+  static size_t round(size_t size) {
+    return ((size + allocation_unit - 1) / allocation_unit) * allocation_unit;
+  }
+
+  void realloc(size_t new_size);
+
+ private:
+  size_t size_;
+  void* ptr_;
+};
+
 class DNNLMatMulPrimitiveHandler {
  public:
   virtual ~DNNLMatMulPrimitiveHandler() = default;
@@ -166,4 +190,54 @@ class W8A8MatMulPrimitiveHandler : public DNNLMatMulPrimitiveHandler {
   std::shared_ptr<MSizeCache> m_size_cache_;
 };
 
+class MatMulPrimitiveHandler : public DNNLMatMulPrimitiveHandler {
+ public:
+  struct Args : public DNNLMatMulPrimitiveHandler::Args {
+    dnnl::memory::data_type ab_type;
+  };
+
+  struct ClassMatmulCacheKey {
+    dnnl_dim_t b_n_size;
+    dnnl_dim_t b_k_size;
+
+    friend bool operator==(const ClassMatmulCacheKey& l,
+                           const ClassMatmulCacheKey& r);
+  };
+
+  struct MSizeCacheKey {
+    dnnl_dim_t a_m_size;
+    dnnl_dim_t a_m_stride;
+    bool use_bias;
+    dnnl::memory::data_type bias_type;
+
+    friend bool operator==(const MSizeCacheKey& l, const MSizeCacheKey& r);
+  };
+
+  using MSizeCache = DNNLPrimitiveCache<MSizeCacheKey, dnnl::matmul>;
+  using ClassMatmulCache =
+      DNNLPrimitiveCache<ClassMatmulCacheKey, std::shared_ptr<MSizeCache>>;
+
+  struct ExecArgs : public MSizeCacheKey {
+    const void* a_ptr;
+    const void* bias_ptr;
+    void* c_ptr;
+  };
+
+ public:
+  MatMulPrimitiveHandler(const Args& args);
+
+  void execute(ExecArgs& args);
+
+ private:
+  dnnl::matmul::primitive_desc create_primitive_desc(const MSizeCacheKey& key,
+                                                     bool first_time);
+
+  void init_runtime_memory_cache(const Args& args);
+
+  dnnl::matmul get_matmul_cache(const MSizeCacheKey& key);
+
+ private:
+  std::shared_ptr<MSizeCache> m_size_cache_;
+};
+
 #endif
diff --git a/csrc/cpu/dnnl_kernels.cpp b/csrc/cpu/dnnl_kernels.cpp
index acc3b9ecde143382728a9a709081971de30ad3f5..9a3af4ac9d8a6f3d0f790ddc764b4deadf5c8a22 100644
--- a/csrc/cpu/dnnl_kernels.cpp
+++ b/csrc/cpu/dnnl_kernels.cpp
@@ -145,7 +145,8 @@ void dynamic_scaled_int8_quant_impl(const scalar_t* input, int8_t* output,
       }
     }
 
-    float scale_val, azp_val;
+    float scale_val;
+    float azp_val = 0.0f;
     if constexpr (AZP) {
       float max_scalar = max_value.reduce_max();
       float min_scalar = min_value.reduce_min();
@@ -379,6 +380,7 @@ void onednn_scaled_mm(
   exec_args.a_ptr = a.data_ptr<int8_t>();
   exec_args.a_m_size = a.size(0);
   exec_args.bias_ptr = nullptr;
+  exec_args.bias_type = get_dnnl_type<void>();
   exec_args.use_bias = false;
   exec_args.a_scales_ptr = nullptr;
   exec_args.a_zero_points_ptr = nullptr;
@@ -492,3 +494,56 @@ void dynamic_scaled_int8_quant(
         }
       });
 }
+
+int64_t create_onednn_mm_handler(const torch::Tensor& b,
+                                 int64_t primitive_cache_size) {
+  TORCH_CHECK(b.dim() == 2);
+
+  MatMulPrimitiveHandler::Args args;
+  args.primitive_cache_size = primitive_cache_size;
+
+  args.b_k_size = b.size(0);
+  args.b_k_stride = b.stride(0);
+  args.b_n_size = b.size(1);
+  args.b_n_stride = b.stride(1);
+  args.b_ptr = b.data_ptr();
+
+  VLLM_DISPATCH_FLOATING_TYPES(b.scalar_type(), "create_onednn_mm_handler",
+                               [&] {
+                                 args.c_type = get_dnnl_type<scalar_t>();
+                                 args.ab_type = get_dnnl_type<scalar_t>();
+                               });
+
+  return reinterpret_cast<int64_t>(new MatMulPrimitiveHandler(args));
+}
+
+void onednn_mm(torch::Tensor& c,        // [M, OC], row-major
+               const torch::Tensor& a,  // [M, IC], row-major
+               const std::optional<torch::Tensor>& bias, int64_t handler) {
+  CPU_KERNEL_GUARD_IN(onednn_mm)
+  TORCH_CHECK(a.dim() == 2);
+  TORCH_CHECK(a.stride(-1) == 1);
+  TORCH_CHECK(c.is_contiguous());
+  MatMulPrimitiveHandler* ptr =
+      reinterpret_cast<MatMulPrimitiveHandler*>(handler);
+
+  MatMulPrimitiveHandler::ExecArgs exec_args;
+  exec_args.a_m_size = a.size(0);
+  exec_args.a_m_stride = a.stride(0);
+
+  VLLM_DISPATCH_FLOATING_TYPES(a.scalar_type(), "onednn_mm", [&] {
+    if (bias.has_value()) {
+      exec_args.use_bias = true;
+      exec_args.bias_type = get_dnnl_type<scalar_t>();
+      exec_args.bias_ptr = bias->data_ptr<scalar_t>();
+    } else {
+      exec_args.use_bias = false;
+      exec_args.bias_type = get_dnnl_type<void>();
+      exec_args.bias_ptr = nullptr;
+    }
+    exec_args.a_ptr = a.data_ptr<scalar_t>();
+    exec_args.c_ptr = c.data_ptr<scalar_t>();
+
+    ptr->execute(exec_args);
+  });
+}
diff --git a/csrc/cpu/torch_bindings.cpp b/csrc/cpu/torch_bindings.cpp
index c9f426bdf618ab457c922ad216ba6dee42ef93e5..98c3ebc5a75f85f833c69e9cd6385aeb96023aec 100644
--- a/csrc/cpu/torch_bindings.cpp
+++ b/csrc/cpu/torch_bindings.cpp
@@ -21,6 +21,12 @@ void onednn_scaled_mm(torch::Tensor& c, const torch::Tensor& a,
                       const std::optional<torch::Tensor>& bias,
                       int64_t handler);
 
+int64_t create_onednn_mm_handler(const torch::Tensor& b,
+                                 int64_t primitive_cache_size);
+
+void onednn_mm(torch::Tensor& c, const torch::Tensor& a,
+               const std::optional<torch::Tensor>& bias, int64_t handler);
+
 void mla_decode_kvcache(torch::Tensor& out, torch::Tensor& query,
                         torch::Tensor& kv_cache, double scale,
                         torch::Tensor& block_tables, torch::Tensor& seq_lens);
@@ -153,6 +159,18 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
   ops.def("release_dnnl_matmul_handler(int handler) -> ()",
           &release_dnnl_matmul_handler);
 
+  // Create oneDNN GEMM handler
+  ops.def(
+      "create_onednn_mm_handler(Tensor b, int "
+      "primitive_cache_size) -> int",
+      &create_onednn_mm_handler);
+
+  // oneDNN GEMM
+  ops.def(
+      "onednn_mm(Tensor! c, Tensor a, Tensor? bias, "
+      "int handler) -> ()");
+  ops.impl("onednn_mm", torch::kCPU, &onednn_mm);
+
   // Create oneDNN W8A8 handler
   ops.def(
       "create_onednn_scaled_mm_handler(Tensor b, Tensor b_scales, ScalarType "
diff --git a/csrc/custom_all_reduce.cuh b/csrc/custom_all_reduce.cuh
index 38002e06c9e14fd4ca6a66b4d30c65e23c8669de..2e8838684cc32e76bc544f55f52a744052e51f03 100644
--- a/csrc/custom_all_reduce.cuh
+++ b/csrc/custom_all_reduce.cuh
@@ -15,6 +15,8 @@ typedef __hip_bfloat16 nv_bfloat16;
 #include <map>
 #include <unordered_map>
 #include <vector>
+#include <cstdlib>
+#include <cstring>
 
 namespace vllm {
 #define CUDACHECK(cmd)                                              \
@@ -564,22 +566,47 @@ class CustomAllreduce {
     size /= d;
     auto bytes = size * sizeof(typename packed_t<T>::P);
     int blocks = std::min(block_limit, (size + threads - 1) / threads);
+
+    // Check environment variable once
+    const char* env_algo = std::getenv("VLLM_CUSTOM_ALLREDUCE_ALGO");
+    bool force_1stage = false;
+    bool force_2stage = false;
+    if (env_algo != nullptr) {
+      if (std::strcmp(env_algo, "1stage") == 0 ||
+          std::strcmp(env_algo, "oneshot") == 0) {
+        force_1stage = true;
+      } else if (std::strcmp(env_algo, "2stage") == 0 ||
+                 std::strcmp(env_algo, "twoshot") == 0) {
+        force_2stage = true;
+      } else {
+        throw std::runtime_error(
+            "Invalid VLLM_CUSTOM_ALLREDUCE_ALGO: " + std::string(env_algo) +
+            ". Valid values: 1stage, oneshot, 2stage, twoshot");
+      }
+    }
+
 #define KL(ngpus, name)                                                       \
   name<T, ngpus><<<blocks, threads, 0, stream>>>(ptrs, sg_, self_sg_, output, \
                                                  rank_, size);
-#define REDUCE_CASE(ngpus)                            \
-  case ngpus: {                                       \
-    if (world_size_ == 2) {                           \
-      KL(ngpus, cross_device_reduce_1stage);          \
-    } else if (fully_connected_) {                    \
-      if ((world_size_ <= 4 && bytes < 512 * 1024) || \
-          (world_size_ <= 8 && bytes < 256 * 1024)) { \
-        KL(ngpus, cross_device_reduce_1stage);        \
-      } else {                                        \
-        KL(ngpus, cross_device_reduce_2stage);        \
-      }                                               \
-    }                                                 \
-    break;                                            \
+#define REDUCE_CASE(ngpus)                              \
+  case ngpus: {                                         \
+    if (force_1stage) {                                 \
+      KL(ngpus, cross_device_reduce_1stage);            \
+    } else if (force_2stage) {                          \
+      KL(ngpus, cross_device_reduce_2stage);            \
+    } else {                                            \
+      if (world_size_ == 2) {                           \
+        KL(ngpus, cross_device_reduce_1stage);          \
+      } else if (fully_connected_) {                    \
+        if ((world_size_ <= 4 && bytes < 512 * 1024) || \
+            (world_size_ <= 8 && bytes < 256 * 1024)) { \
+          KL(ngpus, cross_device_reduce_1stage);        \
+        } else {                                        \
+          KL(ngpus, cross_device_reduce_2stage);        \
+        }                                               \
+      }                                                 \
+    }                                                   \
+    break;                                              \
   }
 
     switch (world_size_) {
diff --git a/csrc/cutlass_extensions/gemm/collective/collective_builder.hpp b/csrc/cutlass_extensions/gemm/collective/collective_builder.hpp
deleted file mode 100644
index ec75c29e54f4d0e87d75bba6086dcd660c5bcb20..0000000000000000000000000000000000000000
--- a/csrc/cutlass_extensions/gemm/collective/collective_builder.hpp
+++ /dev/null
@@ -1,123 +0,0 @@
-// Modified from: cutlass/gemm/collective/builders/sm90_gmma_builder.inl
-// clang-format off
-#pragma once
-
-#include "cutlass/gemm/collective/builders/sm90_gmma_builder.inl"
-
-#include "cutlass_extensions/gemm/collective/sm90_mma_tma_gmma_ss_warpspecialized_fp8_blockwise_scaling.hpp"
-
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass::gemm::collective {
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-// GMMA_TMA_WS_SS (BlockScaled Builders)
-template <
-  class ElementA,
-  class GmemLayoutATag,
-  int AlignmentA,
-  class ElementB,
-  class GmemLayoutBTag,
-  int AlignmentB,
-  class ElementAccumulator,
-  class TileShape_MNK,
-  class ClusterShape_MNK,
-  class StageCountType,
-  int ScaleGranularityM
->
-struct CollectiveBuilder<
-    arch::Sm90,
-    arch::OpClassTensorOp,
-    ElementA,
-    GmemLayoutATag,
-    AlignmentA,
-    ElementB,
-    GmemLayoutBTag,
-    AlignmentB,
-    ElementAccumulator,
-    TileShape_MNK,
-    ClusterShape_MNK,
-    StageCountType,
-    KernelTmaWarpSpecializedCooperativeFP8BlockScaledSubGroupMAccum<ScaleGranularityM>,
-    cute::enable_if_t<
-      not detail::is_use_rmem_A<ElementA, GmemLayoutATag, ElementB, GmemLayoutBTag>()>
-> {
-  using KernelScheduleType = KernelTmaWarpSpecializedCooperativeFP8BlockScaledSubGroupMAccum<ScaleGranularityM>;
-
-  static_assert(is_static<TileShape_MNK>::value);
-  static_assert(is_static<ClusterShape_MNK>::value);
-#ifndef CUTLASS_SM90_COLLECTIVE_BUILDER_SUPPORTED
-  static_assert(cutlass::detail::dependent_false<ElementA>, "Unsupported Toolkit for SM90 Collective Builder\n");
-#endif
-  static_assert(detail::is_aligned<ElementA, AlignmentA, ElementB, AlignmentB, detail::tma_alignment_bytes>(),
-                "Should meet TMA alignment requirement\n");
-
-  static constexpr bool IsArrayOfPointersGemm = (cute::is_any_of_v<KernelScheduleType,
-                                                                   KernelPtrArrayTmaWarpSpecializedCooperative,
-                                                                   KernelPtrArrayTmaWarpSpecializedPingpong>);
-  static constexpr bool IsFP8Input = detail::is_input_fp8<ElementA, ElementB>();
-  static_assert((!IsFP8Input || !IsArrayOfPointersGemm),
-                "KernelTmaWarpSpecializedCooperativeFP8BlockScaledAccum is only compatible with FP8 Blocked Scaled version right now.");
-
-  // For fp32 types, map to tf32 MMA value type
-  using ElementAMma = cute::conditional_t<cute::is_same_v<ElementA, float>, tfloat32_t, ElementA>;
-  using ElementBMma = cute::conditional_t<cute::is_same_v<ElementB, float>, tfloat32_t, ElementB>;
-
-  static constexpr cute::GMMA::Major GmmaMajorA = detail::gmma_ss_tag_to_major_A<ElementAMma, GmemLayoutATag>();
-  static constexpr cute::GMMA::Major GmmaMajorB = detail::gmma_ss_tag_to_major_B<ElementBMma, GmemLayoutBTag>();
-
-  static constexpr bool IsCooperative = cute::is_any_of_v<KernelScheduleType,
-                                                          KernelTmaWarpSpecializedCooperative,
-                                                          KernelPtrArrayTmaWarpSpecializedCooperative,
-                                                          KernelTmaWarpSpecializedCooperativeFP8BlockScaledSubGroupMAccum<ScaleGranularityM>>;
-  using AtomLayoutMNK = cute::conditional_t<IsCooperative,
-      Layout<Shape<_2,_1,_1>>, Layout<Shape<_1,_1,_1>>>;
-
-  using TiledMma = decltype(cute::make_tiled_mma(cute::GMMA::ss_op_selector<
-      ElementAMma, ElementBMma, ElementAccumulator, TileShape_MNK, GmmaMajorA, GmmaMajorB>(), AtomLayoutMNK{}));
-
-  using GmemTiledCopyA = decltype(detail::sm90_cluster_shape_to_tma_atom(shape<1>(ClusterShape_MNK{})));
-  using GmemTiledCopyB = decltype(detail::sm90_cluster_shape_to_tma_atom(shape<0>(ClusterShape_MNK{})));
-
-  using SmemLayoutAtomA = decltype(detail::ss_smem_selector<
-      GmmaMajorA, ElementAMma, decltype(cute::get<0>(TileShape_MNK{})), decltype(cute::get<2>(TileShape_MNK{}))>());
-  using SmemLayoutAtomB = decltype(detail::ss_smem_selector<
-      GmmaMajorB, ElementBMma, decltype(cute::get<1>(TileShape_MNK{})), decltype(cute::get<2>(TileShape_MNK{}))>());
-
-  static constexpr size_t TensorMapStorage = IsArrayOfPointersGemm ? sizeof(cute::TmaDescriptor) * 2 /* for A and B */ : 0;
-  static constexpr int KernelSmemCarveout = static_cast<int>(TensorMapStorage);
-
-  static constexpr int PipelineStages = detail::compute_stage_count_or_override<detail::sm90_smem_capacity_bytes - KernelSmemCarveout,
-      ElementAMma, ElementBMma, TileShape_MNK>(StageCountType{});
-  using DispatchPolicy = MainloopSm90TmaGmmaWarpSpecializedBlockScalingSubGroupMFP8<PipelineStages, ClusterShape_MNK, KernelScheduleType, ScaleGranularityM>;
-
-  using SmemCopyAtomA = void;
-  using SmemCopyAtomB = void;
-
-  using CollectiveOp = CollectiveMma<
-      DispatchPolicy,
-      TileShape_MNK,
-      ElementA,
-      TagToStrideA_t<GmemLayoutATag>,
-      ElementB,
-      TagToStrideB_t<GmemLayoutBTag>,
-      TiledMma,
-      GmemTiledCopyA,
-      SmemLayoutAtomA,
-      SmemCopyAtomA,
-      cute::identity,
-      GmemTiledCopyB,
-      SmemLayoutAtomB,
-      SmemCopyAtomB,
-      cute::identity
-    >;
-};
-
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-}  // namespace cutlass::gemm::collective
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/csrc/cutlass_extensions/gemm/collective/fp8_accumulation.hpp b/csrc/cutlass_extensions/gemm/collective/fp8_accumulation.hpp
deleted file mode 100644
index 13b90e998625e2337eaee3c4cac5fb70bafac01b..0000000000000000000000000000000000000000
--- a/csrc/cutlass_extensions/gemm/collective/fp8_accumulation.hpp
+++ /dev/null
@@ -1,183 +0,0 @@
-// clang-format off
-// adapted from: https://github.com/soundOfDestiny/cutlass/blob/a4208aa6958864923505cade9c63eb2a6daf16e5/include/cutlass/gemm/collective/fp8_accumulation.hpp
-
-/***************************************************************************************************
- * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-
-#pragma once
-
-#include "cute/algorithm/clear.hpp"
-#include "cute/tensor.hpp"
-
-//////////////////////////////////////////////////////////////////////////////
-///////////////////////////////////FP8 Accumulation///////////////////////////
-//////////////////////////////////////////////////////////////////////////////
-/// This class provides API to promote (add) or scale (multiply_add) the results
-/// from the tensor core accumulators to the main accumulators when the number 
-/// of MMAs reaches the max number of MMA interval specified by user, after that
-/// the tensor core accumulators are zeroed.
-//////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass::gemm::collective {
-
-template <
-    class EngineAccum,
-    class LayoutAccum>
-struct GmmaFP8AccumulationWithScale {  
-  using TensorAccum = cute::Tensor<EngineAccum, LayoutAccum>;
-  using ElementAccumulator = typename EngineAccum::value_type;
-
-  static_assert(is_static<LayoutAccum>::value, "Accumulator Layout should be static");
-  static_assert(is_rmem<TensorAccum>::value , "Accumulator tensor must be rmem resident.");
-
-private:
-  TensorAccum& accum_;
-  TensorAccum accum_temp_;
-
-  uint32_t accum_promotion_interval_;         // defines the max num of executed MMAs after which accum should be promoted.
-  uint32_t mma_count_per_mainloop_iteration_; // num of MMAs per k_tile of mainloop
-  uint32_t mma_count_;                        // current executed MMAs
-  uint32_t reset_accum_flag_;                 // accum needs to be zeroed or not. 
-
-  // promote or `add` the partial accumulators to main accumulator (FADD).
-  CUTLASS_DEVICE
-  void promote_core() {
-    warpgroup_wait<0>();
-    CUTLASS_PRAGMA_UNROLL
-    for (int i = 0; i < size(accum_); ++i) {
-      accum_(i) += accum_temp_(i);
-    }
-  }
-
-  // `multiply` scale the partial accumulators and `add` to main accumulator (FFMA).
-  template <
-    class EngineScale,
-    class LayoutScale>
-  CUTLASS_DEVICE
-  void scale_core(const cute::Tensor<EngineScale, LayoutScale> &scale) {
-    using TensorScale = cute::Tensor<EngineScale, LayoutScale>;
-
-    static_assert(is_static<LayoutScale>::value, "Scale Layout should be static");
-    static_assert(is_rmem<TensorScale>::value , "Scale tensor must be rmem resident.");
-
-    static_assert(LayoutAccum{}.shape() == LayoutScale{}.shape(), "Accumulator and scale must have same shape.");
-
-    warpgroup_wait<0>();
-    CUTLASS_PRAGMA_UNROLL
-    for (int i = 0; i < size(accum_); ++i) {
-      accum_(i) += accum_temp_(i) * scale(i);
-    }
-  }
-
-public:
-  CUTLASS_DEVICE
-  GmmaFP8AccumulationWithScale(
-      TensorAccum &accum,
-      uint32_t accum_promotion_interval,
-      uint32_t mma_count_per_mainloop_iteration)
-      : accum_(accum), 
-        accum_promotion_interval_(accum_promotion_interval),
-        mma_count_per_mainloop_iteration_(mma_count_per_mainloop_iteration),
-        mma_count_(0), 
-        reset_accum_flag_(0) 
-  {
-    accum_temp_ = cute::make_fragment_like(accum);
-  }
-
-  //
-  // Methods (Common)
-  //
-
-  CUTLASS_DEVICE 
-  TensorAccum& operator()() {
-    return accum_temp_;
-  }
-
-  /// prepare the MMA accumulators when initialization or zeroing is required.
-  CUTLASS_DEVICE
-  bool prepare_if_needed() { 
-    return reset_accum_flag_;
-  }
-
-  //
-  // Methods (for FADD version)
-  //
-
-  /// promote (add) the results from the MMA accumulators to main accumulator if needed.
-  CUTLASS_DEVICE
-  void promote_if_needed() {
-    mma_count_ += mma_count_per_mainloop_iteration_;
-    reset_accum_flag_ = __shfl_sync(0xffffffff, mma_count_ == accum_promotion_interval_, 0);
-    if (reset_accum_flag_) {
-      promote_core();
-      mma_count_ = 0;
-    }
-  }
-
-  /// promote (add) the residue results from the MMA accumulators to main accumulator if needed.
-  CUTLASS_DEVICE
-  void promote_residue_if_needed() {
-    if (__shfl_sync(0xffffffff, mma_count_ > 0, 0)) {
-      promote_core();
-    }
-  }
-
-  //
-  // Methods (for FFMA version)
-  //
-
-  /// scale (multiply_add) the results from the MMA accumulators to main accumulator if needed.
-  template <
-    class EngineScale,
-    class LayoutScale>
-  CUTLASS_DEVICE
-  void scale_if_needed(const cute::Tensor<EngineScale, LayoutScale> &scale) {
-    mma_count_ += mma_count_per_mainloop_iteration_;
-    reset_accum_flag_ = __shfl_sync(0xffffffff, mma_count_ == accum_promotion_interval_, 0);
-    if (reset_accum_flag_) {
-      scale_core(scale);
-      mma_count_ = 0;
-    }
-  }
-
-  /// scale (multiply_add) the residue results from the MMA accumulators to main accumulator if needed.
-  template <
-    class EngineScale,
-    class LayoutScale>
-  CUTLASS_DEVICE
-  void scale_residue_if_needed(const cute::Tensor<EngineScale, LayoutScale> &scale) {
-    if (__shfl_sync(0xffffffff, mma_count_ > 0, 0)) {
-      scale_core(scale);
-    }
-  }
-};
-
-} // namespace cutlass::gemm::collective
diff --git a/csrc/cutlass_extensions/gemm/collective/sm90_mma_tma_gmma_ss_warpspecialized_fp8_blockwise_scaling.hpp b/csrc/cutlass_extensions/gemm/collective/sm90_mma_tma_gmma_ss_warpspecialized_fp8_blockwise_scaling.hpp
deleted file mode 100644
index ce7f47cf723377f8f777d7641850d6c93ac84a49..0000000000000000000000000000000000000000
--- a/csrc/cutlass_extensions/gemm/collective/sm90_mma_tma_gmma_ss_warpspecialized_fp8_blockwise_scaling.hpp
+++ /dev/null
@@ -1,729 +0,0 @@
-// clang-format off
-// Adapted (Heavily) from: https://github.com/soundOfDestiny/cutlass/blob/9d997ce0dea4c5fa1a617db6b7ff29aa9235822c/include/cutlass/gemm/collective/sm90_mma_tma_gmma_ss_warpspecialized_fp8_blockwise_scaling.hpp
-
-/***************************************************************************************************
- * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-
-#pragma once
-
-#include "cutlass/cutlass.h"
-#include "cutlass/gemm/dispatch_policy.hpp"
-#include "cutlass/trace.h"
-#include "cutlass/numeric_types.h"
-
-#include "cute/arch/cluster_sm90.hpp"
-#include "cute/arch/copy_sm80.hpp"
-#include "cute/arch/copy_sm90.hpp"
-#include "cute/algorithm/functional.hpp"
-#include "cute/atom/mma_atom.hpp"
-#include "cute/algorithm/gemm.hpp"
-#include "cute/numeric/arithmetic_tuple.hpp"
-
-#include "cutlass_extensions/gemm/dispatch_policy.hpp"
-#include "cutlass_extensions/gemm/collective/fp8_accumulation.hpp"
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass::gemm::collective {
-using namespace cute;
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-// WarpSpecialized Mainloop
-template <
-  int Stages,
-  class ClusterShape,
-  class KernelSchedule,
-  int ScaleGranularityM_,
-  class TileShape_,
-  class ElementA_,
-  class StrideA_,
-  class ElementB_,
-  class StrideB_,
-  class TiledMma_,
-  class GmemTiledCopyA_,
-  class SmemLayoutAtomA_,
-  class SmemCopyAtomA_,
-  class TransformA_,
-  class GmemTiledCopyB_,
-  class SmemLayoutAtomB_,
-  class SmemCopyAtomB_,
-  class TransformB_>
-struct CollectiveMma<
-    MainloopSm90TmaGmmaWarpSpecializedBlockScalingSubGroupMFP8<Stages, ClusterShape, KernelSchedule, ScaleGranularityM_>,
-    TileShape_,
-    ElementA_,
-    StrideA_,
-    ElementB_,
-    StrideB_,
-    TiledMma_,
-    GmemTiledCopyA_,
-    SmemLayoutAtomA_,
-    SmemCopyAtomA_,
-    TransformA_,
-    GmemTiledCopyB_,
-    SmemLayoutAtomB_,
-    SmemCopyAtomB_,
-    TransformB_>
-{
-  //
-  // Type Aliases
-  //
-  using DispatchPolicy = MainloopSm90TmaGmmaWarpSpecializedBlockScalingSubGroupMFP8<Stages, ClusterShape, KernelSchedule, ScaleGranularityM_>;
-  using TileShape = TileShape_;
-  using ElementA = ElementA_;
-  using StrideA = StrideA_;
-  using ElementB = ElementB_;
-  using StrideB = StrideB_;
-  using TiledMma = TiledMma_;
-  using ElementAccumulator = typename TiledMma::ValTypeC;
-  using ElementBlockScale = ElementAccumulator;
-  using GmemTiledCopyA = GmemTiledCopyA_;
-  using GmemTiledCopyB = GmemTiledCopyB_;
-  using SmemLayoutAtomA = SmemLayoutAtomA_;
-  using SmemLayoutAtomB = SmemLayoutAtomB_;
-  using SmemCopyAtomA = SmemCopyAtomA_;
-  using SmemCopyAtomB = SmemCopyAtomB_;
-  using TransformA = TransformA_;
-  using TransformB = TransformB_;
-  using ArchTag = typename DispatchPolicy::ArchTag;
-
-  using CtaShape_MNK = decltype(shape_div(TileShape{}, ClusterShape{}));
-  using MainloopPipeline = cutlass::PipelineTmaAsync<DispatchPolicy::Stages>;
-  using PipelineState = cutlass::PipelineState<DispatchPolicy::Stages>;
-  using PipelineParams = typename MainloopPipeline::Params;
-
-  // Two threads per CTA are producers (1 for operand tile and 32 for scales)
-  static constexpr int NumProducerThreadEvents = 33; 
-
-  static constexpr int ScaleGranularityM = ScaleGranularityM_ == 0 ? size<0>(TileShape{}) : ScaleGranularityM_;
-  static constexpr int ScaleMsPerTile = size<0>(TileShape{}) / ScaleGranularityM;
-
-  static_assert(cute::rank(SmemLayoutAtomA{}) == 2, "SmemLayoutAtom must be rank 2 (M/N, K)");
-  static_assert((size<0>(TileShape{}) % size<0>(SmemLayoutAtomA{})) == 0, "SmemLayoutAtom must evenly divide tile shape.");
-  static_assert((size<2>(TileShape{}) % size<1>(SmemLayoutAtomA{})) == 0, "SmemLayoutAtom must evenly divide tile shape.");
-
-  static_assert(cute::rank(SmemLayoutAtomB{}) == 2, "SmemLayoutAtom must be rank 2 (M/N, K)");
-  static_assert((size<1>(TileShape{}) % size<0>(SmemLayoutAtomB{})) == 0, "SmemLayoutAtom must evenly divide tile shape.");
-  static_assert((size<2>(TileShape{}) % size<1>(SmemLayoutAtomB{})) == 0, "SmemLayoutAtom must evenly divide tile shape.");
-
-  static_assert((size<0>(TileShape{}) % ScaleGranularityM) == 0, "FP8 scaling granularity must evenly divide tile shape along M.");
-
-  // Tile along modes in a way that maximizes the TMA box size.
-  using SmemLayoutA = decltype(tile_to_shape(
-      SmemLayoutAtomA{},
-      make_shape(shape<0>(TileShape{}), shape<2>(TileShape{}), Int<DispatchPolicy::Stages>{}),
-      cute::conditional_t< ::cutlass::gemm::detail::is_major<0,StrideA>(), Step<_2,_1,_3>, Step<_1,_2,_3>>{}));
-  using SmemLayoutB = decltype(tile_to_shape(
-      SmemLayoutAtomB{},
-      make_shape(shape<1>(TileShape{}), shape<2>(TileShape{}), Int<DispatchPolicy::Stages>{}),
-      cute::conditional_t< ::cutlass::gemm::detail::is_major<0,StrideB>(), Step<_2,_1,_3>, Step<_1,_2,_3>>{}));
-  
-  // Block scaling gmem-to-smem copy atom 
-  using SmemBlockScalingCopyAtomA = Copy_Atom<SM80_CP_ASYNC_CACHEALWAYS<ElementBlockScale>, ElementBlockScale>;
-  using SmemBlockScalingCopyAtomB = Copy_Atom<SM80_CP_ASYNC_CACHEALWAYS<ElementBlockScale>, ElementBlockScale>;
-  
-  // Block scaling smem layout
-  using SmemLayoutScaleA = Layout<Shape<Int<ScaleMsPerTile>, Int<DispatchPolicy::Stages>>>;
-  using SmemLayoutScaleB = Layout<Shape<Int<DispatchPolicy::Stages>>, Stride<_1>>; // `ScaleNsPerTile` is always 1.
-
-  static_assert(DispatchPolicy::Stages >= 2, "Specialization requires Stages set to value 1 or more.");
-  static_assert(cute::is_base_of<cute::GMMA::DescriptorIterator, typename TiledMma::FrgTypeA>::value &&
-                cute::is_base_of<cute::GMMA::DescriptorIterator, typename TiledMma::FrgTypeB>::value,
-                "MMA atom must source both A and B operand from smem_desc for this mainloop.");
-  static_assert(cute::is_same_v<GmemTiledCopyA, SM90_TMA_LOAD> || cute::is_same_v<GmemTiledCopyA, SM90_TMA_LOAD_MULTICAST>,
-      "GmemTiledCopy - invalid SM90 TMA copy atom specified.");
-  static_assert(cute::is_same_v<GmemTiledCopyB, SM90_TMA_LOAD> || cute::is_same_v<GmemTiledCopyB, SM90_TMA_LOAD_MULTICAST>,
-      "GmemTiledCopy - invalid SM90 TMA copy atom specified.");
-  static_assert(cute::is_same_v<ElementAccumulator, ElementBlockScale>,
-             "ElementAccumulator and ElementBlockScale should be same datatype");
-
-  struct SharedStorage
-  {
-    struct TensorStorage : cute::aligned_struct<128> {
-      cute::array_aligned<typename TiledMma::ValTypeA, cute::cosize_v<SmemLayoutA>> smem_A;  // mxk
-      cute::array_aligned<typename TiledMma::ValTypeB, cute::cosize_v<SmemLayoutB>> smem_B;  // nxk
-      cute::array_aligned<ElementBlockScale, cute::cosize_v<SmemLayoutScaleA>> smem_scale_A; // ScaleMsPerTile x k
-      cute::array_aligned<ElementBlockScale, cute::cosize_v<SmemLayoutScaleB>> smem_scale_B; // 1xk
-    } tensors;
-
-    using PipelineStorage = typename MainloopPipeline::SharedStorage;
-    PipelineStorage pipeline;
-  };
-  using TensorStorage = typename SharedStorage::TensorStorage;
-  using PipelineStorage = typename SharedStorage::PipelineStorage;
-
-  // Host side kernel arguments
-  struct Arguments {
-    ElementA const* ptr_A;
-    StrideA dA;
-    ElementB const* ptr_B;
-    StrideB dB;
-    ElementBlockScale const* ptr_scale_A; 
-    ElementBlockScale const* ptr_scale_B;
-  };
-
-  // Device side kernel params
-  struct Params {
-    // Assumption: StrideA is congruent with Problem_MK
-    using TMA_A = decltype(make_tma_copy_A_sm90(
-        GmemTiledCopyA{},
-        make_tensor(static_cast<ElementA const*>(nullptr), repeat_like(StrideA{}, int32_t(0)), StrideA{}),
-        SmemLayoutA{}(_,_,0),
-        TileShape{},
-        ClusterShape{}));
-    // Assumption: StrideB is congruent with Problem_NK
-    using TMA_B = decltype(make_tma_copy_B_sm90(
-        GmemTiledCopyB{},
-        make_tensor(static_cast<ElementB const*>(nullptr), repeat_like(StrideB{}, int32_t(0)), StrideB{}),
-        SmemLayoutB{}(_,_,0),
-        TileShape{},
-        ClusterShape{}));
-    TMA_A tma_load_a;
-    TMA_B tma_load_b;
-    uint32_t tma_transaction_bytes = TmaTransactionBytes;
-    uint32_t tma_transaction_bytes_mk = TmaTransactionBytesMK;
-    uint32_t tma_transaction_bytes_nk = TmaTransactionBytesNK;
-    // Block scaling factors for A and B
-    ElementBlockScale const* ptr_scale_A; 
-    ElementBlockScale const* ptr_scale_B;
-  };
-
-  //
-  // Methods
-  //
-
-  template <class ProblemShape>
-  static constexpr Params
-  to_underlying_arguments(ProblemShape const& problem_shape, Arguments const& args, void* workspace) {
-    (void) workspace;
-
-    // Optionally append 1s until problem shape is rank-4 (MNKL), in case it is only rank-3 (MNK)
-    auto problem_shape_MNKL = append<4>(problem_shape, 1);
-    auto [M,N,K,L] = problem_shape_MNKL;
-
-    auto ptr_A = reinterpret_cast<ElementA const*>(args.ptr_A);
-    auto ptr_B = reinterpret_cast<ElementB const*>(args.ptr_B);
-
-    Tensor tensor_a = make_tensor(ptr_A, make_layout(make_shape(M,K,L), args.dA));
-    Tensor tensor_b = make_tensor(ptr_B, make_layout(make_shape(N,K,L), args.dB));
-    typename Params::TMA_A tma_load_a = make_tma_copy_A_sm90(
-        GmemTiledCopyA{},
-        tensor_a,
-        SmemLayoutA{}(_,_,cute::Int<0>{}),
-        TileShape{},
-        ClusterShape{});
-    typename Params::TMA_B tma_load_b = make_tma_copy_B_sm90(
-        GmemTiledCopyB{},
-        tensor_b,
-        SmemLayoutB{}(_,_,cute::Int<0>{}),
-        TileShape{},
-        ClusterShape{});
-    uint32_t transaction_bytes_mk = TmaTransactionBytesMK;
-    uint32_t transaction_bytes_nk = TmaTransactionBytesNK;
-    uint32_t transaction_bytes = transaction_bytes_mk + transaction_bytes_nk;
-
-    return {
-      tma_load_a,
-      tma_load_b,
-      transaction_bytes,
-      transaction_bytes_mk,
-      transaction_bytes_nk,
-      args.ptr_scale_A,
-      args.ptr_scale_B
-    };
-  }
-
-  template<class ProblemShape>
-  static bool
-  can_implement(
-      ProblemShape const& problem_shape,
-      [[maybe_unused]] Arguments const& args) {
-    constexpr int tma_alignment_bits = 128;
-    auto problem_shape_MNKL = append<4>(problem_shape, 1);
-    auto [M,N,K,L] = problem_shape_MNKL;
-    
-    bool implementable = true;
-    constexpr int min_tma_aligned_elements_A = tma_alignment_bits / cutlass::sizeof_bits<ElementA>::value;
-    implementable = implementable && cutlass::detail::check_alignment<min_tma_aligned_elements_A>(cute::make_shape(M,K,L), StrideA{});
-    constexpr int min_tma_aligned_elements_B = tma_alignment_bits / cutlass::sizeof_bits<ElementB>::value;
-    implementable = implementable && cutlass::detail::check_alignment<min_tma_aligned_elements_B>(cute::make_shape(N,K,L), StrideB{});
-
-    if (!implementable) {
-      CUTLASS_TRACE_HOST("  CAN IMPLEMENT: Problem Size doesn't meet the minimum alignment requirements for TMA.\n");
-    }
-    return implementable;
-  }
-
-  static constexpr int K_PIPE_MAX = DispatchPolicy::Stages;
-  static constexpr int K_PIPE_MMAS = 1;
-  static constexpr uint32_t TmaTransactionBytesMK =
-        cutlass::bits_to_bytes(size<0>(SmemLayoutA{}) * size<1>(SmemLayoutA{}) * static_cast<uint32_t>(sizeof_bits<ElementA>::value));
-  static constexpr uint32_t TmaTransactionBytesNK =
-        cutlass::bits_to_bytes(size<0>(SmemLayoutB{}) * size<1>(SmemLayoutB{}) * static_cast<uint32_t>(sizeof_bits<ElementB>::value));
-  static constexpr uint32_t TmaTransactionBytes = TmaTransactionBytesMK + TmaTransactionBytesNK;
-
-  /// Issue Tma Descriptor Prefetch -- ideally from a single thread for best performance
-  CUTLASS_DEVICE
-  static void prefetch_tma_descriptors(Params const& mainloop_params)
-  {
-    cute::prefetch_tma_descriptor(mainloop_params.tma_load_a.get_tma_descriptor());
-    cute::prefetch_tma_descriptor(mainloop_params.tma_load_b.get_tma_descriptor());
-  }
-
-  /// Set up the data needed by this collective for load and mma.
-  /// Returns a tuple of tensors. The collective and the kernel layer have the contract
-  /// Returned tuple must contain at least two elements, with the first two elements being:
-  /// gA_mkl - The tma tensor, A after a local tile so it has shape  (BLK_M,BLK_K,m,k,l)
-  /// gB_nkl - The tma tensor, B after a local tile so it has shape  (BLK_N,BLK_K,n,k,l)
-  template <class ProblemShape_MNKL>
-  CUTLASS_DEVICE auto
-  load_init(ProblemShape_MNKL const& problem_shape_MNKL, Params const& mainloop_params) const {
-    using X = Underscore;
-    // Separate out problem shape for convenience
-    auto [M,N,K,L] = problem_shape_MNKL;
-
-    // TMA requires special handling of strides to deal with coord codomain mapping
-    // Represent the full tensors -- get these from TMA
-    Tensor mA_mkl = mainloop_params.tma_load_a.get_tma_tensor(make_shape(M,K,L));                            // (m,k,l)
-    Tensor mB_nkl = mainloop_params.tma_load_b.get_tma_tensor(make_shape(N,K,L));                            // (n,k,l)
-
-    // Make tiled views, defer the slice
-    Tensor gA_mkl = local_tile(mA_mkl, TileShape{}, make_coord(_,_,_), Step<_1, X,_1>{});        // (BLK_M,BLK_K,m,k,l)
-    Tensor gB_nkl = local_tile(mB_nkl, TileShape{}, make_coord(_,_,_), Step< X,_1,_1>{});        // (BLK_N,BLK_K,n,k,l)
-
-    constexpr auto scales_m = Int<ScaleMsPerTile>{};
-    auto tM = get<2>(gA_mkl.shape());
-    auto tN = get<2>(gB_nkl.shape());
-    auto tK = get<3>(gA_mkl.shape());
-
-    // Make the tiled views of scale tensors
-    auto scaleA_shape = make_shape(M / ScaleGranularityM, tK, L); // (scale_m,k,l)
-    auto scaleA_layout = make_ordered_layout(scaleA_shape,  Step<_0, _1, _2>{});
-    auto scaleB_shape = make_shape(tN, tK, L); // (n,k,l)
-    auto scaleB_layout = make_ordered_layout(scaleB_shape, Step<_1, _0, _2>{});
-
-    // Note that mScaleA_mkl and mScaleB_nkl are already blocked tiled in the `m` host and 
-    // gScaleA_mkl and gScaleB_nkl in `g` global memory are same as mScaleA_mkl and mScaleB_nkl.
-    Tensor mScaleA_mkl = make_tensor(make_gmem_ptr(mainloop_params.ptr_scale_A), scaleA_layout); // (scale_m,k,l)
-    Tensor mScaleB_nkl = make_tensor(make_gmem_ptr(mainloop_params.ptr_scale_B), scaleB_layout); // (n,k,l)
-
-    return cute::make_tuple(gA_mkl, gB_nkl, mScaleA_mkl, mScaleB_nkl);
-  }
-
-  /// Perform a collective-scoped matrix multiply-accumulate
-  /// Producer Perspective
-  template <
-    class TensorA, class TensorB,
-    class TensorScaleA, class TensorScaleB,
-    class KTileIterator, class BlockCoord
-  >
-  CUTLASS_DEVICE void
-  load(
-      Params const& mainloop_params,
-      MainloopPipeline pipeline,
-      PipelineState smem_pipe_write,
-      cute::tuple<TensorA, TensorB, TensorScaleA, TensorScaleB> const& load_inputs,
-      BlockCoord const& blk_coord,
-      KTileIterator k_tile_iter, int k_tile_count,
-      int thread_idx,
-      uint32_t block_rank_in_cluster,
-      TensorStorage& shared_tensors) {
-    int lane_predicate = cute::elect_one_sync();
-
-    // Blockscaling: Tma loads for load_input and CpAsync for load_scale
-    Tensor sA = make_tensor(make_smem_ptr(shared_tensors.smem_A.data()), SmemLayoutA{});        // (BLK_M,BLK_K,PIPE)
-    Tensor sB = make_tensor(make_smem_ptr(shared_tensors.smem_B.data()), SmemLayoutB{});        // (BLK_N,BLK_K,PIPE)
-    Tensor sScaleA = make_tensor(cute::make_smem_ptr(shared_tensors.smem_scale_A.data()), SmemLayoutScaleA{}); // (ScaleMsPerTile,k)
-    Tensor sScaleB = make_tensor(cute::make_smem_ptr(shared_tensors.smem_scale_B.data()), SmemLayoutScaleB{}); // (k)
-
-    //
-    // Prepare the TMA loads for A and B
-    //
-
-    constexpr uint32_t cluster_shape_x = get<0>(ClusterShape());
-    uint2 cluster_local_block_id = {block_rank_in_cluster % cluster_shape_x, block_rank_in_cluster / cluster_shape_x};
-
-    Tensor gA_mkl = get<0>(load_inputs);
-    Tensor gB_nkl = get<1>(load_inputs);
-
-    auto block_tma_a = mainloop_params.tma_load_a.get_slice(cluster_local_block_id.y);
-    auto block_tma_b = mainloop_params.tma_load_b.get_slice(cluster_local_block_id.x);
-
-    // Partition the inputs based on the current block coordinates.
-    auto [m_coord, n_coord, k_coord, l_coord] = blk_coord;
-    Tensor gA = gA_mkl(_,_,m_coord,_,l_coord);                                                     // (BLK_M,BLK_K,k)
-    Tensor gB = gB_nkl(_,_,n_coord,_,l_coord);                                                     // (BLK_N,BLK_K,k)
-
-
-    // Block scaling: load_scale has scaling tensors in global memory which are not tiled
-    Tensor mScaleA_mkl = get<2>(load_inputs);
-    Tensor mScaleB_nkl = get<3>(load_inputs);
-    auto scales_m = get<0>(mScaleA_mkl.shape());
-
-    Tensor cScaleA_mkl = make_identity_tensor(mScaleA_mkl.shape());
-
-    Tensor gScaleA = local_tile( 
-      mScaleA_mkl, make_tile(Int<ScaleMsPerTile>{}), 
-      make_coord(m_coord,_,l_coord));                   // (ScaleMsPerTile,k,1)
-    Tensor cScaleA = local_tile( 
-      cScaleA_mkl, make_tile(Int<ScaleMsPerTile>{}), 
-      make_coord(m_coord,_,l_coord));
-    Tensor gScaleB = mScaleB_nkl(n_coord,_,l_coord);                                           // (1,k,1)
-
-    // TODO: test `scale_copy_a` with `ScaleMsPerTile` < 128
-    TiledCopy scale_copy_a = make_tiled_copy(SmemBlockScalingCopyAtomA{}, 
-      Layout<Shape<_32>>{}, Layout<Shape<_1>>{}); // (1,1,1)
-    TiledCopy scale_copy_b = make_tiled_copy(SmemBlockScalingCopyAtomB{}, 
-      Layout<Shape<_1>>{}, Layout<Shape<_1>>{}); // (1,1,1)
-    ThrCopy thr_scale_copy_a = scale_copy_a.get_slice(threadIdx.x);
-    ThrCopy thr_scale_copy_b = scale_copy_b.get_slice(threadIdx.x);
-    
-    Tensor tAgA_ScaleA = thr_scale_copy_a.partition_S(gScaleA);
-    Tensor tAcA_ScaleA = thr_scale_copy_a.partition_S(cScaleA);
-    Tensor tAsA_ScaleA = thr_scale_copy_a.partition_D(sScaleA);
-    
-    Tensor tBgB_ScaleB = thr_scale_copy_b.partition_S(gScaleB);
-    Tensor tBsB_ScaleB = thr_scale_copy_b.partition_D(sScaleB);
-
-    // Applies the mapping from block_tma_a
-    Tensor tAgA = block_tma_a.partition_S(gA);                                              // (TMA,TMA_M,TMA_K,k)
-    Tensor tAsA = block_tma_a.partition_D(sA);                                              // (TMA,TMA_M,TMA_K,PIPE)
-
-    Tensor tBgB = block_tma_b.partition_S(gB);                                              // (TMA,TMA_N,TMA_K,k)
-    Tensor tBsB = block_tma_b.partition_D(sB);                                              // (TMA,TMA_N,TMA_K,PIPE)
-
-    uint16_t mcast_mask_a = 0;
-    uint16_t mcast_mask_b = 0;
-
-    // Issue TmaLoads for GEMM operands A/B and CpAsync for scale tensors
-    // Maps the tile -> block, value
-    if constexpr (cute::is_same_v<GmemTiledCopyA, SM90_TMA_LOAD_MULTICAST>) {
-      auto block_layout = Layout<typename DispatchPolicy::ClusterShape>{};                       // (m,n) -> block_id
-      for (int n = 0; n < size<1>(block_layout); ++n) {
-        mcast_mask_a |= (uint16_t(1) << block_layout(cluster_local_block_id.x,n,Int<0>{}));
-      }
-    }
-
-    if constexpr (cute::is_same_v<GmemTiledCopyB, SM90_TMA_LOAD_MULTICAST>) {
-      auto block_layout = Layout<typename DispatchPolicy::ClusterShape>{};                       // (m,n) -> block_id
-      for (int m = 0; m < size<0>(block_layout); ++m) {
-        mcast_mask_b |= (uint16_t(1) << block_layout(m,cluster_local_block_id.y,Int<0>{}));
-      }
-    }
-
-    // Allocate predicate tensors for a_scales (since we can't guarantee that 
-    // all scales are valid, since we could have a partial tiles along M)
-    Tensor tApA_ScaleA = make_tensor<bool>(shape(tAsA_ScaleA(_,_,0)));
-    #pragma unroll
-    for (int i = 0; i < size(tApA_ScaleA); ++i) {
-      tApA_ScaleA(i) = get<0>(tAcA_ScaleA(i)) < scales_m;
-    }
-
-    // Mainloop
-    CUTLASS_PRAGMA_NO_UNROLL
-    for ( ; k_tile_count > 0; --k_tile_count) {
-      // LOCK smem_pipe_write for _writing_
-      pipeline.producer_acquire(smem_pipe_write);
-
-      //
-      // Copy gmem to smem for *k_tile_iter
-      //
-      int write_stage = smem_pipe_write.index();
-      using BarrierType = typename MainloopPipeline::ProducerBarrierType;
-      BarrierType* tma_barrier = pipeline.producer_get_barrier(smem_pipe_write);
-
-      // Copy operands A and B from global memory to shared memory
-      if (lane_predicate) copy(mainloop_params.tma_load_a.with(*tma_barrier, mcast_mask_a), tAgA(_,_,_,*k_tile_iter), tAsA(_,_,_,write_stage));
-      if (lane_predicate) copy(mainloop_params.tma_load_b.with(*tma_barrier, mcast_mask_b), tBgB(_,_,_,*k_tile_iter), tBsB(_,_,_,write_stage));
-
-      // Copy scale tensors from global memory to shared memory
-      copy_if(scale_copy_a, tApA_ScaleA, tAgA_ScaleA(_,_,*k_tile_iter), tAsA_ScaleA(_,_,write_stage));
-      copy(scale_copy_b, tBgB_ScaleB(_,*k_tile_iter), tBsB_ScaleB(_,write_stage));
-      pipeline.producer_commit(smem_pipe_write, cutlass::arch::cpasync_barrier_arrive_noinc);
-
-      ++k_tile_iter;
-
-      // Advance smem_pipe_write
-      ++smem_pipe_write;
-    }
-  }
-
-  /// Perform a Producer Epilogue to prevent early exit of blocks in a Cluster
-  CUTLASS_DEVICE void
-  load_tail(
-      MainloopPipeline pipeline,
-      PipelineState smem_pipe_write) {
-    int lane_predicate = cute::elect_one_sync();
-
-    // Issue the epilogue waits
-    if (lane_predicate) {
-      /* This helps avoid early exit of blocks in Cluster
-       * Waits for all stages to either be released (all
-       * Consumer UNLOCKs), or if the stage was never used
-       * then would just be acquired since the phase was
-       * still inverted from make_producer_start_state
-       */
-      pipeline.producer_tail(smem_pipe_write);
-    }
-  }
-
-  /// Perform a collective-scoped matrix multiply-accumulate
-  /// Consumer Perspective
-  template <
-    class FrgTensorC
-  >
-  CUTLASS_DEVICE void
-  mma(MainloopPipeline pipeline,
-      PipelineState smem_pipe_read,
-      FrgTensorC& accum,
-      int k_tile_count,
-      int thread_idx,
-      TensorStorage& shared_tensors,
-      Params const& mainloop_params) {
-
-
-    static_assert(is_rmem<FrgTensorC>::value, "C tensor must be rmem resident.");
-    static_assert(cute::rank(SmemLayoutA{}) == 3, "Smem layout must be rank 3.");
-    static_assert(cute::rank(SmemLayoutB{}) == 3, "Smem layout must be rank 3.");
-    static_assert(cute::is_void_v<SmemCopyAtomA>,
-      "SM90 GMMA mainloops cannot have a non-void copy atom for smem sourced instructions.");
-    static_assert(cute::is_void_v<SmemCopyAtomB>,
-      "SM90 GMMA mainloops cannot have a non-void copy atom for smem sourced instructions.");
-
-    Tensor sA = make_tensor(make_smem_ptr(shared_tensors.smem_A.data()), SmemLayoutA{});          // (BLK_M,BLK_K,PIPE)
-    Tensor sB = make_tensor(make_smem_ptr(shared_tensors.smem_B.data()), SmemLayoutB{});          // (BLK_N,BLK_K,PIPE)
-    
-    // Block scaling
-    Tensor sScaleAViewAsC = make_tensor(cute::make_smem_ptr(shared_tensors.smem_scale_A.data()),
-      Layout<
-        Shape<Shape<Int<ScaleGranularityM>, Int<ScaleMsPerTile>>, cute::tuple_element_t<1, TileShape>, Int<DispatchPolicy::Stages>>,
-        Stride<Stride<_0, _1>, _0, Int<ScaleMsPerTile>>
-      >{}); // ((ScaleGranularityM,ScaleMsPerTile),n,k)
-    Tensor sScaleB = make_tensor(cute::make_smem_ptr(shared_tensors.smem_scale_B.data()), SmemLayoutScaleB{}); // (k)
-
-    //
-    // Define C accumulators and A/B partitioning
-    //
-    
-    // Layout of warp group to thread mapping
-
-    static_assert(stride<0>(typename TiledMma::ALayout{}) == 0 and 
-                  stride<0>(typename TiledMma::BLayout{}) == 0 and
-                  size<0>(typename TiledMma::ALayout{}) == NumThreadsPerWarpGroup and
-                  size<0>(typename TiledMma::BLayout{}) == NumThreadsPerWarpGroup, 
-                  "Stride of the first mode must be 0 and the size of the mode must be NumThreadsPerWarpGroup");
-
-    constexpr int MmaWarpGroups = size(TiledMma{}) / NumThreadsPerWarpGroup;
-    Layout warp_group_thread_layout = make_layout(Int<MmaWarpGroups>{}, 
-                                                  Int<NumThreadsPerWarpGroup>{});
-
-    int warp_group_idx = __shfl_sync(0xFFFFFFFF, thread_idx / NumThreadsPerWarpGroup, 0);
-
-    TiledMma tiled_mma;
-    auto thread_mma = tiled_mma.get_slice(warp_group_thread_layout(warp_group_idx));
-
-    Tensor tCsScaleAViewAsC = tiled_mma.get_slice(thread_idx).partition_C(sScaleAViewAsC);    // (MMA,MMA_M,MMA_N,PIPE), `thread_mma` above is correct when partitioning A and B, but it is not correct when partitioning C.
-
-    Tensor tCsA = thread_mma.partition_A(sA);                                                 // (MMA,MMA_M,MMA_K,PIPE)
-    Tensor tCsB = thread_mma.partition_B(sB);                                                 // (MMA,MMA_N,MMA_K,PIPE)
-
-    // Allocate "fragments/descriptors"
-    Tensor tCrA = thread_mma.make_fragment_A(tCsA);                                           // (MMA,MMA_M,MMA_K,PIPE)
-    Tensor tCrB = thread_mma.make_fragment_B(tCsB);                                           // (MMA,MMA_N,MMA_K,PIPE)
-
-    CUTE_STATIC_ASSERT_V(size<1>(tCsA) == size<1>(accum));                                                         // M
-    CUTE_STATIC_ASSERT_V(size<1>(tCsB) == size<2>(accum));                                                         // N
-    CUTE_STATIC_ASSERT_V(size<2>(tCsA) == size<2>(tCsB));                                                          // K
-    CUTE_STATIC_ASSERT_V(size<3>(tCsA) == size<3>(tCsB));                                                       // PIPE
-    CUTE_STATIC_ASSERT_V(Int<DispatchPolicy::Stages>{} == size<2>(sA));                                         // PIPE
-    CUTE_STATIC_ASSERT_V(Int<DispatchPolicy::Stages>{} == size<2>(sB));                                         // PIPE
-
-    //
-    // PIPELINED MAIN LOOP
-    //
-    static_assert((0 <= K_PIPE_MMAS) && (K_PIPE_MMAS <  K_PIPE_MAX),
-        "ERROR : Incorrect number of MMAs in flight");
-
-    // We release buffers to producer warps(dma load) with some mmas in flight
-    PipelineState smem_pipe_release = smem_pipe_read;
-    
-    // Per block scale values for operand A and B
-
-    using RegLayoutScaleAViewAsC = decltype(make_layout_like(tCsScaleAViewAsC(_, _, _, 0).layout())); // `make_layout_like` makes a compact layout.
-    using RegLayoutScaleAEssential = decltype(filter_zeros(RegLayoutScaleAViewAsC{}.stride(), RegLayoutScaleAViewAsC{}.shape())); // an interface to traverse the underlying storage for the compact layout mentioned above
-
-    Tensor tCrScaleAViewAsC = make_tensor<ElementBlockScale>(RegLayoutScaleAViewAsC{});              // (MMA,MMA_M,MMA_N)
-    ElementBlockScale scale_b;
-
-    // Prologue GMMAs
-    int prologue_mma_count = min(K_PIPE_MMAS, k_tile_count);
-
-    tiled_mma.accumulate_ = GMMA::ScaleOut::Zero;
-
-    GmmaFP8AccumulationWithScale accumulation(accum, size<2>(TileShape{}) / size<2>(typename TiledMma::AtomShape_MNK{}), size<2>(tCrA));
-    warpgroup_fence_operand(accumulation());
-    CUTLASS_PRAGMA_UNROLL
-    for (int k_tile_prologue = prologue_mma_count; k_tile_prologue > 0; --k_tile_prologue)
-    {
-      // WAIT on smem_pipe_read until its data are available (phase bit flips from rdPhaseBit value)
-      auto barrier_token = pipeline.consumer_try_wait(smem_pipe_read);
-      pipeline.consumer_wait(smem_pipe_read, barrier_token);
-
-      if (accumulation.prepare_if_needed()) {
-        tiled_mma.accumulate_ = GMMA::ScaleOut::Zero;
-      }
-
-      int read_stage = smem_pipe_read.index();
-      
-      // Load per block scale values from shared memory to registers.
-      scale_b = sScaleB[read_stage];
-      CUTLASS_PRAGMA_UNROLL
-      for (int i = 0; i < size(RegLayoutScaleAEssential{}); i++) {
-        tCrScaleAViewAsC.data()[i] = tCsScaleAViewAsC(_, _, _, read_stage)(idx2crd(i, RegLayoutScaleAEssential{}));
-      }
-      if constexpr (ScaleMsPerTile == 1) {
-        static_assert(size(RegLayoutScaleAEssential{}) == 1);
-        tCrScaleAViewAsC.data()[0] = __shfl_sync(0xffffffff, tCrScaleAViewAsC.data()[0] * scale_b, 0); // `tCrScaleAViewAsC.data()[0]` are all same in a warp group when `ScaleMsPerTile == 1`.
-      } else {
-        CUTLASS_PRAGMA_UNROLL
-        for (int i = 0; i < size(RegLayoutScaleAEssential{}); i++) {
-          tCrScaleAViewAsC.data()[i] = tCrScaleAViewAsC.data()[i] * scale_b;
-        }
-      }
-
-      warpgroup_arrive();
-      // Unroll the K mode manually to set scale D to 1
-      CUTLASS_PRAGMA_UNROLL
-      for (int k_block = 0; k_block < size<2>(tCrA); ++k_block) {
-        // (V,M,K) x (V,N,K) => (V,M,N)
-        cute::gemm(tiled_mma, tCrA(_,_,k_block,read_stage), tCrB(_,_,k_block,read_stage), accumulation());
-        tiled_mma.accumulate_ = GMMA::ScaleOut::One;
-      }
-      warpgroup_commit_batch();
-
-      // Block scale the accumulators with reg tensor `tCrScaleAViewAsC`
-      accumulation.scale_if_needed(tCrScaleAViewAsC);
-
-      ++smem_pipe_read;
-    }
-
-    warpgroup_fence_operand(accumulation());
-    // Mainloop GMMAs
-    k_tile_count -= prologue_mma_count;
-
-    CUTLASS_PRAGMA_NO_UNROLL
-    for ( ; k_tile_count > 0; --k_tile_count)
-    {
-      // WAIT on smem_pipe_read until its data are available (phase bit flips from rdPhaseBit value)
-      auto barrier_token = pipeline.consumer_try_wait(smem_pipe_read);
-      pipeline.consumer_wait(smem_pipe_read, barrier_token);
-
-      //
-      // Compute on k_tile
-      //
-
-      int read_stage = smem_pipe_read.index();
-
-      // Load per block scale values from shared memory to registers (at most twice per block along M and exactly once per block along N) 
-      scale_b = sScaleB[read_stage];
-      CUTLASS_PRAGMA_UNROLL
-      for (int i = 0; i < size(RegLayoutScaleAEssential{}); i++) {
-        tCrScaleAViewAsC.data()[i] = tCsScaleAViewAsC(_, _, _, read_stage)(idx2crd(i, RegLayoutScaleAEssential{}));
-      }
-      if constexpr (ScaleMsPerTile == 1) {
-        static_assert(size(RegLayoutScaleAEssential{}) == 1);
-        tCrScaleAViewAsC.data()[0] = __shfl_sync(0xffffffff, tCrScaleAViewAsC.data()[0] * scale_b, 0); // `tCrScaleAViewAsC.data()[0]` are all same in a warp group when `ScaleMsPerTile == 1`.
-      } else {
-        CUTLASS_PRAGMA_UNROLL
-        for (int i = 0; i < size(RegLayoutScaleAEssential{}); i++) {
-          tCrScaleAViewAsC.data()[i] = tCrScaleAViewAsC.data()[i] * scale_b;
-        }
-      }
-
-      if (accumulation.prepare_if_needed()) {
-        tiled_mma.accumulate_ = GMMA::ScaleOut::Zero;
-      }
-
-      warpgroup_fence_operand(accumulation());
-      warpgroup_arrive();
-      // Unroll the K mode manually to set scale D to 1
-      CUTLASS_PRAGMA_UNROLL
-      for (int k_block = 0; k_block < size<2>(tCrA); ++k_block) {
-        // (V,M,K) x (V,N,K) => (V,M,N)
-        cute::gemm(tiled_mma, tCrA(_,_,k_block,read_stage), tCrB(_,_,k_block,read_stage), accumulation());
-        tiled_mma.accumulate_ = GMMA::ScaleOut::One;
-      }
-      warpgroup_commit_batch();
-
-      /// Wait on the GMMA barrier for K_PIPE_MMAS (or fewer) outstanding to ensure smem_pipe_write is consumed
-      warpgroup_wait<K_PIPE_MMAS>();
-      warpgroup_fence_operand(accumulation());
-
-      // Block scale the accumulators with reg tensor `tCrScaleAViewAsC`
-      accumulation.scale_if_needed(tCrScaleAViewAsC);
-
-      pipeline.consumer_release(smem_pipe_release);                 // UNLOCK smem_pipe_release, done _computing_ on it
-
-      // Advance smem_pipe_read and smem_pipe_release
-      ++smem_pipe_read;
-      ++smem_pipe_release;
-    }
-    
-    accumulation.scale_residue_if_needed(tCrScaleAViewAsC);
-
-    warpgroup_fence_operand(accumulation());
-  }
-
-  /// Perform a Consumer Epilogue to release all buffers
-  CUTLASS_DEVICE void
-  mma_tail(MainloopPipeline pipeline, PipelineState smem_pipe_release, int k_tile_count) {
-    // Prologue GMMAs
-    int prologue_mma_count = min(K_PIPE_MMAS, k_tile_count);
-    k_tile_count -= prologue_mma_count;
-
-    smem_pipe_release.advance(k_tile_count);
-
-    // Wait on all GMMAs to complete
-    warpgroup_wait<0>();
-
-    for (int count = 0; count < prologue_mma_count; ++count) {
-      pipeline.consumer_release(smem_pipe_release);                 // UNLOCK smem_pipe_release, done _computing_ on it
-      ++smem_pipe_release;
-    }
-  }
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-} // namespace cutlass::gemm::collective
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/csrc/cutlass_extensions/gemm/dispatch_policy.hpp b/csrc/cutlass_extensions/gemm/dispatch_policy.hpp
deleted file mode 100644
index df809e27a3efee33f14cba886e72ff9face437fb..0000000000000000000000000000000000000000
--- a/csrc/cutlass_extensions/gemm/dispatch_policy.hpp
+++ /dev/null
@@ -1,39 +0,0 @@
-#pragma once
-
-#include "cutlass/gemm/dispatch_policy.hpp"
-
-namespace cutlass::gemm {
-
-//////////////////////////////////////////////////////////////////////////////
-
-// FP8 related policies (including Blocked Scaled Accumulation)
-//  `ScaleGranularityM` specifies scaling granularity along M, while zero-value
-//  `ScaleGranularityM` indicates that scaling granularity is
-//  `size<0>(TileShape_MNK{})` along M.
-template <int ScaleGranularityM = 0>
-struct KernelTmaWarpSpecializedCooperativeFP8BlockScaledSubGroupMAccum
-    : KernelTmaWarpSpecializedCooperative {};
-
-// n-buffer in smem (Hopper TMA), pipelined with Hopper GMMA and TMA, Warp
-// specialized dynamic schedule For FP8 kernels with Block Scaling
-template <int Stages_, class ClusterShape_ = Shape<_1, _1, _1>,
-          class KernelSchedule = KernelTmaWarpSpecialized,
-          int ScaleGranularityM =
-              0  // `ScaleGranularityM` specifies scaling granularity along M,
-                 // while zero-value `ScaleGranularityM` indicates that scaling
-                 // granularity is `size<0>(TileShape_MNK{})` along M.
-          >
-struct MainloopSm90TmaGmmaWarpSpecializedBlockScalingSubGroupMFP8
-    : MainloopSm90TmaGmmaWarpSpecialized<Stages_, ClusterShape_,
-                                         KernelSchedule> {
-  static_assert(
-      cute::is_same_v<
-          KernelSchedule,
-          KernelTmaWarpSpecializedCooperativeFP8BlockScaledSubGroupMAccum<
-              ScaleGranularityM>>,
-      "KernelSchedule must be one of the warp specialized policies");
-};
-
-//////////////////////////////////////////////////////////////////////////////
-
-}  // namespace cutlass::gemm
\ No newline at end of file
diff --git a/csrc/cutlass_extensions/vllm_collective_builder.cuh b/csrc/cutlass_extensions/vllm_collective_builder.cuh
index e7fbba4cd4b0d211a8a7a4f425cfc7435e249543..085ee1290031fb88eb1c1e06dfba50eca50dceda 100644
--- a/csrc/cutlass_extensions/vllm_collective_builder.cuh
+++ b/csrc/cutlass_extensions/vllm_collective_builder.cuh
@@ -1,6 +1,6 @@
 #pragma once
 
-#include "cutlass_extensions/gemm/collective/collective_builder.hpp"
+#include "cutlass/gemm/collective/collective_builder.hpp"
 
 namespace cutlass::gemm::collective {
 using namespace cute;
diff --git a/csrc/dispatch_utils.h b/csrc/dispatch_utils.h
index 2728aa81f0c9f51906acb8c7673e55ccc55e9e43..995374a50b037df4bc5a31057fd3ad1857f3201d 100644
--- a/csrc/dispatch_utils.h
+++ b/csrc/dispatch_utils.h
@@ -52,15 +52,6 @@
 #define VLLM_DISPATCH_FP8_TYPES(TYPE, NAME, ...) \
   AT_DISPATCH_SWITCH(TYPE, NAME, VLLM_DISPATCH_CASE_FP8_TYPES(__VA_ARGS__))
 
-#define AT_DISPATCH_BYTE_CASE(enum_type, ...) \
-  AT_PRIVATE_CASE_TYPE_USING_HINT(enum_type, byte_t, __VA_ARGS__)
-
-#define VLLM_DISPATCH_CASE_BYTE_TYPES(...) \
-  AT_DISPATCH_BYTE_CASE(at::ScalarType::Byte, __VA_ARGS__)
-
-#define VLLM_DISPATCH_BYTE_TYPES(TYPE, NAME, ...) \
-  AT_DISPATCH_SWITCH(TYPE, NAME, VLLM_DISPATCH_CASE_BYTE_TYPES(__VA_ARGS__))
-
 #define VLLM_DISPATCH_QUANT_TYPES(TYPE, NAME, ...) \
   AT_DISPATCH_SWITCH(TYPE, NAME, VLLM_DISPATCH_CASE_QUANT_TYPES(__VA_ARGS__))
 
diff --git a/csrc/layernorm_kernels.cu b/csrc/layernorm_kernels.cu
index f051eb07022280114f6b285239572aa4c72a0176..05be023de0f28f8afdda0d0c7d42161988b9e293 100644
--- a/csrc/layernorm_kernels.cu
+++ b/csrc/layernorm_kernels.cu
@@ -140,6 +140,211 @@ fused_add_rms_norm_kernel(
   }
 }
 
+/* Function specialization in the case of FP16/BF16 tensors.
+   Additional optimizations we can make in this case are
+   packed and vectorized operations, which help with the
+   memory latency bottleneck.
+
+   _f16VecPN struct extends _f16Vec to add operations specifically required for
+   polynomial normalization (poly norm).
+   The original _f16Vec does not include the sum-of-powers computation or
+   in-place polynomial normalization logic. */
+template <typename scalar_t, int width>
+struct alignas(16) _f16VecPN : _f16Vec<scalar_t, width> {
+  using Base = _f16Vec<scalar_t, width>;
+  using Converter = typename Base::Converter;
+  using T1 = typename Base::T1;
+  using T2 = typename Base::T2;
+  using Base::data;
+
+  __device__ auto sum_pows() const {
+    float s2 = 0.0f, s4 = 0.0f, s6 = 0.0f;
+
+#pragma unroll
+    for (int i = 0; i < width; i += 2) {
+      float2 z = Converter::convert(T2{data[i], data[i + 1]});
+      float x2 = z.x * z.x;
+      float x4 = x2 * x2;
+      float x6 = x4 * x2;
+
+      float y2 = z.y * z.y;
+      float y4 = y2 * y2;
+      float y6 = y4 * y2;
+
+      s2 += x2 + y2;
+      s4 += x4 + y4;
+      s6 += x6 + y6;
+    }
+    return std::make_tuple(s2, s4, s6);
+  }
+
+  __device__ void poly_norm_inplace(const float w2_inv_std,
+                                    const float w1_inv_std2,
+                                    const float w0_inv_std3, const float bias) {
+#pragma unroll
+    for (int i = 0; i < width; i += 2) {
+      float2 z = Converter::convert(T2{data[i], data[i + 1]});
+
+      float x2 = z.x * z.x;
+      float x3 = x2 * z.x;
+      z.x = w2_inv_std * z.x + w1_inv_std2 * x2 + w0_inv_std3 * x3 + bias;
+
+      float y2 = z.y * z.y;
+      float y3 = y2 * z.y;
+      z.y = w2_inv_std * z.y + w1_inv_std2 * y2 + w0_inv_std3 * y3 + bias;
+
+      auto out = Converter::convert(z);
+      data[i] = out.x;
+      data[i + 1] = out.y;
+    }
+  }
+};
+
+template <typename scalar_t, int width>
+__global__ std::enable_if_t<(width > 0) && _typeConvert<scalar_t>::exists>
+poly_norm_kernel(scalar_t* __restrict__ out,           // [..., hidden_size]
+                 const scalar_t* __restrict__ input,   // [..., hidden_size]
+                 const scalar_t* __restrict__ weight,  // [3]
+                 const scalar_t* __restrict__ bias,    // [1]
+                 const float epsilon, const int hidden_size) {
+  // Sanity checks on our vector struct and type-punned pointer arithmetic
+  static_assert(std::is_pod_v<_f16VecPN<scalar_t, width>>);
+  static_assert(sizeof(_f16VecPN<scalar_t, width>) == sizeof(scalar_t) * width);
+
+  /* These and the argument pointers are all declared `restrict` as they are
+     not aliased in practice. Argument pointers should not be dereferenced
+     in this kernel as that would be undefined behavior */
+  auto* __restrict__ input_v =
+      reinterpret_cast<const _f16VecPN<scalar_t, width>*>(input);
+  const int vec_hidden_size = hidden_size / width;
+  float variance = 0.0f;
+  float variance2 = 0.0f;
+  float variance3 = 0.0f;
+
+  for (int idx = threadIdx.x; idx < vec_hidden_size; idx += blockDim.x) {
+    int id = blockIdx.x * vec_hidden_size + idx;
+    _f16VecPN<scalar_t, width> temp = input_v[id];
+    auto [x2, x4, x6] = temp.sum_pows();
+
+    variance += x2;
+    variance2 += x4;
+    variance3 += x6;
+  }
+
+  float3 thread_variances = make_float3(variance, variance2, variance3);
+
+  struct SumOp {
+    __device__ float3 operator()(const float3& a, const float3& b) const {
+      return make_float3(a.x + b.x, a.y + b.y, a.z + b.z);
+    }
+  };
+
+  using BlockReduce = cub::BlockReduce<float3, 1024>;
+  __shared__ typename BlockReduce::TempStorage reduceStore;
+  float3 block_variances =
+      BlockReduce(reduceStore).Reduce(thread_variances, SumOp{}, blockDim.x);
+
+  variance = block_variances.x;
+  variance2 = block_variances.y;
+  variance3 = block_variances.z;
+
+  __shared__ float s_w2_inv_std;
+  __shared__ float s_w1_inv_std2;
+  __shared__ float s_w0_inv_std3;
+  __shared__ float s_bias;
+
+  if (threadIdx.x == 0) {
+    float w0 = (float)weight[0];
+    float w1 = (float)weight[1];
+    float w2 = (float)weight[2];
+    s_bias = (float)bias[0];
+
+    s_w2_inv_std = w2 * rsqrtf(variance / hidden_size + epsilon);
+    s_w1_inv_std2 = w1 * rsqrtf(variance2 / hidden_size + epsilon);
+    s_w0_inv_std3 = w0 * rsqrtf(variance3 / hidden_size + epsilon);
+  }
+  __syncthreads();
+
+  auto* __restrict__ out_v = reinterpret_cast<_f16VecPN<scalar_t, width>*>(out);
+
+  for (int idx = threadIdx.x; idx < vec_hidden_size; idx += blockDim.x) {
+    int id = blockIdx.x * vec_hidden_size + idx;
+    _f16VecPN<scalar_t, width> temp = input_v[id];
+    temp.poly_norm_inplace(s_w2_inv_std, s_w1_inv_std2, s_w0_inv_std3, s_bias);
+    out_v[id] = temp;
+  }
+}
+
+/* Generic poly_norm_kernel
+   The width field is not used here but necessary for other specializations.
+ */
+template <typename scalar_t, int width>
+__global__ std::enable_if_t<(width == 0) || !_typeConvert<scalar_t>::exists>
+poly_norm_kernel(scalar_t* __restrict__ out,           // [..., hidden_size]
+                 const scalar_t* __restrict__ input,   // [..., hidden_size]
+                 const scalar_t* __restrict__ weight,  // [3]
+                 const scalar_t* __restrict__ bias,    // [1]
+                 const float epsilon, const int hidden_size) {
+  float variance = 0.0f;
+  float variance2 = 0.0f;
+  float variance3 = 0.0f;
+
+  for (int idx = threadIdx.x; idx < hidden_size; idx += blockDim.x) {
+    float x = (float)input[blockIdx.x * hidden_size + idx];
+    float x2 = x * x;
+    float x4 = x2 * x2;
+    float x6 = x4 * x2;
+
+    variance += x2;
+    variance2 += x4;
+    variance3 += x6;
+  }
+
+  float3 thread_variances = make_float3(variance, variance2, variance3);
+
+  struct SumOp {
+    __device__ float3 operator()(const float3& a, const float3& b) const {
+      return make_float3(a.x + b.x, a.y + b.y, a.z + b.z);
+    }
+  };
+
+  using BlockReduce = cub::BlockReduce<float3, 1024>;
+  __shared__ typename BlockReduce::TempStorage reduceStore;
+  float3 block_variances =
+      BlockReduce(reduceStore).Reduce(thread_variances, SumOp{}, blockDim.x);
+
+  variance = block_variances.x;
+  variance2 = block_variances.y;
+  variance3 = block_variances.z;
+
+  __shared__ float s_w2_inv_std;
+  __shared__ float s_w1_inv_std2;
+  __shared__ float s_w0_inv_std3;
+  __shared__ float s_bias;
+
+  if (threadIdx.x == 0) {
+    float w0 = (float)weight[0];
+    float w1 = (float)weight[1];
+    float w2 = (float)weight[2];
+    s_bias = (float)bias[0];
+
+    s_w2_inv_std = w2 * rsqrtf(variance / hidden_size + epsilon);
+    s_w1_inv_std2 = w1 * rsqrtf(variance2 / hidden_size + epsilon);
+    s_w0_inv_std3 = w0 * rsqrtf(variance3 / hidden_size + epsilon);
+  }
+  __syncthreads();
+
+  for (int idx = threadIdx.x; idx < hidden_size; idx += blockDim.x) {
+    float x = (float)input[blockIdx.x * hidden_size + idx];
+    float x2 = x * x;
+    float x3 = x2 * x;
+
+    out[blockIdx.x * hidden_size + idx] =
+        (scalar_t)(x * s_w2_inv_std + x2 * s_w1_inv_std2 + x3 * s_w0_inv_std3 +
+                   s_bias);
+  }
+}
+
 }  // namespace vllm
 
 void rms_norm(torch::Tensor& out,     // [..., hidden_size]
@@ -219,3 +424,49 @@ void fused_add_rms_norm(torch::Tensor& input,     // [..., hidden_size]
     LAUNCH_FUSED_ADD_RMS_NORM(0);
   }
 }
+
+#define LAUNCH_FUSED_POLY_NORM(width)                                         \
+  VLLM_DISPATCH_FLOATING_TYPES(input.scalar_type(), "poly_norm_kernel", [&] { \
+    vllm::poly_norm_kernel<scalar_t, width><<<grid, block, 0, stream>>>(      \
+        out.data_ptr<scalar_t>(), input.data_ptr<scalar_t>(),                 \
+        weight.data_ptr<scalar_t>(), bias.data_ptr<scalar_t>(), epsilon,      \
+        hidden_size);                                                         \
+  });
+
+void poly_norm(torch::Tensor& out,     // [..., hidden_size]
+               torch::Tensor& input,   // [..., hidden_size]
+               torch::Tensor& weight,  // [3]
+               torch::Tensor& bias,    // [1]
+               double epsilon) {
+  TORCH_CHECK(out.is_contiguous());
+  TORCH_CHECK(input.is_contiguous());
+  TORCH_CHECK(out.data_ptr() != input.data_ptr());
+
+  int hidden_size = input.size(-1);
+  int num_tokens = input.numel() / hidden_size;
+
+  dim3 grid(num_tokens);
+  /* This kernel is memory-latency bound in many scenarios.
+     When num_tokens is large, a smaller block size allows
+     for increased block occupancy on CUs and better latency
+     hiding on global mem ops. */
+  const int max_block_size = (num_tokens < 256) ? 1024 : 256;
+  dim3 block(std::min(hidden_size, max_block_size));
+  const at::cuda::OptionalCUDAGuard device_guard(device_of(input));
+  const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+  /*If the tensor types are FP16/BF16, try to use the optimized kernel
+    with packed + vectorized ops.
+    Max optimization is achieved with a width-8 vector of FP16/BF16s
+    since we can load at most 128 bits at once in a global memory op.
+    However, this requires each tensor's data to be aligned to 16
+    bytes.
+   */
+  auto inp_ptr = reinterpret_cast<std::uintptr_t>(input.data_ptr());
+  auto out_ptr = reinterpret_cast<std::uintptr_t>(out.data_ptr());
+  bool ptrs_are_aligned = inp_ptr % 16 == 0 && out_ptr % 16 == 0;
+  if (ptrs_are_aligned && hidden_size % 8 == 0) {
+    LAUNCH_FUSED_POLY_NORM(8);
+  } else {
+    LAUNCH_FUSED_POLY_NORM(0);
+  }
+}
diff --git a/csrc/mamba/mamba_ssm/selective_scan_fwd.cu b/csrc/mamba/mamba_ssm/selective_scan_fwd.cu
index c4ddbc142791fc28cb1c8d6be2f13501be48c3a0..d534e138d26d6123f0138902bdcd8508297f16ce 100644
--- a/csrc/mamba/mamba_ssm/selective_scan_fwd.cu
+++ b/csrc/mamba/mamba_ssm/selective_scan_fwd.cu
@@ -27,11 +27,12 @@
 
 template<int kNThreads_, int kNItems_, int kNRows_, bool kIsEvenLen_,
          bool kIsVariableB_, bool kIsVariableC_,
-         bool kHasZ_, bool kVarlen_, typename input_t_, typename weight_t_>
+         bool kHasZ_, bool kVarlen_, typename input_t_, typename weight_t_, typename state_t_>
 struct Selective_Scan_fwd_kernel_traits {
     static_assert(kNItems_ % 4 == 0);
     using input_t = input_t_;
     using weight_t = weight_t_;
+    using state_t = state_t_;
     static constexpr int kNThreads = kNThreads_;
     // Setting MinBlocksPerMP to be 3 (instead of 2) for 128 threads improves occupancy.
     static constexpr int kMinBlocks = kNThreads < 128 ? 5 : 3;
@@ -132,7 +133,7 @@ void selective_scan_fwd_kernel(SSMParamsBase params) {
     input_t *Bvar = reinterpret_cast<input_t *>(params.B_ptr) + sequence_start_index * params.B_batch_stride + group_id * params.B_group_stride;
     weight_t *C = reinterpret_cast<weight_t *>(params.C_ptr) + dim_id * kNRows * params.C_d_stride;
     input_t *Cvar = reinterpret_cast<input_t *>(params.C_ptr) + sequence_start_index * params.C_batch_stride + group_id * params.C_group_stride;
-    input_t *ssm_states = reinterpret_cast<input_t *>(params.ssm_states_ptr) + 
+    typename Ktraits::state_t *ssm_states = reinterpret_cast<typename Ktraits::state_t *>(params.ssm_states_ptr) + 
     cache_index * params.ssm_states_batch_stride + 
     dim_id * kNRows * params.ssm_states_dim_stride;
     
@@ -261,7 +262,7 @@ void selective_scan_fwd_kernel(SSMParamsBase params) {
                 if (threadIdx.x == 0) {
                     smem_running_prefix[state_idx] = prefix_op.running_prefix;
                     if (chunk == n_chunks - 1) {
-                        ssm_states[state_idx * params.ssm_states_dstate_stride] = input_t(prefix_op.running_prefix.y);
+                        ssm_states[state_idx * params.ssm_states_dstate_stride] = typename Ktraits::state_t(prefix_op.running_prefix.y);
                     }
                 }
                 #pragma unroll
@@ -310,7 +311,7 @@ void selective_scan_fwd_kernel(SSMParamsBase params) {
     }
 }
 
-template<int kNThreads, int kNItems, typename input_t, typename weight_t>
+template<int kNThreads, int kNItems, typename input_t, typename weight_t, typename state_t>
 void selective_scan_fwd_launch(SSMParamsBase &params, cudaStream_t stream) {
     // Only kNRows == 1 is tested for now, which ofc doesn't differ from previously when we had each block
     // processing 1 row.
@@ -321,7 +322,7 @@ void selective_scan_fwd_launch(SSMParamsBase &params, cudaStream_t stream) {
     BOOL_SWITCH(params.seqlen % (kNThreads * kNItems) == 0, kIsEvenLen, [&] {
         BOOL_SWITCH(params.z_ptr != nullptr , kHasZ, [&] {
             BOOL_SWITCH(params.query_start_loc_ptr != nullptr , kVarlen, [&] {
-                using Ktraits = Selective_Scan_fwd_kernel_traits<kNThreads, kNItems, kNRows, kIsEvenLen, kIsVariableB, kIsVariableC, kHasZ,  kVarlen, input_t, weight_t>;
+                using Ktraits = Selective_Scan_fwd_kernel_traits<kNThreads, kNItems, kNRows, kIsEvenLen, kIsVariableB, kIsVariableC, kHasZ,  kVarlen, input_t, weight_t, state_t>;
                 constexpr int kSmemSize = Ktraits::kSmemSize + kNRows * MAX_DSTATE * sizeof(typename Ktraits::scan_t);
                 dim3 grid(params.batch, params.dim / kNRows);
                 auto kernel = &selective_scan_fwd_kernel<Ktraits>;
@@ -341,59 +342,78 @@ void selective_scan_fwd_launch(SSMParamsBase &params, cudaStream_t stream) {
     });
 }
 
-template<typename input_t, typename weight_t>
+template<typename input_t, typename weight_t, typename state_t>
 void selective_scan_fwd_cuda(SSMParamsBase &params, cudaStream_t stream) {
 
     #ifndef USE_ROCM
         if (params.seqlen <= 128) {           
-            selective_scan_fwd_launch<32, 4, input_t, weight_t>(params, stream);
+            selective_scan_fwd_launch<32, 4, input_t, weight_t, state_t>(params, stream);
         } else if (params.seqlen <= 256) {
-            selective_scan_fwd_launch<32, 8, input_t, weight_t>(params, stream);
+            selective_scan_fwd_launch<32, 8, input_t, weight_t, state_t>(params, stream);
         } else if (params.seqlen <= 512) {
-            selective_scan_fwd_launch<32, 16, input_t, weight_t>(params, stream);
+            selective_scan_fwd_launch<32, 16, input_t, weight_t, state_t>(params, stream);
         } else if (params.seqlen <= 1024) {
-            selective_scan_fwd_launch<64, 16, input_t, weight_t>(params, stream);
+            selective_scan_fwd_launch<64, 16, input_t, weight_t, state_t>(params, stream);
         } else {
-            selective_scan_fwd_launch<128, 16, input_t, weight_t>(params, stream);
+            selective_scan_fwd_launch<128, 16, input_t, weight_t, state_t>(params, stream);
         }
     #else
         if (params.seqlen <= 256) {
-            selective_scan_fwd_launch<64, 4, input_t, weight_t>(params, stream);
+            selective_scan_fwd_launch<64, 4, input_t, weight_t, state_t>(params, stream);
         } else if (params.seqlen <= 512) {
-            selective_scan_fwd_launch<64, 8, input_t, weight_t>(params, stream);
+            selective_scan_fwd_launch<64, 8, input_t, weight_t, state_t>(params, stream);
         } else if (params.seqlen <= 1024) {
-            selective_scan_fwd_launch<64, 16, input_t, weight_t>(params, stream);
+            selective_scan_fwd_launch<64, 16, input_t, weight_t, state_t>(params, stream);
         } else {
-            selective_scan_fwd_launch<128, 16, input_t, weight_t>(params, stream);
+            selective_scan_fwd_launch<128, 16, input_t, weight_t, state_t>(params, stream);
         }
     #endif
 }
 
-template void selective_scan_fwd_cuda<at::BFloat16, float>(SSMParamsBase &params, cudaStream_t stream);
-template void selective_scan_fwd_cuda<at::Half, float>(SSMParamsBase &params, cudaStream_t stream);
-template void selective_scan_fwd_cuda<float, float>(SSMParamsBase &params, cudaStream_t stream);
+template void selective_scan_fwd_cuda<at::BFloat16, float, at::BFloat16>(SSMParamsBase &params, cudaStream_t stream);
+template void selective_scan_fwd_cuda<at::BFloat16, float, float>(SSMParamsBase &params, cudaStream_t stream);
+template void selective_scan_fwd_cuda<at::Half, float, at::Half>(SSMParamsBase &params, cudaStream_t stream);
+template void selective_scan_fwd_cuda<at::Half, float, float>(SSMParamsBase &params, cudaStream_t stream);
+template void selective_scan_fwd_cuda<float, float, float>(SSMParamsBase &params, cudaStream_t stream);
 
 #define CHECK_SHAPE(x, ...) TORCH_CHECK(x.sizes() == torch::IntArrayRef({__VA_ARGS__}), #x " must have shape (" #__VA_ARGS__ ")")
 
-#define DISPATCH_WTYPE_ITYPE_FLOAT_AND_HALF_AND_BF16(ITYPE, NAME, ...)              \
+#define DISPATCH_WTYPE_ITYPE_FLOAT_AND_HALF_AND_BF16(ITYPE, STYPE, NAME, ...)       \
     if (ITYPE == at::ScalarType::Half) {                                            \
         using input_t = at::Half;                                                   \
         using weight_t = float;                                                     \
-        __VA_ARGS__();                                                              \
+        if (STYPE == at::ScalarType::Half) {                                        \
+            using state_t = at::Half;                                               \
+            __VA_ARGS__();                                                          \
+        } else if (STYPE == at::ScalarType::Float) {                                \
+            using state_t = float;                                                  \
+            __VA_ARGS__();                                                          \
+        } else {                                                                    \
+            AT_ERROR(#NAME, " not implemented for state type '", toString(STYPE), "'"); \
+        }                                                                           \
     } else if (ITYPE == at::ScalarType::BFloat16) {                                 \
         using input_t = at::BFloat16;                                               \
         using weight_t = float;                                                     \
-        __VA_ARGS__();                                                              \
+        if (STYPE == at::ScalarType::BFloat16) {                                    \
+            using state_t = at::BFloat16;                                           \
+            __VA_ARGS__();                                                          \
+        } else if (STYPE == at::ScalarType::Float) {                                \
+            using state_t = float;                                                  \
+            __VA_ARGS__();                                                          \
+        } else {                                                                    \
+            AT_ERROR(#NAME, " not implemented for state type '", toString(STYPE), "'"); \
+        }                                                                           \
     } else if (ITYPE == at::ScalarType::Float)  {                                   \
         using input_t = float;                                                      \
         using weight_t = float;                                                     \
+        using state_t = float;                                                      \
         __VA_ARGS__();                                                              \
     } else {                                                                        \
         AT_ERROR(#NAME, " not implemented for input type '", toString(ITYPE), "'"); \
     }
 
 
-template<typename input_t, typename weight_t>
+template<typename input_t, typename weight_t, typename state_t>
 void selective_scan_fwd_cuda(SSMParamsBase &params, cudaStream_t stream);
 
 void set_ssm_params_fwd(SSMParamsBase &params,
@@ -648,7 +668,9 @@ void selective_scan_fwd(const torch::Tensor &u, const torch::Tensor &delta,
 
     // Right now u has BHL layout and delta has HBL layout, and we want out to have HBL layout
     at::Tensor out = delta;
-    TORCH_CHECK(ssm_states.scalar_type() == input_type);
+    // ssm_states can now be either the same as input_type or float32
+    auto state_type = ssm_states.scalar_type();
+    TORCH_CHECK(state_type == input_type || state_type == at::ScalarType::Float);
     TORCH_CHECK(ssm_states.is_cuda());
     TORCH_CHECK(ssm_states.stride(-1) == 1);
 
@@ -670,7 +692,7 @@ void selective_scan_fwd(const torch::Tensor &u, const torch::Tensor &delta,
     
     const at::cuda::OptionalCUDAGuard device_guard(device_of(u));
     auto stream = at::cuda::getCurrentCUDAStream().stream();
-    DISPATCH_WTYPE_ITYPE_FLOAT_AND_HALF_AND_BF16(u.scalar_type(), "selective_scan_fwd", [&] {
-        selective_scan_fwd_cuda<input_t, weight_t>(params, stream);
+    DISPATCH_WTYPE_ITYPE_FLOAT_AND_HALF_AND_BF16(u.scalar_type(), ssm_states.scalar_type(), "selective_scan_fwd", [&] {
+        selective_scan_fwd_cuda<input_t, weight_t, state_t>(params, stream);
     });
 }
diff --git a/csrc/moe/grouped_topk_kernels.cu b/csrc/moe/grouped_topk_kernels.cu
index 78f7b3cc1aa2502aa1d7a846b37b41e0023b275e..accbb09858fac493754b870ebb30feaf0c45e405 100644
--- a/csrc/moe/grouped_topk_kernels.cu
+++ b/csrc/moe/grouped_topk_kernels.cu
@@ -28,6 +28,7 @@ namespace cg = cooperative_groups;
 namespace vllm {
 namespace moe {
 
+constexpr float kNegInfinity = INFINITY * -1;
 constexpr unsigned FULL_WARP_MASK = 0xffffffff;
 constexpr int32_t WARP_SIZE = 32;
 constexpr int32_t BLOCK_SIZE = 512;
@@ -512,8 +513,8 @@ __global__ void group_idx_and_topk_idx_kernel(
       warp_id * topk;
   s_topk_idx += warp_id * topk;
 
-  T value = cuda::std::numeric_limits<T>::min();
-  T topk_group_value = cuda::std::numeric_limits<T>::min();
+  T value = kNegInfinity;
+  T topk_group_value = kNegInfinity;
   int32_t num_equalto_topkth_group;
 
 #if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900))
@@ -539,11 +540,11 @@ __global__ void group_idx_and_topk_idx_kernel(
       __syncwarp();  // Ensure all threads have valid data before reduction
       topk_group_value = cg::reduce(tile, value, cg::greater<T>());
       if (value == topk_group_value) {
-        value = cuda::std::numeric_limits<T>::min();
+        value = kNegInfinity;
       }
       pre_count_equal_to_top_value = count_equal_to_top_value;
       count_equal_to_top_value = __popc(__ballot_sync(
-          FULL_WARP_MASK, (value == cuda::std::numeric_limits<T>::min())));
+          FULL_WARP_MASK, (value == cuda_cast<T, float>(kNegInfinity))));
     }
     num_equalto_topkth_group = target_num_min - pre_count_equal_to_top_value;
   }
@@ -555,7 +556,7 @@ __global__ void group_idx_and_topk_idx_kernel(
 
   int count_equalto_topkth_group = 0;
   bool if_proceed_next_topk =
-      (topk_group_value != cuda::std::numeric_limits<T>::min());
+      (topk_group_value != cuda_cast<T, float>(kNegInfinity));
   if (case_id < num_tokens && if_proceed_next_topk) {
     for (int i_group = 0; i_group < n_group; i_group++) {
       if ((group_scores[i_group] > topk_group_value) ||
@@ -568,7 +569,7 @@ __global__ void group_idx_and_topk_idx_kernel(
               (i < num_experts_per_group) && isfinite(cuda_cast<float, T>(
                                                  scores_with_bias[offset + i]))
                   ? scores_with_bias[offset + i]
-                  : cuda::std::numeric_limits<T>::min();
+                  : cuda_cast<T, float>(kNegInfinity);
           queue.add(candidates, offset + i);
         }
         if (group_scores[i_group] == topk_group_value) {
diff --git a/csrc/ops.h b/csrc/ops.h
index bdd87b3d7b4a278cdfd581467720d5af8b1e78dd..7992ca7d10458cf6180a2c7453f82549b123d373 100644
--- a/csrc/ops.h
+++ b/csrc/ops.h
@@ -92,6 +92,9 @@ void rms_norm(torch::Tensor& out, torch::Tensor& input, torch::Tensor& weight,
 void fused_add_rms_norm(torch::Tensor& input, torch::Tensor& residual,
                         torch::Tensor& weight, double epsilon);
 
+void poly_norm(torch::Tensor& out, torch::Tensor& input, torch::Tensor& weight,
+               torch::Tensor& bias, double epsilon);
+
 void apply_repetition_penalties_(torch::Tensor& logits,
                                  const torch::Tensor& prompt_mask,
                                  const torch::Tensor& output_mask,
@@ -130,8 +133,7 @@ void silu_and_mul(torch::Tensor& out, torch::Tensor& input);
 // void silu_and_mul_quant(torch::Tensor& out, torch::Tensor& input,
 //                         torch::Tensor& scale);
 
-#if (defined(ENABLE_NVFP4_SM100) && ENABLE_NVFP4_SM100) || \
-    (defined(ENABLE_NVFP4_SM120) && ENABLE_NVFP4_SM120)
+#ifndef USE_ROCM
 void silu_and_mul_nvfp4_quant(torch::Tensor& out,
                               torch::Tensor& output_block_scale,
                               torch::Tensor& input,
@@ -356,4 +358,4 @@ void qr_open_handles(fptr_t _fa, const std::vector<torch::Tensor>& handles);
 void qr_all_reduce(fptr_t _fa, torch::Tensor& inp, torch::Tensor& out,
                    int64_t quant_level, bool cast_bf2half = false);
 int64_t qr_max_size();
-#endif
\ No newline at end of file
+#endif
diff --git a/csrc/quantization/cutlass_w4a8/w4a8_mm_entry.cu b/csrc/quantization/cutlass_w4a8/w4a8_mm_entry.cu
index fdac47c425d618d7f31350bd21f3c019e0bafcf8..57bcbaae45dda37e2201e07bfc2c207a6aaee643 100644
--- a/csrc/quantization/cutlass_w4a8/w4a8_mm_entry.cu
+++ b/csrc/quantization/cutlass_w4a8/w4a8_mm_entry.cu
@@ -11,6 +11,7 @@
 #include "core/registration.h"
 
 #include "cutlass/cutlass.h"
+#include <limits>
 
 #include "cute/tensor.hpp"
 #include "cutlass/gemm/collective/collective_builder.hpp"
@@ -169,6 +170,11 @@ struct W4A8GemmKernel {
     int k = A.size(1);
     int n = B.size(1);
 
+    // safely cast group_size to int
+    TORCH_CHECK(group_size > 0 && group_size <= std::numeric_limits<int>::max(),
+                "group_size out of supported range for int: ", group_size);
+    int const group_size_int = static_cast<int>(group_size);
+
     // Allocate output
     const at::cuda::OptionalCUDAGuard device_guard(device_of(A));
     auto device = A.device();
@@ -181,7 +187,7 @@ struct W4A8GemmKernel {
     auto A_ptr = static_cast<MmaType const*>(A.const_data_ptr());
     auto B_ptr = static_cast<QuantType const*>(B.const_data_ptr());
     auto D_ptr = static_cast<ElementD*>(D.data_ptr());
-    // can we avoid harcode the 8 here
+    // can we avoid hardcode the 8 here
     auto S_ptr =
         static_cast<cutlass::Array<ElementScale, ScalePackSize> const*>(
             group_scales.const_data_ptr());
@@ -192,7 +198,7 @@ struct W4A8GemmKernel {
         cute::tile_to_shape(LayoutAtomQuant{}, shape_B);
 
     // strides
-    int const scale_k = cutlass::ceil_div(k, group_size);
+    int const scale_k = cutlass::ceil_div(k, group_size_int);
     StrideA stride_A =
         cutlass::make_cute_packed_stride(StrideA{}, cute::make_shape(m, k, 1));
     // Reverse stride here due to swap and transpose
@@ -211,8 +217,8 @@ struct W4A8GemmKernel {
     using EpilogueArguments = typename GemmKernelShuffled::EpilogueArguments;
 
     MainloopArguments mainloop_arguments{
-        B_ptr, layout_B_reordered, A_ptr,     stride_A,
-        S_ptr, stride_S,           group_size};
+        B_ptr, layout_B_reordered, A_ptr,         stride_A,
+        S_ptr, stride_S,           group_size_int};
 
     EpilogueArguments epilogue_arguments{
         ChTokScalesEpilogue::prepare_args(channel_scales, token_scales),
diff --git a/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_blockwise_sm100_fp8_dispatch.cuh b/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_blockwise_sm100_fp8_dispatch.cuh
index c841125dbb734f31d97f64d2d94c73f841a7bae1..939879b2c59fa2302ea096cd5fa6ac3419591912 100644
--- a/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_blockwise_sm100_fp8_dispatch.cuh
+++ b/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_blockwise_sm100_fp8_dispatch.cuh
@@ -14,9 +14,6 @@
 #include "cutlass/epilogue/dispatch_policy.hpp"
 #include "cutlass/epilogue/collective/collective_builder.hpp"
 
-#include "cutlass_extensions/gemm/dispatch_policy.hpp"
-#include "cutlass_extensions/gemm/collective/collective_builder.hpp"
-
 #include "cutlass_gemm_caller.cuh"
 
 namespace vllm {
diff --git a/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_blockwise_sm120_fp8_dispatch.cuh b/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_blockwise_sm120_fp8_dispatch.cuh
index d50a83ae1cd485659dd43f34455f6268666b3800..78d5cf37fa6d0222513cf584705ee1e3c243aa82 100644
--- a/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_blockwise_sm120_fp8_dispatch.cuh
+++ b/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_blockwise_sm120_fp8_dispatch.cuh
@@ -14,9 +14,6 @@
 #include "cutlass/epilogue/dispatch_policy.hpp"
 #include "cutlass/epilogue/collective/collective_builder.hpp"
 
-#include "cutlass_extensions/gemm/dispatch_policy.hpp"
-#include "cutlass_extensions/gemm/collective/collective_builder.hpp"
-
 #include "cutlass_gemm_caller.cuh"
 
 namespace vllm {
diff --git a/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_blockwise_sm90_fp8_dispatch.cuh b/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_blockwise_sm90_fp8_dispatch.cuh
index e089c3d4be2cc2385ea4d2309ca1cdc08caaeb7c..86220264151e70f549a43cd4e6762b90d0aa5ea2 100644
--- a/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_blockwise_sm90_fp8_dispatch.cuh
+++ b/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_blockwise_sm90_fp8_dispatch.cuh
@@ -13,27 +13,18 @@
 #include "cutlass/epilogue/dispatch_policy.hpp"
 #include "cutlass/epilogue/collective/collective_builder.hpp"
 
-#include "cutlass_extensions/gemm/dispatch_policy.hpp"
-#include "cutlass_extensions/gemm/collective/collective_builder.hpp"
-
 #include "cutlass_gemm_caller.cuh"
 
 namespace vllm {
 
 using namespace cute;
 
-template <typename SchedulerType, typename OutType, int GroupSizeM_,
-          int GroupSizeN_, int GroupSizeK_, int TileSizeM_ = 128,
-          class ClusterShape = Shape<_1, _2, _1>>
+// clang-format off
+template <class OutType, int ScaleGranularityM,
+          int ScaleGranularityN, int ScaleGranularityK,
+          class MmaTileShape, class ClusterShape,
+          class EpilogueScheduler, class MainloopScheduler>
 struct cutlass_3x_gemm_fp8_blockwise {
-  using GroupSizeM = Int<GroupSizeM_>;
-  using GroupSizeN = Int<GroupSizeN_>;
-  using GroupSizeK = Int<GroupSizeK_>;
-  using TileSizeM = Int<TileSizeM_>;
-
-  static_assert(TileSizeM_ % GroupSizeM_ == 0,
-                "TileSizeM must be a multiple of GroupSizeM");
-
   using ElementAB = cutlass::float_e4m3_t;
 
   using ElementA = ElementAB;
@@ -45,52 +36,67 @@ struct cutlass_3x_gemm_fp8_blockwise {
   static constexpr int AlignmentB = 128 / cutlass::sizeof_bits<ElementB>::value;
 
   using ElementD = OutType;
-  using StrideD = Stride<int64_t, Int<1>, Int<0>>;
+  using LayoutD = cutlass::layout::RowMajor;
   static constexpr int AlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value;
 
-  using ElementC = void;
-  using StrideC = StrideD;
+  using ElementC = void; // TODO: support bias
+  using LayoutC = LayoutD;
   static constexpr int AlignmentC = AlignmentD;
 
   using ElementAccumulator = float;
-  using ElementBlockScale = float;
   using ElementCompute = float;
+  using ElementBlockScale = float;
+
+  using ScaleConfig = cutlass::detail::Sm90BlockwiseScaleConfig<
+        ScaleGranularityM, ScaleGranularityN, ScaleGranularityK>;
+
+  using LayoutSFA = decltype(ScaleConfig::deduce_layoutSFA());
+  using LayoutSFB = decltype(ScaleConfig::deduce_layoutSFB());
+
   using ArchTag = cutlass::arch::Sm90;
   using OperatorClass = cutlass::arch::OpClassTensorOp;
-  using TileShape = Shape<TileSizeM, GroupSizeN, GroupSizeK>;
-
-  using KernelSchedule = cutlass::gemm::
-      KernelTmaWarpSpecializedCooperativeFP8BlockScaledSubGroupMAccum<
-          GroupSizeM_>;
-  using EpilogueSchedule = cutlass::epilogue::TmaWarpSpecializedCooperative;
-  using EpilogueTileType = cutlass::epilogue::collective::EpilogueTileAuto;
-
-  using StoreEpilogueCompute = typename cutlass::epilogue::fusion::Sm90EVT<
-      cutlass::epilogue::fusion::Sm90AccFetch>;
-
-  using CollectiveEpilogue =
-      typename cutlass::epilogue::collective::CollectiveBuilder<
-          ArchTag, OperatorClass, TileShape, ClusterShape, EpilogueTileType,
-          ElementAccumulator, ElementCompute, ElementC, StrideC, AlignmentC,
-          ElementD, StrideD, AlignmentD, EpilogueSchedule,
-          StoreEpilogueCompute>::CollectiveOp;
-
-  using CollectiveMainloop =
-      typename cutlass::gemm::collective::CollectiveBuilder<
-          ArchTag, OperatorClass, ElementA, LayoutA, AlignmentA, ElementB,
-          LayoutB, AlignmentB, ElementAccumulator, TileShape, ClusterShape,
-          cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(
-              sizeof(typename CollectiveEpilogue::SharedStorage))>,
-          KernelSchedule>::CollectiveOp;
+
+  static constexpr auto RoundStyle = cutlass::FloatRoundStyle::round_to_nearest;
+  using ElementScalar = float;
+  using DefaultOperation = cutlass::epilogue::fusion::LinearCombination<ElementD, ElementCompute, ElementC, ElementScalar, RoundStyle>;
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      ArchTag,
+      OperatorClass,
+      MmaTileShape,
+      ClusterShape,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      ElementAccumulator,
+      ElementCompute,
+      ElementC,
+      LayoutC,
+      AlignmentC,
+      ElementD,
+      LayoutD,
+      AlignmentD,
+      EpilogueScheduler,
+      DefaultOperation
+  >::CollectiveOp;
+
+  using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+      ArchTag,
+      OperatorClass,
+      ElementA,
+      cute::tuple<LayoutA, LayoutSFA>,
+      AlignmentA,
+      ElementB,
+      cute::tuple<LayoutB, LayoutSFB>,
+      AlignmentB,
+      ElementAccumulator,
+      MmaTileShape,
+      ClusterShape,
+      cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      MainloopScheduler
+  >::CollectiveOp;
 
   using KernelType = enable_sm90_or_later<cutlass::gemm::kernel::GemmUniversal<
-      Shape<int, int, int, int>, CollectiveMainloop, CollectiveEpilogue,
-      SchedulerType>>;
+      Shape<int, int, int, int>, CollectiveMainloop, CollectiveEpilogue>>;
 
   struct GemmKernel : public KernelType {};
-
-  using StrideA = typename GemmKernel::StrideA;
-  using StrideB = typename GemmKernel::StrideB;
 };
 
 template <typename Gemm>
@@ -99,76 +105,54 @@ void cutlass_gemm_caller_blockwise(torch::Tensor& out, torch::Tensor const& a,
                                    torch::Tensor const& a_scales,
                                    torch::Tensor const& b_scales) {
   using GemmKernel = typename Gemm::GemmKernel;
+  using StrideA = typename Gemm::GemmKernel::StrideA;
+  using StrideB = typename Gemm::GemmKernel::StrideB;
+  using StrideD = typename Gemm::GemmKernel::StrideD;
+  using StrideC = typename Gemm::GemmKernel::StrideC;
+  using LayoutSFA = typename Gemm::LayoutSFA;
+  using LayoutSFB = typename Gemm::LayoutSFB;
+  using ScaleConfig = typename Gemm::ScaleConfig;
 
   using ElementAB = typename Gemm::ElementAB;
   using ElementD = typename Gemm::ElementD;
 
-  auto prob_shape = c3x::get_problem_shape(a, b);
-  int32_t m = get<0>(prob_shape), n = get<1>(prob_shape),
-          k = get<2>(prob_shape);
+  int32_t m = a.size(0), n = b.size(1), k = a.size(1);
 
-  int64_t lda = a.stride(0);
-  int64_t ldb = b.stride(1);
-  int64_t ldc = out.stride(0);
+  TORCH_CHECK(m % 4 == 0, "m must be divisible by 4");
 
-  using StrideA = Stride<int64_t, Int<1>, int64_t>;
-  using StrideB = Stride<int64_t, Int<1>, int64_t>;
-  using StrideC = typename Gemm::StrideC;
+  StrideA a_stride;
+  StrideB b_stride;
+  StrideC c_stride;
+  a_stride =
+      cutlass::make_cute_packed_stride(StrideA{}, cute::make_shape(m, k, 1));
+  b_stride =
+      cutlass::make_cute_packed_stride(StrideB{}, cute::make_shape(n, k, 1));
+  c_stride =
+      cutlass::make_cute_packed_stride(StrideC{}, cute::make_shape(m, n, 1));
 
-  StrideA a_stride{lda, Int<1>{}, 0};
-  StrideB b_stride{ldb, Int<1>{}, 0};
-  StrideC c_stride{ldc, Int<1>{}, Int<0>{}};
+  LayoutSFA layout_SFA = 
+      ScaleConfig::tile_atom_to_shape_SFA(make_shape(m, n, k, 1));
+  LayoutSFB layout_SFB = 
+      ScaleConfig::tile_atom_to_shape_SFB(make_shape(m, n, k, 1));
 
   auto a_ptr = static_cast<ElementAB*>(a.data_ptr());
   auto b_ptr = static_cast<ElementAB*>(b.data_ptr());
   auto a_scales_ptr = static_cast<float*>(a_scales.data_ptr());
   auto b_scales_ptr = static_cast<float*>(b_scales.data_ptr());
 
-  // Check is the t is contiguous and is 1D or 2D with one of the dimensions
-  // being 1 (i.e. a row or column vector)
-  auto is_contiguous_vector = [](const torch::Tensor& t) {
-    auto t_sizes = t.sizes();
-    return t.is_contiguous() &&
-           (t.dim() == 1 ||
-            (t.dim() == 2 &&
-             *std::min_element(t_sizes.begin(), t_sizes.end()) == 1));
-  };
-
-  // TODO(lucas): lets clean-up the kernel so that we pass in Strides so
-  //  we don't have to deal with enforcing implicit layouts
-  TORCH_CHECK(a_scales.size(0) == m / Gemm::GroupSizeM::value);
-  TORCH_CHECK(a_scales.size(1) == k / Gemm::GroupSizeK::value);
-  TORCH_CHECK(a_scales.stride(0) == 1 || is_contiguous_vector(a_scales),
-              "a_scales must be M major");
-  TORCH_CHECK(b_scales.size(0) == k / Gemm::GroupSizeK::value);
-  TORCH_CHECK(b_scales.size(1) == n / Gemm::GroupSizeN::value);
-  TORCH_CHECK(b_scales.stride(0) == 1 || is_contiguous_vector(b_scales),
-              "b_scales must be K major");
-  typename GemmKernel::MainloopArguments mainloop_args{
-      a_ptr, a_stride, b_ptr, b_stride, a_scales_ptr, b_scales_ptr};
+  auto mainloop_args = [&](){
+    return typename GemmKernel::MainloopArguments{
+        a_ptr,        a_stride,   b_ptr,        b_stride,
+        a_scales_ptr, layout_SFA, b_scales_ptr, layout_SFB
+    };
+  }();
+  auto prob_shape = cute::make_shape(m, n, k, 1);
 
   auto c_ptr = static_cast<ElementD*>(out.data_ptr());
   typename GemmKernel::EpilogueArguments epilogue_args{
       {}, c_ptr, c_stride, c_ptr, c_stride};
-
-  typename GemmKernel::TileSchedulerArguments scheduler;
-
-  static constexpr bool UsesStreamKScheduler =
-      cute::is_same_v<typename GemmKernel::TileSchedulerTag,
-                      cutlass::gemm::StreamKScheduler>;
-
-  if constexpr (UsesStreamKScheduler) {
-    using DecompositionMode = typename cutlass::gemm::kernel::detail::
-        PersistentTileSchedulerSm90StreamKParams::DecompositionMode;
-    using ReductionMode = typename cutlass::gemm::kernel::detail::
-        PersistentTileSchedulerSm90StreamKParams::ReductionMode;
-
-    scheduler.decomposition_mode = DecompositionMode::StreamK;
-    scheduler.reduction_mode = ReductionMode::Nondeterministic;
-  }
-
   c3x::cutlass_gemm_caller<GemmKernel>(a.device(), prob_shape, mainloop_args,
-                                       epilogue_args, scheduler);
+                                       epilogue_args);
 }
 
 template <typename OutType>
@@ -177,18 +161,12 @@ void cutlass_gemm_blockwise_sm90_fp8_dispatch(torch::Tensor& out,
                                               torch::Tensor const& b,
                                               torch::Tensor const& a_scales,
                                               torch::Tensor const& b_scales) {
-  auto k = a.size(1);
-  auto n = b.size(1);
-
-  if (k > 3 * n) {
-    cutlass_gemm_caller_blockwise<cutlass_3x_gemm_fp8_blockwise<
-        cutlass::gemm::StreamKScheduler, OutType, 1, 128, 128>>(
-        out, a, b, a_scales, b_scales);
-  } else {
-    cutlass_gemm_caller_blockwise<cutlass_3x_gemm_fp8_blockwise<
-        cutlass::gemm::PersistentScheduler, OutType, 1, 128, 128>>(
-        out, a, b, a_scales, b_scales);
-  }
+  // TODO: better heuristics
+  cutlass_gemm_caller_blockwise<cutlass_3x_gemm_fp8_blockwise<
+      OutType, 1, 128, 128, Shape<_128, _128, _128>,
+      Shape<_1, _2, _1>, cutlass::epilogue::TmaWarpSpecializedCooperative,
+      cutlass::gemm::KernelTmaWarpSpecializedCooperativeFP8BlockScaledAccum>>(
+      out, a, b, a_scales, b_scales);
 }
 
 }  // namespace vllm
\ No newline at end of file
diff --git a/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_helper.hpp b/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_helper.hpp
index 2ee6a19407f923e13110fae658e0809f141afa03..3af59267bd60c6d4b105c061d947b610d510ba9e 100644
--- a/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_helper.hpp
+++ b/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_helper.hpp
@@ -32,7 +32,7 @@ void dispatch_scaled_mm(torch::Tensor& c, torch::Tensor const& a,
     TORCH_CHECK(a_scales.dim() == 2, "a scale must be 2d tensor.");
     TORCH_CHECK(b_scales.dim() == 2, "b scale must be 2d tensor.");
     int32_t version_num = get_sm_version_num();
-    if (version_num >= 100) {
+    if (version_num >= 90) {
       TORCH_CHECK(
           a.size(0) == a_scales.size(0) &&
               cuda_utils::ceil_div(a.size(1), int64_t(128)) == a_scales.size(1),
@@ -41,32 +41,6 @@ void dispatch_scaled_mm(torch::Tensor& c, torch::Tensor const& a,
           cuda_utils::ceil_div(b.size(0), int64_t(128)) == b_scales.size(0) &&
               cuda_utils::ceil_div(b.size(1), int64_t(128)) == b_scales.size(1),
           "b_scale_group_shape must be [128, 128].");
-    } else {
-      // TODO: Remove this after using cutlass sm90 blockwise scaling gemm
-      // kernel, or introducing ceil_div to the load_init() of mainloop.
-      using GroupShape = std::array<int64_t, 2>;
-      auto make_group_shape = [](torch::Tensor const& x,
-                                 torch::Tensor const& s) -> GroupShape {
-        TORCH_CHECK(s.dim() == 2, "cutlass_scaled_mm group scales must be 2D");
-        return {cuda_utils::ceil_div(x.size(0), s.size(0)),
-                cuda_utils::ceil_div(x.size(1), s.size(1))};
-      };
-
-      GroupShape a_scale_group_shape = make_group_shape(a, a_scales);
-      GroupShape b_scale_group_shape = make_group_shape(b, b_scales);
-
-      // 1x128 per-token group scales for activations
-      // 128x128 blockwise scales for weights
-      TORCH_CHECK((a_scale_group_shape == GroupShape{1, 128} &&
-                   b_scale_group_shape == GroupShape{128, 128} &&
-                   a.dtype() == torch::kFloat8_e4m3fn &&
-                   b.dtype() == torch::kFloat8_e4m3fn),
-                  "cutlass_scaled_mm only supports datatype float8_e4m3fn.\n"
-                  "a_scale_group_shape must be [1, 128]. Got: [",
-                  a_scale_group_shape[0], ", ", a_scale_group_shape[1],
-                  "]\n"
-                  "b_scale_group_shape must be [128, 128]. Got: [",
-                  b_scale_group_shape[0], ", ", b_scale_group_shape[1], "]");
     }
 
     TORCH_CHECK(!bias, "Bias not yet supported blockwise scaled_mm");
diff --git a/csrc/quantization/fp4/activation_nvfp4_quant_fusion_kernels.cu b/csrc/quantization/fp4/activation_nvfp4_quant_fusion_kernels.cu
index 9bbeb0334fb9a81c21c3a461a5dde86ede759444..b4eb141cb4883d495318f4d381826801e5387193 100644
--- a/csrc/quantization/fp4/activation_nvfp4_quant_fusion_kernels.cu
+++ b/csrc/quantization/fp4/activation_nvfp4_quant_fusion_kernels.cu
@@ -26,164 +26,17 @@
 #include "dispatch_utils.h"
 
 #include "cuda_utils.h"
+#include "nvfp4_utils.cuh"
 
 namespace vllm {
 
-// Get type2 from type or vice versa (applied to half and bfloat16)
-template <typename T>
-struct TypeConverter {
-  using Type = half2;
-};  // keep for generality
-
-template <>
-struct TypeConverter<half2> {
-  using Type = c10::Half;
-};
-
-template <>
-struct TypeConverter<c10::Half> {
-  using Type = half2;
-};
-
-template <>
-struct TypeConverter<__nv_bfloat162> {
-  using Type = c10::BFloat16;
-};
-
-template <>
-struct TypeConverter<c10::BFloat16> {
-  using Type = __nv_bfloat162;
-};
-
-#define ELTS_PER_THREAD 8
-
-constexpr int CVT_FP4_ELTS_PER_THREAD = 8;
-constexpr int CVT_FP4_SF_VEC_SIZE = 16;
-
-// Convert 8 float32 values into 8 e2m1 values (represented as one uint32_t).
-inline __device__ uint32_t fp32_vec_to_e2m1(float (&array)[8]) {
-#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 1000)
-  uint32_t val;
-  asm volatile(
-      "{\n"
-      ".reg .b8 byte0;\n"
-      ".reg .b8 byte1;\n"
-      ".reg .b8 byte2;\n"
-      ".reg .b8 byte3;\n"
-      "cvt.rn.satfinite.e2m1x2.f32   byte0, %2, %1;\n"
-      "cvt.rn.satfinite.e2m1x2.f32   byte1, %4, %3;\n"
-      "cvt.rn.satfinite.e2m1x2.f32   byte2, %6, %5;\n"
-      "cvt.rn.satfinite.e2m1x2.f32   byte3, %8, %7;\n"
-      "mov.b32 %0, {byte0, byte1, byte2, byte3};\n"
-      "}"
-      : "=r"(val)
-      : "f"(array[0]), "f"(array[1]), "f"(array[2]), "f"(array[3]),
-        "f"(array[4]), "f"(array[5]), "f"(array[6]), "f"(array[7]));
-  return val;
-#else
-  return 0;
-#endif
-}
-
-// Convert 4 float2 values into 8 e2m1 values (represented as one uint32_t).
-inline __device__ uint32_t fp32_vec_to_e2m1(float2 (&array)[4]) {
-#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 1000)
-  uint32_t val;
-  asm volatile(
-      "{\n"
-      ".reg .b8 byte0;\n"
-      ".reg .b8 byte1;\n"
-      ".reg .b8 byte2;\n"
-      ".reg .b8 byte3;\n"
-      "cvt.rn.satfinite.e2m1x2.f32   byte0, %2, %1;\n"
-      "cvt.rn.satfinite.e2m1x2.f32   byte1, %4, %3;\n"
-      "cvt.rn.satfinite.e2m1x2.f32   byte2, %6, %5;\n"
-      "cvt.rn.satfinite.e2m1x2.f32   byte3, %8, %7;\n"
-      "mov.b32 %0, {byte0, byte1, byte2, byte3};\n"
-      "}"
-      : "=r"(val)
-      : "f"(array[0].x), "f"(array[0].y), "f"(array[1].x), "f"(array[1].y),
-        "f"(array[2].x), "f"(array[2].y), "f"(array[3].x), "f"(array[3].y));
-  return val;
-#else
-  return 0;
-#endif
-}
-
-// Fast reciprocal.
-inline __device__ float reciprocal_approximate_ftz(float a) {
-  float b;
-  asm volatile("rcp.approx.ftz.f32 %0, %1;\n" : "=f"(b) : "f"(a));
-  return b;
-}
-
-template <class SFType, int CVT_FP4_NUM_THREADS_PER_SF>
-__device__ uint8_t* cvt_quant_to_fp4_get_sf_out_offset(int rowIdx, int colIdx,
-                                                       int numCols,
-                                                       SFType* SFout) {
-#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 1000)
-  static_assert(CVT_FP4_NUM_THREADS_PER_SF == 1 ||
-                CVT_FP4_NUM_THREADS_PER_SF == 2);
-
-  // One pair of threads write one SF to global memory.
-  // TODO: stage through smem for packed STG.32
-  // is it better than STG.8 from 4 threads ?
-  if (threadIdx.x % CVT_FP4_NUM_THREADS_PER_SF == 0) {
-    // SF vector index (16 elements share one SF in the K dimension).
-    int32_t kIdx = colIdx / CVT_FP4_NUM_THREADS_PER_SF;
-    int32_t mIdx = rowIdx;
-
-    // SF layout [numMTiles, numKTiles, 32 (mTile), 4 (mTile), 4(kTile)]
-    // --> index [mTileIdx, kTileIdx, outerMIdx, innerMIdx, innerKIdx]
-
-    int32_t mTileIdx = mIdx / (32 * 4);
-    // SF vector size 16.
-    int factor = CVT_FP4_SF_VEC_SIZE * 4;
-    int32_t numKTiles = (numCols + factor - 1) / factor;
-    int64_t mTileStride = numKTiles * 32 * 4 * 4;
-
-    int32_t kTileIdx = (kIdx / 4);
-    int64_t kTileStride = 32 * 4 * 4;
-
-    // M tile layout [32, 4] is column-major.
-    int32_t outerMIdx = (mIdx % 32);
-    int64_t outerMStride = 4 * 4;
-
-    int32_t innerMIdx = (mIdx % (32 * 4)) / 32;
-    int64_t innerMStride = 4;
-
-    int32_t innerKIdx = (kIdx % 4);
-    int64_t innerKStride = 1;
-
-    // Compute the global offset.
-    int64_t SFOffset = mTileIdx * mTileStride + kTileIdx * kTileStride +
-                       outerMIdx * outerMStride + innerMIdx * innerMStride +
-                       innerKIdx * innerKStride;
-
-    return reinterpret_cast<uint8_t*>(SFout) + SFOffset;
-  }
-#endif
-  return nullptr;
-}
-
-// Define a 16 bytes packed data type.
-template <class Type>
-struct PackedVec {
-  typename TypeConverter<Type>::Type elts[4];
-};
-
-template <>
-struct PackedVec<__nv_fp8_e4m3> {
-  __nv_fp8x2_e4m3 elts[8];
-};
-
 template <class Type>
 __inline__ __device__ PackedVec<Type> compute_silu(PackedVec<Type>& vec,
                                                    PackedVec<Type>& vec2) {
   PackedVec<Type> result;
 #pragma unroll
   for (int i = 0; i < CVT_FP4_ELTS_PER_THREAD / 2; ++i) {
-    if constexpr (std::is_same_v<Type, c10::Half>) {
+    if constexpr (std::is_same_v<Type, half>) {
       half2 val(0.5f, 0.5f);
       half2 t0 = __hmul2(vec.elts[i], val);
       half2 t1 = __hfma2(h2tanh(t0), val, val);
@@ -206,13 +59,12 @@ __device__ uint32_t silu_and_cvt_warp_fp16_to_fp4(PackedVec<Type>& vec,
                                                   PackedVec<Type>& vec2,
                                                   float SFScaleVal,
                                                   uint8_t* SFout) {
-#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 1000)
   PackedVec<Type> out_silu = compute_silu(vec, vec2);
   // Get absolute maximum values among the local 8 values.
   auto localMax = __habs2(out_silu.elts[0]);
 
-  // Local maximum value.
-  #pragma unroll
+// Local maximum value.
+#pragma unroll
   for (int i = 1; i < CVT_FP4_ELTS_PER_THREAD / 2; i++) {
     localMax = __hmax2(localMax, __habs2(out_silu.elts[i]));
   }
@@ -259,9 +111,9 @@ __device__ uint32_t silu_and_cvt_warp_fp16_to_fp4(PackedVec<Type>& vec,
   // Convert the input to float.
   float2 fp2Vals[CVT_FP4_ELTS_PER_THREAD / 2];
 
-  #pragma unroll
+#pragma unroll
   for (int i = 0; i < CVT_FP4_ELTS_PER_THREAD / 2; i++) {
-    if constexpr (std::is_same_v<Type, c10::Half>) {
+    if constexpr (std::is_same_v<Type, half>) {
       fp2Vals[i] = __half22float2(out_silu.elts[i]);
     } else {
       fp2Vals[i] = __bfloat1622float2(out_silu.elts[i]);
@@ -275,22 +127,14 @@ __device__ uint32_t silu_and_cvt_warp_fp16_to_fp4(PackedVec<Type>& vec,
 
   // Write the e2m1 values to global memory.
   return e2m1Vec;
-#else
-  return 0;
-#endif
 }
 
 // Use UE4M3 by default.
 template <class Type, bool UE8M0_SF = false>
-__global__ void
-#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 1000)
-__launch_bounds__(1024, 4) silu_and_cvt_fp16_to_fp4(
-#else
-silu_and_cvt_fp16_to_fp4(
-#endif
-    int32_t numRows, int32_t numCols, Type const* in, float const* SFScale,
-    uint32_t* out, uint32_t* SFout) {
-#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 1000)
+__global__ void __launch_bounds__(1024, 4)
+    silu_and_cvt_fp16_to_fp4(int32_t numRows, int32_t numCols, Type const* in,
+                             float const* SFScale, uint32_t* out,
+                             uint32_t* SFout) {
   using PackedVec = PackedVec<Type>;
   static constexpr int CVT_FP4_NUM_THREADS_PER_SF =
       (CVT_FP4_SF_VEC_SIZE / CVT_FP4_ELTS_PER_THREAD);
@@ -328,22 +172,25 @@ silu_and_cvt_fp16_to_fp4(
           in_vec, in_vec2, SFScaleVal, sf_out);
     }
   }
-#endif
 }
 
 }  // namespace vllm
 
-void silu_and_mul_nvfp4_quant(torch::Tensor& output,  // [..., d]
-                              torch::Tensor& output_sf,
-                              torch::Tensor& input,  // [..., 2 * d]
-                              torch::Tensor& input_sf) {
-  TORCH_CHECK(input.dtype() == torch::kFloat16 ||
-              input.dtype() == torch::kBFloat16);
+void silu_and_mul_nvfp4_quant_sm1xxa(torch::Tensor& output,  // [..., d]
+                                     torch::Tensor& output_sf,
+                                     torch::Tensor& input,  // [..., 2 * d]
+                                     torch::Tensor& input_sf) {
   int32_t m = input.size(0);
   int32_t n = input.size(1) / 2;
+
   TORCH_CHECK(n % 16 == 0, "The N dimension must be multiple of 16.");
+  TORCH_CHECK(input.scalar_type() == at::ScalarType::Half ||
+                  input.scalar_type() == at::ScalarType::BFloat16,
+              "Unsupported input data type for quantize_to_fp4.");
+
   int multiProcessorCount =
       get_device_attribute(cudaDevAttrMultiProcessorCount, -1);
+
   auto input_sf_ptr = static_cast<float const*>(input_sf.data_ptr());
   auto sf_out = static_cast<int32_t*>(output_sf.data_ptr());
   auto output_ptr = static_cast<int64_t*>(output.data_ptr());
@@ -352,17 +199,14 @@ void silu_and_mul_nvfp4_quant(torch::Tensor& output,  // [..., d]
   dim3 block(std::min(int(n / ELTS_PER_THREAD), 1024));
   int const numBlocksPerSM = 2048 / block.x;
   dim3 grid(std::min(int(m), multiProcessorCount * numBlocksPerSM));
+
   VLLM_DISPATCH_HALF_TYPES(
-      input.scalar_type(), "act_and_mul_quant_kernel", [&] {
-        auto input_ptr = reinterpret_cast<scalar_t const*>(input.data_ptr());
-        VLLM_DISPATCH_BYTE_TYPES(
-            output.scalar_type(), "fused_act_and_mul_quant_kernel_nvfp4_type",
-            [&] {
-              vllm::silu_and_cvt_fp16_to_fp4<scalar_t>
-                  <<<grid, block, 0, stream>>>(
-                      m, n, input_ptr, input_sf_ptr,
-                      reinterpret_cast<uint32_t*>(output_ptr),
-                      reinterpret_cast<uint32_t*>(sf_out));
-            });
+      input.scalar_type(), "silu_and_mul_nvfp4_quant_kernel", [&] {
+        using cuda_type = vllm::CUDATypeConverter<scalar_t>::Type;
+        auto input_ptr = static_cast<cuda_type const*>(input.data_ptr());
+        vllm::silu_and_cvt_fp16_to_fp4<cuda_type><<<grid, block, 0, stream>>>(
+            m, n, input_ptr, input_sf_ptr,
+            reinterpret_cast<uint32_t*>(output_ptr),
+            reinterpret_cast<uint32_t*>(sf_out));
       });
 }
diff --git a/csrc/quantization/fp4/nvfp4_blockwise_moe_kernel.cu b/csrc/quantization/fp4/nvfp4_blockwise_moe_kernel.cu
index 03db5cc196d59d45ec4730e43350926f06e06f15..2c8df6144bf4d1b4a8324b1ae7d92b2009172dd8 100644
--- a/csrc/quantization/fp4/nvfp4_blockwise_moe_kernel.cu
+++ b/csrc/quantization/fp4/nvfp4_blockwise_moe_kernel.cu
@@ -1,3 +1,19 @@
+/*
+ * Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
 #include <torch/all.h>
 #include <cutlass/arch/arch.h>
 
diff --git a/csrc/quantization/fp4/nvfp4_experts_quant.cu b/csrc/quantization/fp4/nvfp4_experts_quant.cu
index 190d66f318a8387e0b69ff29c833653b5e73e4b4..ce3ba2c19b9ebe6f9886de07bfe58fe506340c5a 100644
--- a/csrc/quantization/fp4/nvfp4_experts_quant.cu
+++ b/csrc/quantization/fp4/nvfp4_experts_quant.cu
@@ -1,247 +1,42 @@
+/*
+ * Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
 #include <torch/all.h>
 
+#include <cuda_runtime_api.h>
+#include <cuda_runtime.h>
+
 #include <ATen/cuda/CUDAContext.h>
 #include <c10/cuda/CUDAGuard.h>
 
-#include <cuda_runtime.h>
 #include <cuda_fp8.h>
+#include "dispatch_utils.h"
 
-template <typename T>
-struct TypeConverter {
-  using Type = half2;
-};  // keep for generality
-
-template <>
-struct TypeConverter<half2> {
-  using Type = half;
-};
-
-template <>
-struct TypeConverter<half> {
-  using Type = half2;
-};
-
-template <>
-struct TypeConverter<__nv_bfloat162> {
-  using Type = __nv_bfloat16;
-};
-
-template <>
-struct TypeConverter<__nv_bfloat16> {
-  using Type = __nv_bfloat162;
-};
-
-#define ELTS_PER_THREAD 8
-
-constexpr int CVT_FP4_ELTS_PER_THREAD = 8;
-constexpr int CVT_FP4_SF_VEC_SIZE = 16;
-
-// Convert 8 float32 values into 8 e2m1 values (represented as one uint32_t).
-inline __device__ uint32_t fp32_vec_to_e2m1(float (&array)[8]) {
-#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 1000)
-  uint32_t val;
-  asm volatile(
-      "{\n"
-      ".reg .b8 byte0;\n"
-      ".reg .b8 byte1;\n"
-      ".reg .b8 byte2;\n"
-      ".reg .b8 byte3;\n"
-      "cvt.rn.satfinite.e2m1x2.f32   byte0, %2, %1;\n"
-      "cvt.rn.satfinite.e2m1x2.f32   byte1, %4, %3;\n"
-      "cvt.rn.satfinite.e2m1x2.f32   byte2, %6, %5;\n"
-      "cvt.rn.satfinite.e2m1x2.f32   byte3, %8, %7;\n"
-      "mov.b32 %0, {byte0, byte1, byte2, byte3};\n"
-      "}"
-      : "=r"(val)
-      : "f"(array[0]), "f"(array[1]), "f"(array[2]), "f"(array[3]),
-        "f"(array[4]), "f"(array[5]), "f"(array[6]), "f"(array[7]));
-  return val;
-#else
-  return 0;
-#endif
-}
-
-// Convert 4 float2 values into 8 e2m1 values (represented as one uint32_t).
-inline __device__ uint32_t fp32_vec_to_e2m1(float2 (&array)[4]) {
-#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 1000)
-  uint32_t val;
-  asm volatile(
-      "{\n"
-      ".reg .b8 byte0;\n"
-      ".reg .b8 byte1;\n"
-      ".reg .b8 byte2;\n"
-      ".reg .b8 byte3;\n"
-      "cvt.rn.satfinite.e2m1x2.f32   byte0, %2, %1;\n"
-      "cvt.rn.satfinite.e2m1x2.f32   byte1, %4, %3;\n"
-      "cvt.rn.satfinite.e2m1x2.f32   byte2, %6, %5;\n"
-      "cvt.rn.satfinite.e2m1x2.f32   byte3, %8, %7;\n"
-      "mov.b32 %0, {byte0, byte1, byte2, byte3};\n"
-      "}"
-      : "=r"(val)
-      : "f"(array[0].x), "f"(array[0].y), "f"(array[1].x), "f"(array[1].y),
-        "f"(array[2].x), "f"(array[2].y), "f"(array[3].x), "f"(array[3].y));
-  return val;
-#else
-  return 0;
-#endif
-}
-
-// Fast reciprocal.
-inline __device__ float reciprocal_approximate_ftz(float a) {
-  float b;
-  asm volatile("rcp.approx.ftz.f32 %0, %1;\n" : "=f"(b) : "f"(a));
-  return b;
-}
+#include "nvfp4_utils.cuh"
 
-template <class SFType, int CVT_FP4_NUM_THREADS_PER_SF>
-__device__ uint8_t* cvt_quant_to_fp4_get_sf_out_offset(int rowIdx, int colIdx,
-                                                       int numCols,
-                                                       SFType* SFout) {
-#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 1000)
-  static_assert(CVT_FP4_NUM_THREADS_PER_SF == 1 ||
-                CVT_FP4_NUM_THREADS_PER_SF == 2);
-
-  // One pair of threads write one SF to global memory.
-  // TODO: stage through smem for packed STG.32
-  // is it better than STG.8 from 4 threads ?
-  if (threadIdx.x % CVT_FP4_NUM_THREADS_PER_SF == 0) {
-    // SF vector index (16 elements share one SF in the K dimension).
-    int32_t kIdx = colIdx / CVT_FP4_NUM_THREADS_PER_SF;
-    int32_t mIdx = rowIdx;
-
-    // SF layout [numMTiles, numKTiles, 32 (mTile), 4 (mTile), 4(kTile)]
-    // --> index [mTileIdx, kTileIdx, outerMIdx, innerMIdx, innerKIdx]
-
-    int32_t mTileIdx = mIdx / (32 * 4);
-    // SF vector size 16.
-    int factor = CVT_FP4_SF_VEC_SIZE * 4;
-    int32_t numKTiles = (numCols + factor - 1) / factor;
-    int64_t mTileStride = numKTiles * 32 * 4 * 4;
-
-    int32_t kTileIdx = (kIdx / 4);
-    int64_t kTileStride = 32 * 4 * 4;
-
-    // M tile layout [32, 4] is column-major.
-    int32_t outerMIdx = (mIdx % 32);
-    int64_t outerMStride = 4 * 4;
-
-    int32_t innerMIdx = (mIdx % (32 * 4)) / 32;
-    int64_t innerMStride = 4;
-
-    int32_t innerKIdx = (kIdx % 4);
-    int64_t innerKStride = 1;
-
-    // Compute the global offset.
-    int64_t SFOffset = mTileIdx * mTileStride + kTileIdx * kTileStride +
-                       outerMIdx * outerMStride + innerMIdx * innerMStride +
-                       innerKIdx * innerKStride;
-
-    return reinterpret_cast<uint8_t*>(SFout) + SFOffset;
-  }
-#endif
-  return nullptr;
-}
-
-// Define a 16 bytes packed data type.
-template <class Type>
-struct PackedVec {
-  typename TypeConverter<Type>::Type elts[4];
-};
-
-template <>
-struct PackedVec<__nv_fp8_e4m3> {
-  __nv_fp8x2_e4m3 elts[8];
-};
-
-// Quantizes the provided PackedVec into the uint32_t output
-template <class Type, bool UE8M0_SF = false>
-__device__ uint32_t cvt_warp_fp16_to_fp4(PackedVec<Type>& vec, float SFScaleVal,
-                                         uint8_t* SFout) {
-#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 1000)
-  // Get absolute maximum values among the local 8 values.
-  auto localMax = __habs2(vec.elts[0]);
-
-  // Local maximum value.
-  #pragma unroll
-  for (int i = 1; i < CVT_FP4_ELTS_PER_THREAD / 2; i++) {
-    localMax = __hmax2(localMax, __habs2(vec.elts[i]));
-  }
-
-  // Get the absolute maximum among all 16 values (two threads).
-  localMax = __hmax2(__shfl_xor_sync(uint32_t(-1), localMax, 1), localMax);
-  // Get the final absolute maximum values.
-  float vecMax = float(__hmax(localMax.x, localMax.y));
-
-  // Get the SF (max value of the vector / max value of e2m1).
-  // maximum value of e2m1 = 6.0.
-  // TODO: use half as compute data type.
-  float SFValue = SFScaleVal * (vecMax * reciprocal_approximate_ftz(6.0f));
-  // 8 bits representation of the SF.
-  uint8_t fp8SFVal;
-  // Write the SF to global memory (STG.8).
-  if constexpr (UE8M0_SF) {
-    // Extract the 8 exponent bits from float32.
-    // float 32bits = 1 sign bit + 8 exponent bits + 23 mantissa bits.
-    uint32_t tmp = reinterpret_cast<uint32_t&>(SFValue) >> 23;
-    fp8SFVal = tmp & 0xff;
-    // Convert back to fp32.
-    reinterpret_cast<uint32_t&>(SFValue) = tmp << 23;
-  } else {
-    // Here SFValue is always positive, so E4M3 is the same as UE4M3.
-    __nv_fp8_e4m3 tmp = __nv_fp8_e4m3(SFValue);
-    reinterpret_cast<__nv_fp8_e4m3&>(fp8SFVal) = tmp;
-    // Convert back to fp32.
-    SFValue = float(tmp);
-  }
-  // Get the output scale.
-  // Recipe: final_scale = reciprocal(fp32(fp8(SFValue * SFScaleVal))) *
-  //                       reciprocal(SFScaleVal))
-  float outputScale =
-      SFValue != 0 ? reciprocal_approximate_ftz(
-                         SFValue * reciprocal_approximate_ftz(SFScaleVal))
-                   : 0.0f;
-
-  if (SFout) {
-    // Write the SF to global memory (STG.8).
-    *SFout = fp8SFVal;
-  }
-
-  // Convert the input to float.
-  float2 fp2Vals[CVT_FP4_ELTS_PER_THREAD / 2];
-
-  #pragma unroll
-  for (int i = 0; i < CVT_FP4_ELTS_PER_THREAD / 2; i++) {
-    if constexpr (std::is_same_v<Type, half>) {
-      fp2Vals[i] = __half22float2(vec.elts[i]);
-    } else {
-      fp2Vals[i] = __bfloat1622float2(vec.elts[i]);
-    }
-    fp2Vals[i].x *= outputScale;
-    fp2Vals[i].y *= outputScale;
-  }
-
-  // Convert to e2m1 values.
-  uint32_t e2m1Vec = fp32_vec_to_e2m1(fp2Vals);
-
-  // Write the e2m1 values to global memory.
-  return e2m1Vec;
-#else
-  return 0;
-#endif
-}
+namespace vllm {
 
 // Use UE4M3 by default.
 template <class Type, bool UE8M0_SF = false, bool SMALL_NUM_EXPERTS = false>
-__global__ void
-#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 1000)
-__launch_bounds__(512, 4) cvt_fp16_to_fp4(
-#else
-cvt_fp16_to_fp4(
-#endif
-    int32_t numRows, int32_t numCols, Type const* in, float const* SFScale,
-    uint32_t* out, uint32_t* SFout, uint32_t* input_offset_by_experts,
-    uint32_t* output_scale_offset_by_experts, int n_experts, bool low_latency) {
-#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 1000)
+__global__ void __launch_bounds__(512, 4)
+    cvt_fp16_to_fp4(int32_t numRows, int32_t numCols, Type const* in,
+                    float const* SFScale, uint32_t* out, uint32_t* SFout,
+                    uint32_t* input_offset_by_experts,
+                    uint32_t* output_scale_offset_by_experts, int n_experts,
+                    bool low_latency) {
   using PackedVec = PackedVec<Type>;
   static constexpr int CVT_FP4_NUM_THREADS_PER_SF =
       (CVT_FP4_SF_VEC_SIZE / CVT_FP4_ELTS_PER_THREAD);
@@ -299,8 +94,8 @@ cvt_fp16_to_fp4(
                 &input_offset_by_experts[chunk_start + 12]));
         local_offsets[16] = __ldca(&input_offset_by_experts[chunk_start + 16]);
 
-  // Check against the 16 loaded offsets
-  #pragma unroll
+// Check against the 16 loaded offsets
+#pragma unroll
         for (int i = 0; i < 16; i++) {
           if (rowIdx >= local_offsets[i] && rowIdx < local_offsets[i + 1]) {
             rowIdx_in_expert = rowIdx - local_offsets[i];
@@ -330,21 +125,15 @@ cvt_fp16_to_fp4(
 
     out_pos = cvt_warp_fp16_to_fp4<Type, UE8M0_SF>(in_vec, SFScaleVal, sf_out);
   }
-#endif
 }
 
 // Kernel for LARGE_M_TOPK = true (large m_topk optimized version)
 template <class Type, bool UE8M0_SF = false, bool SMALL_NUM_EXPERTS = false>
-__global__ void
-#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 1000)
-__launch_bounds__(1024, 4) cvt_fp16_to_fp4(
-#else
-cvt_fp16_to_fp4(
-#endif
-    int32_t numRows, int32_t numCols, Type const* in, float const* SFScale,
-    uint32_t* out, uint32_t* SFout, uint32_t* input_offset_by_experts,
-    uint32_t* output_scale_offset_by_experts, int n_experts) {
-#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 1000)
+__global__ void __launch_bounds__(1024, 4)
+    cvt_fp16_to_fp4(int32_t numRows, int32_t numCols, Type const* in,
+                    float const* SFScale, uint32_t* out, uint32_t* SFout,
+                    uint32_t* input_offset_by_experts,
+                    uint32_t* output_scale_offset_by_experts, int n_experts) {
   using PackedVec = PackedVec<Type>;
   static constexpr int CVT_FP4_NUM_THREADS_PER_SF =
       (CVT_FP4_SF_VEC_SIZE / CVT_FP4_ELTS_PER_THREAD);
@@ -425,7 +214,6 @@ cvt_fp16_to_fp4(
 
     out_pos = cvt_warp_fp16_to_fp4<Type, UE8M0_SF>(in_vec, SFScaleVal, sf_out);
   }
-#endif
 }
 
 template <typename T>
@@ -501,6 +289,8 @@ void quant_impl(void* output, void* output_scale, void* input,
   }
 }
 
+}  // namespace vllm
+
 /*Quantization entry for fp4 experts quantization*/
 #define CHECK_TH_CUDA(x, m) TORCH_CHECK(x.is_cuda(), m, "must be a CUDA tensor")
 #define CHECK_CONTIGUOUS(x, m) \
@@ -560,23 +350,17 @@ void scaled_fp4_experts_quant_sm100a(
   // 4 means 4 fp8 values are packed into one int32
   TORCH_CHECK(output_scale.size(1) * 4 == padded_k);
 
-  auto in_dtype = input.dtype();
   const at::cuda::OptionalCUDAGuard device_guard(device_of(input));
   const cudaStream_t stream =
       at::cuda::getCurrentCUDAStream(input.get_device());
-  if (in_dtype == at::ScalarType::Half) {
-    quant_impl<half>(output.data_ptr(), output_scale.data_ptr(),
-                     input.data_ptr(), input_global_scale.data_ptr(),
-                     input_offset_by_experts.data_ptr(),
-                     output_scale_offset_by_experts.data_ptr(), m_topk, k,
-                     n_experts, stream);
-  } else if (in_dtype == at::ScalarType::BFloat16) {
-    quant_impl<__nv_bfloat16>(output.data_ptr(), output_scale.data_ptr(),
-                              input.data_ptr(), input_global_scale.data_ptr(),
-                              input_offset_by_experts.data_ptr(),
-                              output_scale_offset_by_experts.data_ptr(), m_topk,
-                              k, n_experts, stream);
-  } else {
-    TORCH_CHECK(false, "Expected input data type to be half or bfloat16");
-  }
+
+  VLLM_DISPATCH_HALF_TYPES(
+      input.scalar_type(), "nvfp4_experts_quant_kernel", [&] {
+        using cuda_type = vllm::CUDATypeConverter<scalar_t>::Type;
+        vllm::quant_impl<cuda_type>(
+            output.data_ptr(), output_scale.data_ptr(), input.data_ptr(),
+            input_global_scale.data_ptr(), input_offset_by_experts.data_ptr(),
+            output_scale_offset_by_experts.data_ptr(), m_topk, k, n_experts,
+            stream);
+      });
 }
diff --git a/csrc/quantization/fp4/nvfp4_quant_entry.cu b/csrc/quantization/fp4/nvfp4_quant_entry.cu
index 1b61bd4519fc33b3072ecf2fa8e4e578953bad62..c2b39e5438805df06d2db89daff5168f9f7c48dd 100644
--- a/csrc/quantization/fp4/nvfp4_quant_entry.cu
+++ b/csrc/quantization/fp4/nvfp4_quant_entry.cu
@@ -32,6 +32,14 @@ void scaled_fp4_experts_quant_sm100a(
     torch::Tensor const& output_scale_offset_by_experts);
 #endif
 
+#if (defined(ENABLE_NVFP4_SM100) && ENABLE_NVFP4_SM100) || \
+    (defined(ENABLE_NVFP4_SM120) && ENABLE_NVFP4_SM120)
+void silu_and_mul_nvfp4_quant_sm1xxa(torch::Tensor& output,
+                                     torch::Tensor& output_sf,
+                                     torch::Tensor& input,
+                                     torch::Tensor& input_sf);
+#endif
+
 void scaled_fp4_quant(torch::Tensor& output, torch::Tensor const& input,
                       torch::Tensor& output_sf, torch::Tensor const& input_sf) {
 #if (defined(ENABLE_NVFP4_SM100) && ENABLE_NVFP4_SM100) || \
@@ -54,3 +62,13 @@ void scaled_fp4_experts_quant(
   TORCH_CHECK_NOT_IMPLEMENTED(false,
                               "No compiled nvfp4 experts quantization kernel");
 }
+
+void silu_and_mul_nvfp4_quant(torch::Tensor& output, torch::Tensor& output_sf,
+                              torch::Tensor& input, torch::Tensor& input_sf) {
+#if (defined(ENABLE_NVFP4_SM100) && ENABLE_NVFP4_SM100) || \
+    (defined(ENABLE_NVFP4_SM120) && ENABLE_NVFP4_SM120)
+  return silu_and_mul_nvfp4_quant_sm1xxa(output, output_sf, input, input_sf);
+#endif
+  TORCH_CHECK_NOT_IMPLEMENTED(
+      false, "No compiled silu_and_mul nvfp4 quantization kernel");
+}
diff --git a/csrc/quantization/fp4/nvfp4_quant_kernels.cu b/csrc/quantization/fp4/nvfp4_quant_kernels.cu
index 4e080de151648190769001ed0beed067bfa0acc2..0c1b9ef0664d711b3db6f7e3ff8bd9d181de1138 100644
--- a/csrc/quantization/fp4/nvfp4_quant_kernels.cu
+++ b/csrc/quantization/fp4/nvfp4_quant_kernels.cu
@@ -23,245 +23,18 @@
 #include <c10/cuda/CUDAGuard.h>
 
 #include <cuda_fp8.h>
+#include "dispatch_utils.h"
 
 #include "cuda_utils.h"
+#include "nvfp4_utils.cuh"
 
-// Get type2 from type or vice versa (applied to half and bfloat16)
-template <typename T>
-struct TypeConverter {
-  using Type = half2;
-};  // keep for generality
-
-template <>
-struct TypeConverter<half2> {
-  using Type = half;
-};
-
-template <>
-struct TypeConverter<half> {
-  using Type = half2;
-};
-
-template <>
-struct TypeConverter<__nv_bfloat162> {
-  using Type = __nv_bfloat16;
-};
-
-template <>
-struct TypeConverter<__nv_bfloat16> {
-  using Type = __nv_bfloat162;
-};
-
-#define ELTS_PER_THREAD 8
-
-constexpr int CVT_FP4_ELTS_PER_THREAD = 8;
-constexpr int CVT_FP4_SF_VEC_SIZE = 16;
-
-// Convert 8 float32 values into 8 e2m1 values (represented as one uint32_t).
-inline __device__ uint32_t fp32_vec_to_e2m1(float (&array)[8]) {
-#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 1000)
-  uint32_t val;
-  asm volatile(
-      "{\n"
-      ".reg .b8 byte0;\n"
-      ".reg .b8 byte1;\n"
-      ".reg .b8 byte2;\n"
-      ".reg .b8 byte3;\n"
-      "cvt.rn.satfinite.e2m1x2.f32   byte0, %2, %1;\n"
-      "cvt.rn.satfinite.e2m1x2.f32   byte1, %4, %3;\n"
-      "cvt.rn.satfinite.e2m1x2.f32   byte2, %6, %5;\n"
-      "cvt.rn.satfinite.e2m1x2.f32   byte3, %8, %7;\n"
-      "mov.b32 %0, {byte0, byte1, byte2, byte3};\n"
-      "}"
-      : "=r"(val)
-      : "f"(array[0]), "f"(array[1]), "f"(array[2]), "f"(array[3]),
-        "f"(array[4]), "f"(array[5]), "f"(array[6]), "f"(array[7]));
-  return val;
-#else
-  return 0;
-#endif
-}
-
-// Convert 4 float2 values into 8 e2m1 values (represented as one uint32_t).
-inline __device__ uint32_t fp32_vec_to_e2m1(float2 (&array)[4]) {
-#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 1000)
-  uint32_t val;
-  asm volatile(
-      "{\n"
-      ".reg .b8 byte0;\n"
-      ".reg .b8 byte1;\n"
-      ".reg .b8 byte2;\n"
-      ".reg .b8 byte3;\n"
-      "cvt.rn.satfinite.e2m1x2.f32   byte0, %2, %1;\n"
-      "cvt.rn.satfinite.e2m1x2.f32   byte1, %4, %3;\n"
-      "cvt.rn.satfinite.e2m1x2.f32   byte2, %6, %5;\n"
-      "cvt.rn.satfinite.e2m1x2.f32   byte3, %8, %7;\n"
-      "mov.b32 %0, {byte0, byte1, byte2, byte3};\n"
-      "}"
-      : "=r"(val)
-      : "f"(array[0].x), "f"(array[0].y), "f"(array[1].x), "f"(array[1].y),
-        "f"(array[2].x), "f"(array[2].y), "f"(array[3].x), "f"(array[3].y));
-  return val;
-#else
-  return 0;
-#endif
-}
-
-// Fast reciprocal.
-inline __device__ float reciprocal_approximate_ftz(float a) {
-  float b;
-  asm volatile("rcp.approx.ftz.f32 %0, %1;\n" : "=f"(b) : "f"(a));
-  return b;
-}
-
-template <class SFType, int CVT_FP4_NUM_THREADS_PER_SF>
-__device__ uint8_t* cvt_quant_to_fp4_get_sf_out_offset(int rowIdx, int colIdx,
-                                                       int numCols,
-                                                       SFType* SFout) {
-#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 1000)
-  static_assert(CVT_FP4_NUM_THREADS_PER_SF == 1 ||
-                CVT_FP4_NUM_THREADS_PER_SF == 2);
-
-  // One pair of threads write one SF to global memory.
-  // TODO: stage through smem for packed STG.32
-  // is it better than STG.8 from 4 threads ?
-  if (threadIdx.x % CVT_FP4_NUM_THREADS_PER_SF == 0) {
-    // SF vector index (16 elements share one SF in the K dimension).
-    int32_t kIdx = colIdx / CVT_FP4_NUM_THREADS_PER_SF;
-    int32_t mIdx = rowIdx;
-
-    // SF layout [numMTiles, numKTiles, 32 (mTile), 4 (mTile), 4(kTile)]
-    // --> index [mTileIdx, kTileIdx, outerMIdx, innerMIdx, innerKIdx]
-
-    int32_t mTileIdx = mIdx / (32 * 4);
-    // SF vector size 16.
-    int factor = CVT_FP4_SF_VEC_SIZE * 4;
-    int32_t numKTiles = (numCols + factor - 1) / factor;
-    int64_t mTileStride = numKTiles * 32 * 4 * 4;
-
-    int32_t kTileIdx = (kIdx / 4);
-    int64_t kTileStride = 32 * 4 * 4;
-
-    // M tile layout [32, 4] is column-major.
-    int32_t outerMIdx = (mIdx % 32);
-    int64_t outerMStride = 4 * 4;
-
-    int32_t innerMIdx = (mIdx % (32 * 4)) / 32;
-    int64_t innerMStride = 4;
-
-    int32_t innerKIdx = (kIdx % 4);
-    int64_t innerKStride = 1;
-
-    // Compute the global offset.
-    int64_t SFOffset = mTileIdx * mTileStride + kTileIdx * kTileStride +
-                       outerMIdx * outerMStride + innerMIdx * innerMStride +
-                       innerKIdx * innerKStride;
-
-    return reinterpret_cast<uint8_t*>(SFout) + SFOffset;
-  }
-#endif
-  return nullptr;
-}
-
-// Define a 16 bytes packed data type.
-template <class Type>
-struct PackedVec {
-  typename TypeConverter<Type>::Type elts[4];
-};
-
-template <>
-struct PackedVec<__nv_fp8_e4m3> {
-  __nv_fp8x2_e4m3 elts[8];
-};
-
-// Quantizes the provided PackedVec into the uint32_t output
-template <class Type, bool UE8M0_SF = false>
-__device__ uint32_t cvt_warp_fp16_to_fp4(PackedVec<Type>& vec, float SFScaleVal,
-                                         uint8_t* SFout) {
-#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 1000)
-  // Get absolute maximum values among the local 8 values.
-  auto localMax = __habs2(vec.elts[0]);
-
-  // Local maximum value.
-  #pragma unroll
-  for (int i = 1; i < CVT_FP4_ELTS_PER_THREAD / 2; i++) {
-    localMax = __hmax2(localMax, __habs2(vec.elts[i]));
-  }
-
-  // Get the absolute maximum among all 16 values (two threads).
-  localMax = __hmax2(__shfl_xor_sync(uint32_t(-1), localMax, 1), localMax);
-  // Get the final absolute maximum values.
-  float vecMax = float(__hmax(localMax.x, localMax.y));
-
-  // Get the SF (max value of the vector / max value of e2m1).
-  // maximum value of e2m1 = 6.0.
-  // TODO: use half as compute data type.
-  float SFValue = SFScaleVal * (vecMax * reciprocal_approximate_ftz(6.0f));
-  // 8 bits representation of the SF.
-  uint8_t fp8SFVal;
-  // Write the SF to global memory (STG.8).
-  if constexpr (UE8M0_SF) {
-    // Extract the 8 exponent bits from float32.
-    // float 32bits = 1 sign bit + 8 exponent bits + 23 mantissa bits.
-    uint32_t tmp = reinterpret_cast<uint32_t&>(SFValue) >> 23;
-    fp8SFVal = tmp & 0xff;
-    // Convert back to fp32.
-    reinterpret_cast<uint32_t&>(SFValue) = tmp << 23;
-  } else {
-    // Here SFValue is always positive, so E4M3 is the same as UE4M3.
-    __nv_fp8_e4m3 tmp = __nv_fp8_e4m3(SFValue);
-    reinterpret_cast<__nv_fp8_e4m3&>(fp8SFVal) = tmp;
-    // Convert back to fp32.
-    SFValue = float(tmp);
-  }
-  // Get the output scale.
-  // Recipe: final_scale = reciprocal(fp32(fp8(SFValue * SFScaleVal))) *
-  //                       reciprocal(SFScaleVal))
-  float outputScale =
-      SFValue != 0 ? reciprocal_approximate_ftz(
-                         SFValue * reciprocal_approximate_ftz(SFScaleVal))
-                   : 0.0f;
-
-  if (SFout) {
-    // Write the SF to global memory (STG.8).
-    *SFout = fp8SFVal;
-  }
-
-  // Convert the input to float.
-  float2 fp2Vals[CVT_FP4_ELTS_PER_THREAD / 2];
-
-  #pragma unroll
-  for (int i = 0; i < CVT_FP4_ELTS_PER_THREAD / 2; i++) {
-    if constexpr (std::is_same_v<Type, half>) {
-      fp2Vals[i] = __half22float2(vec.elts[i]);
-    } else {
-      fp2Vals[i] = __bfloat1622float2(vec.elts[i]);
-    }
-    fp2Vals[i].x *= outputScale;
-    fp2Vals[i].y *= outputScale;
-  }
-
-  // Convert to e2m1 values.
-  uint32_t e2m1Vec = fp32_vec_to_e2m1(fp2Vals);
-
-  // Write the e2m1 values to global memory.
-  return e2m1Vec;
-#else
-  return 0;
-#endif
-}
+namespace vllm {
 
 // Use UE4M3 by default.
 template <class Type, bool UE8M0_SF = false>
-__global__ void
-#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 1000)
-__launch_bounds__(512, 4) cvt_fp16_to_fp4(
-#else
-cvt_fp16_to_fp4(
-#endif
-    int32_t numRows, int32_t numCols, Type const* in, float const* SFScale,
-    uint32_t* out, uint32_t* SFout) {
-#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 1000)
+__global__ void __launch_bounds__(512, 4)
+    cvt_fp16_to_fp4(int32_t numRows, int32_t numCols, Type const* in,
+                    float const* SFScale, uint32_t* out, uint32_t* SFout) {
   using PackedVec = PackedVec<Type>;
   static constexpr int CVT_FP4_NUM_THREADS_PER_SF =
       (CVT_FP4_SF_VEC_SIZE / CVT_FP4_ELTS_PER_THREAD);
@@ -293,7 +66,6 @@ cvt_fp16_to_fp4(
           cvt_warp_fp16_to_fp4<Type, UE8M0_SF>(in_vec, SFScaleVal, sf_out);
     }
   }
-#endif
 }
 
 template <typename T>
@@ -332,6 +104,8 @@ template void invokeFP4Quantization(int m, int n, __nv_bfloat16 const* input,
                                     int multiProcessorCount,
                                     cudaStream_t stream);
 
+}  // namespace vllm
+
 void scaled_fp4_quant_sm1xxa(torch::Tensor const& output,
                              torch::Tensor const& input,
                              torch::Tensor const& output_sf,
@@ -340,6 +114,9 @@ void scaled_fp4_quant_sm1xxa(torch::Tensor const& output,
   int32_t n = input.size(1);
 
   TORCH_CHECK(n % 16 == 0, "The N dimension must be multiple of 16.");
+  TORCH_CHECK(input.scalar_type() == at::ScalarType::Half ||
+                  input.scalar_type() == at::ScalarType::BFloat16,
+              "Unsupported input data type for quantize_to_fp4.");
 
   int multiProcessorCount =
       get_device_attribute(cudaDevAttrMultiProcessorCount, -1);
@@ -353,24 +130,10 @@ void scaled_fp4_quant_sm1xxa(torch::Tensor const& output,
   // We don't support e8m0 scales at this moment.
   bool useUE8M0 = false;
 
-  switch (input.scalar_type()) {
-    case torch::kHalf: {
-      auto input_ptr = reinterpret_cast<half const*>(input.data_ptr());
-      invokeFP4Quantization(m, n, input_ptr, input_sf_ptr, output_ptr, sf_out,
-                            useUE8M0, multiProcessorCount, stream);
-      break;
-    }
-    case torch::kBFloat16: {
-      auto input_ptr = reinterpret_cast<__nv_bfloat16 const*>(input.data_ptr());
-      invokeFP4Quantization(m, n, input_ptr, input_sf_ptr, output_ptr, sf_out,
-                            useUE8M0, multiProcessorCount, stream);
-      break;
-    }
-    default: {
-      std::cerr << "Observing: " << input.scalar_type()
-                << " for the input datatype which is invalid";
-      throw std::runtime_error(
-          "Unsupported input data type for quantize_to_fp4.");
-    }
-  }
+  VLLM_DISPATCH_HALF_TYPES(input.scalar_type(), "nvfp4_quant_kernel", [&] {
+    using cuda_type = vllm::CUDATypeConverter<scalar_t>::Type;
+    auto input_ptr = static_cast<cuda_type const*>(input.data_ptr());
+    vllm::invokeFP4Quantization(m, n, input_ptr, input_sf_ptr, output_ptr,
+                                sf_out, useUE8M0, multiProcessorCount, stream);
+  });
 }
diff --git a/csrc/quantization/fp4/nvfp4_utils.cuh b/csrc/quantization/fp4/nvfp4_utils.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..48e4959de979378e09eba59924e8bc3ff2ad8e3b
--- /dev/null
+++ b/csrc/quantization/fp4/nvfp4_utils.cuh
@@ -0,0 +1,251 @@
+/*
+ * Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <cuda_runtime.h>
+#include <cuda_fp8.h>
+
+#define ELTS_PER_THREAD 8
+
+constexpr int CVT_FP4_ELTS_PER_THREAD = 8;
+constexpr int CVT_FP4_SF_VEC_SIZE = 16;
+
+namespace vllm {
+
+// Convert PyTorch cpp type to CUDA type
+template <typename T>
+struct CUDATypeConverter {
+  using Type = T;
+};
+
+template <>
+struct CUDATypeConverter<at::Half> {
+  using Type = half;
+};
+
+template <>
+struct CUDATypeConverter<at::BFloat16> {
+  using Type = __nv_bfloat16;
+};
+
+// Get type2 from type or vice versa (applied to half and bfloat16)
+template <typename T>
+struct TypeConverter {
+  using Type = half2;
+};  // keep for generality
+
+template <>
+struct TypeConverter<half2> {
+  using Type = half;
+};
+
+template <>
+struct TypeConverter<half> {
+  using Type = half2;
+};
+
+template <>
+struct TypeConverter<__nv_bfloat162> {
+  using Type = __nv_bfloat16;
+};
+
+template <>
+struct TypeConverter<__nv_bfloat16> {
+  using Type = __nv_bfloat162;
+};
+
+// Define a 16 bytes packed data type.
+template <class Type>
+struct PackedVec {
+  typename TypeConverter<Type>::Type elts[4];
+};
+
+template <>
+struct PackedVec<__nv_fp8_e4m3> {
+  __nv_fp8x2_e4m3 elts[8];
+};
+
+// Convert 8 float32 values into 8 e2m1 values (represented as one uint32_t).
+inline __device__ uint32_t fp32_vec_to_e2m1(float (&array)[8]) {
+  uint32_t val;
+  asm volatile(
+      "{\n"
+      ".reg .b8 byte0;\n"
+      ".reg .b8 byte1;\n"
+      ".reg .b8 byte2;\n"
+      ".reg .b8 byte3;\n"
+      "cvt.rn.satfinite.e2m1x2.f32   byte0, %2, %1;\n"
+      "cvt.rn.satfinite.e2m1x2.f32   byte1, %4, %3;\n"
+      "cvt.rn.satfinite.e2m1x2.f32   byte2, %6, %5;\n"
+      "cvt.rn.satfinite.e2m1x2.f32   byte3, %8, %7;\n"
+      "mov.b32 %0, {byte0, byte1, byte2, byte3};\n"
+      "}"
+      : "=r"(val)
+      : "f"(array[0]), "f"(array[1]), "f"(array[2]), "f"(array[3]),
+        "f"(array[4]), "f"(array[5]), "f"(array[6]), "f"(array[7]));
+  return val;
+}
+
+// Convert 4 float2 values into 8 e2m1 values (represented as one uint32_t).
+inline __device__ uint32_t fp32_vec_to_e2m1(float2 (&array)[4]) {
+  uint32_t val;
+  asm volatile(
+      "{\n"
+      ".reg .b8 byte0;\n"
+      ".reg .b8 byte1;\n"
+      ".reg .b8 byte2;\n"
+      ".reg .b8 byte3;\n"
+      "cvt.rn.satfinite.e2m1x2.f32   byte0, %2, %1;\n"
+      "cvt.rn.satfinite.e2m1x2.f32   byte1, %4, %3;\n"
+      "cvt.rn.satfinite.e2m1x2.f32   byte2, %6, %5;\n"
+      "cvt.rn.satfinite.e2m1x2.f32   byte3, %8, %7;\n"
+      "mov.b32 %0, {byte0, byte1, byte2, byte3};\n"
+      "}"
+      : "=r"(val)
+      : "f"(array[0].x), "f"(array[0].y), "f"(array[1].x), "f"(array[1].y),
+        "f"(array[2].x), "f"(array[2].y), "f"(array[3].x), "f"(array[3].y));
+  return val;
+}
+
+// Fast reciprocal.
+inline __device__ float reciprocal_approximate_ftz(float a) {
+  float b;
+  asm volatile("rcp.approx.ftz.f32 %0, %1;\n" : "=f"(b) : "f"(a));
+  return b;
+}
+
+template <class SFType, int CVT_FP4_NUM_THREADS_PER_SF>
+__device__ uint8_t* cvt_quant_to_fp4_get_sf_out_offset(int rowIdx, int colIdx,
+                                                       int numCols,
+                                                       SFType* SFout) {
+  static_assert(CVT_FP4_NUM_THREADS_PER_SF == 1 ||
+                CVT_FP4_NUM_THREADS_PER_SF == 2);
+
+  // One pair of threads write one SF to global memory.
+  // TODO: stage through smem for packed STG.32
+  // is it better than STG.8 from 4 threads ?
+  if (threadIdx.x % CVT_FP4_NUM_THREADS_PER_SF == 0) {
+    // SF vector index (16 elements share one SF in the K dimension).
+    int32_t kIdx = colIdx / CVT_FP4_NUM_THREADS_PER_SF;
+    int32_t mIdx = rowIdx;
+
+    // SF layout [numMTiles, numKTiles, 32 (mTile), 4 (mTile), 4(kTile)]
+    // --> index [mTileIdx, kTileIdx, outerMIdx, innerMIdx, innerKIdx]
+
+    int32_t mTileIdx = mIdx / (32 * 4);
+    // SF vector size 16.
+    int factor = CVT_FP4_SF_VEC_SIZE * 4;
+    int32_t numKTiles = (numCols + factor - 1) / factor;
+    int64_t mTileStride = numKTiles * 32 * 4 * 4;
+
+    int32_t kTileIdx = (kIdx / 4);
+    int64_t kTileStride = 32 * 4 * 4;
+
+    // M tile layout [32, 4] is column-major.
+    int32_t outerMIdx = (mIdx % 32);
+    int64_t outerMStride = 4 * 4;
+
+    int32_t innerMIdx = (mIdx % (32 * 4)) / 32;
+    int64_t innerMStride = 4;
+
+    int32_t innerKIdx = (kIdx % 4);
+    int64_t innerKStride = 1;
+
+    // Compute the global offset.
+    int64_t SFOffset = mTileIdx * mTileStride + kTileIdx * kTileStride +
+                       outerMIdx * outerMStride + innerMIdx * innerMStride +
+                       innerKIdx * innerKStride;
+
+    return reinterpret_cast<uint8_t*>(SFout) + SFOffset;
+  }
+  return nullptr;
+}
+
+// Quantizes the provided PackedVec into the uint32_t output
+template <class Type, bool UE8M0_SF = false>
+__device__ uint32_t cvt_warp_fp16_to_fp4(PackedVec<Type>& vec, float SFScaleVal,
+                                         uint8_t* SFout) {
+  // Get absolute maximum values among the local 8 values.
+  auto localMax = __habs2(vec.elts[0]);
+
+// Local maximum value.
+#pragma unroll
+  for (int i = 1; i < CVT_FP4_ELTS_PER_THREAD / 2; i++) {
+    localMax = __hmax2(localMax, __habs2(vec.elts[i]));
+  }
+
+  // Get the absolute maximum among all 16 values (two threads).
+  localMax = __hmax2(__shfl_xor_sync(uint32_t(-1), localMax, 1), localMax);
+  // Get the final absolute maximum values.
+  float vecMax = float(__hmax(localMax.x, localMax.y));
+
+  // Get the SF (max value of the vector / max value of e2m1).
+  // maximum value of e2m1 = 6.0.
+  // TODO: use half as compute data type.
+  float SFValue = SFScaleVal * (vecMax * reciprocal_approximate_ftz(6.0f));
+  // 8 bits representation of the SF.
+  uint8_t fp8SFVal;
+  // Write the SF to global memory (STG.8).
+  if constexpr (UE8M0_SF) {
+    // Extract the 8 exponent bits from float32.
+    // float 32bits = 1 sign bit + 8 exponent bits + 23 mantissa bits.
+    uint32_t tmp = reinterpret_cast<uint32_t&>(SFValue) >> 23;
+    fp8SFVal = tmp & 0xff;
+    // Convert back to fp32.
+    reinterpret_cast<uint32_t&>(SFValue) = tmp << 23;
+  } else {
+    // Here SFValue is always positive, so E4M3 is the same as UE4M3.
+    __nv_fp8_e4m3 tmp = __nv_fp8_e4m3(SFValue);
+    reinterpret_cast<__nv_fp8_e4m3&>(fp8SFVal) = tmp;
+    // Convert back to fp32.
+    SFValue = float(tmp);
+  }
+  // Get the output scale.
+  // Recipe: final_scale = reciprocal(fp32(fp8(SFValue * SFScaleVal))) *
+  //                       reciprocal(SFScaleVal))
+  float outputScale =
+      SFValue != 0 ? reciprocal_approximate_ftz(
+                         SFValue * reciprocal_approximate_ftz(SFScaleVal))
+                   : 0.0f;
+
+  if (SFout) {
+    // Write the SF to global memory (STG.8).
+    *SFout = fp8SFVal;
+  }
+
+  // Convert the input to float.
+  float2 fp2Vals[CVT_FP4_ELTS_PER_THREAD / 2];
+
+#pragma unroll
+  for (int i = 0; i < CVT_FP4_ELTS_PER_THREAD / 2; i++) {
+    if constexpr (std::is_same_v<Type, half>) {
+      fp2Vals[i] = __half22float2(vec.elts[i]);
+    } else {
+      fp2Vals[i] = __bfloat1622float2(vec.elts[i]);
+    }
+    fp2Vals[i].x *= outputScale;
+    fp2Vals[i].y *= outputScale;
+  }
+
+  // Convert to e2m1 values.
+  uint32_t e2m1Vec = fp32_vec_to_e2m1(fp2Vals);
+
+  // Write the e2m1 values to global memory.
+  return e2m1Vec;
+}
+
+}  // namespace vllm
diff --git a/csrc/quantization/machete/generate.py b/csrc/quantization/machete/generate.py
index 0d14ba15937c65a26d5936fa9fc42eb53eeec736..8fd536ef46e3d0970d96e30a1d45aa358811630b 100644
--- a/csrc/quantization/machete/generate.py
+++ b/csrc/quantization/machete/generate.py
@@ -417,7 +417,7 @@ def create_sources(impl_configs: list[ImplConfig], num_impl_files=8):
             ))
 
     def prepacked_type_key(prepack_type: PrepackTypeConfig):
-        # For now we we can just use the first accumulator type seen since
+        # For now, we can just use the first accumulator type seen since
         # the tensor core shapes/layouts don't vary based on accumulator
         # type so we can generate less code this way
         return (prepack_type.a, prepack_type.b_num_bits, prepack_type.convert)
diff --git a/csrc/torch_bindings.cpp b/csrc/torch_bindings.cpp
index 68e2a4c1a49c19729f3cfdbf85a407dce832692e..789d989f97b3da18de86c263e97902751e33a069 100644
--- a/csrc/torch_bindings.cpp
+++ b/csrc/torch_bindings.cpp
@@ -115,8 +115,7 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
 //       "silu_and_mul_quant(Tensor! result, Tensor input, Tensor scale) -> ()");
 //   ops.impl("silu_and_mul_quant", torch::kCUDA, &silu_and_mul_quant);
 
-#if (defined(ENABLE_NVFP4_SM100) && ENABLE_NVFP4_SM100) || \
-    (defined(ENABLE_NVFP4_SM120) && ENABLE_NVFP4_SM120)
+#ifndef USE_ROCM
   ops.def(
       "silu_and_mul_nvfp4_quant(Tensor! result, Tensor! result_block_scale, "
       "Tensor input, Tensor input_global_scale) -> ()");
@@ -169,6 +168,12 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
       "float epsilon) -> ()");
   ops.impl("fused_add_rms_norm", torch::kCUDA, &fused_add_rms_norm);
 
+  // Polynomial Normalization.
+  ops.def(
+      "poly_norm(Tensor! out, Tensor input, Tensor weight, Tensor bias, float "
+      "epsilon) -> ()");
+  ops.impl("poly_norm", torch::kCUDA, &poly_norm);
+
   // Apply repetition penalties to logits in-place
   ops.def(
       "apply_repetition_penalties_(Tensor! logits, Tensor prompt_mask, "
@@ -521,10 +526,10 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
 
   // SM100 CUTLASS MLA decode
   ops.def(
-      "sm100_cutlass_mla_decode(Tensor! out, Tensor q_nope, Tensor q_pe,"
-      "                         Tensor kv_c_and_k_pe_cache, Tensor seq_lens,"
-      "                         Tensor page_table, Tensor workspace, float "
-      "scale,"
+      "sm100_cutlass_mla_decode(Tensor! out, Tensor! lse, Tensor q_nope,"
+      "                         Tensor q_pe, Tensor kv_c_and_k_pe_cache,"
+      "                         Tensor seq_lens, Tensor page_table,"
+      "                         Tensor workspace, float scale,"
       "                         int num_kv_splits) -> ()");
   // conditionally compiled so impl in source file
 
@@ -698,16 +703,6 @@ TORCH_LIBRARY_EXPAND(CONCAT(TORCH_EXTENSION_NAME, _cache_ops), cache_ops) {
       "                     Tensor scale) -> ()");
   cache_ops.impl("concat_and_cache_mla", torch::kCUDA, &concat_and_cache_mla);
 
-  cache_ops.def(
-      "cp_fused_concat_and_cache_mla(Tensor kv_c, Tensor k_pe,"
-      "                              Tensor cp_local_token_select_indices,"
-      "                              Tensor! kv_cache,"
-      "                              Tensor slot_mapping,"
-      "                              str kv_cache_dtype,"
-      "                              Tensor scale) -> ()");
-  cache_ops.impl("cp_fused_concat_and_cache_mla", torch::kCUDA,
-                 &cp_fused_concat_and_cache_mla);
-
   // Convert the key and value cache to fp8 data type.
   cache_ops.def(
       "convert_fp8(Tensor! dst_cache, Tensor src_cache, float scale, "
diff --git a/docker/Dockerfile b/docker/Dockerfile
index 2e272cbca84177eed611852dd5d5e5f9d02f2b2b..307e9658f7175fa7a6ebbaad4d26cad6dc0525f2 100644
--- a/docker/Dockerfile
+++ b/docker/Dockerfile
@@ -237,7 +237,7 @@ RUN --mount=type=cache,target=/root/.cache/ccache \
 # Check the size of the wheel if RUN_WHEEL_CHECK is true
 COPY .buildkite/check-wheel-size.py check-wheel-size.py
 # sync the default value with .buildkite/check-wheel-size.py
-ARG VLLM_MAX_SIZE_MB=400
+ARG VLLM_MAX_SIZE_MB=450
 ENV VLLM_MAX_SIZE_MB=$VLLM_MAX_SIZE_MB
 ARG RUN_WHEEL_CHECK=true
 RUN if [ "$RUN_WHEEL_CHECK" = "true" ]; then \
@@ -261,6 +261,8 @@ ENV UV_INDEX_STRATEGY="unsafe-best-match"
 # Use copy mode to avoid hardlink failures with Docker cache mounts
 ENV UV_LINK_MODE=copy
 
+# Install libnuma-dev, required by fastsafetensors (fixes #20384)
+RUN apt-get update && apt-get install -y libnuma-dev && rm -rf /var/lib/apt/lists/*
 COPY requirements/lint.txt requirements/lint.txt
 COPY requirements/test.txt requirements/test.txt
 COPY requirements/dev.txt requirements/dev.txt
@@ -373,7 +375,7 @@ RUN --mount=type=bind,from=build,src=/workspace/dist,target=/vllm-workspace/dist
 # Install FlashInfer from source
 ARG FLASHINFER_GIT_REPO="https://github.com/flashinfer-ai/flashinfer.git"
 # Keep this in sync with "flashinfer" extra in setup.py
-ARG FLASHINFER_GIT_REF="v0.2.14.post1"
+ARG FLASHINFER_GIT_REF="v0.3.0"
 # Flag to control whether to compile FlashInfer AOT kernels
 # Set to "true" to enable AOT compilation:
 # docker build --build-arg FLASHINFER_AOT_COMPILE=true ...
@@ -432,11 +434,10 @@ RUN --mount=type=cache,target=/root/.cache/uv \
         --extra-index-url ${PYTORCH_CUDA_INDEX_BASE_URL}/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.')
 
 # Install DeepGEMM from source
-ARG DEEPGEMM_GIT_REF="7b6b5563b9d4c1ae07ffbce7f78ad3ac9204827c"
+ARG DEEPGEMM_GIT_REF
 COPY tools/install_deepgemm.sh /tmp/install_deepgemm.sh
 RUN --mount=type=cache,target=/root/.cache/uv \
-    VLLM_DOCKER_BUILD_CONTEXT=1 /tmp/install_deepgemm.sh --cuda-version "${CUDA_VERSION}" --ref "${DEEPGEMM_GIT_REF}" \
-    && rm /tmp/install_deepgemm.sh
+    VLLM_DOCKER_BUILD_CONTEXT=1 /tmp/install_deepgemm.sh --cuda-version "${CUDA_VERSION}" ${DEEPGEMM_GIT_REF:+--ref "$DEEPGEMM_GIT_REF"} 
 
 # Install EP kernels(pplx-kernels and DeepEP), NixL
 COPY tools/ep_kernels/install_python_libraries.sh install_python_libraries.sh
@@ -518,7 +519,7 @@ RUN --mount=type=cache,target=/root/.cache/uv \
     else \
         BITSANDBYTES_VERSION="0.46.1"; \
     fi; \
-    uv pip install --system accelerate hf_transfer modelscope "bitsandbytes>=${BITSANDBYTES_VERSION}" 'timm==0.9.10' boto3 runai-model-streamer runai-model-streamer[s3]
+    uv pip install --system accelerate hf_transfer modelscope "bitsandbytes>=${BITSANDBYTES_VERSION}" 'timm>=1.0.17' boto3 runai-model-streamer runai-model-streamer[s3]
 
 ENV VLLM_USAGE_SOURCE production-docker-image
 
diff --git a/docker/Dockerfile.neuron b/docker/Dockerfile.neuron
deleted file mode 100644
index 8bc23554718dcdf28a7b32778c182573620c961e..0000000000000000000000000000000000000000
--- a/docker/Dockerfile.neuron
+++ /dev/null
@@ -1,56 +0,0 @@
-# default base image
-# https://gallery.ecr.aws/neuron/pytorch-inference-neuronx
-ARG BASE_IMAGE="public.ecr.aws/neuron/pytorch-inference-neuronx:2.6.0-neuronx-py310-sdk2.23.0-ubuntu22.04"
-
-FROM $BASE_IMAGE
-
-RUN echo "Base image is $BASE_IMAGE"
-
-# Install some basic utilities
-RUN apt-get update && \
-    apt-get install -y \
-        git \
-        python3 \
-        python3-pip \
-        ffmpeg libsm6 libxext6 libgl1
-
-### Mount Point ###
-# When launching the container, mount the code directory to /workspace
-ARG APP_MOUNT=/workspace
-VOLUME [ ${APP_MOUNT} ]
-WORKDIR ${APP_MOUNT}/vllm
-
-RUN python3 -m pip install --upgrade pip
-RUN python3 -m pip install --no-cache-dir fastapi ninja tokenizers pandas tenacity
-RUN python3 -m pip install neuronx-cc==2.* --extra-index-url=https://pip.repos.neuron.amazonaws.com -U
-RUN python3 -m pip install pytest
-
-# uninstall transformers-neuronx package explicitly to avoid version conflict
-RUN python3 -m pip uninstall -y transformers-neuronx
-
-COPY . .
-ARG GIT_REPO_CHECK=0
-RUN --mount=type=bind,source=.git,target=.git \
-    if [ "$GIT_REPO_CHECK" != 0 ]; then bash tools/check_repo.sh ; fi
-
-RUN python3 -m pip install -U \
-        'cmake>=3.26.1' ninja packaging 'setuptools-scm>=8' wheel jinja2 \
-        -r requirements/neuron.txt
-
-ENV VLLM_TARGET_DEVICE neuron
-RUN --mount=type=bind,source=.git,target=.git \
-    pip install --no-build-isolation -v -e .
-
-# install development dependencies (for testing)
-RUN python3 -m pip install -e tests/vllm_test_utils
-
-# install transformers-neuronx package as an optional dependencies (for V0)
-# FIXME: `--no-deps` argument is temporarily added to resolve transformers package version conflict
-RUN python3 -m pip install transformers-neuronx==0.13.* --extra-index-url=https://pip.repos.neuron.amazonaws.com -U --no-deps
-
-RUN python3 -m pip install sentencepiece transformers==4.48.0 -U
-
-# overwrite entrypoint to run bash script
-RUN echo "import subprocess; import sys; subprocess.check_call(sys.argv[1:])" > /usr/local/bin/dockerd-entrypoint.py
-
-CMD ["/bin/bash"]
diff --git a/docker/Dockerfile.rocm b/docker/Dockerfile.rocm
index f1648573250434c230b9b2a7219ef116c47e18ea..063fc4969328872ea472e37d3b7ee084d0f23416 100644
--- a/docker/Dockerfile.rocm
+++ b/docker/Dockerfile.rocm
@@ -47,6 +47,7 @@ COPY --from=build_vllm ${COMMON_WORKDIR}/vllm/requirements /requirements
 COPY --from=build_vllm ${COMMON_WORKDIR}/vllm/benchmarks /benchmarks
 COPY --from=build_vllm ${COMMON_WORKDIR}/vllm/tests /tests
 COPY --from=build_vllm ${COMMON_WORKDIR}/vllm/examples /examples
+COPY --from=build_vllm ${COMMON_WORKDIR}/vllm/docker/Dockerfile.rocm /docker/
 COPY --from=build_vllm ${COMMON_WORKDIR}/vllm/.buildkite /.buildkite
 
 # -----------------------
@@ -71,7 +72,7 @@ COPY --from=build_vllm ${COMMON_WORKDIR}/vllm /vllm-workspace
 RUN cd /vllm-workspace \
     && rm -rf vllm \
     && python3 -m pip install -e tests/vllm_test_utils \
-    && python3 -m pip install git+https://github.com/EleutherAI/lm-evaluation-harness.git@206b7722158f58c35b7ffcd53b035fdbdda5126d#egg=lm-eval[api] \
+    && python3 -m pip install lm-eval[api]==0.4.4 \
     && python3 -m pip install pytest-shard
 
 # -----------------------
@@ -100,8 +101,10 @@ ARG COMMON_WORKDIR
 # Copy over the benchmark scripts as well
 COPY --from=export_vllm /benchmarks ${COMMON_WORKDIR}/vllm/benchmarks
 COPY --from=export_vllm /examples ${COMMON_WORKDIR}/vllm/examples
+COPY --from=export_vllm /docker ${COMMON_WORKDIR}/vllm/docker
 
 ENV RAY_EXPERIMENTAL_NOSET_ROCR_VISIBLE_DEVICES=1
+ENV RAY_EXPERIMENTAL_NOSET_HIP_VISIBLE_DEVICES=1
 ENV TOKENIZERS_PARALLELISM=false
 
 # ENV that can improve safe tensor loading, and end-to-end time
diff --git a/docker/Dockerfile.rocm_base b/docker/Dockerfile.rocm_base
index 3414c0aa845cbebcb948de565330517d121f5779..2ba5461dfe551fc7aa527660936584cc9055cc0d 100644
--- a/docker/Dockerfile.rocm_base
+++ b/docker/Dockerfile.rocm_base
@@ -1,18 +1,16 @@
-ARG BASE_IMAGE=rocm/dev-ubuntu-22.04:6.3.1-complete
-ARG HIPBLASLT_BRANCH="db8e93b4"
-ARG HIPBLAS_COMMON_BRANCH="7c1566b"
+ARG BASE_IMAGE=rocm/dev-ubuntu-22.04:6.4.1-complete
+ARG HIPBLASLT_BRANCH="aa0bda7b"
+ARG HIPBLAS_COMMON_BRANCH="9b80ba8e"
 ARG LEGACY_HIPBLASLT_OPTION=
-ARG RCCL_BRANCH="648a58d"
-ARG RCCL_REPO="https://github.com/ROCm/rccl"
 ARG TRITON_BRANCH="e5be006"
 ARG TRITON_REPO="https://github.com/triton-lang/triton.git"
-ARG PYTORCH_BRANCH="295f2ed4"
+ARG PYTORCH_BRANCH="f717b2af"
 ARG PYTORCH_VISION_BRANCH="v0.21.0"
-ARG PYTORCH_REPO="https://github.com/pytorch/pytorch.git"
+ARG PYTORCH_REPO="https://github.com/ROCm/pytorch.git"
 ARG PYTORCH_VISION_REPO="https://github.com/pytorch/vision.git"
 ARG FA_BRANCH="1a7f4dfa"
 ARG FA_REPO="https://github.com/Dao-AILab/flash-attention.git"
-ARG AITER_BRANCH="916bf3c"
+ARG AITER_BRANCH="4822e675"
 ARG AITER_REPO="https://github.com/ROCm/aiter.git"
 
 FROM ${BASE_IMAGE} AS base
@@ -45,7 +43,7 @@ RUN apt-get update -y \
     && curl -sS https://bootstrap.pypa.io/get-pip.py | python${PYTHON_VERSION} \
     && python3 --version && python3 -m pip --version
 
-RUN pip install -U packaging 'cmake<4' ninja wheel setuptools pybind11 Cython
+RUN pip install -U packaging 'cmake<4' ninja wheel 'setuptools<80' pybind11 Cython
 
 FROM base AS build_hipblaslt
 ARG HIPBLASLT_BRANCH
@@ -53,6 +51,7 @@ ARG HIPBLAS_COMMON_BRANCH
 # Set to "--legacy_hipblas_direct" for ROCm<=6.2
 ARG LEGACY_HIPBLASLT_OPTION
 RUN git clone https://github.com/ROCm/hipBLAS-common.git
+RUN apt-get remove -y hipblaslt && apt-get autoremove -y && apt-get autoclean -y
 RUN cd hipBLAS-common \
     && git checkout ${HIPBLAS_COMMON_BRANCH} \
     && mkdir build \
@@ -69,24 +68,17 @@ RUN cd hipBLASLt \
     && make package
 RUN mkdir -p /app/install && cp /app/hipBLASLt/build/release/*.deb /app/hipBLAS-common/build/*.deb /app/install
 
-FROM base AS build_rccl
-ARG RCCL_BRANCH
-ARG RCCL_REPO
-RUN git clone ${RCCL_REPO}
-RUN cd rccl \
-    && git checkout ${RCCL_BRANCH} \
-    && ./install.sh -p --amdgpu_targets ${PYTORCH_ROCM_ARCH}
-RUN mkdir -p /app/install && cp /app/rccl/build/release/*.deb /app/install
-
 FROM base AS build_triton
 ARG TRITON_BRANCH
 ARG TRITON_REPO
 RUN git clone ${TRITON_REPO}
 RUN cd triton \
     && git checkout ${TRITON_BRANCH} \
-    && cd python \
-    && python3 setup.py bdist_wheel --dist-dir=dist
-RUN mkdir -p /app/install && cp /app/triton/python/dist/*.whl /app/install
+    && if [ ! -f setup.py ]; then cd python; fi \
+    && python3 setup.py bdist_wheel --dist-dir=dist \
+    && mkdir -p /app/install && cp dist/*.whl /app/install
+RUN if [ -d triton/python/triton_kernels ]; then pip install build && cd triton/python/triton_kernels \
+    && python3 -m build --wheel && cp dist/*.whl /app/install; fi
 
 FROM base AS build_amdsmi
 RUN cd /opt/rocm/share/amd_smi \
@@ -132,15 +124,25 @@ RUN cd aiter \
 RUN pip install pyyaml && cd aiter && PREBUILD_KERNELS=1 GPU_ARCHS=gfx942 python3 setup.py bdist_wheel --dist-dir=dist && ls /app/aiter/dist/*.whl
 RUN mkdir -p /app/install && cp /app/aiter/dist/*.whl /app/install
 
+FROM base AS debs
+RUN mkdir /app/debs
+RUN --mount=type=bind,from=build_hipblaslt,src=/app/install/,target=/install \
+    cp /install/*.deb /app/debs
+RUN --mount=type=bind,from=build_triton,src=/app/install/,target=/install \
+    cp /install/*.whl /app/debs
+RUN --mount=type=bind,from=build_amdsmi,src=/app/install/,target=/install \
+    cp /install/*.whl /app/debs
+RUN --mount=type=bind,from=build_pytorch,src=/app/install/,target=/install \
+    cp /install/*.whl /app/debs
+RUN --mount=type=bind,from=build_aiter,src=/app/install/,target=/install \
+    cp /install/*.whl /app/debs
+
 FROM base AS final
 RUN --mount=type=bind,from=build_hipblaslt,src=/app/install/,target=/install \
     dpkg -i /install/*deb \
-    && sed -i 's/, hipblaslt-dev \(.*\), hipcub-dev/, hipcub-dev/g' /var/lib/dpkg/status \
-    && sed -i 's/, hipblaslt \(.*\), hipfft/, hipfft/g' /var/lib/dpkg/status
-RUN --mount=type=bind,from=build_rccl,src=/app/install/,target=/install \
-    dpkg -i /install/*deb \
-    && sed -i 's/, rccl-dev \(.*\), rocalution/, rocalution/g' /var/lib/dpkg/status \
-    && sed -i 's/, rccl \(.*\), rocalution/, rocalution/g' /var/lib/dpkg/status
+    && perl -p -i -e 's/, hipblas-common-dev \([^)]*?\), /, /g' /var/lib/dpkg/status \
+    && perl -p -i -e 's/, hipblaslt-dev \([^)]*?\), /, /g' /var/lib/dpkg/status \
+    && perl -p -i -e 's/, hipblaslt \([^)]*?\), /, /g' /var/lib/dpkg/status
 RUN --mount=type=bind,from=build_triton,src=/app/install/,target=/install \
     pip install /install/*.whl
 RUN --mount=type=bind,from=build_amdsmi,src=/app/install/,target=/install \
@@ -154,8 +156,6 @@ ARG BASE_IMAGE
 ARG HIPBLAS_COMMON_BRANCH
 ARG HIPBLASLT_BRANCH
 ARG LEGACY_HIPBLASLT_OPTION
-ARG RCCL_BRANCH
-ARG RCCL_REPO
 ARG TRITON_BRANCH
 ARG TRITON_REPO
 ARG PYTORCH_BRANCH
@@ -170,8 +170,6 @@ RUN echo "BASE_IMAGE: ${BASE_IMAGE}" > /app/versions.txt \
     && echo "HIPBLAS_COMMON_BRANCH: ${HIPBLAS_COMMON_BRANCH}" >> /app/versions.txt \
     && echo "HIPBLASLT_BRANCH: ${HIPBLASLT_BRANCH}" >> /app/versions.txt \
     && echo "LEGACY_HIPBLASLT_OPTION: ${LEGACY_HIPBLASLT_OPTION}" >> /app/versions.txt \
-    && echo "RCCL_BRANCH: ${RCCL_BRANCH}" >> /app/versions.txt \
-    && echo "RCCL_REPO: ${RCCL_REPO}" >> /app/versions.txt \
     && echo "TRITON_BRANCH: ${TRITON_BRANCH}" >> /app/versions.txt \
     && echo "TRITON_REPO: ${TRITON_REPO}" >> /app/versions.txt \
     && echo "PYTORCH_BRANCH: ${PYTORCH_BRANCH}" >> /app/versions.txt \
@@ -180,4 +178,4 @@ RUN echo "BASE_IMAGE: ${BASE_IMAGE}" > /app/versions.txt \
     && echo "PYTORCH_VISION_REPO: ${PYTORCH_VISION_REPO}" >> /app/versions.txt \
     && echo "FA_BRANCH: ${FA_BRANCH}" >> /app/versions.txt \
     && echo "AITER_BRANCH: ${AITER_BRANCH}" >> /app/versions.txt \
-    && echo "AITER_REPO: ${AITER_REPO}" >> /app/versions.txt
+    && echo "AITER_REPO: ${AITER_REPO}" >> /app/versions.txt
\ No newline at end of file
diff --git a/docker/Dockerfile.s390x b/docker/Dockerfile.s390x
index 9270b48c54d4b173c357dc07a853464a44eacbf6..9942b7626f81e77a518e830bce0b4d8e81c6e594 100644
--- a/docker/Dockerfile.s390x
+++ b/docker/Dockerfile.s390x
@@ -16,7 +16,8 @@ ENV LANG=C.UTF-8 \
 RUN microdnf install -y \
     which procps findutils tar vim git gcc gcc-gfortran g++ make patch zlib-devel \
     libjpeg-turbo-devel libtiff-devel libpng-devel libwebp-devel freetype-devel harfbuzz-devel \
-    openssl-devel openblas openblas-devel autoconf automake libtool cmake numpy libsndfile && \
+    openssl-devel openblas openblas-devel autoconf automake libtool cmake numpy libsndfile \
+    clang llvm-devel llvm-static clang-devel && \
     microdnf clean all
 
 # Python Installation
@@ -191,7 +192,6 @@ RUN --mount=type=cache,target=/root/.cache/uv \
         -DCOMPILER_RT_BUILD_ORC=OFF                      \
         -DCOMPILER_RT_INCLUDE_TESTS=OFF                  \
         ${CMAKE_ARGS} -GNinja ../llvm                    \
-
     && ninja install  . && \
     #  build llvmlite
     cd ../../llvmlite && python setup.py bdist_wheel && \
@@ -200,6 +200,45 @@ RUN --mount=type=cache,target=/root/.cache/uv \
        sed -i '/#include "internal\/pycore_atomic.h"/i\#include "dynamic_annotations.h"' numba/_dispatcher.cpp; \
     fi && python setup.py bdist_wheel
 
+# Edit aws-lc-sys to support s390x
+FROM python-install AS aws-lc-sys-editor
+WORKDIR /tmp
+ENV CARGO_HOME=/root/.cargo
+ENV RUSTUP_HOME=/root/.rustup
+ENV PATH="$CARGO_HOME/bin:$RUSTUP_HOME/bin:$PATH"
+ARG AWS_LC_VERSION=v0.30.0
+RUN --mount=type=cache,target=/root/.cache/uv \
+    --mount=type=bind,from=rust,source=/root/.cargo,target=/root/.cargo,rw \
+    --mount=type=bind,from=rust,source=/root/.rustup,target=/root/.rustup,rw \
+    git clone --recursive https://github.com/aws/aws-lc-rs.git && \
+    cd aws-lc-rs && \
+    git checkout tags/aws-lc-sys/${AWS_LC_VERSION} && \
+    git submodule sync && \
+    git submodule update --init --recursive && \
+    cd aws-lc-sys && \
+    sed -i '682 s/strncmp(buf, "-----END ", 9)/memcmp(buf, "-----END ", 9)/' aws-lc/crypto/pem/pem_lib.c && \
+    sed -i '712 s/strncmp(buf, "-----END ", 9)/memcmp(buf, "-----END ", 9)/' aws-lc/crypto/pem/pem_lib.c && \
+    sed -i '747 s/strncmp(buf, "-----END ", 9)/memcmp(buf, "-----END ", 9)/' aws-lc/crypto/pem/pem_lib.c
+    
+# Build Outlines Core
+FROM python-install AS outlines-core-builder
+WORKDIR /tmp
+ENV CARGO_HOME=/root/.cargo
+ENV RUSTUP_HOME=/root/.rustup
+ENV PATH="$CARGO_HOME/bin:$RUSTUP_HOME/bin:$PATH"
+ARG OUTLINES_CORE_VERSION=0.2.10
+RUN --mount=type=cache,target=/root/.cache/uv \
+    --mount=type=bind,from=rust,source=/root/.cargo,target=/root/.cargo,rw \
+    --mount=type=bind,from=rust,source=/root/.rustup,target=/root/.rustup,rw \
+    --mount=type=bind,from=aws-lc-sys-editor,source=/tmp/aws-lc-rs/aws-lc-sys,target=/tmp/aws-lc-sys,rw \
+    git clone https://github.com/dottxt-ai/outlines-core.git && \
+    cd outlines-core && \
+    git checkout tags/${OUTLINES_CORE_VERSION} && \
+    sed -i "s/version = \"0.0.0\"/version = \"${OUTLINES_CORE_VERSION}\"/" Cargo.toml && \
+    echo '[patch.crates-io]' >> Cargo.toml && \
+    echo 'aws-lc-sys = { path = "/tmp/aws-lc-sys" }' >> Cargo.toml && \
+    uv pip install maturin && \
+    python -m maturin build --release --out dist
 
 # Final build stage
 FROM python-install AS vllm-cpu
@@ -230,6 +269,7 @@ RUN --mount=type=cache,target=/root/.cache/uv \
     --mount=type=bind,from=torch,source=/tmp/pytorch/dist,target=/tmp/torch-wheels/ \
     --mount=type=bind,from=numba-builder,source=/tmp/llvmlite/dist,target=/tmp/llvmlite-wheels/ \
     --mount=type=bind,from=numba-builder,source=/tmp/numba/dist,target=/tmp/numba-wheels/ \
+    --mount=type=bind,from=outlines-core-builder,source=/tmp/outlines-core/dist,target=/tmp/outlines-core/dist/ \
      sed -i '/^torch/d' requirements/build.txt && \
      ARROW_WHL_FILE=$(ls /tmp/arrow-wheels/pyarrow-*.whl) && \
      VISION_WHL_FILE=$(ls /tmp/vision-wheels/*.whl) && \
@@ -237,6 +277,7 @@ RUN --mount=type=cache,target=/root/.cache/uv \
      TORCH_WHL_FILE=$(ls /tmp/torch-wheels/*.whl) && \
      LLVM_WHL_FILE=$(ls /tmp/llvmlite-wheels/*.whl) && \
      NUMBA_WHL_FILE=$(ls /tmp/numba-wheels/*.whl) && \
+     OUTLINES_CORE_WHL_FILE=$(ls /tmp/outlines-core/dist/*.whl) && \
     uv pip install -v \    
         $ARROW_WHL_FILE  \
         $VISION_WHL_FILE \
@@ -244,6 +285,7 @@ RUN --mount=type=cache,target=/root/.cache/uv \
         $TORCH_WHL_FILE \
         $LLVM_WHL_FILE \
         $NUMBA_WHL_FILE \
+        $OUTLINES_CORE_WHL_FILE \
         --index-strategy unsafe-best-match \
         -r requirements/build.txt \
         -r requirements/cpu.txt
diff --git a/docker/Dockerfile.xpu b/docker/Dockerfile.xpu
index 65d2e5036b78338b8c6642a47ac85d7821d27d41..ef422352509a97e8b93cbe6aa296c97fa961bba1 100644
--- a/docker/Dockerfile.xpu
+++ b/docker/Dockerfile.xpu
@@ -1,12 +1,10 @@
 FROM intel/deep-learning-essentials:2025.1.3-0-devel-ubuntu24.04 AS vllm-base
 
-RUN rm /etc/apt/sources.list.d/intel-graphics.list
+RUN wget -O- https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB | gpg --dearmor | tee /usr/share/keyrings/oneapi-archive-keyring.gpg > /dev/null && \
+    echo "deb [signed-by=/usr/share/keyrings/oneapi-archive-keyring.gpg] https://apt.repos.intel.com/oneapi all main" | tee /etc/apt/sources.list.d/oneAPI.list && \
+    add-apt-repository -y ppa:kobuk-team/intel-graphics
 
 RUN apt clean && apt-get update -y && \
-    apt-get install -y software-properties-common && \
-    add-apt-repository ppa:deadsnakes/ppa && \
-    apt-get install -y python3.10 python3.10-distutils && \
-    curl -sS https://bootstrap.pypa.io/get-pip.py | python3.10 && \
     apt-get install -y --no-install-recommends --fix-missing \
     curl \
     ffmpeg \
@@ -17,17 +15,29 @@ RUN apt clean && apt-get update -y && \
     libgl1 \
     lsb-release \
     numactl \
-    python3.10-dev \
-    wget
+    wget \
+    vim \
+    python3.12 \
+    python3.12-dev \
+    python3-pip
 
+RUN update-alternatives --install /usr/bin/python3 python3 /usr/bin/python3.12 1
+RUN update-alternatives --install /usr/bin/python python /usr/bin/python3.12 1
 
-RUN update-alternatives --install /usr/bin/python3 python3 /usr/bin/python3.10 1
-RUN update-alternatives --install /usr/bin/python python /usr/bin/python3.10 1
+RUN apt install -y libze1 libze-dev libze-intel-gpu1 intel-opencl-icd libze-intel-gpu-raytracing
+
+RUN wget https://github.com/uxlfoundation/oneCCL/releases/download/2021.15.4/intel-oneccl-2021.15.4.11_offline.sh
+RUN bash intel-oneccl-2021.15.4.11_offline.sh -a --silent --eula accept && echo "source /opt/intel/oneapi/setvars.sh --force" >> /root/.bashrc
+SHELL ["bash", "-c"]
+CMD ["bash", "-c", "source /root/.bashrc && exec bash"]
 
 WORKDIR /workspace/vllm
 COPY requirements/xpu.txt /workspace/vllm/requirements/xpu.txt
 COPY requirements/common.txt /workspace/vllm/requirements/common.txt
 
+# suppress the python externally managed environment error
+RUN python3 -m pip config set global.break-system-packages true
+
 RUN --mount=type=cache,target=/root/.cache/pip \
     pip install --no-cache-dir \
     -r requirements/xpu.txt
@@ -54,8 +64,9 @@ FROM vllm-base AS vllm-openai
 RUN --mount=type=cache,target=/root/.cache/pip \
     pip install accelerate hf_transfer pytest pytest_asyncio lm_eval[api] modelscope
 
-ENV VLLM_USAGE_SOURCE production-docker-image \
-    TRITON_XPU_PROFILE 1
+RUN --mount=type=cache,target=/root/.cache/pip \
+    pip uninstall oneccl oneccl-devel -y
+
 # install development dependencies (for testing)
 RUN python3 -m pip install -e tests/vllm_test_utils
 ENTRYPOINT ["python3", "-m", "vllm.entrypoints.openai.api_server"]
diff --git a/docs/.nav.yml b/docs/.nav.yml
index dbac0e12f1bf2ef1fc1fd428dad9f515351763b9..c103ed476d76d68417c62765348088d63ae7cfbe 100644
--- a/docs/.nav.yml
+++ b/docs/.nav.yml
@@ -32,10 +32,7 @@ nav:
       - models/pooling_models.md
       - models/extensions
       - Hardware Supported Models: models/hardware_supported_models
-    - Features:
-      - features/compatibility_matrix.md
-      - features/*
-      - features/quantization
+    - Features: features
   - Developer Guide:
     - contributing/README.md
     - General:
@@ -47,11 +44,12 @@ nav:
       - contributing/model/registration.md
       - contributing/model/tests.md
       - contributing/model/multimodal.md
+      - contributing/model/transcription.md
     - CI: contributing/ci
     - Design Documents: design
   - API Reference:
     - api/README.md
-    - api/vllm/*
+    - api/vllm
   - CLI Reference: cli
   - Community:
     - community/*
diff --git a/docs/community/meetups.md b/docs/community/meetups.md
index 221a7bd96213f76fc7066c0410d55c0981840579..a3004249b758b512fff363307f38776b7940ada5 100644
--- a/docs/community/meetups.md
+++ b/docs/community/meetups.md
@@ -2,6 +2,8 @@
 
 We host regular meetups in San Francisco Bay Area every 2 months. We will share the project updates from the vLLM team and have guest speakers from the industry to share their experience and insights. Please find the materials of our previous meetups below:
 
+- [vLLM Shenzhen Meetup](https://mp.weixin.qq.com/s/k8ZBO1u2_2odgiKWH_GVTQ), August 30th 2025. [[Slides]](https://drive.google.com/drive/folders/1Ua2SVKVSu-wp5vou_6ElraDt2bnKhiEA)
+- [vLLM Singapore Meetup](https://www.sginnovate.com/event/vllm-sg-meet), August 27th 2025. [[Slides]](https://drive.google.com/drive/folders/1ncf3GyqLdqFaB6IeB834E5TZJPLAOiXZ?usp=sharing)
 - [vLLM Shanghai Meetup](https://mp.weixin.qq.com/s/pDmAXHcN7Iqc8sUKgJgGtg), August 23rd 2025. [[Slides]](https://drive.google.com/drive/folders/1OvLx39wnCGy_WKq8SiVKf7YcxxYI3WCH)
 - [vLLM Korea Meetup](https://luma.com/cgcgprmh), August 19th 2025. [[Slides]](https://drive.google.com/file/d/1bcrrAE1rxUgx0mjIeOWT6hNe2RefC5Hm/view).
 - [vLLM Beijing Meetup](https://mp.weixin.qq.com/s/dgkWg1WFpWGO2jCdTqQHxA), August 2nd 2025. [[Slides]](https://drive.google.com/drive/folders/1Pid6NSFLU43DZRi0EaTcPgXsAzDvbBqF) [[Recording]](https://www.chaspark.com/#/live/1166916873711665152).
diff --git a/docs/configuration/optimization.md b/docs/configuration/optimization.md
index 2d8cdcc11fa9970081eb5d08cd4c3a94e84781c0..c853fcf92941e5baf077c8f6d493accef7639e83 100644
--- a/docs/configuration/optimization.md
+++ b/docs/configuration/optimization.md
@@ -174,6 +174,8 @@ Regardless, you need to set `mm_encoder_tp_mode="data"` in engine arguments to u
 
 Known supported models:
 
+- GLM-4.5V GLM-4.1V (<gh-pr:23168>)
+- Kimi-VL (<gh-pr:23817>)
 - Llama4 (<gh-pr:18368>)
 - MiniCPM-V-2.5 or above (<gh-pr:23327>, <gh-pr:23948>)
 - Qwen2.5-VL (<gh-pr:22742>)
@@ -208,7 +210,7 @@ vllm serve Qwen/Qwen2.5-VL-3B-Instruct --api-server-count 4 -dp 2
 
 !!! note
     API server scale-out disables [multi-modal IPC caching](#ipc-caching)
-    because it requires a one-to-one correspondance between API and engine core processes.
+    because it requires a one-to-one correspondence between API and engine core processes.
 
     This does not impact [multi-modal processor caching](#processor-caching).
 
@@ -225,7 +227,7 @@ to avoid repeatedly processing the same multi-modal inputs in `BaseMultiModalPro
 ### IPC Caching
 
 Multi-modal IPC caching is automatically enabled when
-there is a one-to-one correspondance between API (`P0`) and engine core (`P1`) processes,
+there is a one-to-one correspondence between API (`P0`) and engine core (`P1`) processes,
 to avoid repeatedly transferring the same multi-modal inputs between them.
 
 ### Configuration
diff --git a/docs/contributing/benchmarks.md b/docs/contributing/benchmarks.md
index 2bbed778f3c6aa757f0a93c84dda8c5e4fef63ab..25c2d2955ff2f9dc29be27e973d8df705281ab54 100644
--- a/docs/contributing/benchmarks.md
+++ b/docs/contributing/benchmarks.md
@@ -11,9 +11,39 @@ vLLM contains two sets of benchmarks:
 
 The performance benchmarks are used for development to confirm whether new changes improve performance under various workloads. They are triggered on every commit with both the `perf-benchmarks` and `ready` labels, and when a PR is merged into vLLM.
 
+### Manually Trigger the benchmark
+
+Use [vllm-ci-test-repo images](https://gallery.ecr.aws/q9t5s3a7/vllm-ci-test-repo) with vLLM benchmark suite.  
+For CPU environment, please use the image with "-cpu" postfix.
+
+Here is an example for docker run command for CPU.  
+
+```bash
+docker run -it --entrypoint /bin/bash -v /data/huggingface:/root/.cache/huggingface  -e HF_TOKEN=''  --shm-size=16g --name vllm-cpu-ci  public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:1da94e673c257373280026f75ceb4effac80e892-cpu
+```
+
+Then, run below command inside the docker instance.  
+
+```bash
+bash .buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh
+```
+
+When run, benchmark script generates results under **benchmark/results** folder, along with the benchmark_results.md and benchmark_results.json.  
+
+#### Runtime environment variables
+
+- `ON_CPU`: set the value to '1' on Intel® Xeon® Processors. Default value is 0.
+- `SERVING_JSON`: JSON file to use for the serving tests. Default value is empty string (use default file).
+- `LATENCY_JSON`: JSON file to use for the latency tests. Default value is empty string (use default file).
+- `THROUGHPUT_JSON`: JSON file to use for the throughout tests. Default value is empty string (use default file).
+- `REMOTE_HOST`: IP for the remote vLLM service to benchmark. Default value is empty string.
+- `REMOTE_PORT`: Port for the remote vLLM service to benchmark. Default value is empty string.
+
+For more results visualization, check the [visualizing the results](https://github.com/intel-ai-tce/vllm/blob/more_cpu_models/.buildkite/nightly-benchmarks/README.md#visualizing-the-results).
+
 The latest performance results are hosted on the public [vLLM Performance Dashboard](https://hud.pytorch.org/benchmark/llms?repoName=vllm-project%2Fvllm).
 
-More information on the performance benchmarks and their parameters can be found [here](gh-file:.buildkite/nightly-benchmarks/performance-benchmarks-descriptions.md).
+More information on the performance benchmarks and their parameters can be found in [Benchmark README](https://github.com/intel-ai-tce/vllm/blob/more_cpu_models/.buildkite/nightly-benchmarks/README.md) and [performance benchmark description](gh-file:.buildkite/nightly-benchmarks/performance-benchmarks-descriptions.md).
 
 [](){ #nightly-benchmarks }
 
diff --git a/docs/contributing/model/README.md b/docs/contributing/model/README.md
index 0ca77fa499db736e1775ae2141e348f4d20d59d4..6c013738ac1ec5721678ef69490e157001597c2f 100644
--- a/docs/contributing/model/README.md
+++ b/docs/contributing/model/README.md
@@ -15,6 +15,7 @@ Read through these pages for a step-by-step guide:
 - [Registering a Model](registration.md)
 - [Unit Testing](tests.md)
 - [Multi-Modal Support](multimodal.md)
+- [Speech-to-Text Support](transcription.md)
 
 !!! tip
     If you are encountering issues while integrating your model into vLLM, feel free to open a [GitHub issue](https://github.com/vllm-project/vllm/issues)
diff --git a/docs/contributing/model/transcription.md b/docs/contributing/model/transcription.md
new file mode 100644
index 0000000000000000000000000000000000000000..62e58e5c6ac58dc97b7746b7339e39673d09f763
--- /dev/null
+++ b/docs/contributing/model/transcription.md
@@ -0,0 +1,276 @@
+# Speech-to-Text (Transcription/Translation) Support
+
+This document walks you through the steps to add support for speech-to-text (ASR) models to vLLM’s transcription and translation APIs by implementing [SupportsTranscription][vllm.model_executor.models.interfaces.SupportsTranscription].
+Please refer to the [supported models](../../models/supported_models.md#transcription) for further guidance.
+
+## Update the base vLLM model
+
+It is assumed you have already implemented your model in vLLM according to the basic model guide. Extend your model with the [SupportsTranscription][vllm.model_executor.models.interfaces.SupportsTranscription] interface and implement the following class attributes and methods.
+
+### `supported_languages` and `supports_transcription_only`
+
+Declare supported languages and capabilities:
+
+- The `supported_languages` mapping is validated at init time.
+- Set `supports_transcription_only=True` if the model should not serve text generation (eg Whisper).
+
+??? code "supported_languages and supports_transcription_only"
+    ```python
+    from typing import ClassVar, Mapping, Optional, Literal
+    import numpy as np
+    import torch
+    from torch import nn
+
+    from vllm.config import ModelConfig, SpeechToTextConfig
+    from vllm.inputs.data import PromptType
+    from vllm.model_executor.models.interfaces import SupportsTranscription
+    
+    class YourASRModel(nn.Module, SupportsTranscription):
+        # Map of ISO 639-1 language codes to language names
+        supported_languages: ClassVar[Mapping[str, str]] = {
+            "en": "English",
+            "it": "Italian",
+            # ... add more as needed
+        }
+        
+        # If your model only supports audio-conditioned generation
+        # (no text-only generation), enable this flag.
+        supports_transcription_only: ClassVar[bool] = True
+    ```
+
+Provide an ASR configuration via [get_speech_to_text_config][vllm.model_executor.models.interfaces.SupportsTranscription.get_speech_to_text_config].
+
+This is for controlling general behavior of the API when serving your model:
+
+??? code "get_speech_to_text_config()"
+    ```python
+    class YourASRModel(nn.Module, SupportsTranscription):
+        ...
+
+        @classmethod
+        def get_speech_to_text_config(
+            cls,
+            model_config: ModelConfig,
+            task_type: Literal["transcribe", "translate"],
+        ) -> SpeechToTextConfig:
+            return SpeechToTextConfig(
+                sample_rate=16_000,
+                max_audio_clip_s=30,
+                # Set to None to disable server-side chunking if your
+                # model/processor handles it already
+                min_energy_split_window_size=None,
+            )
+    ```
+
+See [Audio preprocessing and chunking](#audio-preprocessing-and-chunking) for what each field controls.
+
+Implement the prompt construction via [get_generation_prompt][vllm.model_executor.models.interfaces.SupportsTranscription.get_generation_prompt]. The server passes you the resampled waveform and task parameters; you return a valid [PromptType][vllm.inputs.data.PromptType]. There are two common patterns:
+
+#### Multimodal LLM with audio embeddings (e.g., Voxtral, Gemma3n)
+
+Return a dict containing `multi_modal_data` with the audio, and either a `prompt` string or `prompt_token_ids`:
+
+??? code "get_generation_prompt()"
+    ```python
+    class YourASRModel(nn.Module, SupportsTranscription):
+        ...
+
+        @classmethod
+        def get_generation_prompt(
+            cls,
+            audio: np.ndarray,
+            stt_config: SpeechToTextConfig,
+            model_config: ModelConfig,
+            language: Optional[str],
+            task_type: Literal["transcribe", "translate"],
+            request_prompt: str,
+            to_language: Optional[str],
+        ) -> PromptType:
+            # Example with a free-form instruction prompt
+            task_word = "Transcribe" if task_type == "transcribe" else "Translate"
+            prompt = (
+                "<start_of_turn>user\n"
+                f"{task_word} this audio: <audio_soft_token>"
+                "<end_of_turn>\n<start_of_turn>model\n"
+            )
+
+            return {
+                "multi_modal_data": {"audio": (audio, stt_config.sample_rate)},
+                "prompt": prompt,
+            }
+    ```
+
+    For further clarification on multi modal inputs, please refer to [Multi-Modal Inputs](../../features/multimodal_inputs.md).
+
+#### Encoder–decoder audio-only (e.g., Whisper)
+
+Return a dict with separate `encoder_prompt` and `decoder_prompt` entries:
+
+??? code "get_generation_prompt()"
+    ```python
+    class YourASRModel(nn.Module, SupportsTranscription):
+        ...
+
+        @classmethod
+        def get_generation_prompt(
+            cls,
+            audio: np.ndarray,
+            stt_config: SpeechToTextConfig,
+            model_config: ModelConfig,
+            language: Optional[str],
+            task_type: Literal["transcribe", "translate"],
+            request_prompt: str,
+            to_language: Optional[str],
+        ) -> PromptType:
+            if language is None:
+                raise ValueError("Language must be specified")
+
+            prompt = {
+                "encoder_prompt": {
+                    "prompt": "",
+                    "multi_modal_data": {
+                        "audio": (audio, stt_config.sample_rate),
+                    },
+                },
+                "decoder_prompt": (
+                    (f"<|prev|>{request_prompt}" if request_prompt else "")
+                    + f"<|startoftranscript|><|{language}|>"
+                    + f"<|{task_type}|><|notimestamps|>"
+                ),
+            }
+            return cast(PromptType, prompt)
+    ```
+
+### `validate_language` (optional)
+
+Language validation via [validate_language][vllm.model_executor.models.interfaces.SupportsTranscription.validate_language]
+
+If your model requires a language and you want a default, override this method (see Whisper):
+
+??? code "validate_language()"
+    ```python
+    @classmethod
+    def validate_language(cls, language: Optional[str]) -> Optional[str]:
+        if language is None:
+            logger.warning(
+                "Defaulting to language='en'. If you wish to transcribe audio in a different language, pass the `language` field.")
+            language = "en"
+        return super().validate_language(language)
+    ```
+
+### `get_num_audio_tokens` (optional)
+
+Token accounting for streaming via [get_num_audio_tokens][vllm.model_executor.models.interfaces.SupportsTranscription.get_num_audio_tokens]
+
+Provide a fast duration→token estimate to improve streaming usage statistics:
+
+??? code "get_num_audio_tokens()"
+    ```python
+    class YourASRModel(nn.Module, SupportsTranscription):
+        ...
+
+        @classmethod
+        def get_num_audio_tokens(
+            cls,
+            audio_duration_s: float,
+            stt_config: SpeechToTextConfig,
+            model_config: ModelConfig,
+        ) -> Optional[int]:
+            # Return None if unknown; otherwise return an estimate.
+            return int(audio_duration_s * stt_config.sample_rate // 320)  # example
+    ```
+
+## Audio preprocessing and chunking
+
+The API server takes care of basic audio I/O and optional chunking before building prompts:
+
+- Resampling: Input audio is resampled to `SpeechToTextConfig.sample_rate` using `librosa`.
+- Chunking: If `SpeechToTextConfig.allow_audio_chunking` is True and the duration exceeds `max_audio_clip_s`, the server splits the audio into overlapping chunks and generates a prompt per chunk. Overlap is controlled by `overlap_chunk_second`.
+- Energy-aware splitting: When `min_energy_split_window_size` is set, the server finds low-energy regions to minimize cutting within words.
+
+Relevant server logic:
+
+??? code "_preprocess_speech_to_text()"
+    ```python
+    # vllm/entrypoints/openai/speech_to_text.py
+    async def _preprocess_speech_to_text(...):
+        language = self.model_cls.validate_language(request.language)
+        ...
+        y, sr = librosa.load(bytes_, sr=self.asr_config.sample_rate)
+        duration = librosa.get_duration(y=y, sr=sr)
+        do_split_audio = (self.asr_config.allow_audio_chunking
+                        and duration > self.asr_config.max_audio_clip_s)
+        chunks = [y] if not do_split_audio else self._split_audio(y, int(sr))
+        prompts = []
+        for chunk in chunks:
+            prompt = self.model_cls.get_generation_prompt(
+                audio=chunk,
+                stt_config=self.asr_config,
+                model_config=self.model_config,
+                language=language,
+                task_type=self.task_type,
+                request_prompt=request.prompt,
+                to_language=to_language,
+            )
+            prompts.append(prompt)
+        return prompts, duration
+    ```
+
+## Exposing tasks automatically
+
+vLLM automatically advertises transcription support if your model implements the interface:
+
+```python
+if supports_transcription(model):
+    if model.supports_transcription_only:
+        return ["transcription"]
+    supported_tasks.append("transcription")
+```
+
+When enabled, the server initializes the transcription and translation handlers:
+
+```python
+state.openai_serving_transcription = OpenAIServingTranscription(...) if "transcription" in supported_tasks else None
+state.openai_serving_translation = OpenAIServingTranslation(...) if "transcription" in supported_tasks else None
+```
+
+No extra registration is required beyond having your model class available via the model registry and implementing `SupportsTranscription`.
+
+## Examples in-tree
+
+- Whisper encoder–decoder (audio-only): <gh-file:vllm/model_executor/models/whisper.py>
+- Voxtral decoder-only (audio embeddings + LLM): <gh-file:vllm/model_executor/models/voxtral.py>
+- Gemma3n decoder-only with fixed instruction prompt: <gh-file:vllm/model_executor/models/gemma3n_mm.py>
+
+## Test with the API
+
+Once your model implements `SupportsTranscription`, you can test the endpoints (API mimics OpenAI):
+
+- Transcription (ASR):
+
+    ```bash
+    curl -s -X POST \
+      -H "Authorization: Bearer $VLLM_API_KEY" \
+      -H "Content-Type: multipart/form-data" \
+      -F "file=@/path/to/audio.wav" \
+      -F "model=$MODEL_ID" \
+      http://localhost:8000/v1/audio/transcriptions
+    ```
+
+- Translation (source → English unless otherwise supported):
+
+    ```bash
+    curl -s -X POST \
+      -H "Authorization: Bearer $VLLM_API_KEY" \
+      -H "Content-Type: multipart/form-data" \
+      -F "file=@/path/to/audio.wav" \
+      -F "model=$MODEL_ID" \
+      http://localhost:8000/v1/audio/translations
+    ```
+
+Or check out more examples in <gh-file:examples/online_serving>.
+
+!!! note
+    - If your model handles chunking internally (e.g., via its processor or encoder), set `min_energy_split_window_size=None` in the returned `SpeechToTextConfig` to disable server-side chunking.
+    - Implementing `get_num_audio_tokens` improves accuracy of streaming usage metrics (`prompt_tokens`) without an extra forward pass.
+    - For multilingual behavior, keep `supported_languages` aligned with actual model capabilities.
diff --git a/docs/contributing/profiling.md b/docs/contributing/profiling.md
index 74627e9062167acd2f695d2717e05fe1c182040f..5b83d93274f0def5bd6caba0d671a4075dd90d37 100644
--- a/docs/contributing/profiling.md
+++ b/docs/contributing/profiling.md
@@ -19,7 +19,7 @@ When using `vllm bench serve`, you can enable profiling by passing the `--profil
 Traces can be visualized using <https://ui.perfetto.dev/>.
 
 !!! tip
-You can directly call bench module without installing vllm using `python -m vllm.entrypoints.cli.main bench`.
+    You can directly call bench module without installing vLLM using `python -m vllm.entrypoints.cli.main bench`.
 
 !!! tip
     Only send a few requests through vLLM when profiling, as the traces can get quite large. Also, no need to untar the traces, they can be viewed directly.
@@ -73,6 +73,8 @@ apt install nsight-systems-cli
 
 ### Example commands and usage
 
+When profiling with `nsys`, it is advisable to set the environment variable `VLLM_WORKER_MULTIPROC_METHOD=spawn`. The default is to use the `fork` method instead of `spawn`. More information on the topic can be found in the [Nsight Systems release notes](https://docs.nvidia.com/nsight-systems/ReleaseNotes/index.html#general-issues).
+
 #### Offline Inference
 
 For basic usage, you can just append `nsys profile -o report.nsys-rep --trace-fork-before-exec=true --cuda-graph-trace=node` before any existing script you would run for offline inference.
diff --git a/docs/deployment/frameworks/anything-llm.md b/docs/deployment/frameworks/anything-llm.md
index 0b41e73b030cc1f9eb288a789e6c158a7319784c..40a463a8a596c6c9718eea82f61028d8426b9b6e 100644
--- a/docs/deployment/frameworks/anything-llm.md
+++ b/docs/deployment/frameworks/anything-llm.md
@@ -1,41 +1,53 @@
-# Anything LLM
+# AnythingLLM
 
-[Anything LLM](https://github.com/Mintplex-Labs/anything-llm) is a full-stack application that enables you to turn any document, resource, or piece of content into context that any LLM can use as references during chatting.
+[AnythingLLM](https://github.com/Mintplex-Labs/anything-llm) is a full-stack application that enables you to turn any document, resource, or piece of content into context that any LLM can use as references during chatting.
 
 It allows you to deploy a large language model (LLM) server with vLLM as the backend, which exposes OpenAI-compatible endpoints.
 
 ## Prerequisites
 
-- Setup vLLM environment
+Set up the vLLM environment:
+
+```bash
+pip install vllm
+```
 
 ## Deploy
 
-- Start the vLLM server with the supported chat completion model, e.g.
+1. Start the vLLM server with a supported chat-completion model, for example:
 
-```bash
-vllm serve Qwen/Qwen1.5-32B-Chat-AWQ --max-model-len 4096
-```
+    ```bash
+    vllm serve Qwen/Qwen1.5-32B-Chat-AWQ --max-model-len 4096
+    ```
+
+1. Download and install [AnythingLLM Desktop](https://anythingllm.com/desktop).
+
+1. Configure the AI provider:
+
+    - At the bottom, click the 🔧 wrench icon -> **Open settings** -> **AI Providers** -> **LLM**.
+    - Enter the following values:
+        - LLM Provider: Generic OpenAI
+        - Base URL: `http://{vllm server host}:{vllm server port}/v1`
+        - Chat Model Name: `Qwen/Qwen1.5-32B-Chat-AWQ`
 
-- Download and install [Anything LLM desktop](https://anythingllm.com/desktop).
+    ![set AI providers](../../assets/deployment/anything-llm-provider.png)
 
-- On the bottom left of open settings, AI Providers --> LLM:
-    - LLM Provider: Generic OpenAI
-    - Base URL: http://{vllm server host}:{vllm server port}/v1
-    - Chat Model Name: `Qwen/Qwen1.5-32B-Chat-AWQ`
+1. Create a workspace:
 
-![](../../assets/deployment/anything-llm-provider.png)
+    1. At the bottom, click the ↺ back icon and back to workspaces.
+    1. Create a workspace (e.g., `vllm`) and start chatting.
 
-- Back to home page, New Workspace --> create `vllm` workspace, and start to chat:
+    ![create a workspace](../../assets/deployment/anything-llm-chat-without-doc.png)
 
-![](../../assets/deployment/anything-llm-chat-without-doc.png)
+1. Add a document.
 
-- Click the upload button:
-    - upload the doc
-    - select the doc and move to the workspace
-    - save and embed
+    1. Click the 📎 attachment icon.
+    1. Upload a document.
+    1. Select and move the document into your workspace.
+    1. Save and embed it.
 
-![](../../assets/deployment/anything-llm-upload-doc.png)
+    ![add a document](../../assets/deployment/anything-llm-upload-doc.png)
 
-- Chat again:
+1. Chat using your document as context.
 
-![](../../assets/deployment/anything-llm-chat-with-doc.png)
+    ![chat with your context](../../assets/deployment/anything-llm-chat-with-doc.png)
diff --git a/docs/deployment/frameworks/autogen.md b/docs/deployment/frameworks/autogen.md
index c255a85d384016f76727ea1d1dc7eaadf5651d41..7517ee771c09774cfe5bc914865fe4ed9e79359f 100644
--- a/docs/deployment/frameworks/autogen.md
+++ b/docs/deployment/frameworks/autogen.md
@@ -4,9 +4,7 @@
 
 ## Prerequisites
 
-- Setup vLLM environment
-
-- Setup [AutoGen](https://microsoft.github.io/autogen/0.2/docs/installation/) environment
+Set up the vLLM and [AutoGen](https://microsoft.github.io/autogen/0.2/docs/installation/) environment:
 
 ```bash
 pip install vllm
@@ -18,14 +16,14 @@ pip install -U "autogen-agentchat" "autogen-ext[openai]"
 
 ## Deploy
 
-- Start the vLLM server with the supported chat completion model, e.g.
+1. Start the vLLM server with the supported chat completion model, e.g.
 
-```bash
-python -m vllm.entrypoints.openai.api_server \
-    --model mistralai/Mistral-7B-Instruct-v0.2
-```
+    ```bash
+    python -m vllm.entrypoints.openai.api_server \
+        --model mistralai/Mistral-7B-Instruct-v0.2
+    ```
 
-- Call it with AutoGen:
+1. Call it with AutoGen:
 
 ??? code
 
diff --git a/docs/deployment/frameworks/chatbox.md b/docs/deployment/frameworks/chatbox.md
index cbca6e6282fc65042b520b2a3044be0c01acb8a5..002935da560094b8f7ebed457bbf09872933b56b 100644
--- a/docs/deployment/frameworks/chatbox.md
+++ b/docs/deployment/frameworks/chatbox.md
@@ -6,27 +6,31 @@ It allows you to deploy a large language model (LLM) server with vLLM as the bac
 
 ## Prerequisites
 
-- Setup vLLM environment
+Set up the vLLM environment:
+
+```bash
+pip install vllm
+```
 
 ## Deploy
 
-- Start the vLLM server with the supported chat completion model, e.g.
+1. Start the vLLM server with the supported chat completion model, e.g.
 
-```bash
-vllm serve qwen/Qwen1.5-0.5B-Chat
-```
+    ```bash
+    vllm serve qwen/Qwen1.5-0.5B-Chat
+    ```
 
-- Download and install [Chatbox desktop](https://chatboxai.app/en#download).
+1. Download and install [Chatbox desktop](https://chatboxai.app/en#download).
 
-- On the bottom left of settings, Add Custom Provider
+1. On the bottom left of settings, Add Custom Provider
     - API Mode: `OpenAI API Compatible`
     - Name: vllm
     - API Host: `http://{vllm server host}:{vllm server port}/v1`
     - API Path: `/chat/completions`
     - Model: `qwen/Qwen1.5-0.5B-Chat`
 
-![](../../assets/deployment/chatbox-settings.png)
+    ![](../../assets/deployment/chatbox-settings.png)
 
-- Go to `Just chat`, and start to chat:
+1. Go to `Just chat`, and start to chat:
 
-![](../../assets/deployment/chatbox-chat.png)
+    ![](../../assets/deployment/chatbox-chat.png)
diff --git a/docs/deployment/frameworks/dify.md b/docs/deployment/frameworks/dify.md
index 35f02c33cb02b9d76062f7a8348e8f1cbcaeb0f4..820ef0cbed9fafc2fbe955fc5fdffc5922b22631 100644
--- a/docs/deployment/frameworks/dify.md
+++ b/docs/deployment/frameworks/dify.md
@@ -8,44 +8,50 @@ This guide walks you through deploying Dify using a vLLM backend.
 
 ## Prerequisites
 
-- Setup vLLM environment
-- Install [Docker](https://docs.docker.com/engine/install/) and [Docker Compose](https://docs.docker.com/compose/install/)
+Set up the vLLM environment:
+
+```bash
+pip install vllm
+```
+
+And install [Docker](https://docs.docker.com/engine/install/) and [Docker Compose](https://docs.docker.com/compose/install/).
 
 ## Deploy
 
-- Start the vLLM server with the supported chat completion model, e.g.
+1. Start the vLLM server with the supported chat completion model, e.g.
 
-```bash
-vllm serve Qwen/Qwen1.5-7B-Chat
-```
+    ```bash
+    vllm serve Qwen/Qwen1.5-7B-Chat
+    ```
 
-- Start the Dify server with docker compose ([details](https://github.com/langgenius/dify?tab=readme-ov-file#quick-start)):
+1. Start the Dify server with docker compose ([details](https://github.com/langgenius/dify?tab=readme-ov-file#quick-start)):
 
-```bash
-git clone https://github.com/langgenius/dify.git
-cd dify
-cd docker
-cp .env.example .env
-docker compose up -d
-```
+    ```bash
+    git clone https://github.com/langgenius/dify.git
+    cd dify
+    cd docker
+    cp .env.example .env
+    docker compose up -d
+    ```
+
+1. Open the browser to access `http://localhost/install`, config the basic login information and login.
 
-- Open the browser to access `http://localhost/install`, config the basic login information and login.
+1. In the top-right user menu (under the profile icon), go to Settings, then click `Model Provider`, and locate the `vLLM` provider to install it.
 
-- In the top-right user menu (under the profile icon), go to Settings, then click `Model Provider`, and locate the `vLLM` provider to install it.
+1. Fill in the model provider details as follows:
 
-- Fill in the model provider details as follows:
     - **Model Type**: `LLM`
     - **Model Name**: `Qwen/Qwen1.5-7B-Chat`
     - **API Endpoint URL**: `http://{vllm_server_host}:{vllm_server_port}/v1`
     - **Model Name for API Endpoint**: `Qwen/Qwen1.5-7B-Chat`
     - **Completion Mode**: `Completion`
 
-![](../../assets/deployment/dify-settings.png)
+    ![](../../assets/deployment/dify-settings.png)
 
-- To create a test chatbot, go to `Studio → Chatbot → Create from Blank`, then select Chatbot as the type:
+1. To create a test chatbot, go to `Studio → Chatbot → Create from Blank`, then select Chatbot as the type:
 
-![](../../assets/deployment/dify-create-chatbot.png)
+    ![](../../assets/deployment/dify-create-chatbot.png)
 
-- Click the chatbot you just created to open the chat interface and start interacting with the model:
+1. Click the chatbot you just created to open the chat interface and start interacting with the model:
 
-![](../../assets/deployment/dify-chat.png)
+    ![](../../assets/deployment/dify-chat.png)
diff --git a/docs/deployment/frameworks/haystack.md b/docs/deployment/frameworks/haystack.md
index 70b4b48d4543e72f9c304fee07cd7bdb6192ab1d..836305cf15c42e5918e89c3486210231b8e01071 100644
--- a/docs/deployment/frameworks/haystack.md
+++ b/docs/deployment/frameworks/haystack.md
@@ -6,7 +6,7 @@ It allows you to deploy a large language model (LLM) server with vLLM as the bac
 
 ## Prerequisites
 
-- Setup vLLM and Haystack environment
+Set up the vLLM and Haystack environment:
 
 ```bash
 pip install vllm haystack-ai
@@ -14,13 +14,13 @@ pip install vllm haystack-ai
 
 ## Deploy
 
-- Start the vLLM server with the supported chat completion model, e.g.
+1. Start the vLLM server with the supported chat completion model, e.g.
 
-```bash
-vllm serve mistralai/Mistral-7B-Instruct-v0.1
-```
+    ```bash
+    vllm serve mistralai/Mistral-7B-Instruct-v0.1
+    ```
 
-- Use the `OpenAIGenerator` and `OpenAIChatGenerator` components in Haystack to query the vLLM server.
+1. Use the `OpenAIGenerator` and `OpenAIChatGenerator` components in Haystack to query the vLLM server.
 
 ??? code
 
diff --git a/docs/deployment/frameworks/litellm.md b/docs/deployment/frameworks/litellm.md
index c7e514f2276e04cf2b67036b414472ccd9cf8840..0d6c3729911adc56527da6b868086cabd4b2f2f6 100644
--- a/docs/deployment/frameworks/litellm.md
+++ b/docs/deployment/frameworks/litellm.md
@@ -13,7 +13,7 @@ And LiteLLM supports all models on VLLM.
 
 ## Prerequisites
 
-- Setup vLLM and litellm environment
+Set up the vLLM and litellm environment:
 
 ```bash
 pip install vllm litellm
@@ -23,13 +23,13 @@ pip install vllm litellm
 
 ### Chat completion
 
-- Start the vLLM server with the supported chat completion model, e.g.
+1. Start the vLLM server with the supported chat completion model, e.g.
 
-```bash
-vllm serve qwen/Qwen1.5-0.5B-Chat
-```
+    ```bash
+    vllm serve qwen/Qwen1.5-0.5B-Chat
+    ```
 
-- Call it with litellm:
+1. Call it with litellm:
 
 ??? code
 
@@ -51,13 +51,13 @@ vllm serve qwen/Qwen1.5-0.5B-Chat
 
 ### Embeddings
 
-- Start the vLLM server with the supported embedding model, e.g.
+1. Start the vLLM server with the supported embedding model, e.g.
 
-```bash
-vllm serve BAAI/bge-base-en-v1.5
-```
+    ```bash
+    vllm serve BAAI/bge-base-en-v1.5
+    ```
 
-- Call it with litellm:
+1. Call it with litellm:
 
 ```python
 from litellm import embedding   
diff --git a/docs/deployment/frameworks/lws.md b/docs/deployment/frameworks/lws.md
index 3319dc6c90e1eab6b3201566458ab2fb38d196fe..3b9fa3ea43d642e67ea1cd8c8d8ccbd790a36473 100644
--- a/docs/deployment/frameworks/lws.md
+++ b/docs/deployment/frameworks/lws.md
@@ -22,7 +22,7 @@ Deploy the following yaml file `lws.yaml`
     metadata:
       name: vllm
     spec:
-      replicas: 2
+      replicas: 1
       leaderWorkerTemplate:
         size: 2
         restartPolicy: RecreateGroupOnPodRestart
@@ -41,7 +41,7 @@ Deploy the following yaml file `lws.yaml`
                   - sh
                   - -c
                   - "bash /vllm-workspace/examples/online_serving/multi-node-serving.sh leader --ray_cluster_size=$(LWS_GROUP_SIZE); 
-                    python3 -m vllm.entrypoints.openai.api_server --port 8080 --model meta-llama/Meta-Llama-3.1-405B-Instruct --tensor-parallel-size 8 --pipeline_parallel_size 2"
+                    vllm serve meta-llama/Meta-Llama-3.1-405B-Instruct --port 8080 --tensor-parallel-size 8 --pipeline_parallel_size 2"
                 resources:
                   limits:
                     nvidia.com/gpu: "8"
@@ -126,8 +126,6 @@ Should get an output similar to this:
 NAME       READY   STATUS    RESTARTS   AGE
 vllm-0     1/1     Running   0          2s
 vllm-0-1   1/1     Running   0          2s
-vllm-1     1/1     Running   0          2s
-vllm-1-1   1/1     Running   0          2s
 ```
 
 Verify that the distributed tensor-parallel inference works:
diff --git a/docs/deployment/frameworks/retrieval_augmented_generation.md b/docs/deployment/frameworks/retrieval_augmented_generation.md
index d5f2ec302b6cd8d193e18e20581cd6703b4eb3c1..d86ab1600f1269646cf968cd5a3f12c8b7ce9b8a 100644
--- a/docs/deployment/frameworks/retrieval_augmented_generation.md
+++ b/docs/deployment/frameworks/retrieval_augmented_generation.md
@@ -11,7 +11,7 @@ Here are the integrations:
 
 ### Prerequisites
 
-- Setup vLLM and langchain environment
+Set up the vLLM and langchain environment:
 
 ```bash
 pip install -U vllm \
@@ -22,33 +22,33 @@ pip install -U vllm \
 
 ### Deploy
 
-- Start the vLLM server with the supported embedding model, e.g.
+1. Start the vLLM server with the supported embedding model, e.g.
 
-```bash
-# Start embedding service (port 8000)
-vllm serve ssmits/Qwen2-7B-Instruct-embed-base
-```
+    ```bash
+    # Start embedding service (port 8000)
+    vllm serve ssmits/Qwen2-7B-Instruct-embed-base
+    ```
 
-- Start the vLLM server with the supported chat completion model, e.g.
+1. Start the vLLM server with the supported chat completion model, e.g.
 
-```bash
-# Start chat service (port 8001)
-vllm serve qwen/Qwen1.5-0.5B-Chat --port 8001
-```
+    ```bash
+    # Start chat service (port 8001)
+    vllm serve qwen/Qwen1.5-0.5B-Chat --port 8001
+    ```
 
-- Use the script: <gh-file:examples/online_serving/retrieval_augmented_generation_with_langchain.py>
+1. Use the script: <gh-file:examples/online_serving/retrieval_augmented_generation_with_langchain.py>
 
-- Run the script
+1. Run the script
 
-```python
-python retrieval_augmented_generation_with_langchain.py
-```
+    ```python
+    python retrieval_augmented_generation_with_langchain.py
+    ```
 
 ## vLLM + llamaindex
 
 ### Prerequisites
 
-- Setup vLLM and llamaindex environment
+Set up the vLLM and llamaindex environment:
 
 ```bash
 pip install vllm \
@@ -60,24 +60,24 @@ pip install vllm \
 
 ### Deploy
 
-- Start the vLLM server with the supported embedding model, e.g.
+1. Start the vLLM server with the supported embedding model, e.g.
 
-```bash
-# Start embedding service (port 8000)
-vllm serve ssmits/Qwen2-7B-Instruct-embed-base
-```
+    ```bash
+    # Start embedding service (port 8000)
+    vllm serve ssmits/Qwen2-7B-Instruct-embed-base
+    ```
 
-- Start the vLLM server with the supported chat completion model, e.g.
+1. Start the vLLM server with the supported chat completion model, e.g.
 
-```bash
-# Start chat service (port 8001)
-vllm serve qwen/Qwen1.5-0.5B-Chat --port 8001
-```
+    ```bash
+    # Start chat service (port 8001)
+    vllm serve qwen/Qwen1.5-0.5B-Chat --port 8001
+    ```
 
-- Use the script: <gh-file:examples/online_serving/retrieval_augmented_generation_with_llamaindex.py>
+1. Use the script: <gh-file:examples/online_serving/retrieval_augmented_generation_with_llamaindex.py>
 
-- Run the script
+1. Run the script:
 
-```python
-python retrieval_augmented_generation_with_llamaindex.py
-```
+    ```python
+    python retrieval_augmented_generation_with_llamaindex.py
+    ```
diff --git a/docs/deployment/integrations/llamastack.md b/docs/deployment/integrations/llamastack.md
index 28031f01f85e8346f59603ab8d51ebb7a843aa1e..8eb7f8d81275d2ea700bd58585ce7def8cd80d21 100644
--- a/docs/deployment/integrations/llamastack.md
+++ b/docs/deployment/integrations/llamastack.md
@@ -1,6 +1,6 @@
 # Llama Stack
 
-vLLM is also available via [Llama Stack](https://github.com/meta-llama/llama-stack) .
+vLLM is also available via [Llama Stack](https://github.com/llamastack/llama-stack).
 
 To install Llama Stack, run
 
@@ -8,9 +8,9 @@ To install Llama Stack, run
 pip install llama-stack -q
 ```
 
-## Inference using OpenAI Compatible API
+## Inference using OpenAI-Compatible API
 
-Then start Llama Stack server pointing to your vLLM server with the following configuration:
+Then start the Llama Stack server and configure it to point to your vLLM server with the following settings:
 
 ```yaml
 inference:
@@ -20,15 +20,15 @@ inference:
       url: http://127.0.0.1:8000
 ```
 
-Please refer to [this guide](https://llama-stack.readthedocs.io/en/latest/distributions/self_hosted_distro/remote-vllm.html) for more details on this remote vLLM provider.
+Please refer to [this guide](https://llama-stack.readthedocs.io/en/latest/providers/inference/remote_vllm.html) for more details on this remote vLLM provider.
 
-## Inference via Embedded vLLM
+## Inference using Embedded vLLM
 
-An [inline vLLM provider](https://github.com/meta-llama/llama-stack/tree/main/llama_stack/providers/inline/inference/vllm)
+An [inline provider](https://github.com/llamastack/llama-stack/tree/main/llama_stack/providers/inline/inference)
 is also available. This is a sample of configuration using that method:
 
 ```yaml
-inference
+inference:
   - provider_type: vllm
     config:
       model: Llama3.1-8B-Instruct
diff --git a/docs/design/fused_moe_modular_kernel.md b/docs/design/fused_moe_modular_kernel.md
index b03483d1c9b217e1fc67e353fc93849f2cdea8e8..cb2037b575e53271bf87c888a14ccd546b9baf97 100644
--- a/docs/design/fused_moe_modular_kernel.md
+++ b/docs/design/fused_moe_modular_kernel.md
@@ -54,8 +54,8 @@ The `FusedMoEModularKernel` acts as a bridge between the `FusedMoEPermuteExperts
 
 ### FusedMoEPrepareAndFinalize
 
-The `FusedMoEPrepareAndFinalize` abstract class exposes `prepare` and `finalize` functions.
-The `prepare` function is responsible for input activation Quantization and All2All Dispatch. The `finalize` function is responsible for invoking the All2All Combine. Additionally the `finalize` function may or may not do the TopK weight application and reduction (Please refer to the TopKWeightAndReduce section)
+The `FusedMoEPrepareAndFinalize` abstract class exposes `prepare`, `prepare_no_receive`  and `finalize` functions.
+The `prepare` function is responsible for input activation Quantization and All2All Dispatch. If implemented, The `prepare_no_receive` is like `prepare` except it does not wait to receive results from other workers.  Instead it returns a "receiver" callback that must be invoked to wait for the final results of worker. It is not required that this method is supported by all `FusedMoEPrepareAndFinalize` classes, but if it is available, it can be used to interleave work with the initial all to all communication, e.g. interleaving shared experts with fused experts.  The `finalize` function is responsible for invoking the All2All Combine. Additionally the `finalize` function may or may not do the TopK weight application and reduction (Please refer to the TopKWeightAndReduce section)
 
 ![](../assets/design/fused_moe_modular_kernel/prepare_and_finalize_blocks.png "FusedMoEPrepareAndFinalize Blocks")
 
@@ -146,6 +146,10 @@ This section describes the significance of the various functions exposed by the
 
 `FusedMoEPrepareAndFinalize::prepare()`: The prepare method implements the Quantization and All2All Dispatch. Typically the Dispatch function from the relevant All2All Manager is invoked.
 
+`FusedMoEPrepareAndFinalize::has_prepare_no_receive()`: Indicates whether or not this subclass implements `prepare_no_receive`. Defaults to False.
+
+`FusedMoEPrepareAndFinalize::prepare_no_receive()`: The prepare_no_receive method implements the Quantization and All2All Dispatch. It does not wait for the result of the dispatch operation but instead returns a thunk that can be invoked to wait for the final results. Typically the Dispatch function from the relevant All2All Manager is invoked.
+
 `FusedMoEPrepareAndFinalize::finalize()`: Maybe perform TopK Weight Application and Reduction and All2All Combine. Typically the Combine function from the relevant All2AllManager is invoked.
 
 `FusedMoEPrepareAndFinalize::activation_format()`: Return `FusedMoEActivationFormat.BatchedExperts` if the output of the prepare method (i.e. the All2All dispatch) is Batched. Return `FusedMoEActivationFormat.Standard` otherwise.
diff --git a/docs/design/io_processor_plugins.md b/docs/design/io_processor_plugins.md
index 8e5d5249409c661febd86c7a9b33335796c948cf..e70ee4a076e54ad77d739331006841a0a9d81ff7 100644
--- a/docs/design/io_processor_plugins.md
+++ b/docs/design/io_processor_plugins.md
@@ -2,7 +2,7 @@
 
 IO Processor plugins are a feature that allows pre and post processing of the model input and output for pooling models. The idea is that users are allowed to pass a custom input to vLLM that is converted into one or more model prompts and fed to the model `encode` method. One potential use-case of such plugins is that of using vLLM for generating multi-modal data. Say users feed an image to vLLM and get an image in output.
 
-When performing an inference with IO Processor plugins, the prompt type is defined by the plugin and the same is valid for the final request output. vLLM does not perform any validation of input/output data, and it is up to the plugin to ensure the correct data is being fed to the model and returned to the user. As of now these plugins support only pooling models and can be triggerd via the `encode` method in `LLM` and `AsyncLLM`, or in online serving mode via the `/pooling` endpoint.
+When performing an inference with IO Processor plugins, the prompt type is defined by the plugin and the same is valid for the final request output. vLLM does not perform any validation of input/output data, and it is up to the plugin to ensure the correct data is being fed to the model and returned to the user. As of now these plugins support only pooling models and can be triggered via the `encode` method in `LLM` and `AsyncLLM`, or in online serving mode via the `/pooling` endpoint.
 
 ## Writing an IO Processor Plugin
 
@@ -64,9 +64,9 @@ The `parse_request` method is used for validating the user prompt and converting
 The `pre_process*` methods take the validated plugin input to generate vLLM's model prompts for regular inference.
 The `post_process*` methods take `PoolingRequestOutput` objects as input and generate a custom plugin output.
 
-The `output_to_response` method is used only for online serving and converts the plugin output to the `IOProcessorResponse` type that is then returned by the API Server. The implementation of the `/io_processor_pooling` serving endpoint is [here](../../vllm/entrypoints/openai/serving_pooling_with_io_plugin.py).
+The `output_to_response` method is used only for online serving and converts the plugin output to the `IOProcessorResponse` type that is then returned by the API Server. The implementation of the `/io_processor_pooling` serving endpoint is available here <gh-file:vllm/entrypoints/openai/serving_pooling_with_io_plugin.py>.
 
-An example implementation of a plugin that enables generating geotiff images with the PrithviGeospatialMAE model is available [here](https://github.com/christian-pinto/prithvi_io_processor_plugin). Please, also refer to our [online](../../examples/online_serving/prithvi_geospatial_mae.py) and [offline](../../examples/offline_inference/prithvi_geospatial_mae_io_processor.py) inference examples.
+An example implementation of a plugin that enables generating geotiff images with the PrithviGeospatialMAE model is available [here](https://github.com/christian-pinto/prithvi_io_processor_plugin). Please, also refer to our online (<gh-file:examples/online_serving/prithvi_geospatial_mae.py>) and offline (<gh-file:examples/offline_inference/prithvi_geospatial_mae_io_processor.py>) inference examples.
 
 ## Using an IO Processor plugin
 
diff --git a/docs/features/compatibility_matrix.md b/docs/features/README.md
similarity index 98%
rename from docs/features/compatibility_matrix.md
rename to docs/features/README.md
index 5b08b3810776c0f888c2214f1f5a5b07dc5c5199..de23cd0a90eb57d0aa71dbab1abe06674f4a92db 100644
--- a/docs/features/compatibility_matrix.md
+++ b/docs/features/README.md
@@ -1,4 +1,6 @@
-# Compatibility Matrix
+# Features
+
+## Compatibility Matrix
 
 The tables below show mutually exclusive features and the support on some hardware.
 
@@ -12,7 +14,7 @@ The symbols used have the following meanings:
 !!! note
     Check the ❌ or 🟠 with links to see tracking issue for unsupported feature/hardware combination.
 
-## Feature x Feature
+### Feature x Feature
 
 <style>
 td:not(:first-child) {
@@ -56,7 +58,7 @@ th:not(:first-child) {
 
 [](){ #feature-x-hardware }
 
-## Feature x Hardware
+### Feature x Hardware
 
 | Feature                                                   | Volta               | Turing    | Ampere    | Ada    | Hopper     | CPU                | AMD    | TPU |
 |-----------------------------------------------------------|---------------------|-----------|-----------|--------|------------|--------------------|--------|-----|
diff --git a/docs/features/multimodal_inputs.md b/docs/features/multimodal_inputs.md
index 206ab7a4687552cf1196986a31f448c6c35e25c7..77baa27c7a958c02a865ef2af769f2d48311beae 100644
--- a/docs/features/multimodal_inputs.md
+++ b/docs/features/multimodal_inputs.md
@@ -215,19 +215,19 @@ When loading RGBA images (images with transparency), vLLM converts them to RGB f
 
     ```python
     from vllm import LLM
-    
+
     # Default white background (no configuration needed)
     llm = LLM(model="llava-hf/llava-1.5-7b-hf")
-    
+
     # Custom black background for dark theme
     llm = LLM(
         model="llava-hf/llava-1.5-7b-hf",
         media_io_kwargs={"image": {"rgba_background_color": [0, 0, 0]}}
     )
-    
+
     # Custom brand color background (e.g., blue)
     llm = LLM(
-        model="llava-hf/llava-1.5-7b-hf", 
+        model="llava-hf/llava-1.5-7b-hf",
         media_io_kwargs={"image": {"rgba_background_color": [0, 0, 255]}}
     )
     ```
@@ -388,7 +388,7 @@ For Qwen2-VL and MiniCPM-V, we accept additional parameters alongside the embedd
 
 ## Online Serving
 
-Our OpenAI-compatible server accepts multi-modal data via the [Chat Completions API](https://platform.openai.com/docs/api-reference/chat).
+Our OpenAI-compatible server accepts multi-modal data via the [Chat Completions API](https://platform.openai.com/docs/api-reference/chat). Media inputs also support optional UUIDs users can provide to uniquely identify each media, which is used to cache the media results across requests.
 
 !!! important
     A chat template is **required** to use Chat Completions API.
@@ -438,7 +438,13 @@ Then, you can use the OpenAI client as follows:
                 # NOTE: The prompt formatting with the image token `<image>` is not needed
                 # since the prompt will be processed automatically by the API server.
                 {"type": "text", "text": "What’s in this image?"},
-                {"type": "image_url", "image_url": {"url": image_url}},
+                {
+                    "type": "image_url",
+                    "image_url": {
+                        url": image_url
+                    },
+                    "uuid": image_url # Optional
+                },
             ],
         }],
     )
@@ -454,8 +460,20 @@ Then, you can use the OpenAI client as follows:
             "role": "user",
             "content": [
                 {"type": "text", "text": "What are the animals in these images?"},
-                {"type": "image_url", "image_url": {"url": image_url_duck}},
-                {"type": "image_url", "image_url": {"url": image_url_lion}},
+                {
+                    "type": "image_url",
+                    "image_url": {
+                        "url": image_url_duck
+                    },
+                    "uuid": image_url_duck # Optional
+                },
+                {
+                    "type": "image_url",
+                    "image_url": {
+                        "url": image_url_lion
+                    },
+                    "uuid": image_url_lion # Optional
+                },
             ],
         }],
     )
@@ -522,6 +540,7 @@ Then, you can use the OpenAI client as follows:
                     "video_url": {
                         "url": video_url
                     },
+                    "uuid": video_url # Optional
                 },
             ],
         }],
@@ -613,6 +632,7 @@ Then, you can use the OpenAI client as follows:
                         "data": audio_base64,
                         "format": "wav"
                     },
+                    "uuid": audio_url # Optional
                 },
             ],
         }],
@@ -642,6 +662,7 @@ Alternatively, you can pass `audio_url`, which is the audio counterpart of `imag
                     "audio_url": {
                         "url": audio_url
                     },
+                    "uuid": audio_url # Optional
                 },
             ],
         }],
@@ -695,7 +716,8 @@ The following example demonstrates how to pass image embeddings to the OpenAI se
     model = "llava-hf/llava-1.5-7b-hf"
     embeds =  {
         "type": "image_embeds",
-        "image_embeds": f"{base64_image_embedding}" 
+        "image_embeds": f"{base64_image_embedding}",
+        "uuid": image_url # Optional
     }
 
     # Pass additional parameters (available to Qwen2-VL and MiniCPM-V)
@@ -706,6 +728,7 @@ The following example demonstrates how to pass image embeddings to the OpenAI se
             "image_embeds": f"{base64_image_embedding}" , # Required
             "image_grid_thw": f"{base64_image_grid_thw}"  # Required by Qwen/Qwen2-VL-2B-Instruct
         },
+        "uuid": image_url # Optional
     }
     model = "openbmb/MiniCPM-V-2_6"
     embeds =  {
@@ -714,6 +737,7 @@ The following example demonstrates how to pass image embeddings to the OpenAI se
             "image_embeds": f"{base64_image_embedding}" , # Required
             "image_sizes": f"{base64_image_sizes}"  # Required by openbmb/MiniCPM-V-2_6
         },
+        "uuid": image_url # Optional
     }
     chat_completion = client.chat.completions.create(
         messages=[
diff --git a/docs/features/reasoning_outputs.md b/docs/features/reasoning_outputs.md
index d9a785eb73fbeda852c5266319f2e0914e3fbc07..d518e7f0cff43348f7bc936db115e2adb50b8549 100644
--- a/docs/features/reasoning_outputs.md
+++ b/docs/features/reasoning_outputs.md
@@ -15,6 +15,7 @@ vLLM currently supports the following reasoning models:
 | [IBM Granite 3.2 language models](https://huggingface.co/collections/ibm-granite/granite-32-language-models-67b3bc8c13508f6d064cff9a) | `granite` | ❌ | ❌ |
 | [Qwen3 series](https://huggingface.co/collections/Qwen/qwen3-67dd247413f0e2e4f653967f) | `qwen3` | `guided_json`, `guided_regex` | ✅ |
 | [Hunyuan A13B series](https://huggingface.co/collections/tencent/hunyuan-a13b-685ec38e5b46321e3ea7c4be) | `hunyuan_a13b` | `guided_json`, `guided_regex` | ✅ |
+| [GLM-4.5 series](https://huggingface.co/collections/zai-org/glm-45-687c621d34bda8c9e4bf503b) | `glm45` | `guided_json`, `guided_regex` | ✅ |
 
 !!! note
     IBM Granite 3.2 reasoning is disabled by default; to enable it, you must also pass `thinking=True` in your `chat_template_kwargs`.
diff --git a/docs/features/tool_calling.md b/docs/features/tool_calling.md
index afc605a504b3df4e188b9058e3a2ce8327c51087..a8c0db0a7ac1386cd3f0598b51ac96c1e4f6c100 100644
--- a/docs/features/tool_calling.md
+++ b/docs/features/tool_calling.md
@@ -169,7 +169,7 @@ All Llama 3.1, 3.2 and 4 models should be supported.
 
 The tool calling that is supported is the [JSON-based tool calling](https://llama.meta.com/docs/model-cards-and-prompt-formats/llama3_1/#json-based-tool-calling). For [pythonic tool calling](https://github.com/meta-llama/llama-models/blob/main/models/llama3_2/text_prompt_format.md#zero-shot-function-calling) introduced by the Llama-3.2 models, see the `pythonic` tool parser below. As for Llama 4 models, it is recommended to use the `llama4_pythonic` tool parser.
 
-Other tool calling formats like the built in python tool calling or custom tool calling are not supported.
+Other tool calling formats like the built-in python tool calling or custom tool calling are not supported.
 
 Known issues:
 
@@ -311,6 +311,15 @@ Flags:
 * For non-reasoning: `--tool-call-parser hunyuan_a13b`
 * For reasoning: `--tool-call-parser hunyuan_a13b --reasoning-parser hunyuan_a13b --enable_reasoning`
 
+### GLM-4.5 Models (`glm45`)
+
+Supported models:
+
+* `ZhipuAI/GLM-4.5`
+* `ZhipuAI/GLM-4.5-Air`
+
+Flags: `--tool-call-parser glm45`
+
 ### Models with Pythonic Tool Calls (`pythonic`)
 
 A growing number of models output a python list to represent tool calls instead of using JSON. This has the advantage of inherently supporting parallel tool calls and removing ambiguity around the JSON schema required for tool calls. The `pythonic` tool parser can support such models.
diff --git a/docs/getting_started/installation/cpu.md b/docs/getting_started/installation/cpu.md
index 7f0ecb2bc0b74201070d8e8b293266e96eb01a7c..f8b4f75308df7c347769b302b1782cc0c3840a64 100644
--- a/docs/getting_started/installation/cpu.md
+++ b/docs/getting_started/installation/cpu.md
@@ -180,7 +180,7 @@ Inference batch size is an important parameter for the performance. Larger batch
     - Offline Inference: `256 * world_size`
     - Online Serving: `128 * world_size`
 
-vLLM CPU supports data parallel (DP), tensor parallel (TP) and pipeline parallel (PP) to leverage multiple CPU sockets and memory nodes. For more details of tuning DP, TP and PP, please refer to [Optimization and Tuning](../../configuration/optimization.md). For vLLM CPU, it is recommend to use DP, TP and PP together if there are enough CPU sockets and memory nodes.
+vLLM CPU supports data parallel (DP), tensor parallel (TP) and pipeline parallel (PP) to leverage multiple CPU sockets and memory nodes. For more details of tuning DP, TP and PP, please refer to [Optimization and Tuning](../../configuration/optimization.md). For vLLM CPU, it is recommended to use DP, TP and PP together if there are enough CPU sockets and memory nodes.
 
 ### Which quantization configs does vLLM CPU support?
 
@@ -194,3 +194,35 @@ vLLM CPU supports data parallel (DP), tensor parallel (TP) and pipeline parallel
 - Both of them require `amx` CPU flag.
     - `VLLM_CPU_MOE_PREPACK` can provides better performance for MoE models
     - `VLLM_CPU_SGL_KERNEL` can provides better performance for MoE models and small-batch scenarios.
+
+### Why do I see `get_mempolicy: Operation not permitted` when running in Docker?
+
+In some container environments (like Docker), NUMA-related syscalls used by vLLM (e.g., `get_mempolicy`, `migrate_pages`) are blocked/denied in the runtime's default seccomp/capabilities settings. This may lead to warnings like `get_mempolicy: Operation not permitted`. Functionality is not affected, but NUMA memory binding/migration optimizations may not take effect and performance can be suboptimal.
+
+To enable these optimizations inside Docker with the least privilege, you can follow below tips:
+
+```bash
+docker run ... --cap-add SYS_NICE --security-opt seccomp=unconfined  ...
+
+# 1) `--cap-add SYS_NICE` is to address `get_mempolicy` EPERM issue.
+
+# 2) `--security-opt seccomp=unconfined` is to enable `migrate_pages` for `numa_migrate_pages()`.
+# Actually, `seccomp=unconfined` bypasses the seccomp for container,
+# if it's unacceptable, you can customize your own seccomp profile,
+# based on docker/runtime default.json and add `migrate_pages` to `SCMP_ACT_ALLOW` list.
+
+# reference : https://docs.docker.com/engine/security/seccomp/
+```
+
+Alternatively, running with `--privileged=true` also works but is broader and not generally recommended.
+
+In K8S, the following configuration can be added to workload yaml to achieve the same effect as above:
+
+```yaml
+securityContext:
+  seccompProfile:
+    type: Unconfined
+  capabilities:
+    add:
+    - SYS_NICE
+```
diff --git a/docs/getting_started/installation/cpu/arm.inc.md b/docs/getting_started/installation/cpu/arm.inc.md
index cac578eefb1d75acc95ec30ab22002d5031fe257..e45baa0aa4938b0ceae6d8a63abc2e8c9df63e32 100644
--- a/docs/getting_started/installation/cpu/arm.inc.md
+++ b/docs/getting_started/installation/cpu/arm.inc.md
@@ -48,6 +48,10 @@ docker run --rm \
             --dtype=bfloat16 \
             other vLLM OpenAI server arguments
 ```
+
+!!! tip
+    An alternative of `--privileged=true` is `--cap-add SYS_NICE --security-opt seccomp=unconfined`.
+
 # --8<-- [end:build-image-from-source]
 # --8<-- [start:extra-information]
 # --8<-- [end:extra-information]
diff --git a/docs/getting_started/installation/cpu/build.inc.md b/docs/getting_started/installation/cpu/build.inc.md
index 57a09e674a821166398c217a62bd12d8358bb424..4bd4d39a6f80b669569a3e57f9e1c9ae30ccdd09 100644
--- a/docs/getting_started/installation/cpu/build.inc.md
+++ b/docs/getting_started/installation/cpu/build.inc.md
@@ -16,8 +16,8 @@ cd vllm_source
 Third, install required dependencies:
 
 ```bash
-uv pip install -r requirements/cpu-build.txt --torch-backend auto
-uv pip install -r requirements/cpu.txt --torch-backend auto
+uv pip install -r requirements/cpu-build.txt --torch-backend cpu
+uv pip install -r requirements/cpu.txt --torch-backend cpu
 ```
 
 ??? console "pip"
diff --git a/docs/getting_started/installation/cpu/s390x.inc.md b/docs/getting_started/installation/cpu/s390x.inc.md
index c1917267ce91b03118508841f6b3b1e5e3916cc4..f9c4ccb942fac3c4089f4ec5dadfcb23f786d17a 100644
--- a/docs/getting_started/installation/cpu/s390x.inc.md
+++ b/docs/getting_started/installation/cpu/s390x.inc.md
@@ -89,6 +89,9 @@ docker run --rm \
     other vLLM OpenAI server arguments
 ```
 
+!!! tip
+    An alternative of `--privileged true` is `--cap-add SYS_NICE --security-opt seccomp=unconfined`.
+
 # --8<-- [end:build-image-from-source]
 # --8<-- [start:extra-information]
 # --8<-- [end:extra-information]
diff --git a/docs/getting_started/installation/cpu/x86.inc.md b/docs/getting_started/installation/cpu/x86.inc.md
index f7af259ace62806a37ba2731250cab4da6bc7315..836da33f653177c032bc986439c1ee2bb788211a 100644
--- a/docs/getting_started/installation/cpu/x86.inc.md
+++ b/docs/getting_started/installation/cpu/x86.inc.md
@@ -44,6 +44,7 @@ docker build -f docker/Dockerfile.cpu \
 # Launching OpenAI server
 docker run --rm \
             --security-opt seccomp=unconfined \
+            --cap-add SYS_NICE \
             --shm-size=4g \
             -p 8000:8000 \
             -e VLLM_CPU_KVCACHE_SPACE=<KV cache space> \
diff --git a/docs/getting_started/installation/gpu/cuda.inc.md b/docs/getting_started/installation/gpu/cuda.inc.md
index 275232e12e08c266ef66851ae2d7c8b76cdc0e10..01c5f5fc02f3e1ba074286d3a1828be4c7b00137 100644
--- a/docs/getting_started/installation/gpu/cuda.inc.md
+++ b/docs/getting_started/installation/gpu/cuda.inc.md
@@ -165,14 +165,14 @@ There are scenarios where the PyTorch dependency cannot be easily installed with
 - Building vLLM with PyTorch nightly or a custom PyTorch build.
 - Building vLLM with aarch64 and CUDA (GH200), where the PyTorch wheels are not available on PyPI. Currently, only the PyTorch nightly has wheels for aarch64 with CUDA. You can run `uv pip install --index-url https://download.pytorch.org/whl/nightly/cu128 torch torchvision torchaudio` to [install PyTorch nightly](https://pytorch.org/get-started/locally/) and then build vLLM on top of it.
 
-To build vLLM using an existing PyTorch installation:
+To build vLLM using an existing PyTorch installation, it is recommended to use `uv`, because it has [a unique mechanism](https://docs.astral.sh/uv/concepts/projects/config/#disabling-build-isolation) for disabling build isolation for specific packages and vLLM leverages this mechanism to specify `torch` as the package to disable build isolation.
 
 ```bash
+# install PyTorch first, either from PyPI or from source
 git clone https://github.com/vllm-project/vllm.git
 cd vllm
-python use_existing_torch.py
-uv pip install -r requirements/build.txt
-uv pip install --no-build-isolation -e .
+# pip install -e . does not work directly, only uv can do this
+uv pip install -e .
 ```
 
 ##### Use the local cutlass for compilation
diff --git a/docs/getting_started/installation/gpu/rocm.inc.md b/docs/getting_started/installation/gpu/rocm.inc.md
index 80e99d3034d39fe3c65cb46baed903cc4a725f28..4c70128d0b49a1ca01bef9cf3d2c4727e5e7cb69 100644
--- a/docs/getting_started/installation/gpu/rocm.inc.md
+++ b/docs/getting_started/installation/gpu/rocm.inc.md
@@ -119,7 +119,7 @@ Currently, there are no pre-built ROCm wheels.
     This may take 5-10 minutes. Currently, `pip install .` does not work for ROCm installation.
 
     !!! tip
-        - Triton flash attention is used by default. For benchmarking purposes, it is recommended to run a warm up step before collecting perf numbers.
+        - Triton flash attention is used by default. For benchmarking purposes, it is recommended to run a warm-up step before collecting perf numbers.
         - Triton flash attention does not currently support sliding window attention. If using half precision, please use CK flash-attention for sliding window support.
         - To use CK flash-attention or PyTorch naive attention, please use this flag `export VLLM_USE_TRITON_FLASH_ATTN=0` to turn off triton flash attention.
         - The ROCm version of PyTorch, ideally, should match the ROCm driver version.
diff --git a/docs/getting_started/installation/gpu/xpu.inc.md b/docs/getting_started/installation/gpu/xpu.inc.md
index b77c4e00cf0c4798145640e984d15fe34397781b..ed1dc0418cf7e5f19ed5ed6444c0f08282010f8a 100644
--- a/docs/getting_started/installation/gpu/xpu.inc.md
+++ b/docs/getting_started/installation/gpu/xpu.inc.md
@@ -3,13 +3,16 @@
 vLLM initially supports basic model inference and serving on Intel GPU platform.
 
 !!! warning
-    There are no pre-built wheels or images for this device, so you must build vLLM from source.
+    There are no pre-built wheels for this device, so you need build vLLM from source. Or you can use pre-built images which are based on vLLM released versions.
 
 # --8<-- [end:installation]
 # --8<-- [start:requirements]
 
 - Supported Hardware: Intel Data Center GPU, Intel ARC GPU
-- OneAPI requirements: oneAPI 2025.0
+- OneAPI requirements: oneAPI 2025.1
+- Python: 3.12
+!!! warning
+    The provided IPEX whl is Python3.12 specific so this version is a MUST.
 
 # --8<-- [end:requirements]
 # --8<-- [start:set-up-using-python]
@@ -24,7 +27,7 @@ Currently, there are no pre-built XPU wheels.
 # --8<-- [end:pre-built-wheels]
 # --8<-- [start:build-wheel-from-source]
 
-- First, install required [driver](https://dgpu-docs.intel.com/driver/installation.html#installing-gpu-drivers) and [Intel OneAPI](https://www.intel.com/content/www/us/en/developer/tools/oneapi/base-toolkit.html) 2025.0 or later.
+- First, install required [driver](https://dgpu-docs.intel.com/driver/installation.html#installing-gpu-drivers) and [Intel OneAPI](https://www.intel.com/content/www/us/en/developer/tools/oneapi/base-toolkit.html) 2025.1 or later.
 - Second, install Python packages for vLLM XPU backend building:
 
 ```bash
@@ -40,14 +43,10 @@ pip install -v -r requirements/xpu.txt
 VLLM_TARGET_DEVICE=xpu python setup.py install
 ```
 
-!!! note
-    - FP16 is the default data type in the current XPU backend. The BF16 data
-      type is supported on Intel Data Center GPU, not supported on Intel Arc GPU yet.
-
 # --8<-- [end:build-wheel-from-source]
 # --8<-- [start:pre-built-images]
 
-Currently, there are no pre-built XPU images.
+Currently, we release prebuilt XPU images at docker [hub](https://hub.docker.com/r/intel/vllm/tags) based on vLLM released version. For more information, please refer release [note](https://github.com/intel/ai-containers/blob/main/vllm).
 
 # --8<-- [end:pre-built-images]
 # --8<-- [start:build-image-from-source]
@@ -65,14 +64,14 @@ docker run -it \
 # --8<-- [end:build-image-from-source]
 # --8<-- [start:supported-features]
 
-XPU platform supports **tensor parallel** inference/serving and also supports **pipeline parallel** as a beta feature for online serving. We require Ray as the distributed runtime backend. For example, a reference execution like following:
+XPU platform supports **tensor parallel** inference/serving and also supports **pipeline parallel** as a beta feature for online serving. For **pipeline parallel**, we support it on single node with mp as the backend. For example, a reference execution like following:
 
 ```bash
 python -m vllm.entrypoints.openai.api_server \
      --model=facebook/opt-13b \
      --dtype=bfloat16 \
      --max_model_len=1024 \
-     --distributed-executor-backend=ray \
+     --distributed-executor-backend=mp \
      --pipeline-parallel-size=2 \
      -tp=8
 ```
diff --git a/docs/mkdocs/hooks/generate_argparse.py b/docs/mkdocs/hooks/generate_argparse.py
index 051a2d904406da2b3b1dda3c084d5a460beb6b25..91454ec272b81fa5dae60a8c34e12cf58aa75f06 100644
--- a/docs/mkdocs/hooks/generate_argparse.py
+++ b/docs/mkdocs/hooks/generate_argparse.py
@@ -165,6 +165,7 @@ def on_startup(command: Literal["build", "gh-deploy", "serve"], dirty: bool):
     # Generate documentation for each parser
     for stem, parser in parsers.items():
         doc_path = ARGPARSE_DOC_DIR / f"{stem}.md"
-        with open(doc_path, "w") as f:
+        # Specify encoding for building on Windows
+        with open(doc_path, "w", encoding="utf-8") as f:
             f.write(parser.format_help())
         logger.info("Argparse generated: %s", doc_path.relative_to(ROOT_DIR))
diff --git a/docs/mkdocs/hooks/generate_examples.py b/docs/mkdocs/hooks/generate_examples.py
index 881df791698e26b2ebb068fa2d2672160099769c..0cbaebb598a346b54da799682df68c9648db5f9e 100644
--- a/docs/mkdocs/hooks/generate_examples.py
+++ b/docs/mkdocs/hooks/generate_examples.py
@@ -106,13 +106,41 @@ class Example:
 
     def determine_title(self) -> str:
         if not self.is_code:
-            with open(self.main_file) as f:
+            # Specify encoding for building on Windows
+            with open(self.main_file, encoding="utf-8") as f:
                 first_line = f.readline().strip()
             match = re.match(r'^#\s+(?P<title>.+)$', first_line)
             if match:
                 return match.group('title')
         return fix_case(self.path.stem.replace("_", " ").title())
 
+    def fix_relative_links(self, content: str) -> str:
+        """
+        Fix relative links in markdown content by converting them to gh-file
+        format.
+        
+        Args:
+            content (str): The markdown content to process
+            
+        Returns:
+            str: Content with relative links converted to gh-file format
+        """
+        # Regex to match markdown links [text](relative_path)
+        # This matches links that don't start with http, https, ftp, or #
+        link_pattern = r'\[([^\]]*)\]\((?!(?:https?|ftp)://|#)([^)]+)\)'
+
+        def replace_link(match):
+            link_text = match.group(1)
+            relative_path = match.group(2)
+
+            # Make relative to repo root
+            gh_file = (self.main_file.parent / relative_path).resolve()
+            gh_file = gh_file.relative_to(ROOT_DIR)
+
+            return f'[{link_text}](gh-file:{gh_file})'
+
+        return re.sub(link_pattern, replace_link, content)
+
     def generate(self) -> str:
         content = f"# {self.title}\n\n"
         content += f"Source <gh-file:{self.path.relative_to(ROOT_DIR)}>.\n\n"
@@ -120,14 +148,16 @@ class Example:
         # Use long code fence to avoid issues with
         # included files containing code fences too
         code_fence = "``````"
-        # Skip the title from md snippets as it's been included above
-        start_line = 2
-        if self.is_code:
-            content += f"{code_fence}{self.main_file.suffix[1:]}\n"
-            start_line = 1
-        content += f'--8<-- "{self.main_file}:{start_line}"\n'
+
         if self.is_code:
-            content += f"{code_fence}\n"
+            content += (f"{code_fence}{self.main_file.suffix[1:]}\n"
+                        f'--8<-- "{self.main_file}"\n'
+                        f"{code_fence}\n")
+        else:
+            with open(self.main_file) as f:
+                # Skip the title from md snippets as it's been included above
+                main_content = f.readlines()[1:]
+            content += self.fix_relative_links("".join(main_content))
         content += "\n"
 
         if not self.other_files:
@@ -174,6 +204,7 @@ def on_startup(command: Literal["build", "gh-deploy", "serve"], dirty: bool):
         doc_path = EXAMPLE_DOC_DIR / example.category / example_name
         if not doc_path.parent.exists():
             doc_path.parent.mkdir(parents=True)
-        with open(doc_path, "w+") as f:
+        # Specify encoding for building on Windows
+        with open(doc_path, "w+", encoding="utf-8") as f:
             f.write(example.generate())
         logger.debug("Example generated: %s", doc_path.relative_to(ROOT_DIR))
diff --git a/docs/models/supported_models.md b/docs/models/supported_models.md
index e8fe77e8d6c98423ece732ec2afac78c8e0ae21a..db3dd2c252c76dfe336c776304d5f3b8a06eb330 100644
--- a/docs/models/supported_models.md
+++ b/docs/models/supported_models.md
@@ -322,6 +322,7 @@ th {
 
 | Architecture | Models | Example HF Models | [LoRA](../features/lora.md) | [PP](../serving/parallelism_scaling.md) | [V1](gh-issue:8779) |
 |--------------|--------|-------------------|----------------------|---------------------------|---------------------|
+| `ApertusForCausalLM` | Apertus | `swiss-ai/Apertus-8B-2509`, `swiss-ai/Apertus-70B-Instruct-2509`, etc. | ✅︎ | ✅︎ | ✅︎ |
 | `AquilaForCausalLM` | Aquila, Aquila2 | `BAAI/Aquila-7B`, `BAAI/AquilaChat-7B`, etc. | ✅︎ | ✅︎ | ✅︎ |
 | `ArceeForCausalLM` | Arcee (AFM) | `arcee-ai/AFM-4.5B-Base`, etc. | ✅︎ | ✅︎ | ✅︎ |
 | `ArcticForCausalLM` | Arctic | `Snowflake/snowflake-arctic-base`, `Snowflake/snowflake-arctic-instruct`, etc. | | ✅︎ | ✅︎ |
@@ -365,8 +366,8 @@ th {
 | `GraniteMoeSharedForCausalLM` | Granite MoE Shared | `ibm-research/moe-7b-1b-active-shared-experts` (test model) | ✅︎ | ✅︎ | ✅︎ |
 | `GritLM` | GritLM | `parasail-ai/GritLM-7B-vllm`. | ✅︎ | ✅︎ | ✅︎ |
 | `Grok1ModelForCausalLM` | Grok1 | `hpcai-tech/grok-1`. | ✅︎ | ✅︎ | ✅︎ |
-| `HunYuanDenseV1ForCausalLM` | Hunyuan-7B-Instruct-0124 | `tencent/Hunyuan-7B-Instruct-0124` | ✅︎ | | ✅︎ |
-| `HunYuanMoEV1ForCausalLM` | Hunyuan-80B-A13B | `tencent/Hunyuan-A13B-Instruct`, `tencent/Hunyuan-A13B-Pretrain`, `tencent/Hunyuan-A13B-Instruct-FP8`, etc. | ✅︎ | | ✅︎ |
+| `HunYuanDenseV1ForCausalLM` | Hunyuan-7B-Instruct-0124 | `tencent/Hunyuan-7B-Instruct-0124` | ✅︎ | ✅︎ | ✅︎ |
+| `HunYuanMoEV1ForCausalLM` | Hunyuan-80B-A13B | `tencent/Hunyuan-A13B-Instruct`, `tencent/Hunyuan-A13B-Pretrain`, `tencent/Hunyuan-A13B-Instruct-FP8`, etc. | ✅︎ | ✅︎ | ✅︎ |
 | `HCXVisionForCausalLM` | HyperCLOVAX-SEED-Vision-Instruct-3B | `naver-hyperclovax/HyperCLOVAX-SEED-Vision-Instruct-3B` | | | ✅︎ |
 | `InternLMForCausalLM` | InternLM | `internlm/internlm-7b`, `internlm/internlm-chat-7b`, etc. | ✅︎ | ✅︎ | ✅︎ |
 | `InternLM2ForCausalLM` | InternLM2 | `internlm/internlm2-7b`, `internlm/internlm2-chat-7b`, etc. | ✅︎ | ✅︎ | ✅︎ |
@@ -382,6 +383,7 @@ th {
 | `MiniCPM3ForCausalLM` | MiniCPM3 | `openbmb/MiniCPM3-4B`, etc. | ✅︎ | ✅︎ | ✅︎ |
 | `MistralForCausalLM` | Mistral, Mistral-Instruct | `mistralai/Mistral-7B-v0.1`, `mistralai/Mistral-7B-Instruct-v0.1`, etc. | ✅︎ | ✅︎ | ✅︎ |
 | `MixtralForCausalLM` | Mixtral-8x7B, Mixtral-8x7B-Instruct | `mistralai/Mixtral-8x7B-v0.1`, `mistralai/Mixtral-8x7B-Instruct-v0.1`, `mistral-community/Mixtral-8x22B-v0.1`, etc. | ✅︎ | ✅︎ | ✅︎ |
+| `MotifForCausalLM` | Motif-1-Tiny | `Motif-Technologies/Motif-2.6B`, `Motif-Technologies/Motif-2.6b-v1.1-LC`, etc. | ✅︎ | ✅︎ | |
 | `MPTForCausalLM` | MPT, MPT-Instruct, MPT-Chat, MPT-StoryWriter | `mosaicml/mpt-7b`, `mosaicml/mpt-7b-storywriter`, `mosaicml/mpt-30b`, etc. | | ✅︎ | ✅︎ |
 | `NemotronForCausalLM` | Nemotron-3, Nemotron-4, Minitron | `nvidia/Minitron-8B-Base`, `mgoin/Nemotron-4-340B-Base-hf-FP8`, etc. | ✅︎ | ✅︎ | ✅︎ |
 | `NemotronHForCausalLM` | Nemotron-H | `nvidia/Nemotron-H-8B-Base-8K`, `nvidia/Nemotron-H-47B-Base-8K`, `nvidia/Nemotron-H-56B-Base-8K`, etc. | ✅︎ | ✅︎ | ✅︎ |
@@ -395,12 +397,13 @@ th {
 | `PhiMoEForCausalLM` | Phi-3.5-MoE | `microsoft/Phi-3.5-MoE-instruct`, etc. | ✅︎ | ✅︎ | ✅︎ |
 | `Phi4FlashForCausalLM` | Phi-4-mini-flash-reasoning | `microsoft/microsoft/Phi-4-mini-instruct`, etc. | | | |
 | `PersimmonForCausalLM` | Persimmon | `adept/persimmon-8b-base`, `adept/persimmon-8b-chat`, etc. | | ✅︎ | ✅︎ |
-| `Plamo2ForCausalLM` | PLaMo2 | `pfnet/plamo-2-1b`, `pfnet/plamo-2-8b`, etc. | | ✅︎ | |
+| `Plamo2ForCausalLM` | PLaMo2 | `pfnet/plamo-2-1b`, `pfnet/plamo-2-8b`, etc. | | ✅︎ | ✅︎ |
 | `QWenLMHeadModel` | Qwen | `Qwen/Qwen-7B`, `Qwen/Qwen-7B-Chat`, etc. | ✅︎ | ✅︎ | ✅︎ |
 | `Qwen2ForCausalLM` | QwQ, Qwen2 | `Qwen/QwQ-32B-Preview`, `Qwen/Qwen2-7B-Instruct`, `Qwen/Qwen2-7B`, etc. | ✅︎ | ✅︎ | ✅︎ |
 | `Qwen2MoeForCausalLM` | Qwen2MoE | `Qwen/Qwen1.5-MoE-A2.7B`, `Qwen/Qwen1.5-MoE-A2.7B-Chat`, etc. | ✅︎ | ✅︎ | ✅︎ |
 | `Qwen3ForCausalLM` | Qwen3 | `Qwen/Qwen3-8B`, etc. | ✅︎ | ✅︎ | ✅︎ |
 | `Qwen3MoeForCausalLM` | Qwen3MoE | `Qwen/Qwen3-30B-A3B`, etc. | ✅︎ | ✅︎ | ✅︎ |
+| `Qwen3NextForCausalLM` | Qwen3NextMoE | `Qwen/Qwen3-Next-80B-A3B-Instruct`, etc. | ✅︎ | ✅︎ | ✅︎ |
 | `SeedOssForCausalLM` | SeedOss | `ByteDance-Seed/Seed-OSS-36B-Instruct`, etc. | ✅︎ | ✅︎ | ✅︎ |
 | `StableLmForCausalLM` | StableLM | `stabilityai/stablelm-3b-4e1t`, `stabilityai/stablelm-base-alpha-7b-v2`, etc. | | | ✅︎ |
 | `Starcoder2ForCausalLM` | Starcoder2 | `bigcode/starcoder2-3b`, `bigcode/starcoder2-7b`, `bigcode/starcoder2-15b`, etc. | | ✅︎ | ✅︎ |
@@ -440,6 +443,7 @@ These models primarily support the [`LLM.embed`](./pooling_models.md#llmembed) A
 |--------------|--------|-------------------|----------------------|---------------------------|---------------------|
 | `BertModel`<sup>C</sup> | BERT-based | `BAAI/bge-base-en-v1.5`, `Snowflake/snowflake-arctic-embed-xs`, etc. | | | ✅︎ |
 | `Gemma2Model`<sup>C</sup> | Gemma 2-based | `BAAI/bge-multilingual-gemma2`, etc. | ✅︎ | ✅︎ | ✅︎ |
+| `Gemma3TextModel`<sup>C</sup> | Gemma 3-based | `google/embeddinggemma-300m`, etc. | ✅︎ | ✅︎ | ✅︎ |
 | `GritLM` | GritLM | `parasail-ai/GritLM-7B-vllm`. | ✅︎ | ✅︎ | ✅︎ |
 | `GteModel`<sup>C</sup> | Arctic-Embed-2.0-M | `Snowflake/snowflake-arctic-embed-m-v2.0`. |  |  | ✅︎ |
 | `GteNewModel`<sup>C</sup> | mGTE-TRM (see note) | `Alibaba-NLP/gte-multilingual-base`, etc. |  |  | ✅︎ |
@@ -634,7 +638,8 @@ These models primarily accept the [`LLM.generate`](./generative_models.md#llmgen
 | `InternS1ForConditionalGeneration` | Intern-S1 | T + I<sup>E+</sup> + V<sup>E+</sup> | `internlm/Intern-S1`, etc. | ✅︎ | ✅︎ | ✅︎ |
 | `InternVLChatModel` | InternVL 3.5, InternVL 3.0, InternVideo 2.5, InternVL 2.5, Mono-InternVL, InternVL 2.0 | T + I<sup>E+</sup> + (V<sup>E+</sup>) | `OpenGVLab/InternVL3_5-14B`, `OpenGVLab/InternVL3-9B`, `OpenGVLab/InternVideo2_5_Chat_8B`, `OpenGVLab/InternVL2_5-4B`, `OpenGVLab/Mono-InternVL-2B`, `OpenGVLab/InternVL2-4B`, etc. | ✅︎ | ✅︎ | ✅︎ |
 | `InternVLForConditionalGeneration` | InternVL 3.0 (HF format) | T + I<sup>E+</sup> + V<sup>E+</sup> | `OpenGVLab/InternVL3-1B-hf`, etc. | ✅︎ | ✅︎ | ✅︎ |
-| `KeyeForConditionalGeneration` | Keye-VL-8B-Preview | T + I<sup>E+</sup> + V<sup>E+</sup> | `Kwai-Keye/Keye-VL-8B-Preview` | | | ✅︎ |
+| `KeyeForConditionalGeneration` | Keye-VL-8B-Preview | T + I<sup>E+</sup> + V<sup>E+</sup> | `Kwai-Keye/Keye-VL-8B-Preview` | ✅︎ | ✅︎ | ✅︎ |
+| `KeyeVL1_5ForConditionalGeneration` | Keye-VL-1_5-8B | T + I<sup>E+</sup> + V<sup>E+</sup> | `Kwai-Keye/Keye-VL-1_5-8B` | ✅︎ | ✅︎ | ✅︎ |
 | `KimiVLForConditionalGeneration` | Kimi-VL-A3B-Instruct, Kimi-VL-A3B-Thinking | T + I<sup>+</sup> | `moonshotai/Kimi-VL-A3B-Instruct`, `moonshotai/Kimi-VL-A3B-Thinking` | | ✅︎ | ✅︎ |
 | `Llama4ForConditionalGeneration` | Llama 4 | T + I<sup>+</sup> | `meta-llama/Llama-4-Scout-17B-16E-Instruct`, `meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8`, `meta-llama/Llama-4-Maverick-17B-128E-Instruct`, etc. | | ✅︎ | ✅︎ |
 | `Llama_Nemotron_Nano_VL` | Llama Nemotron Nano VL | T + I<sup>E+</sup> | `nvidia/Llama-3.1-Nemotron-Nano-VL-8B-V1` | ✅︎ | ✅︎ | ✅︎ |
@@ -642,6 +647,7 @@ These models primarily accept the [`LLM.generate`](./generative_models.md#llmgen
 | `LlavaNextForConditionalGeneration` | LLaVA-NeXT | T + I<sup>E+</sup> | `llava-hf/llava-v1.6-mistral-7b-hf`, `llava-hf/llava-v1.6-vicuna-7b-hf`, etc. | | ✅︎ | ✅︎ |
 | `LlavaNextVideoForConditionalGeneration` | LLaVA-NeXT-Video | T + V | `llava-hf/LLaVA-NeXT-Video-7B-hf`, etc. | | ✅︎ | ✅︎ |
 | `LlavaOnevisionForConditionalGeneration` | LLaVA-Onevision | T + I<sup>+</sup> + V<sup>+</sup> | `llava-hf/llava-onevision-qwen2-7b-ov-hf`, `llava-hf/llava-onevision-qwen2-0.5b-ov-hf`, etc. | | ✅︎ | ✅︎ |
+| `MiDashengLMModel` | MiDashengLM | T + A<sup>+</sup> | `mispeech/midashenglm-7b` | | ✅︎ | ✅︎ |
 | `MiniCPMO` | MiniCPM-O | T + I<sup>E+</sup> + V<sup>E+</sup> + A<sup>E+</sup> | `openbmb/MiniCPM-o-2_6`, etc. | ✅︎ | ✅︎ | ✅︎ |
 | `MiniCPMV` | MiniCPM-V | T + I<sup>E+</sup> + V<sup>E+</sup> | `openbmb/MiniCPM-V-2` (see note), `openbmb/MiniCPM-Llama3-V-2_5`, `openbmb/MiniCPM-V-2_6`, `openbmb/MiniCPM-V-4`, `openbmb/MiniCPM-V-4_5`, etc. | ✅︎ | | ✅︎ |
 | `MiniMaxVL01ForConditionalGeneration` | MiniMax-VL | T + I<sup>E+</sup> | `MiniMaxAI/MiniMax-VL-01`, etc. | | ✅︎ | ✅︎ |
@@ -660,7 +666,7 @@ These models primarily accept the [`LLM.generate`](./generative_models.md#llmgen
 | `Qwen2AudioForConditionalGeneration` | Qwen2-Audio | T + A<sup>+</sup> | `Qwen/Qwen2-Audio-7B-Instruct` | | ✅︎ | ✅︎ |
 | `Qwen2VLForConditionalGeneration` | QVQ, Qwen2-VL | T + I<sup>E+</sup> + V<sup>E+</sup> | `Qwen/QVQ-72B-Preview`, `Qwen/Qwen2-VL-7B-Instruct`, `Qwen/Qwen2-VL-72B-Instruct`, etc. | ✅︎ | ✅︎ | ✅︎ |
 | `Qwen2_5_VLForConditionalGeneration` | Qwen2.5-VL | T + I<sup>E+</sup> + V<sup>E+</sup> | `Qwen/Qwen2.5-VL-3B-Instruct`, `Qwen/Qwen2.5-VL-72B-Instruct`, etc. | ✅︎ | ✅︎ | ✅︎ |
-| `Qwen2_5OmniThinkerForConditionalGeneration` | Qwen2.5-Omni | T + I<sup>E+</sup> + V<sup>E+</sup> + A<sup>+</sup> | `Qwen/Qwen2.5-Omni-7B` | | ✅︎ | ✅︎ |
+| `Qwen2_5OmniThinkerForConditionalGeneration` | Qwen2.5-Omni | T + I<sup>E+</sup> + V<sup>E+</sup> + A<sup>+</sup> | `Qwen/Qwen2.5-Omni-3B`, `Qwen/Qwen2.5-Omni-7B` | ✅︎ | ✅︎ | ✅︎ |
 | `RForConditionalGeneration` | R-VL-4B | T + I<sup>E+</sup> | `YannQi/R-4B` | | ✅︎ | ✅︎ |
 | `SkyworkR1VChatModel` | Skywork-R1V-38B | T + I | `Skywork/Skywork-R1V-38B` | | ✅︎ | ✅︎ |
 | `SmolVLMForConditionalGeneration` | SmolVLM2 | T + I | `SmolVLM2-2.2B-Instruct` | ✅︎ | | ✅︎ |
@@ -760,8 +766,9 @@ Speech2Text models trained specifically for Automatic Speech Recognition.
 
 | Architecture | Models | Example HF Models | [LoRA](../features/lora.md) | [PP](../serving/parallelism_scaling.md) | [V1](gh-issue:8779) |
 |--------------|--------|-------------------|----------------------|---------------------------|---------------------|
-| `WhisperForConditionalGeneration` | Whisper | `openai/whisper-small`, `openai/whisper-large-v3-turbo`, etc. | | | |
-| `VoxtralForConditionalGeneration` | Voxtral (Mistral format) | `mistralai/Voxtral-Mini-3B-2507`, `mistralai/Voxtral-Small-24B-2507`, etc. | | ✅︎ | ✅︎ |
+| `WhisperForConditionalGeneration` | Whisper | `openai/whisper-small`, `openai/whisper-large-v3-turbo`, etc. | | | ✅︎ |
+| `VoxtralForConditionalGeneration` | Voxtral (Mistral format) | `mistralai/Voxtral-Mini-3B-2507`, `mistralai/Voxtral-Small-24B-2507`, etc. | ✅︎ | ✅︎ | ✅︎ |
+| `Gemma3nForConditionalGeneration` | Gemma3n | `google/gemma-3n-E2B-it`, `google/gemma-3n-E4B-it`, etc. | | | ✅︎ |
 
 ### Pooling Models
 
diff --git a/docs/serving/expert_parallel_deployment.md b/docs/serving/expert_parallel_deployment.md
index 280b3322b11c31168476f977414795fc6ad7aa4c..494d2ad021e713828dc54a1d4df264cf56f03e1a 100644
--- a/docs/serving/expert_parallel_deployment.md
+++ b/docs/serving/expert_parallel_deployment.md
@@ -123,18 +123,46 @@ When enabled, vLLM collects load statistics with every forward pass and periodic
 
 ### EPLB Parameters
 
+Configure EPLB with the `--eplb-config` argument, which accepts a JSON string. The available keys and their descriptions are:
+
 | Parameter | Description | Default |
 |-----------|-------------|---------|
-| `--eplb-window-size` | Number of engine steps to track for rebalancing decisions | - |
-| `--eplb-step-interval` | Frequency of rebalancing (every N engine steps) | - |
-| `--eplb-log-balancedness` | Log balancedness metrics (avg tokens per expert ÷ max tokens per expert) | `false` |
-| `--num-redundant-experts` | Additional global experts per EP rank beyond equal distribution | `0` |
+| `window_size`| Number of engine steps to track for rebalancing decisions | 1000 |
+| `step_interval`| Frequency of rebalancing (every N engine steps) | 3000 |
+| `log_balancedness` | Log balancedness metrics (avg tokens per expert ÷ max tokens per expert) | `false` |
+| `num_redundant_experts` | Additional global experts per EP rank beyond equal distribution | `0` |
+
+For example:
+
+```bash
+vllm serve Qwen/Qwen3-30B-A3B \
+  --enable-eplb \
+  --eplb-config '{"window_size":1000,"step_interval":3000,"num_redundant_experts":2,"log_balancedness":true}'
+```
+
+??? tip "Prefer individual arguments instead of JSON?"
+
+    ```bash
+    vllm serve Qwen/Qwen3-30B-A3B \
+            --enable-eplb \
+            --eplb-config.window_size 1000 \
+            --eplb-config.step_interval 3000 \
+            --eplb-config.num_redundant_experts 2 \
+            --eplb-config.log_balancedness true
+    ```
 
 ### Expert Distribution Formula
 
 - **Default**: Each EP rank has `NUM_TOTAL_EXPERTS ÷ NUM_EP_RANKS` experts
 - **With redundancy**: Each EP rank has `(NUM_TOTAL_EXPERTS + NUM_REDUNDANT_EXPERTS) ÷ NUM_EP_RANKS` experts
 
+### Memory Footprint Overhead
+
+EPLB uses redundant experts that need to fit in GPU memory. This means that EPLB may not be a good fit for memory constrained environments or when KV cache space is at a premium.
+
+This overhead equals `NUM_MOE_LAYERS * BYTES_PER_EXPERT * (NUM_TOTAL_EXPERTS + NUM_REDUNDANT_EXPERTS) ÷ NUM_EP_RANKS`.
+For DeepSeekV3, this is approximately `2.4 GB` for one redundant expert per EP rank.
+
 ### Example Command
 
 Single node deployment with EPLB enabled:
@@ -146,12 +174,10 @@ VLLM_ALL2ALL_BACKEND=pplx VLLM_USE_DEEP_GEMM=1 vllm serve deepseek-ai/DeepSeek-V
     --data-parallel-size 8 \        # Data parallelism  
     --enable-expert-parallel \      # Enable EP
     --enable-eplb \                 # Enable load balancer
-    --eplb-log-balancedness \       # Log balancing metrics
-    --eplb-window-size 1000 \       # Track last 1000 engine steps
-    --eplb-step-interval 3000       # Rebalance every 3000 steps
+    --eplb-config '{"window_size":1000,"step_interval":3000,"num_redundant_experts":2,"log_balancedness":true}'
 ```
 
-For multi-node deployment, add these EPLB flags to each node's command. We recommend setting `--num-redundant-experts` to 32 in large scale use cases so the most popular experts are always available.
+For multi-node deployment, add these EPLB flags to each node's command. We recommend setting `--eplb-config '{"num_redundant_experts":32}'` to 32 in large scale use cases so the most popular experts are always available.
 
 ## Disaggregated Serving (Prefill/Decode Split)
 
diff --git a/docs/serving/parallelism_scaling.md b/docs/serving/parallelism_scaling.md
index fa7fc1b290d505ef4e0c4961d7311dd8177799e3..cef1127fc5c15bf39983ed498c1c0630c015021c 100644
--- a/docs/serving/parallelism_scaling.md
+++ b/docs/serving/parallelism_scaling.md
@@ -66,7 +66,7 @@ Ray is a distributed computing framework for scaling Python programs. Multi-node
 
 vLLM uses Ray to manage the distributed execution of tasks across multiple nodes and control where execution happens.
 
-Ray also offers high-level APIs for large-scale [offline batch inference](https://docs.ray.io/en/latest/data/working-with-llms.html) and [online serving](https://docs.ray.io/en/latest/serve/llm/serving-llms.html) that can leverage vLLM as the engine. These APIs add production-grade fault tolerance, scaling, and distributed observability to vLLM workloads.
+Ray also offers high-level APIs for large-scale [offline batch inference](https://docs.ray.io/en/latest/data/working-with-llms.html) and [online serving](https://docs.ray.io/en/latest/serve/llm) that can leverage vLLM as the engine. These APIs add production-grade fault tolerance, scaling, and distributed observability to vLLM workloads.
 
 For details, see the [Ray documentation](https://docs.ray.io/en/latest/index.html).
 
@@ -104,7 +104,7 @@ Note that `VLLM_HOST_IP` is unique for each worker. Keep the shells running thes
 From any node, enter a container and run `ray status` and `ray list nodes` to verify that Ray finds the expected number of nodes and GPUs.
 
 !!! tip
-    Alternatively, set up the Ray cluster using KubeRay. For more information, see [KubeRay vLLM documentation](https://docs.ray.io/en/latest/cluster/kubernetes/examples/vllm-rayservice.html).
+    Alternatively, set up the Ray cluster using KubeRay. For more information, see [KubeRay vLLM documentation](https://docs.ray.io/en/latest/cluster/kubernetes/examples/rayserve-llm-example.html).
 
 ### Running vLLM on a Ray cluster
 
diff --git a/docs/usage/troubleshooting.md b/docs/usage/troubleshooting.md
index b92c6cef4a3fa27daa5d1332a90df170484af459..6e700d1faaa9c3e2f94f606a36f59d61c1bad329 100644
--- a/docs/usage/troubleshooting.md
+++ b/docs/usage/troubleshooting.md
@@ -40,6 +40,34 @@ If other strategies don't solve the problem, it's likely that the vLLM instance
 - `export NCCL_DEBUG=TRACE` to turn on more logging for NCCL.
 - `export VLLM_TRACE_FUNCTION=1` to record all function calls for inspection in the log files to tell which function crashes or hangs. Do not use this flag unless absolutely needed for debugging, it will cause significant delays in startup time.
 
+## Breakpoints
+
+Setting normal `pdb` breakpoints may not work in vLLM's codebase if they are executed in a subprocess. You will experience something like:
+
+``` text
+  File "/usr/local/uv/cpython-3.12.11-linux-x86_64-gnu/lib/python3.12/bdb.py", line 100, in trace_dispatch
+    return self.dispatch_line(frame)
+           ^^^^^^^^^^^^^^^^^^^^^^^^^
+  File "/usr/local/uv/cpython-3.12.11-linux-x86_64-gnu/lib/python3.12/bdb.py", line 125, in dispatch_line
+    if self.quitting: raise BdbQuit
+                      ^^^^^^^^^^^^^
+bdb.BdbQuit
+```
+
+One solution is using [forked-pdb](https://github.com/Lightning-AI/forked-pdb). Install with `pip install fpdb` and set a breakpoint with something like:
+
+``` python
+__import__('fpdb').ForkedPdb().set_trace()
+```
+
+Another option is to disable multiprocessing entirely, with the `VLLM_ENABLE_V1_MULTIPROCESSING` environment variable.
+This keeps the scheduler in the same process, so you can use stock `pdb` breakpoints:
+
+``` python
+import os
+os.environ["VLLM_ENABLE_V1_MULTIPROCESSING"] = "0"
+```
+
 ## Incorrect network setup
 
 The vLLM instance cannot get the correct IP address if you have a complicated network config. You can find a log such as `DEBUG 06-10 21:32:17 parallel_state.py:88] world_size=8 rank=0 local_rank=0 distributed_init_method=tcp://xxx.xxx.xxx.xxx:54641 backend=nccl` and the IP address should be the correct one.
@@ -295,4 +323,5 @@ This indicates vLLM failed to initialize the NCCL communicator, possibly due to
 ## Known Issues
 
 - In `v0.5.2`, `v0.5.3`, and `v0.5.3.post1`, there is a bug caused by [zmq](https://github.com/zeromq/pyzmq/issues/2000) , which can occasionally cause vLLM to hang depending on the machine configuration. The solution is to upgrade to the latest version of `vllm` to include the [fix](gh-pr:6759).
-- To circumvent a NCCL [bug](https://github.com/NVIDIA/nccl/issues/1234) , all vLLM processes will set an environment variable `NCCL_CUMEM_ENABLE=0` to disable NCCL's `cuMem` allocator. It does not affect performance but only gives memory benefits. When external processes want to set up a NCCL connection with vLLM's processes, they should also set this environment variable, otherwise, inconsistent environment setup will cause NCCL to hang or crash, as observed in the [RLHF integration](https://github.com/OpenRLHF/OpenRLHF/pull/604) and the [discussion](gh-issue:5723#issuecomment-2554389656) .
+- To address a memory overhead issue in older NCCL versions (see [bug](https://github.com/NVIDIA/nccl/issues/1234)), vLLM versions `>= 0.4.3, <= 0.10.1.1` would set the environment variable `NCCL_CUMEM_ENABLE=0`. External processes connecting to vLLM also needed to set this variable to prevent hangs or crashes. Since the underlying NCCL bug was fixed in NCCL 2.22.3, this override was removed in newer vLLM versions to allow for NCCL performance optimizations.
+- In some PCIe machines (e.g. machines without NVLink), if you see an error like `transport/shm.cc:590 NCCL WARN Cuda failure 217 'peer access is not supported between these two devices'`, it's likely caused by a driver bug. See [this issue](https://github.com/NVIDIA/nccl/issues/1838) for more details. In that case, you can try to set `NCCL_CUMEM_HOST_ENABLE=0` to disable the feature, or upgrade your driver to the latest version.
diff --git a/docs/usage/v1_guide.md b/docs/usage/v1_guide.md
index f71805436a6ae064cace4e5f3beae74ba0b9d8a9..d404c87e8f5a7d28b4ba64caa2149e49248511b9 100644
--- a/docs/usage/v1_guide.md
+++ b/docs/usage/v1_guide.md
@@ -83,7 +83,7 @@ based on assigned priority, with FCFS as a tie-breaker), configurable via the
 | Model Type                  | Status                                                                             |
 |-----------------------------|------------------------------------------------------------------------------------|
 | **Decoder-only Models**     | <nobr>🚀 Optimized</nobr>                                                          |
-| **Encoder-Decoder Models**  | <nobr>🟠 Delayed</nobr>                                                            |
+| **Encoder-Decoder Models**  | <nobr>🟢 Whisper only</nobr>                                                       |
 | **Embedding Models**        | <nobr>🟢 Functional</nobr>                                                         |
 | **Mamba Models**            | <nobr>🟢 (Mamba-2), 🟢 (Mamba-1)</nobr>                                            |
 | **Multimodal Models**       | <nobr>🟢 Functional</nobr>                                                         |
@@ -110,7 +110,7 @@ Models using selective state-space mechanisms instead of standard transformer at
 Models that use Mamba-2 and Mamba-1 layers (e.g., `Mamba2ForCausalLM`, `MambaForCausalLM`,`FalconMambaForCausalLM`) are supported.
 
 Hybrid models that combine Mamba-2 and Mamba-1 layers with standard attention layers are also supported (e.g., `BambaForCausalLM`,
-`Zamba2ForCausalLM`, `NemotronHForCausalLM`, `FalconH1ForCausalLM` and `GraniteMoeHybridForCausalLM`, `JambaForCausalLM`).
+`Zamba2ForCausalLM`, `NemotronHForCausalLM`, `FalconH1ForCausalLM` and `GraniteMoeHybridForCausalLM`, `JambaForCausalLM`, `Plamo2ForCausalLM`).
 
 Hybrid models with mechanisms different to Mamba are also supported (e.g, `MiniMaxText01ForCausalLM`, `MiniMaxM1ForCausalLM`, `Lfm2ForCausalLM`).
 
@@ -118,8 +118,9 @@ Please note that prefix caching is not yet supported for any of the above models
 
 #### Encoder-Decoder Models
 
-Models requiring cross-attention between separate encoder and decoder (e.g., `BartForConditionalGeneration`, `MllamaForConditionalGeneration`)
-are not yet supported.
+Whisper is supported. Other models requiring cross-attention between separate
+encoder and decoder (e.g., `BartForConditionalGeneration`,
+`MllamaForConditionalGeneration`) are not yet supported.
 
 ### Features
 
diff --git a/examples/offline_inference/audio_language.py b/examples/offline_inference/audio_language.py
index 22cb8b057dac7692217c4b8470a1c142f33cf45a..65a87d2dd9e8ebb023bb81e6f824bae27619e4bf 100644
--- a/examples/offline_inference/audio_language.py
+++ b/examples/offline_inference/audio_language.py
@@ -117,7 +117,7 @@ def run_gemma3n(question: str, audio_count: int) -> ModelRequestData:
 
 # Granite Speech
 def run_granite_speech(question: str, audio_count: int) -> ModelRequestData:
-    # NOTE - the setting in this example are somehat different than what is
+    # NOTE - the setting in this example are somewhat different from what is
     # optimal for granite speech, and it is generally recommended to use beam
     # search. Check the model README for suggested settings.
     # https://huggingface.co/ibm-granite/granite-speech-3.3-8b
@@ -146,6 +146,36 @@ def run_granite_speech(question: str, audio_count: int) -> ModelRequestData:
     )
 
 
+# MiDashengLM
+def run_midashenglm(question: str, audio_count: int):
+    model_name = "mispeech/midashenglm-7b"
+
+    engine_args = EngineArgs(
+        model=model_name,
+        trust_remote_code=True,
+        max_model_len=4096,
+        max_num_seqs=5,
+        limit_mm_per_prompt={"audio": audio_count},
+    )
+
+    audio_in_prompt = "".join(
+        ["<|audio_bos|><|AUDIO|><|audio_eos|>" for idx in range(audio_count)]
+    )
+
+    default_system = "You are a helpful language and speech assistant."
+
+    prompt = (
+        f"<|im_start|>system\n{default_system}<|im_end|>\n"
+        "<|im_start|>user\n"
+        f"{audio_in_prompt}{question}<|im_end|>\n"
+        "<|im_start|>assistant\n"
+    )
+    return ModelRequestData(
+        engine_args=engine_args,
+        prompt=prompt,
+    )
+
+
 # MiniCPM-O
 def run_minicpmo(question: str, audio_count: int) -> ModelRequestData:
     model_name = "openbmb/MiniCPM-o-2_6"
@@ -352,6 +382,7 @@ model_example_map = {
     "voxtral": run_voxtral,
     "gemma3n": run_gemma3n,
     "granite_speech": run_granite_speech,
+    "midashenglm": run_midashenglm,
     "minicpmo": run_minicpmo,
     "phi4_mm": run_phi4mm,
     "phi4_multimodal": run_phi4_multimodal,
diff --git a/examples/offline_inference/chat_with_tools.py b/examples/offline_inference/chat_with_tools.py
index 6e56e24f2092c984611d2e0129bb9cbfe41b78b5..3a95b1fdfbabc90fd924f889ec84f22643e3d337 100644
--- a/examples/offline_inference/chat_with_tools.py
+++ b/examples/offline_inference/chat_with_tools.py
@@ -143,5 +143,5 @@ outputs = llm.chat(messages, sampling_params, tools=tools)
 
 print(outputs[0].outputs[0].text.strip())
 # yields
-#   'The weather in Dallas, TX is 85 degrees fahrenheit. '
+#   'The weather in Dallas, TX is 85 degrees Fahrenheit. '
 #   'It is partly cloudly, with highs in the 90's.'
diff --git a/examples/offline_inference/data_parallel.py b/examples/offline_inference/data_parallel.py
index dd7559451c4c6a898d49a5829c398a955b54c78b..36d805a32db7a2d04e677eb1010136ef645a7116 100644
--- a/examples/offline_inference/data_parallel.py
+++ b/examples/offline_inference/data_parallel.py
@@ -87,6 +87,11 @@ def parse_args():
         default=0.8,
         help=("Fraction of GPU memory vLLM is allowed to allocate (0.0, 1.0]."),
     )
+    parser.add_argument(
+        "--compilation-config",
+        type=int,
+        help=("Compilation optimization (O) level 0-3."),
+    )
     parser.add_argument(
         "--quantization",
         type=str,
@@ -106,6 +111,7 @@ def main(
     trust_remote_code,
     max_num_seqs,
     max_model_len,
+    compilation_config,
     gpu_memory_utilization,
     quantization,
 ):
@@ -162,6 +168,7 @@ def main(
         max_model_len=max_model_len,
         gpu_memory_utilization=gpu_memory_utilization,
         quantization=quantization,
+        compilation_config=compilation_config,
     )
     outputs = llm.generate(prompts, sampling_params)
     # Print the outputs.
@@ -218,6 +225,7 @@ if __name__ == "__main__":
                 args.trust_remote_code,
                 args.max_num_seqs,
                 args.max_model_len,
+                args.compilation_config,
                 args.gpu_memory_utilization,
                 args.quantization,
             ),
diff --git a/examples/offline_inference/disaggregated_prefill.py b/examples/offline_inference/disaggregated_prefill.py
index 05a361fee071711912f8625b1365eb23c938b130..f619fa584f80134fb834beb605d6720cb39eaaef 100644
--- a/examples/offline_inference/disaggregated_prefill.py
+++ b/examples/offline_inference/disaggregated_prefill.py
@@ -30,12 +30,12 @@ def run_prefill(prefill_done):
     ]
     sampling_params = SamplingParams(temperature=0, top_p=0.95, max_tokens=1)
 
-    # Using PyNcclConnector to transmit KV caches between vLLM instances.
+    # Using P2pNcclConnector to transmit KV caches between vLLM instances.
     # This instance is the prefill node (kv_producer, rank 0).
     # The number of parallel instances for KV cache transfer is set to 2,
-    # as required for PyNcclConnector.
+    # as required for P2pNcclConnector.
     ktc = KVTransferConfig(
-        kv_connector="PyNcclConnector",
+        kv_connector="P2pNcclConnector",
         kv_role="kv_producer",
         kv_rank=0,
         kv_parallel_size=2,
@@ -74,12 +74,12 @@ def run_decode(prefill_done):
     ]
     sampling_params = SamplingParams(temperature=0, top_p=0.95)
 
-    # Using PyNcclConnector to transmit KV caches between vLLM instances.
+    # Using P2pNcclConnector to transmit KV caches between vLLM instances.
     # This instance is the decode node (kv_consumer, rank 1).
     # The number of parallel instances for KV cache transfer is set to 2,
-    # as required for PyNcclConnector.
+    # as required for P2pNcclConnector.
     ktc = KVTransferConfig(
-        kv_connector="PyNcclConnector",
+        kv_connector="P2pNcclConnector",
         kv_role="kv_consumer",
         kv_rank=1,
         kv_parallel_size=2,
diff --git a/examples/offline_inference/encoder_decoder.py b/examples/offline_inference/encoder_decoder.py
index df6c1eaf4a21ecbb548557af2df7c71dcb6106e7..957db3c23b863af46d8bf42a0c9fe672e193e0cb 100644
--- a/examples/offline_inference/encoder_decoder.py
+++ b/examples/offline_inference/encoder_decoder.py
@@ -5,6 +5,8 @@ Demonstrate prompting of text-to-text
 encoder/decoder models, specifically BART and mBART.
 
 This script is refactored to allow model selection via command-line arguments.
+
+NOTE: This example is not yet supported in V1.
 """
 
 import argparse
diff --git a/examples/offline_inference/encoder_decoder_multimodal.py b/examples/offline_inference/encoder_decoder_multimodal.py
index 655f9f3fce7ae23c87938c0c6f681f7927f07ba9..35e9203d1caf00e9f4548c0c0076178ae0611c5f 100644
--- a/examples/offline_inference/encoder_decoder_multimodal.py
+++ b/examples/offline_inference/encoder_decoder_multimodal.py
@@ -5,6 +5,7 @@ This example shows how to use vLLM for running offline inference with
 the explicit/implicit prompt format on enc-dec LMMs for text generation.
 """
 
+import os
 import time
 from collections.abc import Sequence
 from dataclasses import asdict
@@ -130,6 +131,8 @@ def run_mllama():
 
 
 def run_whisper():
+    os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn"
+
     engine_args = EngineArgs(
         model="openai/whisper-large-v3-turbo",
         max_model_len=448,
diff --git a/examples/offline_inference/logits_processor.py b/examples/offline_inference/logits_processor/custom.py
similarity index 100%
rename from examples/offline_inference/logits_processor.py
rename to examples/offline_inference/logits_processor/custom.py
diff --git a/examples/offline_inference/logits_processor/custom_req.py b/examples/offline_inference/logits_processor/custom_req.py
new file mode 100644
index 0000000000000000000000000000000000000000..4c19bb4ce2bae2339ad1e64444b15cdecee6a8a5
--- /dev/null
+++ b/examples/offline_inference/logits_processor/custom_req.py
@@ -0,0 +1,151 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+"""This example demonstrates wrapping a request-level logits processor to be
+compatible with vLLM's batch-level logits processing
+
+For demo purposes, a dummy logits processor is employed which, if
+`target_token` is passed as a keyword argument to `SamplingParams.extra_args`,
+will mask out all tokens except `target_token`. This logits processor can be
+applied to a vector of logits associated with a single decode step for a single
+request. The logits processor cannot be applied to a request which does not
+pass in a `target_token` custom argument.
+
+The request-level dummy logits processor is wrapped to create a batch-level
+logits processor, which can apply the logits processor to output logits from
+all requests in the persistent batch in a given decode step. For requests which
+do not provide a `target_token` argument, the corresponding row of `logits`
+will not be modified.
+
+A batch is constructed with `temperature=0.0` and 50% of requests specifying
+`target_token`, and for these requests - and *only* these requests - we
+expect the `target_token` to be decoded in each step, yielding an output
+similar to that shown below:
+
+Generated Outputs:
+------------------------------------------------------------
+Prompt:    'Hello, my name is'
+Output:    " ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' '"
+------------------------------------------------------------
+Prompt:    'The president of the United States is'
+Output:    " not a racist. He is a racist.\nHe's a racist because he"
+------------------------------------------------------------
+Prompt:    'The capital of France is'
+Output:    ' also also also also also also also also also also also also also
+             also also also'
+------------------------------------------------------------
+Prompt:    'The future of AI is'
+Output:    ' in the hands of the people.\n\nThe future of AI is in the'
+------------------------------------------------------------
+"""
+
+from typing import Any, Optional
+
+import torch
+
+from vllm import LLM, SamplingParams
+from vllm.logger import init_logger
+from vllm.v1.sample.logits_processor import (
+    AdapterLogitsProcessor,
+    RequestLogitsProcessor,
+)
+
+logger = init_logger(__name__)
+
+
+class DummyPerReqLogitsProcessor:
+    """The request-level logits processor masks out all logits except the
+    token id identified by `target_token`"""
+
+    def __init__(self, target_token: int) -> None:
+        """Specify `target_token`"""
+        self.target_token = target_token
+
+    def __call__(
+        self,
+        output_ids: list[int],
+        logits: torch.Tensor,
+    ) -> torch.Tensor:
+        val_to_keep = logits[self.target_token].item()
+        logits[:] = float("-inf")
+        logits[self.target_token] = val_to_keep
+        return logits
+
+
+class WrappedPerReqLogitsProcessor(AdapterLogitsProcessor):
+    """Example of wrapping a fake request-level logit processor to create a
+    batch-level logits processor"""
+
+    def is_argmax_invariant(self) -> bool:
+        return False
+
+    def new_req_logits_processor(
+        self,
+        params: SamplingParams,
+    ) -> Optional[RequestLogitsProcessor]:
+        """This method returns a new request-level logits processor, customized
+        to the `target_token` value associated with a particular request.
+
+        Returns None if the logits processor should not be applied to the
+        particular request. To use the logits processor the request must have
+        a "target_token" custom argument with an integer value.
+
+        Args:
+          params: per-request sampling params
+
+        Returns:
+          `Callable` request logits processor, or None
+        """
+        target_token: Optional[Any] = params.extra_args and params.extra_args.get(
+            "target_token"
+        )
+        if target_token is None:
+            return None
+        if not isinstance(target_token, int):
+            logger.warning(
+                "target_token value %s is not int; not applying logits"
+                " processor to request.",
+                target_token,
+            )
+            return None
+        return DummyPerReqLogitsProcessor(target_token)
+
+
+# Sample prompts.
+prompts = [
+    "Hello, my name is",
+    "The president of the United States is",
+    "The capital of France is",
+    "The future of AI is",
+]
+# Create a mixture of requests which do and don't utilize the dummy logitproc
+sampling_params_list = [
+    SamplingParams(temperature=0.0, extra_args={"target_token": 128}),
+    SamplingParams(temperature=0.0),
+    SamplingParams(temperature=0.0, extra_args={"target_token": 67}),
+    SamplingParams(temperature=0.0),
+]
+
+
+def main():
+    # Create an LLM.
+    llm = LLM(
+        model="facebook/opt-125m",
+        logits_processors=[WrappedPerReqLogitsProcessor],
+    )
+    # Generate texts from the prompts.
+    # The output is a list of RequestOutput objects
+    # that contain the prompt, generated text, and other information.
+    outputs = llm.generate(prompts, sampling_params_list)
+    # Print the outputs.
+    print("\nGenerated Outputs:\n" + "-" * 60)
+    for output in outputs:
+        prompt = output.prompt
+        generated_text = output.outputs[0].text
+        print(f"Prompt:    {prompt!r}")
+        print(f"Output:    {generated_text!r}")
+        print("-" * 60)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/examples/offline_inference/logits_processor/custom_req_init.py b/examples/offline_inference/logits_processor/custom_req_init.py
new file mode 100644
index 0000000000000000000000000000000000000000..62947d122e01cba9dc9377540f9da8be9c5900a5
--- /dev/null
+++ b/examples/offline_inference/logits_processor/custom_req_init.py
@@ -0,0 +1,165 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+"""This example demonstrates a special case of wrapping a request-level logits
+processor, namely the case where it is necessary to utilize engine config or
+environment info passed to the constructor. The subclass must override the
+wrapper base class `__init__()` method to access the engine config, the device
+identifier, or the flag which indicates whether pinned memory is available.
+
+For demo purposes, a request-level dummy logits processor is employed which
+causes the same token (`target_token`) to be decoded in each step. The
+request-level dummy logits processor is wrapped to create a batch-level logits
+processor, which can apply the logits processor to output logits from all
+requests in the persistent batch in a given decode step.
+
+The wrapped dummy logits processor below models a scenario where we must
+disable the logits processor on non-"cuda" platforms. The wrapper base class
+`__init__()` is overridden in order to check this condition and set a flag.
+
+A batch is constructed with `temperature=0.0` and 50% of requests specifying
+`target_token`, and for these requests - and *only* these requests - we
+expect that on a "cuda" device the output will look something like:
+
+Generated Outputs:
+------------------------------------------------------------
+Prompt:    'Hello, my name is'
+Output:    " ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' '"
+------------------------------------------------------------
+Prompt:    'The president of the United States is'
+Output:    " not a racist. He is a racist.\nHe's a racist because he"
+------------------------------------------------------------
+Prompt:    'The capital of France is'
+Output:    ' also also also also also also also also also also also also also
+             also also also'
+------------------------------------------------------------
+Prompt:    'The future of AI is'
+Output:    ' in the hands of the people.\n\nThe future of AI is in the'
+------------------------------------------------------------
+
+which indicates that the logits processor is running. However, on a non-"cuda"
+device, the first and third requests would not repeat the same token.
+"""
+
+from typing import Optional
+
+import torch
+
+from vllm import LLM, SamplingParams
+from vllm.config import VllmConfig
+from vllm.logger import init_logger
+from vllm.v1.sample.logits_processor import (
+    AdapterLogitsProcessor,
+    RequestLogitsProcessor,
+)
+
+logger = init_logger(__name__)
+
+
+class DummyPerReqLogitsProcessor:
+    """The request-level logits processor masks out all logits except the
+    token id identified by `target_token`"""
+
+    def __init__(self, target_token: int) -> None:
+        """Specify `target_token`"""
+        self.target_token = target_token
+
+    def __call__(
+        self,
+        output_ids: list[int],
+        logits: torch.Tensor,
+    ) -> torch.Tensor:
+        val_to_keep = logits[self.target_token].item()
+        logits[:] = float("-inf")
+        logits[self.target_token] = val_to_keep
+        return logits
+
+
+class WrappedPerReqLogitsProcessor(AdapterLogitsProcessor):
+    """Example of overriding the wrapper class `__init__()` in order to utilize
+    info about the device type"""
+
+    def __init__(
+        self, vllm_config: VllmConfig, device: torch.device, is_pin_memory: bool
+    ):
+        super().__init__(vllm_config, device, is_pin_memory)
+        self.is_cuda = device.type == "cuda"
+
+    def is_argmax_invariant(self) -> bool:
+        return False
+
+    def new_req_logits_processor(
+        self,
+        params: SamplingParams,
+    ) -> Optional[RequestLogitsProcessor]:
+        """This method returns a new request-level logits processor, customized
+        to the `target_token` value associated with a particular request.
+
+        Returns None if the logits processor should not be applied to the
+        particular request. To use the logits processor the request must have
+        a "target_token" custom argument with an integer value, and the device
+        must be "cuda"-type
+
+        Args:
+          params: per-request sampling params
+
+        Returns:
+          `Callable` request logits processor, or None
+        """
+        if (
+            not self.is_cuda
+            or (
+                target_token := params.extra_args
+                and params.extra_args.get("target_token")
+            )
+            is None
+        ):
+            return None
+        if not isinstance(target_token, int):
+            logger.warning(
+                "target_token value %s is not int; not applying logits"
+                " processor to request.",
+                target_token,
+            )
+            return None
+        return DummyPerReqLogitsProcessor(target_token)
+
+
+# Sample prompts.
+prompts = [
+    "Hello, my name is",
+    "The president of the United States is",
+    "The capital of France is",
+    "The future of AI is",
+]
+# Create a mixture of requests which do and don't utilize the dummy logitproc
+sampling_params_list = [
+    SamplingParams(temperature=0.0, extra_args={"target_token": 128}),
+    SamplingParams(temperature=0.0),
+    SamplingParams(temperature=0.0, extra_args={"target_token": 67}),
+    SamplingParams(temperature=0.0),
+]
+
+
+def main():
+    # Create an LLM.
+    llm = LLM(
+        model="facebook/opt-125m",
+        logits_processors=[WrappedPerReqLogitsProcessor],
+    )
+    # Generate texts from the prompts.
+    # The output is a list of RequestOutput objects
+    # that contain the prompt, generated text, and other information.
+    outputs = llm.generate(prompts, sampling_params_list)
+    # Print the outputs.
+    print("\nGenerated Outputs:\n" + "-" * 60)
+    for output in outputs:
+        prompt = output.prompt
+        generated_text = output.outputs[0].text
+        print(f"Prompt:    {prompt!r}")
+        print(f"Output:    {generated_text!r}")
+        print("-" * 60)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/examples/offline_inference/multilora_inference.py b/examples/offline_inference/multilora_inference.py
index f0c00bcaaeb110c4a127f14ccc0b7f69eafeec8a..6040683c68bcd8d1a5281f63946325dc794d23c7 100644
--- a/examples/offline_inference/multilora_inference.py
+++ b/examples/offline_inference/multilora_inference.py
@@ -23,7 +23,7 @@ def create_test_prompts(
     2 requests for base model, 4 requests for the LoRA. We define 2
     different LoRA adapters (using the same model for demo purposes).
     Since we also set `max_loras=1`, the expectation is that the requests
-    with the second LoRA adapter will be ran after all requests with the
+    with the second LoRA adapter will be run after all requests with the
     first adapter have finished.
     """
     return [
diff --git a/examples/offline_inference/neuron.py b/examples/offline_inference/neuron.py
deleted file mode 100644
index 7826629a36d01dc7990e99c4efa3b81a6b3d3ddf..0000000000000000000000000000000000000000
--- a/examples/offline_inference/neuron.py
+++ /dev/null
@@ -1,49 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-from vllm import LLM, SamplingParams
-
-# Sample prompts.
-prompts = [
-    "Hello, my name is",
-    "The president of the United States is",
-    "The capital of France is",
-    "The future of AI is",
-]
-# Create a sampling params object.
-sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
-
-
-def main():
-    # Create an LLM.
-    llm = LLM(
-        model="TinyLlama/TinyLlama-1.1B-Chat-v1.0",
-        max_num_seqs=8,
-        # The max_model_len and block_size arguments are required to be same as
-        # max sequence length when targeting neuron device.
-        # Currently, this is a known limitation in continuous batching support
-        # in transformers-neuronx.
-        # TODO(liangfu): Support paged-attention in transformers-neuronx.
-        max_model_len=1024,
-        block_size=1024,
-        # ruff: noqa: E501
-        # The device can be automatically detected when AWS Neuron SDK is installed.
-        # The device argument can be either unspecified for automated detection,
-        # or explicitly assigned.
-        device="neuron",
-        tensor_parallel_size=2,
-    )
-    # Generate texts from the prompts. The output is a list of RequestOutput objects
-    # that contain the prompt, generated text, and other information.
-    outputs = llm.generate(prompts, sampling_params)
-    # Print the outputs.
-    print("-" * 50)
-    for output in outputs:
-        prompt = output.prompt
-        generated_text = output.outputs[0].text
-        print(f"Prompt: {prompt!r}\nGenerated text: {generated_text!r}")
-        print("-" * 50)
-
-
-if __name__ == "__main__":
-    main()
diff --git a/examples/offline_inference/neuron_eagle.py b/examples/offline_inference/neuron_eagle.py
deleted file mode 100644
index 8b1d235ff974289b85fbebcd3d89428b3a5d3e5b..0000000000000000000000000000000000000000
--- a/examples/offline_inference/neuron_eagle.py
+++ /dev/null
@@ -1,61 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-"""
-This example shows how to run offline inference with an EAGLE speculative
-decoding model on neuron. To use EAGLE speculative decoding, you must use
-a draft model that is specifically fine-tuned for EAGLE speculation.
-Additionally, to use EAGLE with NxD Inference, the draft model must include
-the LM head weights from the target model. These weights are shared between
-the draft and target model.
-"""
-
-from vllm import LLM, SamplingParams
-
-# Sample prompts.
-prompts = [
-    "What is annapurna labs?",
-]
-
-
-def main():
-    # Create a sampling params object.
-    sampling_params = SamplingParams(top_k=1, max_tokens=500, ignore_eos=True)
-
-    # Create an LLM.
-    llm = LLM(
-        model="/home/ubuntu/model_hf/Meta-Llama-3.1-70B-Instruct",
-        speculative_config={
-            "model": "/home/ubuntu/model_hf/Llama-3.1-70B-Instruct-EAGLE-Draft",
-            "num_speculative_tokens": 5,
-            "max_model_len": 2048,
-        },
-        max_num_seqs=4,
-        # The max_model_len and block_size arguments are required to be same as
-        # max sequence length when targeting neuron device.
-        # Currently, this is a known limitation in continuous batching support
-        # in neuronx-distributed-inference.
-        max_model_len=2048,
-        block_size=2048,
-        # The device can be automatically detected when AWS Neuron SDK is installed.
-        # The device argument can be either unspecified for automated detection,
-        # or explicitly assigned.
-        device="neuron",
-        tensor_parallel_size=32,
-        override_neuron_config={
-            "enable_eagle_speculation": True,
-            "enable_fused_speculation": True,
-        },
-    )
-
-    # Generate texts from the prompts. The output is a list of RequestOutput objects
-    # that contain the prompt, generated text, and other information.
-    outputs = llm.generate(prompts, sampling_params)
-    # Print the outputs.
-    for output in outputs:
-        prompt = output.prompt
-        generated_text = output.outputs[0].text
-        print(f"Prompt: {prompt!r}, \n\n\n Generated text: {generated_text!r}")
-
-
-if __name__ == "__main__":
-    main()
diff --git a/examples/offline_inference/neuron_int8_quantization.py b/examples/offline_inference/neuron_int8_quantization.py
deleted file mode 100644
index c0ecfac508996974d2dd7c2344a044e17cda8be2..0000000000000000000000000000000000000000
--- a/examples/offline_inference/neuron_int8_quantization.py
+++ /dev/null
@@ -1,63 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-import os
-
-from vllm import LLM, SamplingParams
-
-# creates XLA hlo graphs for all the context length buckets.
-os.environ["NEURON_CONTEXT_LENGTH_BUCKETS"] = "128,512,1024,2048"
-# creates XLA hlo graphs for all the token gen buckets.
-os.environ["NEURON_TOKEN_GEN_BUCKETS"] = "128,512,1024,2048"
-# Quantizes neuron model weight to int8 ,
-# The default config for quantization is int8 dtype.
-os.environ["NEURON_QUANT_DTYPE"] = "s8"
-
-# Sample prompts.
-prompts = [
-    "Hello, my name is",
-    "The president of the United States is",
-    "The capital of France is",
-    "The future of AI is",
-]
-# Create a sampling params object.
-sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
-
-
-def main():
-    # Create an LLM.
-    llm = LLM(
-        model="TinyLlama/TinyLlama-1.1B-Chat-v1.0",
-        max_num_seqs=8,
-        # The max_model_len and block_size arguments are required to be same as
-        # max sequence length when targeting neuron device.
-        # Currently, this is a known limitation in continuous batching support
-        # in transformers-neuronx.
-        # TODO(liangfu): Support paged-attention in transformers-neuronx.
-        max_model_len=2048,
-        block_size=2048,
-        # ruff: noqa: E501
-        # The device can be automatically detected when AWS Neuron SDK is installed.
-        # The device argument can be either unspecified for automated detection,
-        # or explicitly assigned.
-        device="neuron",
-        quantization="neuron_quant",
-        override_neuron_config={
-            "cast_logits_dtype": "bfloat16",
-        },
-        tensor_parallel_size=2,
-    )
-    # Generate texts from the prompts. The output is a list of RequestOutput objects
-    # that contain the prompt, generated text, and other information.
-    outputs = llm.generate(prompts, sampling_params)
-    # Print the outputs.
-    print("-" * 50)
-    for output in outputs:
-        prompt = output.prompt
-        generated_text = output.outputs[0].text
-        print(f"Prompt: {prompt!r}\nGenerated text: {generated_text!r}")
-        print("-" * 50)
-
-
-if __name__ == "__main__":
-    main()
diff --git a/examples/offline_inference/neuron_multimodal.py b/examples/offline_inference/neuron_multimodal.py
deleted file mode 100644
index 26f7505f2fa53b4b9d0a677447972a4ab5f4d554..0000000000000000000000000000000000000000
--- a/examples/offline_inference/neuron_multimodal.py
+++ /dev/null
@@ -1,110 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-import requests
-import torch
-from neuronx_distributed_inference.models.mllama.utils import add_instruct
-from PIL import Image
-
-from vllm import LLM, SamplingParams, TextPrompt
-
-
-def get_image(image_url):
-    image = Image.open(requests.get(image_url, stream=True).raw)
-    return image
-
-
-# Model Inputs
-PROMPTS = [
-    "What is in this image? Tell me a story",
-    "What is the recipe of mayonnaise in two sentences?",
-    "Describe this image",
-    "What is the capital of Italy famous for?",
-]
-IMAGES = [
-    get_image(
-        "https://images.pexels.com/photos/1108099/pexels-photo-1108099.jpeg?auto=compress&cs=tinysrgb&dpr=1&w=500"
-    ),
-    None,
-    get_image(
-        "https://images.pexels.com/photos/1108099/pexels-photo-1108099.jpeg?auto=compress&cs=tinysrgb&dpr=1&w=500"
-    ),
-    None,
-]
-SAMPLING_PARAMS = [
-    dict(top_k=1, temperature=1.0, top_p=1.0, max_tokens=16)
-    for _ in range(len(PROMPTS))
-]
-
-
-def get_VLLM_mllama_model_inputs(prompt, single_image, sampling_params):
-    # Prepare all inputs for mllama generation, including:
-    # 1. put text prompt into instruct chat template
-    # 2. compose single text and single image prompt into Vllm's prompt class
-    # 3. prepare sampling parameters
-    input_image = single_image
-    has_image = torch.tensor([1])
-    if isinstance(single_image, torch.Tensor) and single_image.numel() == 0:
-        has_image = torch.tensor([0])
-
-    instruct_prompt = add_instruct(prompt, has_image)
-    inputs = TextPrompt(prompt=instruct_prompt)
-
-    if input_image is not None:
-        inputs["multi_modal_data"] = {"image": input_image}
-
-    sampling_params = SamplingParams(**sampling_params)
-    return inputs, sampling_params
-
-
-def print_outputs(outputs):
-    # Print the outputs.
-    for output in outputs:
-        prompt = output.prompt
-        generated_text = output.outputs[0].text
-        print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
-
-
-def main():
-    assert (
-        len(PROMPTS) == len(IMAGES) == len(SAMPLING_PARAMS)
-    ), f"""Text, image prompts and sampling parameters should have the 
-            same batch size; but got {len(PROMPTS)}, {len(IMAGES)}, 
-            and {len(SAMPLING_PARAMS)}"""
-
-    # Create an LLM.
-    llm = LLM(
-        model="meta-llama/Llama-3.2-11B-Vision-Instruct",
-        max_num_seqs=1,
-        max_model_len=4096,
-        block_size=4096,
-        device="neuron",
-        tensor_parallel_size=32,
-        override_neuron_config={
-            "sequence_parallel_enabled": False,
-            "skip_warmup": True,
-            "save_sharded_checkpoint": True,
-            "on_device_sampling_config": {
-                "global_topk": 1,
-                "dynamic": False,
-                "deterministic": False,
-            },
-        },
-    )
-
-    batched_inputs = []
-    batched_sample_params = []
-    for pmpt, img, params in zip(PROMPTS, IMAGES, SAMPLING_PARAMS):
-        inputs, sampling_params = get_VLLM_mllama_model_inputs(pmpt, img, params)
-        # test batch-size = 1
-        outputs = llm.generate(inputs, sampling_params)
-        print_outputs(outputs)
-        batched_inputs.append(inputs)
-        batched_sample_params.append(sampling_params)
-
-    # test batch-size = 4
-    outputs = llm.generate(batched_inputs, batched_sample_params)
-    print_outputs(outputs)
-
-
-if __name__ == "__main__":
-    main()
diff --git a/examples/offline_inference/neuron_speculation.py b/examples/offline_inference/neuron_speculation.py
deleted file mode 100644
index 7fc22caee742df6b4c86e32c7921d7ea71a5a29a..0000000000000000000000000000000000000000
--- a/examples/offline_inference/neuron_speculation.py
+++ /dev/null
@@ -1,64 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-"""
-This example shows how to run offline inference with a speculative
-decoding model on neuron.
-"""
-
-import os
-
-from vllm import LLM, SamplingParams
-
-# Sample prompts.
-prompts = [
-    "Hello, I am a language model and I can help",
-    "The president of the United States is",
-    "The capital of France is",
-]
-
-
-def config_buckets():
-    """Configure context length and token gen buckets."""
-    # creates XLA hlo graphs for all the context length buckets.
-    os.environ["NEURON_CONTEXT_LENGTH_BUCKETS"] = "128,512,1024,2048"
-    # creates XLA hlo graphs for all the token gen buckets.
-    os.environ["NEURON_TOKEN_GEN_BUCKETS"] = "128,512,1024,2048"
-
-
-def initialize_llm():
-    """Create an LLM with speculative decoding."""
-    return LLM(
-        model="openlm-research/open_llama_7b",
-        speculative_config={
-            "model": "openlm-research/open_llama_3b",
-            "num_speculative_tokens": 4,
-            "max_model_len": 2048,
-        },
-        max_num_seqs=4,
-        max_model_len=2048,
-        block_size=2048,
-        device="neuron",
-        tensor_parallel_size=32,
-    )
-
-
-def process_requests(llm: LLM, sampling_params: SamplingParams):
-    """Generate texts from prompts and print them."""
-    outputs = llm.generate(prompts, sampling_params)
-    for output in outputs:
-        prompt = output.prompt
-        generated_text = output.outputs[0].text
-        print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
-
-
-def main():
-    """Main function that sets up the llm and processes prompts."""
-    config_buckets()
-    llm = initialize_llm()
-    # Create a sampling params object.
-    sampling_params = SamplingParams(max_tokens=100, top_k=1)
-    process_requests(llm, sampling_params)
-
-
-if __name__ == "__main__":
-    main()
diff --git a/examples/offline_inference/prithvi_geospatial_mae.py b/examples/offline_inference/prithvi_geospatial_mae.py
index b6007b9f463019fc4982b5baeb0445957df8e158..1a5879a6d35f54a84721063bf5e09e359ee04432 100644
--- a/examples/offline_inference/prithvi_geospatial_mae.py
+++ b/examples/offline_inference/prithvi_geospatial_mae.py
@@ -45,7 +45,11 @@ datamodule_config = {
 class PrithviMAE:
     def __init__(self, model):
         self.model = LLM(
-            model=model, skip_tokenizer_init=True, dtype="float16", enforce_eager=True
+            model=model,
+            skip_tokenizer_init=True,
+            dtype="float16",
+            enforce_eager=True,
+            model_impl="terratorch",
         )
 
     def run(self, input_data, location_coords):
diff --git a/examples/offline_inference/prithvi_geospatial_mae_io_processor.py b/examples/offline_inference/prithvi_geospatial_mae_io_processor.py
index 8023cd6677762762b72e781894d989f51fcbd6fc..418c40645f9f262e1f3ecdcaa6726ffbb6786c3d 100644
--- a/examples/offline_inference/prithvi_geospatial_mae_io_processor.py
+++ b/examples/offline_inference/prithvi_geospatial_mae_io_processor.py
@@ -12,13 +12,13 @@ from vllm.pooling_params import PoolingParams
 # multimodal data. In this specific case this example will take a geotiff
 # image as input, process it using the multimodal data processor, and
 # perform inference.
-# Reuirement - install plugin at:
+# Requirement - install plugin at:
 #   https://github.com/christian-pinto/prithvi_io_processor_plugin
 
 
 def main():
     torch.set_default_dtype(torch.float16)
-    image_url = "https://huggingface.co/christian-pinto/Prithvi-EO-2.0-300M-TL-VLLM/resolve/main/India_900498_S2Hand.tif"  # noqa: E501
+    image_url = "https://huggingface.co/christian-pinto/Prithvi-EO-2.0-300M-TL-VLLM/resolve/main/valencia_example_2024-10-26.tiff"  # noqa: E501
 
     img_prompt = dict(
         data=image_url,
@@ -36,7 +36,8 @@ def main():
         # to avoid the model going OOM.
         # The maximum number depends on the available GPU memory
         max_num_seqs=32,
-        io_processor_plugin="prithvi_to_tiff_india",
+        io_processor_plugin="prithvi_to_tiff",
+        model_impl="terratorch",
     )
 
     pooling_params = PoolingParams(task="encode", softmax=False)
diff --git a/examples/offline_inference/rlhf_colocate.py b/examples/offline_inference/rlhf_colocate.py
index 65621023ab6ce06694058356f2739998a9cf0582..360fd79b55aad0e3c493beef8350bd4b59d7f665 100644
--- a/examples/offline_inference/rlhf_colocate.py
+++ b/examples/offline_inference/rlhf_colocate.py
@@ -28,12 +28,15 @@ Learn more about Ray placement groups:
 https://docs.ray.io/en/latest/placement-groups.html
 """
 
+import gc
 import os
 
 import ray
 import torch
+import zmq
 from ray.util.placement_group import placement_group
 from ray.util.scheduling_strategies import PlacementGroupSchedulingStrategy
+from torch.multiprocessing.reductions import reduce_tensor
 
 from vllm import LLM
 
@@ -86,20 +89,72 @@ class RayTrainingActor:
         from vllm.platforms import current_platform
 
         self.device_uuid = current_platform.get_device_uuid(0)
+        self.zmq_context = zmq.Context()
+        self.zmq_address_counter = 0
+        self.zmq_handle = None
 
     def report_device_id(self) -> str:
         return self.device_uuid
 
-    def get_weight_ipc_handles(self):
-        from torch.multiprocessing.reductions import reduce_tensor
+    def get_zmq_handles(self) -> dict[str, str]:
+        suffix = f"{self.device_uuid}-{self.zmq_address_counter}"
+        self.zmq_handle = f"ipc:///tmp/rl-colocate-zmq-{suffix}.sock"
+        self.zmq_address_counter += 1
+        return {self.device_uuid: self.zmq_handle}
 
-        data = {}
-        for name, p in self.model.named_parameters():
-            # A training actor might hold only a subset of the weights and may
-            # need to gather weights from other actors. For demonstration
-            # purposes, each training actor owns the full weight set.
-            data[name] = reduce_tensor(p.detach())
-        return {self.device_uuid: data}
+    def update_weights(self):
+        # align size to avoid misaligned address
+        align_size = 256
+
+        def get_size(p: torch.Tensor) -> int:
+            return (p.nbytes + align_size - 1) // align_size * align_size
+
+        named_parameters: dict[str, torch.nn.Parameter] = dict(
+            self.model.named_parameters()
+        )
+        max_tensor_size = max(get_size(p) for p in named_parameters.values())
+        # use max_tensor_size * 2 as buffer size
+        buffer = torch.empty(max_tensor_size * 2, dtype=torch.uint8, device="cuda:0")
+        s = self.zmq_context.socket(zmq.REQ)
+        s.bind(self.zmq_handle)
+        handle = reduce_tensor(buffer)
+
+        offset = 0
+        buckets: list[tuple[list[dict], list[torch.Tensor]]] = []
+        named_tensors: list[dict] = []
+        real_tensors: list[torch.Tensor] = []
+        for name, p in named_parameters.items():
+            size = get_size(p)
+            if offset + size > buffer.numel():
+                buckets.append((named_tensors, real_tensors))
+                named_tensors, real_tensors = [], []
+                offset = 0
+            # assume tensors are contiguous
+            named_tensors.append(
+                {"name": name, "dtype": p.dtype, "shape": p.shape, "offset": offset}
+            )
+            real_tensors.append(p)
+            offset += size
+        if named_tensors:
+            buckets.append((named_tensors, real_tensors))
+        s.send_pyobj(handle)
+        s.recv()
+        for named_tensors, real_tensors in buckets:
+            offset = 0
+            for p in real_tensors:
+                buffer[offset : offset + p.nbytes].data.copy_(
+                    p.data.view(-1).view(dtype=torch.uint8), non_blocking=True
+                )
+                offset += get_size(p)
+            torch.cuda.synchronize()
+            s.send_pyobj(named_tensors)
+            s.recv()
+        s.send_pyobj(None)
+        s.recv()
+        s.close()
+        del buffer
+        gc.collect()
+        torch.cuda.empty_cache()
 
 
 # Ray manages four GPUs.
@@ -175,18 +230,22 @@ assert training_actor_device_ids[:2] == inference_engine_device_ids[0]
 # the second inference engine.
 assert training_actor_device_ids[2:] == inference_engine_device_ids[1]
 
-print("Gather all the IPC handles from the training actors.")
-ipc_handles = {}
+print("Gather all the ZMQ handles from the training actors.")
+zmq_handles = {}
 for actor in training_actors:
-    ipc_handles.update(ray.get(actor.get_weight_ipc_handles.remote()))
+    zmq_handles.update(ray.get(actor.get_zmq_handles.remote()))
+
+print(f"ZMQ handles: {zmq_handles}")
 
 print("Update the weights of the inference engines.")
-for llm in inference_engines:
-    ray.get(
-        llm.collective_rpc.remote(
-            "update_weights_from_ipc_handles", args=(ipc_handles,)
-        )
-    )
+ray.get(
+    [actor.update_weights.remote() for actor in training_actors]
+    + [
+        llm.collective_rpc.remote("update_weights_from_ipc", args=(zmq_handles,))
+        for llm in inference_engines
+    ]
+)
+
 print("Check if the weights are updated.")
 for llm in inference_engines:
     assert ray.get(llm.collective_rpc.remote("check_weights_changed", args=tuple()))
diff --git a/examples/offline_inference/rlhf_utils.py b/examples/offline_inference/rlhf_utils.py
index d2a8419ffabcd8c9ef7c05d54d9ffd1559b71934..c0e60b9793407da33284334868fe4d2d6d744b4c 100644
--- a/examples/offline_inference/rlhf_utils.py
+++ b/examples/offline_inference/rlhf_utils.py
@@ -1,6 +1,10 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import gc
+from typing import Callable, Optional, TypedDict
+
 import torch
+import zmq
 
 
 def stateless_init_process_group(master_address, master_port, rank, world_size, device):
@@ -66,6 +70,27 @@ class WorkerExtension:
         return weights_updated
 
 
+def rebuild_ipc(
+    handle: tuple[Callable, tuple], device_id: Optional[int] = None
+) -> torch.Tensor:
+    func, args = handle
+    list_args = list(args)
+    if device_id is not None:
+        # the key is to change device id to the current device id
+        # in case two processes have different CUDA_VISIBLE_DEVICES
+        list_args[6] = device_id
+    buffer = func(*list_args)
+    return buffer
+
+
+class FlattenedTensorMetadata(TypedDict):
+    name: str
+    shape: torch.Size
+    dtype: torch.dtype
+    # specify the start offset of this tensor in shared ipc_buffer tensor
+    offset: int
+
+
 class ColocateWorkerExtension:
     """
     The class for vLLM's worker to inherit from, in the colocate setting.
@@ -76,27 +101,62 @@ class ColocateWorkerExtension:
     should pass the full qualified name as `worker_extension_cls` argument.
     """
 
+    def update_weights_from_ipc(self, zmq_handles: dict[str, str]):
+        from vllm.model_executor.model_loader.utils import process_weights_after_loading
+
+        assert self.device is not None
+        if not hasattr(self, "_zmq_ctx") or self._zmq_ctx is None:
+            self._zmq_ctx = zmq.Context()
+        socket = self._zmq_ctx.socket(zmq.REP)
+        socket.connect(zmq_handles[self.report_device_id()])
+        buffer: Optional[torch.Tensor] = None
+        while True:
+            payload: tuple[Callable, tuple] | list[FlattenedTensorMetadata] | None = (
+                socket.recv_pyobj()
+            )
+            if payload is None:
+                # means the update is done
+                process_weights_after_loading(
+                    self.model_runner.model, self.model_config, self.device
+                )
+                torch.cuda.synchronize()
+                socket.send(b"")
+                break
+            if isinstance(payload, tuple):
+                # an ipc handle that vLLM can use `func, args = handle`
+                # and `func(*args)` to rebuild GPU tensor.
+                buffer = rebuild_ipc(payload, self.device.index)
+                assert buffer.dtype == torch.uint8
+                socket.send(b"")
+                continue
+            assert isinstance(payload, list)
+            assert buffer is not None
+            weights = []
+            for item in payload:
+                shape = item["shape"]
+                if isinstance(shape, (list, tuple)):
+                    shape = torch.Size(shape)
+                assert isinstance(shape, torch.Size)
+                dtype, offset = item["dtype"], item["offset"]
+                size = dtype.itemsize * shape.numel()
+                tensor = buffer[offset : offset + size].view(dtype=dtype).view(shape)
+                weights.append((item["name"], tensor))
+            self.model_runner.model.load_weights(weights=weights)
+            del weights
+            torch.cuda.synchronize()
+            socket.send(b"")
+
+        socket.close()
+        del buffer
+        gc.collect()
+        torch.cuda.empty_cache()
+
     def report_device_id(self) -> str:
         from vllm.platforms import current_platform
 
         self.device_uuid = current_platform.get_device_uuid(self.device.index)
         return self.device_uuid
 
-    def update_weights_from_ipc_handles(self, ipc_handles):
-        handles = ipc_handles[self.device_uuid]
-        device_id = self.device.index
-        weights = []
-        for name, handle in handles.items():
-            func, args = handle
-            list_args = list(args)
-            # the key is to change device id to the current device id
-            # in case two processes have different CUDA_VISIBLE_DEVICES
-            list_args[6] = device_id
-            tensor = func(*list_args)
-            weights.append((name, tensor))
-        self.model_runner.model.load_weights(weights=weights)
-        torch.cuda.synchronize()
-
     def check_weights_changed(self):
         """
         Check if the weights are updated to 0.
diff --git a/examples/offline_inference/vision_language.py b/examples/offline_inference/vision_language.py
index 4e879666f61d7cc98d986ab1ea429c7691f780d2..b104113b8821301dccbb964a48109f6ffc637926 100644
--- a/examples/offline_inference/vision_language.py
+++ b/examples/offline_inference/vision_language.py
@@ -683,6 +683,37 @@ def run_keye_vl(questions: list[str], modality: str) -> ModelRequestData:
     )
 
 
+# Keye-VL-1.5
+def run_keye_vl1_5(questions: list[str], modality: str) -> ModelRequestData:
+    model_name = "Kwai-Keye/Keye-VL-1.5-8B"
+
+    engine_args = EngineArgs(
+        model=model_name,
+        max_model_len=8192,
+        trust_remote_code=True,
+        limit_mm_per_prompt={modality: 1},
+    )
+
+    if modality == "image":
+        placeholder = "<|image_pad|>"
+    elif modality == "video":
+        placeholder = "<|video_pad|>"
+
+    prompts = [
+        (
+            f"<|im_start|>user\n<|vision_start|>{placeholder}<|vision_end|>"
+            f"{question}<|im_end|>\n"
+            "<|im_start|>assistant\n"
+        )
+        for question in questions
+    ]
+
+    return ModelRequestData(
+        engine_args=engine_args,
+        prompts=prompts,
+    )
+
+
 # Kimi-VL
 def run_kimi_vl(questions: list[str], modality: str) -> ModelRequestData:
     assert modality == "image"
@@ -1648,6 +1679,7 @@ model_example_map = {
     "interns1": run_interns1,
     "internvl_chat": run_internvl,
     "keye_vl": run_keye_vl,
+    "keye_vl1_5": run_keye_vl1_5,
     "kimi_vl": run_kimi_vl,
     "llama4": run_llama4,
     "llava": run_llava,
diff --git a/examples/offline_inference/vision_language_multi_image.py b/examples/offline_inference/vision_language_multi_image.py
index d9242efa85470aaf8186d51a739f2471e0b59013..01c2905cf26d8803dbc264f97fc5fd4916fdc87b 100644
--- a/examples/offline_inference/vision_language_multi_image.py
+++ b/examples/offline_inference/vision_language_multi_image.py
@@ -542,6 +542,43 @@ def load_keye_vl(question: str, image_urls: list[str]) -> ModelRequestData:
     )
 
 
+def load_keye_vl1_5(question: str, image_urls: list[str]) -> ModelRequestData:
+    model_name = "Kwai-Keye/Keye-VL-1_5-8B"
+
+    engine_args = EngineArgs(
+        model=model_name,
+        trust_remote_code=True,
+        max_model_len=8192,
+        max_num_seqs=5,
+        limit_mm_per_prompt={"image": len(image_urls)},
+    )
+
+    placeholders = [{"type": "image", "image": url} for url in image_urls]
+    messages = [
+        {
+            "role": "user",
+            "content": [
+                *placeholders,
+                {"type": "text", "text": question},
+            ],
+        },
+    ]
+
+    processor = AutoProcessor.from_pretrained(model_name, trust_remote_code=True)
+
+    prompt = processor.apply_chat_template(
+        messages, tokenize=False, add_generation_prompt=True
+    )
+
+    image_data = [fetch_image(url) for url in image_urls]
+
+    return ModelRequestData(
+        engine_args=engine_args,
+        prompt=prompt,
+        image_data=image_data,
+    )
+
+
 def load_kimi_vl(question: str, image_urls: list[str]) -> ModelRequestData:
     model_name = "moonshotai/Kimi-VL-A3B-Instruct"
 
@@ -1209,6 +1246,7 @@ model_example_map = {
     "interns1": load_interns1,
     "internvl_chat": load_internvl,
     "keye_vl": load_keye_vl,
+    "keye_vl1_5": load_keye_vl1_5,
     "kimi_vl": load_kimi_vl,
     "llama4": load_llama4,
     "llava": load_llava,
diff --git a/examples/online_serving/disaggregated_prefill.sh b/examples/online_serving/disaggregated_prefill.sh
index 6925dc8af07e9db004af6dca5e404a7bb6cde094..d434e22b1ae8868e9f61793211e828626788db43 100644
--- a/examples/online_serving/disaggregated_prefill.sh
+++ b/examples/online_serving/disaggregated_prefill.sh
@@ -53,7 +53,7 @@ CUDA_VISIBLE_DEVICES=0 vllm serve $MODEL_NAME \
     --gpu-memory-utilization 0.8 \
     --trust-remote-code \
     --kv-transfer-config \
-    '{"kv_connector":"PyNcclConnector","kv_role":"kv_producer","kv_rank":0,"kv_parallel_size":2}' &
+    '{"kv_connector":"P2pNcclConnector","kv_role":"kv_producer","kv_rank":0,"kv_parallel_size":2}' &
 
 # decoding instance, which is the KV consumer
 CUDA_VISIBLE_DEVICES=1 vllm serve $MODEL_NAME \
@@ -62,7 +62,7 @@ CUDA_VISIBLE_DEVICES=1 vllm serve $MODEL_NAME \
     --gpu-memory-utilization 0.8 \
     --trust-remote-code \
     --kv-transfer-config \
-    '{"kv_connector":"PyNcclConnector","kv_role":"kv_consumer","kv_rank":1,"kv_parallel_size":2}' &
+    '{"kv_connector":"P2pNcclConnector","kv_role":"kv_consumer","kv_rank":1,"kv_parallel_size":2}' &
 
 # wait until prefill and decode instances are ready
 wait_for_server 8100
diff --git a/examples/online_serving/kv_events_subscriber.py b/examples/online_serving/kv_events_subscriber.py
index f238c66234dcca620059806461e1e33d56a1aad5..9fd55fc9ddc94082fc6de288ed6eb3189181cf5c 100644
--- a/examples/online_serving/kv_events_subscriber.py
+++ b/examples/online_serving/kv_events_subscriber.py
@@ -6,6 +6,8 @@ import msgspec
 import zmq
 from msgspec.msgpack import Decoder
 
+from vllm.v1.core.kv_cache_utils import BlockHash
+
 
 #
 # Types copied from vllm.distributed.kv_events
@@ -22,8 +24,8 @@ class KVCacheEvent(
 
 
 class BlockStored(KVCacheEvent):
-    block_hashes: list[int]
-    parent_block_hash: Optional[int]
+    block_hashes: list[BlockHash]
+    parent_block_hash: Optional[BlockHash]
     token_ids: list[int]
     block_size: int
     lora_id: Optional[int]
@@ -31,7 +33,7 @@ class BlockStored(KVCacheEvent):
 
 
 class BlockRemoved(KVCacheEvent):
-    block_hashes: list[int]
+    block_hashes: list[BlockHash]
     medium: Optional[str]
 
 
diff --git a/examples/online_serving/multi-node-serving.sh b/examples/online_serving/multi-node-serving.sh
index e8ad8d3de5f41750f88f364ed4e91cffb4d9cf0e..3fc5502fb9bc2457aca0120d01a6ef3a6451a32f 100644
--- a/examples/online_serving/multi-node-serving.sh
+++ b/examples/online_serving/multi-node-serving.sh
@@ -11,7 +11,7 @@
 # Example usage:
 # On the head node machine, start the Ray head node process and run a vLLM server.
 #   ./multi-node-serving.sh leader --ray_port=6379 --ray_cluster_size=<SIZE> [<extra ray args>]  && \
-#   python3 -m vllm.entrypoints.openai.api_server --port 8080 --model meta-llama/Meta-Llama-3.1-405B-Instruct --tensor-parallel-size 8 --pipeline_parallel_size 2
+#   vllm serve meta-llama/Meta-Llama-3.1-405B-Instruct --port 8080 --tensor-parallel-size 8 --pipeline_parallel_size 2
 # 
 # On each worker node, start the Ray worker node process.
 #   ./multi-node-serving.sh worker --ray_address=<HEAD_NODE_IP> --ray_port=6379 [<extra ray args>]
diff --git a/examples/online_serving/openai_chat_completion_client_for_multimodal.py b/examples/online_serving/openai_chat_completion_client_for_multimodal.py
index ac5f79b56e49fa12c2b92a971ac6fb633a73acd7..37216a5cfe5741f672bfa02f8eed5c4d1e1c36c7 100644
--- a/examples/online_serving/openai_chat_completion_client_for_multimodal.py
+++ b/examples/online_serving/openai_chat_completion_client_for_multimodal.py
@@ -266,10 +266,52 @@ def run_audio(model: str) -> None:
     print("Chat completion output from base64 encoded audio:", result)
 
 
+def run_multi_audio(model: str) -> None:
+    from vllm.assets.audio import AudioAsset
+
+    # Two different audios to showcase batched inference.
+    audio_url = AudioAsset("winning_call").url
+    audio_base64 = encode_base64_content_from_url(audio_url)
+    audio_url2 = AudioAsset("azacinto_foscolo").url
+    audio_base64_2 = encode_base64_content_from_url(audio_url2)
+
+    # OpenAI-compatible schema (`input_audio`)
+    chat_completion_from_base64 = client.chat.completions.create(
+        messages=[
+            {
+                "role": "user",
+                "content": [
+                    {"type": "text", "text": "Are these two audios the same?"},
+                    {
+                        "type": "input_audio",
+                        "input_audio": {
+                            "data": audio_base64,
+                            "format": "wav",
+                        },
+                    },
+                    {
+                        "type": "input_audio",
+                        "input_audio": {
+                            "data": audio_base64_2,
+                            "format": "wav",
+                        },
+                    },
+                ],
+            }
+        ],
+        model=model,
+        max_completion_tokens=64,
+    )
+
+    result = chat_completion_from_base64.choices[0].message.content
+    print("Chat completion output from input audio:", result)
+
+
 example_function_map = {
     "text-only": run_text_only,
     "single-image": run_single_image,
     "multi-image": run_multi_image,
+    "multi-audio": run_multi_audio,
     "video": run_video,
     "audio": run_audio,
 }
diff --git a/examples/online_serving/prithvi_geospatial_mae.py b/examples/online_serving/prithvi_geospatial_mae.py
index cbd34f461362cdf012cf1950e7f7d36cd0d8eeb9..611a7cbc89fa2a98a998af3d6405fbbb37a0808f 100644
--- a/examples/online_serving/prithvi_geospatial_mae.py
+++ b/examples/online_serving/prithvi_geospatial_mae.py
@@ -10,18 +10,19 @@ import requests
 # multimodal data. In this specific case this example will take a geotiff
 # image as input, process it using the multimodal data processor, and
 # perform inference.
-# Reuirements :
+# Requirements :
 # - install plugin at:
 #   https://github.com/christian-pinto/prithvi_io_processor_plugin
 # - start vllm in serving mode with the below args
 #   --model='christian-pinto/Prithvi-EO-2.0-300M-TL-VLLM'
+#   --model-impl terratorch
 #   --task embed --trust-remote-code
 #   --skip-tokenizer-init --enforce-eager
-#   --io-processor-plugin prithvi_to_tiff_india
+#   --io-processor-plugin prithvi_to_tiff
 
 
 def main():
-    image_url = "https://huggingface.co/christian-pinto/Prithvi-EO-2.0-300M-TL-VLLM/resolve/main/India_900498_S2Hand.tif"  # noqa: E501
+    image_url = "https://huggingface.co/christian-pinto/Prithvi-EO-2.0-300M-TL-VLLM/resolve/main/valencia_example_2024-10-26.tiff"  # noqa: E501
     server_endpoint = "http://localhost:8000/pooling"
 
     request_payload_url = {
@@ -33,6 +34,7 @@ def main():
         },
         "priority": 0,
         "model": "christian-pinto/Prithvi-EO-2.0-300M-TL-VLLM",
+        "softmax": False,
     }
 
     ret = requests.post(server_endpoint, json=request_payload_url)
diff --git a/examples/online_serving/prometheus_grafana/grafana.json b/examples/online_serving/prometheus_grafana/grafana.json
index 3488956a5b24cb8e1f8fd44875a04ecd8977cb15..37abc9de926fd1d7541e7ae0ffae4f9ef87e332f 100644
--- a/examples/online_serving/prometheus_grafana/grafana.json
+++ b/examples/online_serving/prometheus_grafana/grafana.json
@@ -402,7 +402,7 @@
           },
           "disableTextWrap": false,
           "editorMode": "builder",
-          "expr": "histogram_quantile(0.99, sum by(le) (rate(vllm:time_per_output_token_seconds_bucket{model_name=\"$model_name\"}[$__rate_interval])))",
+          "expr": "histogram_quantile(0.99, sum by(le) (rate(vllm:inter_token_latency_seconds_bucket{model_name=\"$model_name\"}[$__rate_interval])))",
           "fullMetaSearch": false,
           "includeNullMetadata": false,
           "instant": false,
@@ -418,7 +418,7 @@
           },
           "disableTextWrap": false,
           "editorMode": "builder",
-          "expr": "histogram_quantile(0.95, sum by(le) (rate(vllm:time_per_output_token_seconds_bucket{model_name=\"$model_name\"}[$__rate_interval])))",
+          "expr": "histogram_quantile(0.95, sum by(le) (rate(vllm:inter_token_latency_seconds_bucket{model_name=\"$model_name\"}[$__rate_interval])))",
           "fullMetaSearch": false,
           "hide": false,
           "includeNullMetadata": false,
@@ -435,7 +435,7 @@
           },
           "disableTextWrap": false,
           "editorMode": "builder",
-          "expr": "histogram_quantile(0.9, sum by(le) (rate(vllm:time_per_output_token_seconds_bucket{model_name=\"$model_name\"}[$__rate_interval])))",
+          "expr": "histogram_quantile(0.9, sum by(le) (rate(vllm:inter_token_latency_seconds_bucket{model_name=\"$model_name\"}[$__rate_interval])))",
           "fullMetaSearch": false,
           "hide": false,
           "includeNullMetadata": false,
@@ -452,7 +452,7 @@
           },
           "disableTextWrap": false,
           "editorMode": "builder",
-          "expr": "histogram_quantile(0.5, sum by(le) (rate(vllm:time_per_output_token_seconds_bucket{model_name=\"$model_name\"}[$__rate_interval])))",
+          "expr": "histogram_quantile(0.5, sum by(le) (rate(vllm:inter_token_latency_seconds_bucket{model_name=\"$model_name\"}[$__rate_interval])))",
           "fullMetaSearch": false,
           "hide": false,
           "includeNullMetadata": false,
@@ -468,7 +468,7 @@
             "uid": "${DS_PROMETHEUS}"
           },
           "editorMode": "code",
-          "expr": "rate(vllm:time_per_output_token_seconds_sum{model_name=\"$model_name\"}[$__rate_interval])\n/\nrate(vllm:time_per_output_token_seconds_count{model_name=\"$model_name\"}[$__rate_interval])",
+          "expr": "rate(vllm:inter_token_latency_seconds_sum{model_name=\"$model_name\"}[$__rate_interval])\n/\nrate(vllm:inter_token_latency_seconds_count{model_name=\"$model_name\"}[$__rate_interval])",
           "hide": false,
           "instant": false,
           "legendFormat": "Mean",
@@ -476,7 +476,7 @@
           "refId": "E"
         }
       ],
-      "title": "Time Per Output Token Latency",
+      "title": "Inter Token Latency",
       "type": "timeseries"
     },
     {
diff --git a/examples/tool_chat_template_phi4_mini.jinja b/examples/tool_chat_template_phi4_mini.jinja
index 83886762c2893e2e8bbdb8401c7a4f9abafc97a8..6f40c38c20644023965a0f5b731bd89112ab9ebc 100644
--- a/examples/tool_chat_template_phi4_mini.jinja
+++ b/examples/tool_chat_template_phi4_mini.jinja
@@ -9,7 +9,7 @@
 <|system|>
 {{ system_message }}
 {%- if tools %}
-In addition to plain text responses, you can chose to call one or more of the provided functions.
+In addition to plain text responses, you can choose to call one or more of the provided functions.
 
 Use the following rule to decide when to call a function:
   * if the response can be generated from your internal knowledge (e.g., as in the case of queries like "What is the capital of Poland?"), do so
@@ -19,7 +19,7 @@ If you decide to call functions:
   * prefix function calls with functools marker (no closing marker required)
   * all function calls should be generated in a single JSON list formatted as functools[{"name": [function name], "arguments": [function arguments as JSON]}, ...]
   * follow the provided JSON schema. Do not hallucinate arguments or values. Do to blindly copy values from the provided samples
-  * respect the argument type formatting. E.g., if the type if number and format is float, write value 7 as 7.0
+  * respect the argument type formatting. E.g., if the type is number and format is float, write value 7 as 7.0
   * make sure you pick the right functions that match the user intent
 
 
diff --git a/pyproject.toml b/pyproject.toml
index e63f8aeae2787c63a4ed05c97df16f54ff42a313..416423abcad88d0c9866ea8e91579ae5eb074287 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -228,6 +228,7 @@ fo = "fo"
 ba = "ba"
 
 [tool.typos.type.py.extend-words]
+ba = "ba"
 
 [tool.typos.type.cpp]
 extend-glob = ["*.cu"]
@@ -344,3 +345,6 @@ extend-ignore-re = []
 windo = "windo"
 
 [tool.typos.type.vimscript.extend-words]
+
+[tool.uv]
+no-build-isolation-package = ["torch"]
diff --git a/requirements/common.txt b/requirements/common.txt
index 9e7ccc3abae06354181fc6a1d34a79807a7cc8ff..43a5767fd5142ca66bc1376c939d2d5e6d928cfd 100644
--- a/requirements/common.txt
+++ b/requirements/common.txt
@@ -20,12 +20,11 @@ prometheus-fastapi-instrumentator >= 7.0.0
 tiktoken >= 0.6.0  # Required for DBRX tokenizer
 lm-format-enforcer == 0.11.3
 llguidance >= 0.7.11, < 0.8.0; platform_machine == "x86_64" or platform_machine == "arm64" or platform_machine == "aarch64"
-outlines_core == 0.2.10 ; platform_machine != "s390x"
-outlines == 0.1.11 ; platform_machine == "s390x"
+outlines_core == 0.2.11
 # required for outlines backend disk cache
 diskcache == 5.6.3
 lark == 1.2.2
-xgrammar == 0.1.21; platform_machine == "x86_64" or platform_machine == "aarch64" or platform_machine == "arm64"
+xgrammar == 0.1.23; platform_machine == "x86_64" or platform_machine == "aarch64" or platform_machine == "arm64"
 typing_extensions >= 4.10
 filelock >= 3.16.1 # need to contain https://github.com/tox-dev/filelock/pull/317
 partial-json-parser # used for parsing partial JSON outputs
diff --git a/requirements/neuron.txt b/requirements/neuron.txt
deleted file mode 100644
index 7df478eddde3fffccea18182b8e29882c88812e7..0000000000000000000000000000000000000000
--- a/requirements/neuron.txt
+++ /dev/null
@@ -1,9 +0,0 @@
-# Common dependencies
--r common.txt
-
-# Dependencies for Neuron devices
-packaging>=24.2
-setuptools>=77.0.3,<80.0.0
-torch-neuronx >= 2.5.0
-neuronx-cc>=2.0.0a0
-torchvision # Required for Llama3.2 multimodal image preprocessing
diff --git a/requirements/rocm.txt b/requirements/rocm.txt
index e2ecd8e876d2a9c3ddedf50d0a89323bd523799f..1c586533a1677ebde8c2a6935082a9248f8298ad 100644
--- a/requirements/rocm.txt
+++ b/requirements/rocm.txt
@@ -8,7 +8,7 @@ numba == 0.61.2; python_version > '3.9'
 boto3
 botocore
 datasets
-ray>=2.10.0,<2.45.0
+ray[cgraph]>=2.48.0 # Ray Compiled Graph, required for pipeline parallelism in V1.
 peft
 pytest-asyncio
 tensorizer==2.10.1
diff --git a/requirements/test.in b/requirements/test.in
index 5b1688c76c954c3e04c56bb9c286dc2367252635..744cfbe885278a3b4ce197f7a59e1efaebef77fd 100644
--- a/requirements/test.in
+++ b/requirements/test.in
@@ -21,6 +21,7 @@ ray[cgraph,default]>=2.48.0 # Ray Compiled Graph, required by pipeline paralleli
 sentence-transformers # required for embedding tests
 soundfile # required for audio tests
 jiwer # required for audio tests
+tblib # for pickling test exceptions
 timm >=1.0.17 # required for internvl and gemma3n-mm test
 torch==2.8.0
 torchaudio==2.8.0
@@ -53,5 +54,5 @@ runai-model-streamer==0.11.0
 runai-model-streamer-s3==0.11.0
 fastsafetensors>=0.1.10
 pydantic>=2.10 # 2.9 leads to error on python 3.10
-terratorch==1.1rc2 # required for PrithviMAE test
 decord==0.6.0
+terratorch @ git+https://github.com/IBM/terratorch.git@1.1.rc3 # required for PrithviMAE test
diff --git a/requirements/test.txt b/requirements/test.txt
index 0b728ebfb007134f4ab08bdb699ff62ec400d3bb..5eebdc788aa3df5f4f188feca750139f55a106b0 100644
--- a/requirements/test.txt
+++ b/requirements/test.txt
@@ -137,7 +137,7 @@ contourpy==1.3.0
     # via matplotlib
 cramjam==2.9.0
     # via fastparquet
-cupy-cuda12x==13.3.0
+cupy-cuda12x==13.6.0
     # via ray
 cycler==0.12.1
     # via matplotlib
@@ -1032,6 +1032,8 @@ tabledata==1.3.3
     # via pytablewriter
 tabulate==0.9.0
     # via sacrebleu
+tblib==3.1.0
+    # via -r requirements/test.in
 tcolorpy==0.1.6
     # via pytablewriter
 tenacity==9.0.0
@@ -1042,7 +1044,7 @@ tensorboardx==2.6.4
     # via lightning
 tensorizer==2.10.1
     # via -r requirements/test.in
-terratorch==1.1rc2
+terratorch @ git+https://github.com/IBM/terratorch.git@07184fcf91a1324f831ff521dd238d97fe350e3e
     # via -r requirements/test.in
 threadpoolctl==3.5.0
     # via scikit-learn
diff --git a/requirements/xpu.txt b/requirements/xpu.txt
index 4607c3efdf14c4f5ebb452b09875ad2ac9541a76..74f5b05b2382ae993f0cc24fce4d6a9508833989 100644
--- a/requirements/xpu.txt
+++ b/requirements/xpu.txt
@@ -10,10 +10,10 @@ wheel
 jinja2>=3.1.6
 datasets # for benchmark scripts
 numba == 0.60.0 # v0.61 doesn't support Python 3.9. Required for N-gram speculative decoding
---extra-index-url=https://download.pytorch.org/whl/xpu
+nixl==0.3.0 # for PD disaggregation
 torch==2.8.0+xpu
 torchaudio
 torchvision
-pytorch-triton-xpu
---extra-index-url=https://pytorch-extension.intel.com/release-whl/stable/xpu/us/
-intel-extension-for-pytorch==2.8.10+xpu
+--extra-index-url=https://download.pytorch.org/whl/xpu
+
+intel-extension-for-pytorch @ https://intel-extension-for-pytorch.s3.us-east-1.amazonaws.com/ipex_dev/xpu/intel_extension_for_pytorch-2.8.10.post0%2Bxpu-cp312-cp312-linux_x86_64.whl
diff --git a/setup.py b/setup.py
index 6eeb0134d109239c9cfb9ffe653ed246b4b0758a..38ebaff79199334b8e1c93c9550e281b8799f398 100644
--- a/setup.py
+++ b/setup.py
@@ -413,8 +413,7 @@ def _no_device() -> bool:
 
 def _is_cuda() -> bool:
     has_cuda = torch.version.cuda is not None
-    return (VLLM_TARGET_DEVICE == "cuda" and has_cuda
-            and not (_is_neuron() or _is_tpu()))
+    return (VLLM_TARGET_DEVICE == "cuda" and has_cuda and not _is_tpu())
 
 
 def _is_hip() -> bool:
@@ -422,10 +421,6 @@ def _is_hip() -> bool:
             or VLLM_TARGET_DEVICE == "rocm") and torch.version.hip is not None
 
 
-def _is_neuron() -> bool:
-    return VLLM_TARGET_DEVICE == "neuron"
-
-
 def _is_tpu() -> bool:
     return VLLM_TARGET_DEVICE == "tpu"
 
@@ -470,25 +465,6 @@ def get_rocm_version():
         return None
 
 
-def get_neuronxcc_version():
-    import sysconfig
-    site_dir = sysconfig.get_paths()["purelib"]
-    version_file = os.path.join(site_dir, "neuronxcc", "version",
-                                "__init__.py")
-
-    # Check if the command was executed successfully
-    with open(version_file) as fp:
-        content = fp.read()
-
-    # Extract the version using a regular expression
-    match = re.search(r"__version__ = '(\S+)'", content)
-    if match:
-        # Return the version string
-        return match.group(1)
-    else:
-        raise RuntimeError("Could not find Neuron version in the output")
-
-
 def get_nvcc_cuda_version() -> Version:
     """Get the CUDA version from nvcc.
 
@@ -541,12 +517,6 @@ def get_vllm_version() -> str:
         rocm_version = get_rocm_version() or torch.version.hip
         if rocm_version and rocm_version != MAIN_CUDA_VERSION:
             version += f"{sep}rocm{rocm_version.replace('.', '')[:3]}"
-    elif _is_neuron():
-        # Get the Neuron version
-        neuron_version = str(get_neuronxcc_version())
-        if neuron_version != MAIN_CUDA_VERSION:
-            neuron_version_str = neuron_version.replace(".", "")[:3]
-            version += f"{sep}neuron{neuron_version_str}"
     elif _is_tpu():
         version += f"{sep}tpu"
     elif _is_cpu():
@@ -591,8 +561,6 @@ def get_requirements() -> list[str]:
         requirements = modified_requirements
     elif _is_hip():
         requirements = _read_requirements("rocm.txt")
-    elif _is_neuron():
-        requirements = _read_requirements("neuron.txt")
     elif _is_tpu():
         requirements = _read_requirements("tpu.txt")
     elif _is_cpu():
@@ -601,7 +569,7 @@ def get_requirements() -> list[str]:
         requirements = _read_requirements("xpu.txt")
     else:
         raise ValueError(
-            "Unsupported platform, please use CUDA, ROCm, Neuron, or CPU.")
+            "Unsupported platform, please use CUDA, ROCm, or CPU.")
     return requirements
 
 
@@ -688,13 +656,15 @@ setup(
         "bench": ["pandas", "datasets"],
         "tensorizer": ["tensorizer==2.10.1"],
         "fastsafetensors": ["fastsafetensors >= 0.1.10"],
-        "runai":
-        ["runai-model-streamer >= 0.13.3", "runai-model-streamer-s3", "boto3"],
+        "runai": [
+            "runai-model-streamer >= 0.14.0", "runai-model-streamer-gcs",
+            "google-cloud-storage", "runai-model-streamer-s3", "boto3"
+        ],
         "audio": ["librosa", "soundfile",
                   "mistral_common[audio]"],  # Required for audio processing
         "video": [],  # Kept for backwards compatibility
         # FlashInfer should be updated together with the Dockerfile
-        "flashinfer": ["flashinfer-python==0.2.14.post1"],
+        "flashinfer": ["flashinfer-python==0.3.0"],
         # Optional deps for AMD FP4 quantization support
         "petit-kernel": ["petit-kernel"],
     },
diff --git a/tests/async_engine/test_api_server.py b/tests/async_engine/test_api_server.py
index 76c94bdf80ca8c13729abe3124013ff53c0f67b7..07370a880329150a751b38ff63795b2ec17337d0 100644
--- a/tests/async_engine/test_api_server.py
+++ b/tests/async_engine/test_api_server.py
@@ -1,6 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
+import copyreg
 import os
 import subprocess
 import sys
@@ -10,6 +11,30 @@ from pathlib import Path
 
 import pytest
 import requests
+import urllib3.exceptions
+
+
+def _pickle_new_connection_error(obj):
+    """Custom pickler for NewConnectionError to fix tblib compatibility."""
+    # Extract the original message by removing the "conn: " prefix
+    full_message = obj.args[0] if obj.args else ""
+    if ': ' in full_message:
+        # Split off the connection part and keep the actual message
+        _, actual_message = full_message.split(': ', 1)
+    else:
+        actual_message = full_message
+    return _unpickle_new_connection_error, (actual_message, )
+
+
+def _unpickle_new_connection_error(message):
+    """Custom unpickler for NewConnectionError."""
+    # Create with None as conn and the actual message
+    return urllib3.exceptions.NewConnectionError(None, message)
+
+
+# Register the custom pickle/unpickle functions for tblib compatibility
+copyreg.pickle(urllib3.exceptions.NewConnectionError,
+               _pickle_new_connection_error)
 
 
 def _query_server(prompt: str, max_tokens: int = 5) -> dict:
@@ -52,6 +77,7 @@ def api_server(distributed_executor_backend: str):
     uvicorn_process.terminate()
 
 
+@pytest.mark.timeout(300)
 @pytest.mark.parametrize("distributed_executor_backend", ["mp", "ray"])
 def test_api_server(api_server, distributed_executor_backend: str):
     """
@@ -98,7 +124,7 @@ def test_api_server(api_server, distributed_executor_backend: str):
         pool.join()
 
         # check cancellation stats
-        # give it some times to update the stats
+        # give it some time to update the stats
         time.sleep(1)
 
         num_aborted_requests = requests.get(
diff --git a/tests/benchmarks/test_serve_cli.py b/tests/benchmarks/test_serve_cli.py
index bfcf274727e276093f0861a8dd8d5d136b95cea9..5471d6b8e4a5f399e034eef4e5d0e5ef751b2fb2 100644
--- a/tests/benchmarks/test_serve_cli.py
+++ b/tests/benchmarks/test_serve_cli.py
@@ -45,3 +45,34 @@ def test_bench_serve(server):
     print(result.stderr)
 
     assert result.returncode == 0, f"Benchmark failed: {result.stderr}"
+
+@pytest.mark.benchmark
+def test_bench_serve_chat(server):
+    command = [
+        "vllm",
+        "bench",
+        "serve",
+        "--model",
+        MODEL_NAME,
+        "--host",
+        server.host,
+        "--port",
+        str(server.port),
+        "--dataset-name",
+        "random",
+        "--random-input-len",
+        "32",
+        "--random-output-len",
+        "4",
+        "--num-prompts",
+        "5",
+        "--endpoint",
+        "/v1/chat/completions",
+        "--endpoint-type",
+        "openai-chat",
+    ]
+    result = subprocess.run(command, capture_output=True, text=True)
+    print(result.stdout)
+    print(result.stderr)
+
+    assert result.returncode == 0, f"Benchmark failed: {result.stderr}"
diff --git a/tests/compile/piecewise/test_full_cudagraph.py b/tests/compile/piecewise/test_full_cudagraph.py
index 97140a9db7af6a28ce5be578f826b55191844ebc..2454f85342eba81604e27856abb89273948aeba9 100644
--- a/tests/compile/piecewise/test_full_cudagraph.py
+++ b/tests/compile/piecewise/test_full_cudagraph.py
@@ -61,6 +61,16 @@ backend_configs = {
                       "cudagraph_mode": "FULL_AND_PIECEWISE",
                   },
                   specific_gpu_arch=(9, 0)),
+    # FlashAttention MLA on Hopper
+    "FlashAttentionMLA":
+    BackendConfig(name="FlashAttentionMLA",
+                  env_vars={
+                      "VLLM_ATTENTION_BACKEND": "FLASH_ATTN_MLA",
+                  },
+                  comp_config={
+                      "cudagraph_mode": "FULL_DECODE_ONLY",
+                  },
+                  specific_gpu_arch=(9, 0)),
     # Cutlass MLA on Blackwell
     "CutlassMLA":
     BackendConfig(
@@ -102,7 +112,7 @@ backend_configs = {
 test_params_full_cudagraph = []
 
 # deepseek-ai/DeepSeek-V2-Lite with MLA
-MLA_backends = ["FlashMLA", "CutlassMLA"]
+MLA_backends = ["FlashMLA", "FlashAttentionMLA", "CutlassMLA"]
 for mla_backend in MLA_backends:
     test_params_full_cudagraph.append(
         pytest.param(
diff --git a/tests/compile/piecewise/test_multiple_graphs.py b/tests/compile/piecewise/test_multiple_graphs.py
index f5e2d9ddb7528d52cb15c2aa397a3695d46b3404..5cfebfce9ea2a8729a2c91205c44fa2337fefcc7 100644
--- a/tests/compile/piecewise/test_multiple_graphs.py
+++ b/tests/compile/piecewise/test_multiple_graphs.py
@@ -4,9 +4,9 @@
 Test (piecewise) compilation with a simple model where multiple submodules
 are compiled and graph captured separately.
 """
+
 import torch
 from torch import nn
-from torch.library import Library
 
 from vllm.compilation.backends import set_model_tag
 from vllm.compilation.counter import compilation_counter
@@ -15,10 +15,9 @@ from vllm.compilation.decorators import (ignore_torch_compile,
 from vllm.config import (CompilationConfig, CompilationLevel, CUDAGraphMode,
                          VllmConfig, set_current_vllm_config)
 from vllm.forward_context import BatchDescriptor, set_forward_context
-from vllm.utils import direct_register_custom_op
 
-# create a library to hold the custom op
-silly_lib = Library("silly", "FRAGMENT")  # noqa
+# This import automatically registers `torch.ops.silly.attention`
+from .. import silly_attention  # noqa: F401
 
 BATCH_SIZE = 32
 MLP_SIZE = 128
@@ -26,27 +25,6 @@ HIDDEN_SIZE = 1024
 RANDOM_SEED = 0
 
 
-def silly_attention(q: torch.Tensor, k: torch.Tensor, v: torch.Tensor,
-                    out: torch.Tensor) -> None:
-    out.copy_(q)
-    out += k
-    out += v
-
-
-def silly_attention_fake(q: torch.Tensor, k: torch.Tensor, v: torch.Tensor,
-                         out: torch.Tensor) -> None:
-    return
-
-
-direct_register_custom_op(
-    op_name="attention",
-    op_func=silly_attention,
-    mutates_args=["out"],
-    fake_impl=silly_attention_fake,
-    target_lib=silly_lib,
-)
-
-
 @support_torch_compile
 class ParentModel(nn.Module):
 
@@ -134,7 +112,7 @@ class SimpleModelWithTwoGraphs(ParentModel):
         # Test will fail without set_model_tag here with error:
         # "ValueError: too many values to unpack (expected 3)"
         # This is because CompiledAttention and CompiledAttentionTwo
-        # have different implmentations but the same torch.compile
+        # have different implementations but the same torch.compile
         # cache dir will be used as default prefix is 'model_tag'
         with set_model_tag("attn_one"):
             self.attn_one = CompiledAttention(
diff --git a/tests/compile/piecewise/test_simple.py b/tests/compile/piecewise/test_simple.py
index 2d1a72d44ec70bac7582132c8786e3f9671067eb..84f4945c827257563f4cf6e89374bd900991aeb3 100644
--- a/tests/compile/piecewise/test_simple.py
+++ b/tests/compile/piecewise/test_simple.py
@@ -4,10 +4,10 @@
 Test the piecewise compilation with a simple model so that we
 can exactly calculate the expected output and side effects.
 """
+
 import pytest
 import torch
 from torch import nn
-from torch.library import Library
 
 from vllm.compilation.counter import compilation_counter
 from vllm.compilation.decorators import support_torch_compile
@@ -15,35 +15,9 @@ from vllm.config import (CompilationConfig, CompilationLevel, CUDAGraphMode,
                          VllmConfig, set_current_vllm_config)
 from vllm.envs import VLLM_USE_V1
 from vllm.forward_context import BatchDescriptor, set_forward_context
-from vllm.utils import direct_register_custom_op
-
-global_counter = 0
-
-# create a library to hold the custom op
-silly_lib = Library("silly", "FRAGMENT")  # noqa
-
-
-def silly_attention(q: torch.Tensor, k: torch.Tensor, v: torch.Tensor,
-                    out: torch.Tensor) -> None:
-    global global_counter
-    global_counter += 1
-    print(f"{global_counter=}")
-    out.copy_(q)
-    out[0] += 1
-
-
-def silly_attention_fake(q: torch.Tensor, k: torch.Tensor, v: torch.Tensor,
-                         out: torch.Tensor) -> None:
-    return
-
 
-direct_register_custom_op(
-    op_name="attention",
-    op_func=silly_attention,
-    mutates_args=["out"],
-    fake_impl=silly_attention_fake,
-    target_lib=silly_lib,
-)
+# This import automatically registers `torch.ops.silly.attention`
+from ..silly_attention import get_global_counter, reset_global_counter
 
 
 @support_torch_compile
@@ -59,8 +33,7 @@ class SillyModel(nn.Module):
     def forward(self, x: torch.Tensor) -> torch.Tensor:
         """
         Overall effect:
-        x += 1
-        x[0] += 2
+        x = 3 * x + 19
         global_counter += 2
         """
         x = x + 1
@@ -78,6 +51,7 @@ class SillyModel(nn.Module):
 
 
 @pytest.mark.parametrize("use_inductor", [True, False])
+@torch.inference_mode()
 def test_simple_piecewise_compile(use_inductor):
     assert VLLM_USE_V1
 
@@ -121,13 +95,12 @@ def test_simple_piecewise_compile(use_inductor):
             model(torch.randn(1).cuda())
 
         input = torch.zeros(2).cuda()
-        global global_counter
-        global_counter = 0
+        reset_global_counter()
         with set_forward_context(
                 None,
                 vllm_config=vllm_config,
                 cudagraph_runtime_mode=CUDAGraphMode.PIECEWISE,
                 batch_descriptor=BatchDescriptor(num_tokens=2, )):
             output = model(input)
-        assert global_counter == 2
-        assert torch.allclose(output.cpu(), torch.tensor([3., 1.]))
+        assert get_global_counter() == 2
+        assert torch.allclose(output.cpu(), torch.tensor([19.0, 19.0]))
diff --git a/tests/compile/piecewise/test_toy_llama.py b/tests/compile/piecewise/test_toy_llama.py
index bcfd0d834c5db6a876202ad64be2239f3b4b15e8..cba7517647e51294226bf291cf0990a04ed7ba16 100644
--- a/tests/compile/piecewise/test_toy_llama.py
+++ b/tests/compile/piecewise/test_toy_llama.py
@@ -14,38 +14,15 @@ from typing import Any, Optional
 import pytest
 import torch
 from torch import nn
-from torch.library import Library
 
 from vllm.compilation.counter import compilation_counter
 from vllm.compilation.decorators import support_torch_compile
 from vllm.config import (CompilationConfig, CompilationLevel, CUDAGraphMode,
                          VllmConfig, set_current_vllm_config)
 from vllm.forward_context import BatchDescriptor, set_forward_context
-from vllm.utils import direct_register_custom_op
 
-# create a library to hold the custom op
-silly_lib = Library("silly", "FRAGMENT")  # noqa
-
-
-def silly_attention(q: torch.Tensor, k: torch.Tensor, v: torch.Tensor,
-                    out: torch.Tensor) -> None:
-    out.copy_(q)
-    out += k
-    out += v
-
-
-def silly_attention_fake(q: torch.Tensor, k: torch.Tensor, v: torch.Tensor,
-                         out: torch.Tensor) -> None:
-    return
-
-
-direct_register_custom_op(
-    op_name="attention",
-    op_func=silly_attention,
-    mutates_args=["out"],
-    fake_impl=silly_attention_fake,
-    target_lib=silly_lib,
-)
+# This import automatically registers `torch.ops.silly.attention`
+from .. import silly_attention  # noqa: F401
 
 
 @dataclass
diff --git a/tests/compile/silly_attention.py b/tests/compile/silly_attention.py
new file mode 100644
index 0000000000000000000000000000000000000000..13eb0bf4b1fa16b8c2167346ac0dbebfe1b4a671
--- /dev/null
+++ b/tests/compile/silly_attention.py
@@ -0,0 +1,63 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""
+Shared PyTorch custom silly attention for compilation tests.
+Centralizes custom operation definitions to avoid duplicate registrations.
+"""
+
+import torch
+from torch.library import Library
+
+from vllm.utils import direct_register_custom_op
+
+# Shared library for all compilation test operations
+# Using "silly" namespace to match existing test expectations
+# import this file will automatically register
+# torch ops for testing (like silly.attention)
+silly_lib = Library("silly", "FRAGMENT")
+
+# Global counter that counts the number of times attention is invoked
+_global_counter = 0
+
+
+def get_global_counter():
+    """Get the current global counter value"""
+    return _global_counter
+
+
+def reset_global_counter():
+    """Reset the global counter to 0"""
+    global _global_counter
+    _global_counter = 0
+
+
+def silly_attention(q: torch.Tensor, k: torch.Tensor, v: torch.Tensor,
+                    out: torch.Tensor) -> None:
+    """
+    Unified attention implementation that depends on
+    all inputs and affects the output.
+    Always increments a global counter that tests can use or ignore.
+    """
+    global _global_counter
+
+    # Always increment the global counter
+    _global_counter += 1
+
+    # Unified implementation that depends on all inputs
+    out.copy_(q + k + v)
+
+
+def silly_attention_fake(q: torch.Tensor, k: torch.Tensor, v: torch.Tensor,
+                         out: torch.Tensor) -> None:
+    """Fake implementation for testing"""
+    return
+
+
+# Register the unified attention operation
+direct_register_custom_op(
+    op_name="attention",
+    op_func=silly_attention,
+    mutates_args=["out"],
+    fake_impl=silly_attention_fake,
+    target_lib=silly_lib,
+)
diff --git a/tests/compile/test_basic_correctness.py b/tests/compile/test_basic_correctness.py
index 422cb94b036caeb2e31da9097cc58e22e7f26a11..fd2b1866e62e1a8c32852eb53ab539bf75b8a28b 100644
--- a/tests/compile/test_basic_correctness.py
+++ b/tests/compile/test_basic_correctness.py
@@ -23,7 +23,7 @@ class TestSetting:
     fullgraph: bool
 
 
-# we cannot afford testing the full Catesian product
+# we cannot afford testing the full Cartesian product
 # of all models and all levels
 @pytest.mark.parametrize(
     "test_setting",
@@ -62,8 +62,12 @@ class TestSetting:
         TestSetting(
             model="BAAI/bge-multilingual-gemma2",
             model_args=[
-                "--runner", "pooling", "--dtype", "bfloat16",
-                "--max-model-len", "2048"
+                "--runner",
+                "pooling",
+                "--dtype",
+                "bfloat16",
+                "--max-model-len",
+                "2048",
             ],
             pp_size=1,
             tp_size=1,
@@ -71,17 +75,15 @@ class TestSetting:
             method="encode",
             fullgraph=True,
         ),
-        # TODO: bert models are not supported in V1 yet
-        # # encoder-based embedding model (BERT)
-        # TestSetting(
-        #     model="BAAI/bge-base-en-v1.5",
-        #     model_args=["--runner", "pooling"],
-        #     pp_size=1,
-        #     tp_size=1,
-        #     attn_backend="XFORMERS",
-        #     method="encode",
-        #     fullgraph=True,
-        # ),
+        TestSetting(
+            model="BAAI/bge-base-en-v1.5",
+            model_args=["--runner", "pooling"],
+            pp_size=1,
+            tp_size=1,
+            attn_backend="FLASH_ATTN",
+            method="encode",
+            fullgraph=True,
+        ),
         # vision language model
         TestSetting(
             model="microsoft/Phi-3.5-vision-instruct",
@@ -92,7 +94,8 @@ class TestSetting:
             method="generate_with_image",
             fullgraph=False,
         ),
-    ])
+    ],
+)
 def test_compile_correctness(
     monkeypatch: pytest.MonkeyPatch,
     test_setting: TestSetting,
diff --git a/tests/compile/test_decorator.py b/tests/compile/test_decorator.py
index 51f8ddd566d5646d66027f268de45041ebb3d17e..d73586d53ff3ed44f1f549725feea6f558973eb5 100644
--- a/tests/compile/test_decorator.py
+++ b/tests/compile/test_decorator.py
@@ -2,7 +2,6 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import torch
 from torch import nn
-from torch.library import Library
 
 from vllm.compilation.counter import compilation_counter
 from vllm.compilation.decorators import (ignore_torch_compile,
@@ -10,36 +9,14 @@ from vllm.compilation.decorators import (ignore_torch_compile,
 from vllm.config import (CacheConfig, CompilationConfig, CompilationLevel,
                          CUDAGraphMode, VllmConfig, set_current_vllm_config)
 from vllm.forward_context import BatchDescriptor, set_forward_context
-from vllm.utils import direct_register_custom_op
 
-# create a library to hold the custom op
-silly_lib = Library("silly", "FRAGMENT")  # noqa
+# This import automatically registers `torch.ops.silly.attention`
+from . import silly_attention  # noqa: F401
 
 BATCH_SIZE = 32
 MLP_SIZE = 128
 
 
-def silly_attention(q: torch.Tensor, k: torch.Tensor, v: torch.Tensor,
-                    out: torch.Tensor) -> None:
-    out.copy_(q)
-    out += k
-    out += v
-
-
-def silly_attention_fake(q: torch.Tensor, k: torch.Tensor, v: torch.Tensor,
-                         out: torch.Tensor) -> None:
-    return
-
-
-direct_register_custom_op(
-    op_name="attention",
-    op_func=silly_attention,
-    mutates_args=["out"],
-    fake_impl=silly_attention_fake,
-    target_lib=silly_lib,
-)
-
-
 @torch.inference_mode
 def run_model(vllm_config: VllmConfig, model: nn.Module,
               cudagraph_runtime_mode: CUDAGraphMode):
@@ -151,7 +128,7 @@ def test_ignore_torch_compile_decorator():
         run_model(vllm_config, mod_C, cudagraph_runtime_mode)
 
 
-# Only enable torch.compile if
+# Only enable torch.compile if
 # vllm_config.cache_config.kv_sharing_fast_prefill=True
 @support_torch_compile(enable_if=lambda vllm_config: vllm_config.cache_config.
                        kv_sharing_fast_prefill)
@@ -173,7 +150,7 @@ class B(nn.Module):
         return x
 
 
-# Only enable torch.compile if
+# Only enable torch.compile if
 # vllm_config.cache_config.kv_sharing_fast_prefill=False
 @support_torch_compile(enable_if=lambda vllm_config: not vllm_config.
                        cache_config.kv_sharing_fast_prefill)
diff --git a/tests/compile/test_fusion.py b/tests/compile/test_fusion.py
index c4229f93464ac93eb2799bdd4ba221d99846176d..eedb9bdcd5299518e5126aae488b79a8cf8b7525 100644
--- a/tests/compile/test_fusion.py
+++ b/tests/compile/test_fusion.py
@@ -15,9 +15,10 @@ from vllm.model_executor.layers.layernorm import RMSNorm
 from vllm.model_executor.layers.quantization.utils.quant_utils import (
     GroupShape, QuantKey, ScaleDesc)
 from vllm.model_executor.layers.quantization.utils.w8a8_utils import (
-    Fp8LinearOp, maybe_create_device_identity)
+    Fp8LinearOp, cutlass_fp8_supported, maybe_create_device_identity)
 from vllm.platforms import current_platform
 
+from ..utils import override_cutlass_fp8_supported
 from .backend import TestBackend
 
 FP8_DTYPE = current_platform.fp8_dtype()
@@ -26,9 +27,9 @@ FP8_DTYPE = current_platform.fp8_dtype()
 class TestModel(torch.nn.Module):
 
     def __init__(self, hidden_size: int, eps: float, static: bool,
-                 force_fp8_e4m3fnuz: bool, *args, **kwargs):
+                 cuda_force_torch: bool, *args, **kwargs):
         super().__init__(*args, **kwargs)
-        self.force_fp8_e4m3fnuz = force_fp8_e4m3fnuz
+        self.cuda_force_torch = cuda_force_torch
         self.norm = [RMSNorm(hidden_size, eps) for _ in range(3)]
         self.wscale = [torch.rand(1, dtype=torch.float32) for _ in range(2)]
         group_shape = GroupShape.PER_TENSOR if static else GroupShape.PER_TOKEN
@@ -42,11 +43,12 @@ class TestModel(torch.nn.Module):
             torch.rand(hidden_size, hidden_size).to(dtype=FP8_DTYPE).t()
             for _ in range(2)
         ]
-        self.fp8_linear = Fp8LinearOp(
-            force_fp8_e4m3fnuz=force_fp8_e4m3fnuz,
-            act_quant_static=static,
-            act_quant_group_shape=group_shape,
-        )
+
+        with override_cutlass_fp8_supported(not cuda_force_torch):
+            self.fp8_linear = Fp8LinearOp(
+                act_quant_static=static,
+                act_quant_group_shape=group_shape,
+            )
 
     def forward(self, x):
         resid = torch.sqrt(x)
@@ -81,11 +83,14 @@ class TestModel(torch.nn.Module):
 @pytest.mark.parametrize("num_tokens", [7, 256, 533, 2048, 2049])
 @pytest.mark.parametrize("eps", [1e-5, 1e-6])
 @pytest.mark.parametrize("static", [True, False])
-@pytest.mark.parametrize("force_fp8_e4m3fnuz", [True, False])
+# cuda_force_torch used to test torch code path on platforms that
+# cutlass_fp8_supported() == True.
+@pytest.mark.parametrize("cuda_force_torch",
+                         [True, False] if cutlass_fp8_supported() else [True])
 @pytest.mark.skipif(envs.VLLM_TARGET_DEVICE not in ["cuda", "rocm"],
                     reason="Only test on CUDA and ROCm")
 def test_fusion_rmsnorm_quant(dtype, hidden_size, num_tokens, eps, static,
-                              force_fp8_e4m3fnuz):
+                              cuda_force_torch):
     torch.set_default_device("cuda")
     torch.set_default_dtype(dtype)
     torch.manual_seed(1)
@@ -102,7 +107,7 @@ def test_fusion_rmsnorm_quant(dtype, hidden_size, num_tokens, eps, static,
         fusion_pass = FusionPass.instance(vllm_config)
 
         backend = TestBackend(noop_pass, fusion_pass)
-        model = TestModel(hidden_size, eps, static, force_fp8_e4m3fnuz)
+        model = TestModel(hidden_size, eps, static, cuda_force_torch)
 
         # First dimension dynamic
         x = torch.rand(num_tokens, hidden_size)
diff --git a/tests/compile/test_fusion_attn.py b/tests/compile/test_fusion_attn.py
index dba668cfa16a60850bc75f92f20e166d3da01dc2..6baf4bf83f4996ba8e794727c243496214307681 100644
--- a/tests/compile/test_fusion_attn.py
+++ b/tests/compile/test_fusion_attn.py
@@ -40,13 +40,12 @@ backend_unfused: Optional[TestBackend] = None
 @pytest.mark.parametrize(
     "model, quant_key",
     [("amd/Llama-3.1-8B-Instruct-FP8-KV", kFp8StaticTensorSym)])
-@pytest.mark.parametrize(
-    "use_triton_fa", [True, False] if current_platform.is_rocm() else [False])
+@pytest.mark.parametrize("use_triton_fa", [True, False])
 @pytest.mark.skipif(not current_platform.supports_fp8(), reason="Need FP8")
-@pytest.mark.skipif(not current_platform.is_cuda_alike(),
-                    reason="Only test CUDA and ROCm")
-def test_attention_fusion(example_prompts, monkeypatch, model: str,
-                          quant_key: QuantKey, use_triton_fa: bool):
+@pytest.mark.skipif(not current_platform.is_rocm(),
+                    reason="V0 attn quant fusion only on ROCm")
+def test_attention_fusion_v0(example_prompts, monkeypatch, model: str,
+                             quant_key: QuantKey, use_triton_fa: bool):
     # Clean Dynamo cache to avoid reusing other test cases
     # (for some reason the reset at the end is not enough)
     torch._dynamo.reset()
@@ -69,13 +68,17 @@ def test_attention_fusion(example_prompts, monkeypatch, model: str,
         backend="tests.compile.test_fusion_attn.backend_unfused",
         custom_ops=["+quant_fp8"],
     )
-    vllm_config = VllmConfig(compilation_config=compile_config)
+    vllm_config = VllmConfig(compilation_config=compile_config,
+                             model_config=ModelConfig(
+                                 model=model,
+                                 dtype=torch.bfloat16,
+                             ))
     backend_unfused = TestBackend(NoOpEliminationPass(vllm_config))
 
     llm = LLM(model,
               enforce_eager=True,
               compilation_config=compile_config,
-              gpu_memory_utilization=0.9,
+              gpu_memory_utilization=0.5,
               max_model_len=2048)
 
     sampling_params = SamplingParams(temperature=0.0,
@@ -93,7 +96,11 @@ def test_attention_fusion(example_prompts, monkeypatch, model: str,
         backend="tests.compile.test_fusion_attn.backend",
         custom_ops=["+quant_fp8"],
     )
-    vllm_config = VllmConfig(compilation_config=compile_config)
+    vllm_config = VllmConfig(compilation_config=compile_config,
+                             model_config=ModelConfig(
+                                 model=model,
+                                 dtype=torch.bfloat16,
+                             ))
 
     # AttnFusionPass needs attention layers to be registered in config upon init
     # so we initialize it during compilation.
@@ -102,7 +109,7 @@ def test_attention_fusion(example_prompts, monkeypatch, model: str,
     llm2 = LLM(model,
                enforce_eager=True,
                compilation_config=compile_config,
-               gpu_memory_utilization=0.9,
+               gpu_memory_utilization=0.5,
                max_model_len=2048)
 
     # check support
@@ -171,6 +178,8 @@ class AttentionQuantPatternModel(torch.nn.Module):
             cache_config=vllm_config.cache_config,
             prefix="model.layers.0.self_attn.attn",
         )
+        self.attn._k_scale = self.attn._k_scale.to(device)
+        self.attn._v_scale = self.attn._v_scale.to(device)
 
         self.block_size = 16
 
@@ -188,7 +197,7 @@ class AttentionQuantPatternModel(torch.nn.Module):
             device=self.device,
         )
 
-    def build_attn_metadata(self, batch_size: int):
+    def build_attn_metadata(self, batch_size: int, use_hnd: bool):
         """Initialize attention metadata."""
 
         # Create common attn metadata
@@ -205,10 +214,8 @@ class AttentionQuantPatternModel(torch.nn.Module):
         num_blocks = batch_size * max_blocks
 
         # Create dummy KV cache for FlashInfer TRTLLM
-        #   - NHD: [num_blocks, 2, block_size, num_kv_heads, head_size]
-        #   - HND: [num_blocks, 2, num_kv_heads, block_size, head_size]
-        # Create kv_cache in HND layout and permute to NHD layout
-        # (later will be permuted back to HND layout in forward pass)
+        #   - NHD: [num_blocks, block_size, num_kv_heads, head_size]
+        #   - HND: [num_blocks, num_kv_heads, block_size, head_size]
         kv_cache = torch.zeros(num_blocks,
                                2,
                                self.num_kv_heads,
@@ -216,7 +223,17 @@ class AttentionQuantPatternModel(torch.nn.Module):
                                self.head_size,
                                dtype=self.kv_cache_dtype,
                                device=self.device)
-        kv_cache = kv_cache.permute(0, 1, 3, 2, 4)
+        if current_platform.is_rocm():
+            # k/v as 1st dimention
+            if use_hnd:
+                kv_cache = kv_cache.permute(1, 0, 2, 3, 4)
+            else:
+                kv_cache = kv_cache.permute(1, 0, 3, 2, 4)
+        else:
+            # k/v as 2nd dimention
+            # Create kv_cache in HND layout and permute to NHD layout
+            # (later will be permuted back to HND layout in forward pass)
+            kv_cache = kv_cache.permute(0, 1, 3, 2, 4)
         self.attn.kv_cache = [kv_cache]
 
         # Build attn metadata
@@ -296,28 +313,51 @@ class TestAttentionNvfp4QuantPatternModel(AttentionQuantPatternModel):
                                      out_dtype=attn_output.dtype)
 
 
-@pytest.mark.parametrize("num_qo_heads, num_kv_heads", [(64, 8), (40, 8)])
+if current_platform.is_cuda():
+    MODELS = [("nvidia/Llama-4-Scout-17B-16E-Instruct-FP8",
+               TestAttentionFp8StaticQuantPatternModel),
+              ("nvidia/Llama-4-Scout-17B-16E-Instruct-FP4",
+               TestAttentionNvfp4QuantPatternModel)]
+    HEADS = [(64, 8), (40, 8)]
+elif current_platform.is_rocm():
+    MODELS = [("amd/Llama-3.1-8B-Instruct-FP8-KV",
+               TestAttentionFp8StaticQuantPatternModel)]
+    HEADS = [(32, 8), (40, 8)]
+else:
+    MODELS = []
+    HEADS = []
+
+
+@pytest.mark.parametrize("num_qo_heads, num_kv_heads", HEADS)
 @pytest.mark.parametrize("head_size", [128])
-@pytest.mark.parametrize("batch_size", [7, 256, 533])
-@pytest.mark.parametrize("dtype", [torch.bfloat16])
-@pytest.mark.parametrize("model_name, model_class",
-                         [("nvidia/Llama-4-Scout-17B-16E-Instruct-FP8",
-                           TestAttentionFp8StaticQuantPatternModel),
-                          ("nvidia/Llama-4-Scout-17B-16E-Instruct-FP4",
-                           TestAttentionNvfp4QuantPatternModel)])
-@pytest.mark.parametrize("backend", [_Backend.FLASHINFER])
-@pytest.mark.skipif(not current_platform.is_cuda(), reason="Only test CUDA")
+@pytest.mark.parametrize("batch_size",
+                         [7, 256, 533] if current_platform.is_cuda() else [8])
+@pytest.mark.parametrize("dtype", [torch.bfloat16, torch.float16])
+@pytest.mark.parametrize("model_name, model_class", MODELS)
+@pytest.mark.parametrize("backend", [_Backend.FLASHINFER] if
+                         current_platform.is_cuda() else [_Backend.ROCM_FLASH])
+@pytest.mark.parametrize(
+    "split_attention",
+    [False, True] if current_platform.is_rocm() else [False])
+@pytest.mark.skipif(not current_platform.is_cuda_alike(),
+                    reason="Only test ROCm or CUDA")
 @pytest.mark.skipif(not current_platform.supports_fp8(), reason="Need FP8")
-@pytest.mark.skipif(not current_platform.is_device_capability((10, 0)),
-                    reason="Only test on SM100(Blackwell)")
+@pytest.mark.skipif(current_platform.is_cuda()
+                    and not current_platform.is_device_capability((10, 0)),
+                    reason="On CUDA only test on SM100(Blackwell)")
+@pytest.mark.skipif(not current_platform.is_cuda_alike(),
+                    reason="Only test ROCm or CUDA")
 def test_attention_quant_pattern(num_qo_heads: int, num_kv_heads: int,
                                  head_size: int, batch_size: int,
                                  dtype: torch.dtype, model_name: str,
                                  model_class: type[AttentionQuantPatternModel],
-                                 backend: _Backend, monkeypatch, dist_init):
+                                 backend: _Backend, split_attention: bool,
+                                 monkeypatch, dist_init):
     """Test AttentionStaticQuantPattern fusion pass"""
 
     monkeypatch.setenv("VLLM_USE_V1", "1")
+    if split_attention:
+        monkeypatch.setenv("VLLM_V1_USE_PREFILL_DECODE_ATTENTION", "1")
 
     device = torch.device("cuda:0")
     torch.manual_seed(42)
@@ -326,6 +366,7 @@ def test_attention_quant_pattern(num_qo_heads: int, num_kv_heads: int,
         model_config=ModelConfig(
             model=model_name,
             max_model_len=2048,
+            dtype=dtype,
         ),
         scheduler_config=SchedulerConfig(max_num_seqs=1024),
         compilation_config=CompilationConfig(
@@ -368,7 +409,7 @@ def test_attention_quant_pattern(num_qo_heads: int, num_kv_heads: int,
 
         forward_ctx = get_forward_context()
         forward_ctx.attn_metadata = model_unfused.build_attn_metadata(
-            batch_size)
+            batch_size, use_hnd=split_attention)
 
         # Run model directly without compilation and fusion
         result_unfused = model_unfused(q, k, v)
@@ -389,7 +430,8 @@ def test_attention_quant_pattern(num_qo_heads: int, num_kv_heads: int,
         model_fused = model_fused.to(device)
 
         forward_ctx = get_forward_context()
-        forward_ctx.attn_metadata = model_fused.build_attn_metadata(batch_size)
+        forward_ctx.attn_metadata = model_fused.build_attn_metadata(
+            batch_size, use_hnd=split_attention)
 
         # Create test backend with fusion passes enabled
         noop_pass = NoOpEliminationPass(vllm_config)
@@ -404,12 +446,19 @@ def test_attention_quant_pattern(num_qo_heads: int, num_kv_heads: int,
         assert model_compiled.attn._o_scale_float is None
         result_fused_1 = model_compiled(q, k, v)
 
-        # After the 1st round of the forward pass, output quant scale should be
-        # loaded into the attn layer's _o_scale_float, the 2nd round should
-        # reuse the loaded _o_scale_float
-        assert model_compiled.attn._o_scale_float is not None
-        result_fused_2 = model_compiled(q, k, v)
-        assert model_compiled.attn._o_scale_float is not None
+        if backend == _Backend.FLASHINFER:
+            # With the Flashinfer backend after the 1st round of the forward
+            # pass, output quant scale should be loaded into the attn layer's
+            # _o_scale_float, the 2nd round should reuse the loaded
+            # _o_scale_float
+            assert model_compiled.attn._o_scale_float is not None
+            result_fused_2 = model_compiled(q, k, v)
+            assert model_compiled.attn._o_scale_float is not None
+
+            torch.testing.assert_close(result_unfused,
+                                       result_fused_2,
+                                       atol=1e-2,
+                                       rtol=1e-2)
 
     # Check attn fusion support
     quant_key = model_class.quant_key
@@ -444,12 +493,8 @@ def test_attention_quant_pattern(num_qo_heads: int, num_kv_heads: int,
         assert attn_nodes_post[0].kwargs.get("output_block_scale") is not None, \
             "Attention should have output_block_scale after FP4 fusion"  # noqa: E501
 
-    # Check that results are closed
+    # Check that results are close
     torch.testing.assert_close(result_unfused,
                                result_fused_1,
                                atol=1e-2,
                                rtol=1e-2)
-    torch.testing.assert_close(result_unfused,
-                               result_fused_2,
-                               atol=1e-2,
-                               rtol=1e-2)
diff --git a/tests/compile/test_silu_mul_quant_fusion.py b/tests/compile/test_silu_mul_quant_fusion.py
index fcc2589e421167567b48ec240dfedeffec5b6408..736db80a2f379f278a3583f228861029ee9bf763 100644
--- a/tests/compile/test_silu_mul_quant_fusion.py
+++ b/tests/compile/test_silu_mul_quant_fusion.py
@@ -1,9 +1,12 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from typing import cast
+
 import pytest
 import torch
 
 import vllm.envs as envs
+from tests.kernels.quantization.nvfp4_utils import quant_nvfp4_tensor
 from vllm._custom_ops import cutlass_scaled_fp4_mm, scaled_fp4_quant
 # yapf conflicts with isort for this block
 # yapf: disable
@@ -17,9 +20,10 @@ from vllm.model_executor.layers.activation import SiluAndMul
 from vllm.model_executor.layers.quantization.utils.quant_utils import (
     GroupShape, kFp8StaticTensorSym, kNvfp4Quant)
 from vllm.model_executor.layers.quantization.utils.w8a8_utils import (
-    Fp8LinearOp)
+    Fp8LinearOp, cutlass_fp8_supported)
 from vllm.platforms import current_platform
 
+from ..utils import override_cutlass_fp8_supported
 from .backend import TestBackend
 
 FP8_DTYPE = current_platform.fp8_dtype()
@@ -32,7 +36,7 @@ def is_nvfp4_supported():
 
 class TestSiluMulFp8QuantModel(torch.nn.Module):
 
-    def __init__(self, hidden_size: int, force_fp8_e4m3fnuz: bool, **kwargs):
+    def __init__(self, hidden_size: int, cuda_force_torch: bool, **kwargs):
         super().__init__()
         self.silu_and_mul = SiluAndMul()
         self.wscale = torch.rand(1, dtype=torch.float32)
@@ -40,11 +44,11 @@ class TestSiluMulFp8QuantModel(torch.nn.Module):
 
         self.w = torch.rand(hidden_size, hidden_size).to(dtype=FP8_DTYPE).t()
 
-        self.fp8_linear = Fp8LinearOp(
-            force_fp8_e4m3fnuz=force_fp8_e4m3fnuz,
-            act_quant_static=True,
-            act_quant_group_shape=GroupShape.PER_TENSOR,
-        )
+        with override_cutlass_fp8_supported(not cuda_force_torch):
+            self.fp8_linear = Fp8LinearOp(
+                act_quant_static=True,
+                act_quant_group_shape=GroupShape.PER_TENSOR,
+            )
 
     def forward(self, x):
         y = self.silu_and_mul(x)
@@ -63,24 +67,27 @@ class TestSiluMulFp8QuantModel(torch.nn.Module):
 
 class TestSiluMulNvfp4QuantModel(torch.nn.Module):
 
-    def __init__(self, hidden_size: int, **kwargs):
+    def __init__(self, hidden_size: int, x: torch.Tensor, **kwargs):
         super().__init__()
         self.silu_and_mul = SiluAndMul()
-        self.w = torch.randint(256, (hidden_size, hidden_size // 2),
-                               dtype=FP4_DTYPE)
-        self.wscale = torch.randn(hidden_size,
-                                  hidden_size // 16).to(dtype=FP8_DTYPE)
-        self.wscale2 = torch.rand(1, dtype=torch.float32)
-        self.scale = torch.rand(1, dtype=torch.float32)
+
+        # create nvfp4 weight
+        w = torch.rand((hidden_size, hidden_size))
+        self.w, self.w_block_scale, self.w_global_scale = quant_nvfp4_tensor(w)
+
+        # get global scale offline
+        _, _, self.y_global_scale = quant_nvfp4_tensor(self.silu_and_mul(x))
+
+        self.alpha = 1.0 / (self.w_global_scale * self.y_global_scale)
 
     def forward(self, x):
         y = self.silu_and_mul(x)
-        y_quant, y_block_scale = scaled_fp4_quant(y, 1 / self.scale)
+        y_quant, y_block_scale = scaled_fp4_quant(y, self.y_global_scale)
         out = cutlass_scaled_fp4_mm(a=y_quant,
                                     b=self.w,
                                     block_scale_a=y_block_scale,
-                                    block_scale_b=self.wscale,
-                                    alpha=self.scale * self.wscale2,
+                                    block_scale_b=self.w_block_scale,
+                                    alpha=self.alpha,
                                     out_dtype=y.dtype)
         return out
 
@@ -94,19 +101,25 @@ class TestSiluMulNvfp4QuantModel(torch.nn.Module):
 @pytest.mark.parametrize("num_tokens", [64])
 @pytest.mark.parametrize("hidden_size", [128])
 @pytest.mark.parametrize(
-    "model_class", [TestSiluMulFp8QuantModel, TestSiluMulNvfp4QuantModel]
-    if is_nvfp4_supported() else [TestSiluMulFp8QuantModel])
-@pytest.mark.parametrize("force_fp8_e4m3fnuz", [True, False])
+    "model_class",
+    cast(list[type], [TestSiluMulFp8QuantModel, TestSiluMulNvfp4QuantModel]
+         if is_nvfp4_supported() else [TestSiluMulFp8QuantModel]))
+# cuda_force_torch used to test torch code path on platforms that
+# cutlass_fp8_supported() == True.
+@pytest.mark.parametrize("cuda_force_torch",
+                         [True, False] if cutlass_fp8_supported() else [True])
 @pytest.mark.skipif(envs.VLLM_TARGET_DEVICE not in ["cuda", "rocm"],
                     reason="Only test on CUDA and ROCm")
 def test_fusion_silu_and_mul_quant(num_tokens, hidden_size, model_class,
-                                   force_fp8_e4m3fnuz):
-    if model_class == TestSiluMulNvfp4QuantModel and force_fp8_e4m3fnuz:
+                                   cuda_force_torch):
+    if model_class == TestSiluMulNvfp4QuantModel and cuda_force_torch:
         pytest.skip("Duplicate tests for NVFP4")
 
     torch.set_default_device("cuda")
     torch.set_default_dtype(torch.float16)
 
+    x = torch.rand(num_tokens, hidden_size * 2)
+
     # Reshape pass is needed for the fusion pass to work
     config = VllmConfig()
     config.compilation_config = CompilationConfig(
@@ -115,10 +128,10 @@ def test_fusion_silu_and_mul_quant(num_tokens, hidden_size, model_class,
 
     backend = TestBackend(NoOpEliminationPass(config), fusion_pass)
     model = model_class(hidden_size=hidden_size,
-                        force_fp8_e4m3fnuz=force_fp8_e4m3fnuz)
+                        cuda_force_torch=cuda_force_torch,
+                        x=x)
 
     # First dimension dynamic
-    x = torch.rand(num_tokens, hidden_size * 2)
     torch._dynamo.mark_dynamic(x, 0)
 
     result = model(x)
@@ -127,10 +140,15 @@ def test_fusion_silu_and_mul_quant(num_tokens, hidden_size, model_class,
     result2 = model2(x)
 
     # Check that it gives the same answer
+    if model_class == TestSiluMulFp8QuantModel:
+        atol, rtol = 1e-3, 1e-3
+    elif model_class == TestSiluMulNvfp4QuantModel:
+        atol, rtol = 1e-1, 1e-1
+
     torch.testing.assert_close(result[0].to(dtype=torch.float16),
                                result2[0].to(dtype=torch.float16),
-                               atol=1e-3,
-                               rtol=1e-3)
+                               atol=atol,
+                               rtol=rtol)
 
     # In pre-nodes, quant op should be present and fused kernels should not
     backend.check_before_ops(model.ops_in_model_before())
diff --git a/tests/conftest.py b/tests/conftest.py
index 27db5422ceac2b9d9eb96f0c6b3901789a3b71e2..0440e859fe02dad614e5fa5c60baa35727a40ef5 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -1,9 +1,24 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+# ruff: noqa
+
+from tblib import pickling_support
+
+# Install support for pickling exceptions so that we can nicely propagate
+# failures from tests running in a subprocess.
+# This should be run before any custom exception subclasses are defined.
+pickling_support.install()
+
+import http.server
 import json
 import math
+import mimetypes
 import os
+import socket
 import tempfile
+import threading
+from collections.abc import Generator
 from enum import Enum
 from typing import Any, Callable, Optional, TypedDict, TypeVar, Union, cast
 
@@ -32,6 +47,7 @@ from vllm.distributed import (cleanup_dist_env_and_memory,
 from vllm.inputs import (ExplicitEncoderDecoderPrompt, TextPrompt,
                          to_enc_dec_tuple_list, zip_enc_dec_prompts)
 from vllm.logger import init_logger
+from vllm.multimodal.utils import fetch_image
 from vllm.outputs import RequestOutput
 from vllm.sampling_params import BeamSearchParams
 from vllm.sequence import Logprob
@@ -1253,3 +1269,119 @@ def cli_config_file():
 def cli_config_file_with_model():
     """Return the path to the CLI config file with model."""
     return os.path.join(_TEST_DIR, "config", "test_config_with_model.yaml")
+
+
+class AssetHandler(http.server.BaseHTTPRequestHandler):
+    # _IMAGE_CACHE : Dict[str, bytes] = {}
+
+    def log_message(self, *args, **kwargs):
+        pass
+
+    def do_GET(self):
+        # Accepts paths like: /1280px-Venn_diagram_rgb.jpg
+        filename = self.path.lstrip("/")
+        if not filename or "." not in filename:
+            self.send_error(404, "Missing filename (expected /<name>.<ext>)")
+            return
+
+        base, ext = filename.rsplit(".", 1)
+        ext = ext.lower()
+
+        if ext not in ["jpg", "png"]:
+            self.send_error(404, f"Unsupported extension: .{ext}")
+            return
+
+        try:
+            data = ImageAsset(base).read_bytes(ext=ext)
+        except Exception as e:
+            self.send_error(500, f"Failed to load asset: {ext} {base} {e} ")
+            return
+
+        ctype, _ = mimetypes.guess_type(filename)
+        if ctype is None:
+            ctype = {"jpg": "image/jpg", "png": "image/png"}[ext]
+        self.send_response(200)
+        self.send_header("Content-Type", ctype)
+        self.send_header("Content-Length", str(len(data)))
+        self.end_headers()
+        self.wfile.write(data)
+
+
+def _find_free_port() -> int:
+    with socket.socket() as s:
+        s.bind(("127.0.0.1", 0))
+        return s.getsockname()[1]
+
+
+class LocalAssetServer:
+
+    address: str
+    port: int
+    server: Optional[http.server.ThreadingHTTPServer]
+    thread: Optional[threading.Thread]
+
+    def __init__(self, address: str = "127.0.0.1") -> None:
+        self.address = address
+        self.port = -1
+        self.server = None
+        self.thread = None
+
+    def __enter__(self):
+        self.port = _find_free_port()
+        self.server = http.server.ThreadingHTTPServer(
+            (self.address, self.port), AssetHandler)
+        self.thread = threading.Thread(target=self.server.serve_forever,
+                                       daemon=True)
+        self.thread.start()
+        return self
+
+    def __exit__(self, exc_type, exc_value, traceback):
+        if self.server:
+            self.server.shutdown()
+            del self.server
+
+        if self.thread:
+            self.thread.join()
+            del self.thread
+
+        if exc_type is None:
+            return None
+
+        return False
+
+    @property
+    def base_url(self) -> str:
+        assert self.port is not None
+        return f"http://{self.address}:{self.port}"
+
+    def url_for(self, name: str) -> str:
+        """e.g., name='RGBA_comp.png' -> 'http://127.0.0.1:PORT/RGBA_comp.png'"""
+        return f"{self.base_url}/{name}"
+
+    def get_image_asset(self, name: str) -> Image.Image:
+        return fetch_image(self.url_for(name))
+
+
+@pytest.fixture(scope="session")
+def local_asset_server() -> Generator[LocalAssetServer, None, None]:
+    """
+    Starts a thread based HTTP server bound to 127.0.0.1 on a random free port. 
+    The server currently servers images at:
+    http://127.0.0.1:<port>/<name>.<ext>
+    """
+    with LocalAssetServer() as srv:
+        yield srv
+
+
+@pytest.fixture
+def image_url(request, local_asset_server) -> str:
+    # request.param is one of the IMAGE_ASSETS filenames
+    name = request.param
+    return local_asset_server.url_for(name)
+
+
+@pytest.fixture
+def image_urls(request, local_asset_server) -> list[str]:
+    """Indirect fixture: takes a list of names, returns list of full URLs."""
+    names: list[str] = request.param
+    return [local_asset_server.url_for(name) for name in names]
diff --git a/tests/core/block/e2e/test_correctness.py b/tests/core/block/e2e/test_correctness.py
index 93222b564ebe71b8de406c72157efa5bad3fef6c..8de48ef59a013125255070dec99396393ca65311 100644
--- a/tests/core/block/e2e/test_correctness.py
+++ b/tests/core/block/e2e/test_correctness.py
@@ -439,10 +439,10 @@ def test_auto_prefix_caching_with_preemption(baseline_llm_generator,
 @pytest.mark.parametrize("seed", [1])
 def test_auto_prefix_caching_after_eviction_start(baseline_llm_generator,
                                                   test_llm_generator):
-    """Verify block manager v2 with auto prefix caching could works normal
+    """Verify block manager v2 with auto prefix caching could work normally
     even when eviction started.
     With APC enabled, all blocks are held by native block at the beginning.
-    Then blocks are managed by evictor instead. If cache hit at the evitor's
+    Then blocks are managed by evictor instead. If cache hit at the evictor's
     block, then it could be reused, or we need to recompute its kv cache.
     """
     output_len = 10
diff --git a/tests/core/test_scheduler.py b/tests/core/test_scheduler.py
index 591e1780c11c6c720825cbfadf3e9f1325d9b8ad..86e08328c43b0eb7d22bebc2da0a1cad7013a09d 100644
--- a/tests/core/test_scheduler.py
+++ b/tests/core/test_scheduler.py
@@ -10,7 +10,8 @@ import pytest  # noqa
 import torch
 from torch import Use  # noqa
 
-from vllm.config import CacheConfig, LoRAConfig, SchedulerConfig
+from vllm.config import CacheConfig, SchedulerConfig
+from vllm.config.lora import LoRAConfig
 from vllm.core.interfaces import AllocStatus
 from vllm.core.scheduler import Scheduler, SchedulingBudget
 from vllm.lora.request import LoRARequest
@@ -641,7 +642,7 @@ def test_schedule_decode_blocks_to_copy_update():
     # Nothing is preempted.
     assert output.blocks_to_swap_out == []
     # Since append_slot returns the source -> dist mapping, it should
-    # applied.
+    # be applied.
     assert output.blocks_to_copy == [(2, 3)]
 
 
diff --git a/tests/detokenizer/test_stop_string_while_stop_model_terminates.py b/tests/detokenizer/test_stop_string_while_stop_model_terminates.py
new file mode 100644
index 0000000000000000000000000000000000000000..9b32a2927f2de97ff55db68b6b23ec14559cc58e
--- /dev/null
+++ b/tests/detokenizer/test_stop_string_while_stop_model_terminates.py
@@ -0,0 +1,103 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import pytest
+
+from vllm.sampling_params import SamplingParams
+from vllm.v1.engine import EngineCoreRequest
+from vllm.v1.engine.detokenizer import BaseIncrementalDetokenizer
+
+
+@pytest.fixture(params=[True, False])
+def include_stop_str_in_output(request):
+    return request.param
+
+
+class _DummyDetokenizer(BaseIncrementalDetokenizer):
+
+    def __init__(self, request: EngineCoreRequest):
+        super().__init__(request)
+
+    def decode_next(self, next_token_id: int) -> str:
+        # Map token id to single ASCII character for deterministic testing.
+        return chr(next_token_id)
+
+
+def _make_request(stop, include_stop_str_in_output: bool, min_tokens: int = 0):
+    params = SamplingParams(
+        stop=stop,
+        include_stop_str_in_output=include_stop_str_in_output,
+        min_tokens=min_tokens)
+    # Keep other fields minimal for unit test purposes.
+    req = EngineCoreRequest(
+        request_id="test",
+        prompt_token_ids=[],
+        mm_features=None,
+        sampling_params=params,
+        pooling_params=None,
+        eos_token_id=None,
+        arrival_time=0.0,
+        lora_request=None,
+        cache_salt=None,
+        data_parallel_rank=None,
+    )
+    return req
+
+
+def test_stop_string_while_stop_token_terminates(
+        include_stop_str_in_output: bool):
+    """
+    This test verifies that the detokenizer correctly handles the case where
+    the generated token sequence contains both:
+    - a stop token
+    - an <eos> token
+    
+    The detokenizer should respect the stop string and truncate the output
+    accordingly.
+    
+    Imagine the following sequence:
+    - "abcdeZ" is generated, where "Z" is the <eos> token.
+    - "cd" is the stop string.
+    
+    If include_stop_str_in_output=False, the detokenizer should truncate the
+    output to "ab" because the stop string "cd" is excluded.
+    If include_stop_str_in_output=True, the detokenizer should include the stop
+    string "cd" in the output, resulting in "abcd".
+    
+
+    This verifies the behavioral change introduced in BaseIncrementalDetokenizer
+    where stop-string evaluation occurs before the early-return on
+    stop_terminated.
+    """
+
+    # Generate text "abcdeZ" and tokenize it.
+    generated_text = "abcde"
+    eos_token = "Z"
+    stop_string = "cd"
+    generated_text = generated_text + eos_token
+    token_ids = [ord(c) for c in generated_text]
+
+    # Create a request with the stop string and initialize the detokenizer.
+    req = _make_request(stop=[stop_string],
+                        include_stop_str_in_output=include_stop_str_in_output)
+    detok = _DummyDetokenizer(req)
+
+    # Simulate that the last token ('Z') is a stop token (stop_terminated=True).
+    result = detok.update(new_token_ids=token_ids, stop_terminated=True)
+
+    # The update should not report a stop string
+    assert result == stop_string
+
+    # Output text should reflect stop-string handling:
+    # - include_stop_str_in_output=False => exclude "cd" => "ab"
+    # - include_stop_str_in_output=True  => include "cd" => "abcd"
+    expected_text = "abcd" if include_stop_str_in_output else "ab"
+    assert detok.output_text == expected_text
+
+    # The skipped final token should still be recorded in token_ids.
+    assert detok.output_token_ids == token_ids
+
+    # get_next_output_text should return the full text when finished=True.
+    # (Buffering only applies during streaming when finished=False.)
+    assert detok.get_next_output_text(finished=True,
+                                      delta=False) == expected_text
diff --git a/tests/distributed/conftest.py b/tests/distributed/conftest.py
index 666a715cc0da114589128000c9c49718df87668f..7dc4a0cc3d58249d80cfc14c433bafc5979081f3 100644
--- a/tests/distributed/conftest.py
+++ b/tests/distributed/conftest.py
@@ -8,7 +8,7 @@ import msgspec.msgpack
 import pytest
 import zmq
 
-from vllm.config import KVEventsConfig
+from vllm.config.kv_events import KVEventsConfig
 from vllm.distributed.kv_events import EventPublisherFactory
 
 from .test_events import SampleBatch
diff --git a/tests/distributed/test_context_parallel.py b/tests/distributed/test_context_parallel.py
new file mode 100644
index 0000000000000000000000000000000000000000..23be703a30682516017107f10692dcbd7759a6e9
--- /dev/null
+++ b/tests/distributed/test_context_parallel.py
@@ -0,0 +1,263 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""
+WARNING: This test runs in both single-node (4 GPUs) and multi-node
+ (2 node with 2 GPUs each) modes. If the test only uses 2 GPUs, it is
+ important to set the distributed backend to "mp" to avoid Ray scheduling
+ all workers in a node other than the head node, which can cause the test
+ to fail.
+"""
+import json
+import os
+from dataclasses import dataclass
+from typing import Literal, NamedTuple, Optional
+
+import pytest
+
+from vllm.config import RunnerOption
+from vllm.logger import init_logger
+
+from ..models.registry import HF_EXAMPLE_MODELS
+from ..utils import compare_two_settings, create_new_process_for_each_test
+
+logger = init_logger("test_context_parallel")
+
+VLLM_MULTI_NODE = os.getenv("VLLM_MULTI_NODE", "0") == "1"
+
+
+class ParallelSetup(NamedTuple):
+    tp_size: int
+    pp_size: int
+    dcp_size: int
+    eager_mode: bool
+    chunked_prefill: bool
+
+
+class CPTestOptions(NamedTuple):
+    multi_node_only: bool
+    load_format: Optional[str] = None
+
+
+@dataclass
+class CPTestSettings:
+    parallel_setups: list[ParallelSetup]
+    # NOTE: the length of distributed_backends and
+    # vllm_major_versions should be the same, and they
+    # are first zipped together to iterate over all
+    # test settings.
+    distributed_backends: list[str]
+    # vllm major version: "0" for V0, "1" for V1
+    vllm_major_versions: list[str]
+    runner: RunnerOption
+    test_options: CPTestOptions
+
+    def __post_init__(self):
+        if len(self.distributed_backends) != len(self.vllm_major_versions):
+            raise ValueError(
+                f"Length mismatch: distributed_backends "
+                f"({len(self.distributed_backends)}) != "
+                f"vllm_major_versions ({len(self.vllm_major_versions)})")
+
+    @staticmethod
+    def detailed(
+        *,
+        tp_base: int = 4,
+        pp_base: int = 1,
+        dcp_base: int = 1,
+        multi_node_only: bool = False,
+        runner: RunnerOption = "auto",
+        load_format: Optional[str] = None,
+    ):
+        parallel_setups = []
+        for eager_mode_val in [False]:
+            for pp_multiplier in [1]:
+                for dcp_multiplier in [2, 4]:
+                    for chunked_prefill_val in [True]:
+                        parallel_setups.append(
+                            ParallelSetup(tp_size=tp_base,
+                                          pp_size=pp_multiplier * pp_base,
+                                          dcp_size=dcp_multiplier * dcp_base,
+                                          eager_mode=eager_mode_val,
+                                          chunked_prefill=chunked_prefill_val))
+        return CPTestSettings(
+            parallel_setups=parallel_setups,
+            distributed_backends=["mp"],
+            vllm_major_versions=["1"],
+            runner=runner,
+            test_options=CPTestOptions(multi_node_only=multi_node_only,
+                                       load_format=load_format),
+        )
+
+    def iter_params(self, model_id: str):
+        opts = self.test_options
+
+        for parallel_setup in self.parallel_setups:
+            for backend, vllm_major_version in zip(self.distributed_backends,
+                                                   self.vllm_major_versions):
+                yield (model_id, parallel_setup, backend, vllm_major_version,
+                       self.runner, opts)
+
+
+def _compare_cp_with_tp(
+    model_id: str,
+    parallel_setup: ParallelSetup,
+    distributed_backend: str,
+    vllm_major_version: str,
+    runner: RunnerOption,
+    test_options: CPTestOptions,
+    num_gpus_available: int,
+    *,
+    method: Literal["generate"],
+    is_multimodal: bool,
+):
+    (
+        tp_size,
+        pp_size,
+        dcp_size,
+        eager_mode,
+        chunked_prefill,
+    ) = parallel_setup
+
+    multi_node_only, load_format = test_options
+
+    model_info = HF_EXAMPLE_MODELS.find_hf_info(model_id)
+    model_info.check_transformers_version(on_fail="skip")
+
+    trust_remote_code = model_info.trust_remote_code
+    tokenizer_mode = model_info.tokenizer_mode
+    hf_overrides = model_info.hf_overrides
+
+    if load_format == "dummy":
+        # Avoid OOM
+        text_overrides = {
+            "num_hidden_layers": 4,
+            "hidden_size": 512,
+            "intermediate_size": 800,
+            "num_attention_heads": 4,
+            "num_key_value_heads": 1,
+        }
+
+        if is_multimodal:
+            hf_overrides.update({"text_config": text_overrides})
+        else:
+            hf_overrides.update(text_overrides)
+    else:
+        model_info.check_available_online(on_fail="skip")
+
+    if num_gpus_available < tp_size * pp_size:
+        pytest.skip(f"Need at least {tp_size} x {pp_size} GPUs")
+    if VLLM_MULTI_NODE and distributed_backend == "mp":
+        pytest.skip("Skipping multi-node pipeline parallel test for "
+                    "multiprocessing distributed backend")
+    if multi_node_only and not VLLM_MULTI_NODE:
+        pytest.skip("Not in multi-node setting")
+
+    common_args = [
+        # use half precision for speed and memory savings in CI environment
+        "--dtype",
+        "bfloat16",
+        "--max-model-len",
+        "2048",
+        "--max-num-seqs",
+        "8",
+    ]
+    if chunked_prefill:
+        common_args.append("--enable-chunked-prefill")
+    if eager_mode:
+        common_args.append("--enforce-eager")
+    if runner != "auto":
+        common_args.extend(["--runner", runner])
+    if trust_remote_code:
+        common_args.append("--trust-remote-code")
+    if tokenizer_mode:
+        common_args.extend(["--tokenizer-mode", tokenizer_mode])
+    if load_format:
+        common_args.extend(["--load-format", load_format])
+    if hf_overrides:
+        common_args.extend(["--hf-overrides", json.dumps(hf_overrides)])
+
+    cp_env = tp_env = {
+        "VLLM_USE_V1":
+        vllm_major_version,  # Note(hc): DCP only support V1 engine only
+    }
+
+    cp_args = [
+        *common_args,
+        "--tensor-parallel-size",
+        str(tp_size),
+        "--pipeline-parallel-size",
+        str(pp_size),
+        "--decode-context-parallel-size",
+        str(dcp_size),
+        "--distributed-executor-backend",
+        distributed_backend,
+    ]
+
+    tp_args = [
+        *common_args,
+        "--tensor-parallel-size",
+        str(tp_size),
+        "--pipeline-parallel-size",
+        str(pp_size),
+        "--distributed-executor-backend",
+        distributed_backend,
+    ]
+
+    try:
+        compare_two_settings(model_id,
+                             cp_args,
+                             tp_args,
+                             cp_env,
+                             tp_env,
+                             method=method,
+                             max_wait_seconds=720)
+    except Exception:
+        testing_ray_compiled_graph = cp_env is not None
+        if testing_ray_compiled_graph and vllm_major_version == "0":
+            # Ray Compiled Graph tests are flaky for V0,
+            # so we don't want to fail the test
+            logger.exception("Ray Compiled Graph tests failed")
+        else:
+            raise
+
+
+CP_TEXT_GENERATION_MODELS = {
+    # [MLA attention only]
+    "deepseek-ai/DeepSeek-V2-Lite-Chat": CPTestSettings.detailed(),
+}
+
+CP_TEST_MODELS = [
+    # TODO support other models
+    # [LANGUAGE GENERATION]
+    "deepseek-ai/DeepSeek-V2-Lite-Chat",
+]
+
+
+@pytest.mark.parametrize(
+    ("model_id", "parallel_setup", "distributed_backend", "vllm_major_version",
+     "runner", "test_options"),
+    [
+        params for model_id, settings in CP_TEXT_GENERATION_MODELS.items()
+        for params in settings.iter_params(model_id)
+        if model_id in CP_TEST_MODELS
+    ],
+)
+@create_new_process_for_each_test()
+def test_cp_generation(
+    model_id: str,
+    parallel_setup: ParallelSetup,
+    distributed_backend: str,
+    vllm_major_version: str,
+    runner: RunnerOption,
+    test_options: CPTestOptions,
+    num_gpus_available,
+):
+    _compare_cp_with_tp(model_id,
+                        parallel_setup,
+                        distributed_backend,
+                        vllm_major_version,
+                        runner,
+                        test_options,
+                        num_gpus_available,
+                        method="generate",
+                        is_multimodal=False)
diff --git a/tests/distributed/test_pipeline_parallel.py b/tests/distributed/test_pipeline_parallel.py
index 1afe9ea970c9784037df9a1c9fcfd07c6e06e687..fffab1a984c263a91aece6ba45612922c3f8d0c4 100644
--- a/tests/distributed/test_pipeline_parallel.py
+++ b/tests/distributed/test_pipeline_parallel.py
@@ -298,6 +298,8 @@ def _compare_tp(
     tokenizer_mode = model_info.tokenizer_mode
     hf_overrides = model_info.hf_overrides
     hf_config = get_config(model_id, trust_remote_code)
+    skip_tokenizer_init = model_info.skip_tokenizer_init
+    max_num_seqs = model_info.max_num_seqs
 
     dtype = "float16"
     if hf_config.model_type in _FLOAT16_NOT_SUPPORTED_MODELS:
@@ -351,6 +353,10 @@ def _compare_tp(
         common_args.extend(["--load-format", load_format])
     if hf_overrides:
         common_args.extend(["--hf-overrides", json.dumps(hf_overrides)])
+    if skip_tokenizer_init:
+        common_args.append("--skip-tokenizer-init")
+    if max_num_seqs:
+        common_args.extend(["--max-num-seqs", f"{max_num_seqs}"])
 
     specific_case = tp_size == 2 and pp_size == 2 and chunked_prefill
     testing_ray_compiled_graph = False
diff --git a/tests/distributed/test_sequence_parallel.py b/tests/distributed/test_sequence_parallel.py
index c93b436f384b9aebb82971ce9a8a2c61f7d028e5..65c5e68968440e274dae512b836f224175d2c693 100644
--- a/tests/distributed/test_sequence_parallel.py
+++ b/tests/distributed/test_sequence_parallel.py
@@ -178,6 +178,7 @@ def _compare_sp(
     trust_remote_code = model_info.trust_remote_code
     tokenizer_mode = model_info.tokenizer_mode
     hf_overrides = model_info.hf_overrides
+    skip_tokenizer_init = model_info.skip_tokenizer_init
 
     if load_format == "dummy":
         # Avoid OOM
@@ -227,6 +228,8 @@ def _compare_sp(
         common_args.extend(["--load-format", load_format])
     if hf_overrides:
         common_args.extend(["--hf-overrides", json.dumps(hf_overrides)])
+    if skip_tokenizer_init:
+        common_args.append("--skip-tokenizer-init")
 
     compilation_config = {
         'level': 3,
diff --git a/tests/encoder_decoder/test_e2e_correctness.py b/tests/encoder_decoder/test_e2e_correctness.py
index 8b99d9d6e21fbe1eec439c8830c9753753cbf00f..3cf4c377fb5812a7380a3c7086aff95b6b733961 100644
--- a/tests/encoder_decoder/test_e2e_correctness.py
+++ b/tests/encoder_decoder/test_e2e_correctness.py
@@ -63,6 +63,7 @@ def clear_cache():
     current_platform.is_cpu(),
     reason="CPU backend is not currently supported with encoder/decoder models"
 )
+@pytest.mark.skip(reason="bart not supported in V1")
 def test_encoder_decoder_e2e(
     hf_runner,
     vllm_runner,
diff --git a/tests/engine/test_arg_utils.py b/tests/engine/test_arg_utils.py
index 93ac18dfcc7b477bea36d6aae163b10adfafa9bd..b82e8396380413ede7e2a6b2faa65a1e4e0e72d7 100644
--- a/tests/engine/test_arg_utils.py
+++ b/tests/engine/test_arg_utils.py
@@ -167,7 +167,7 @@ def test_get_kwargs():
     # dict should have json tip in help
     json_tip = "Should either be a valid JSON string or JSON keys"
     assert json_tip in kwargs["json_tip"]["help"]
-    # nested config should should construct the nested config
+    # nested config should construct the nested config
     assert kwargs["nested_config"]["type"]('{"field": 2}') == NestedConfig(2)
 
 
@@ -287,15 +287,6 @@ def test_prefix_cache_default():
         },
         "mm-processor-kwargs"
     ),
-    (
-        '{"cast_logits_dtype":"bfloat16","sequence_parallel_norm":true,"sequence_parallel_norm_threshold":2048}',
-        {
-            "cast_logits_dtype": "bfloat16",
-            "sequence_parallel_norm": True,
-            "sequence_parallel_norm_threshold": 2048,
-        },
-        "override-neuron-config"
-    ),
 ])
 # yapf: enable
 def test_composite_arg_parser(arg, expected, option):
diff --git a/tests/engine/test_executor.py b/tests/engine/test_executor.py
index 15c7a97b50e1f52b74d1612b0694b459e2d535d9..67064aff3ae9240800000a76805a1b9a31dd3669 100644
--- a/tests/engine/test_executor.py
+++ b/tests/engine/test_executor.py
@@ -25,7 +25,7 @@ class CustomUniExecutor(UniProcExecutor):
                        timeout: Optional[float] = None,
                        args: tuple = (),
                        kwargs: Optional[dict] = None) -> list[Any]:
-        # Drop marker to show that this was ran
+        # Drop marker to show that this was run
         with open(".marker", "w"):
             ...
         return super().collective_rpc(method, timeout, args, kwargs)
diff --git a/tests/entrypoints/conftest.py b/tests/entrypoints/conftest.py
index a7c533ec24198c03ad1ca9a865d18c16c99f2174..48fd848e8820007c1c6843ba0643d5d551646669 100644
--- a/tests/entrypoints/conftest.py
+++ b/tests/entrypoints/conftest.py
@@ -201,3 +201,32 @@ table: "table_1" | "table_2"
 condition: column "=" number
 number: "1" | "2"
 """)
+
+
+@pytest.fixture(scope="session")
+def zephyr_lora_files():
+    """Download zephyr LoRA files once per test session."""
+    from huggingface_hub import snapshot_download
+    return snapshot_download(repo_id="typeof/zephyr-7b-beta-lora")
+
+
+@pytest.fixture(scope="session")
+def zephyr_lora_added_tokens_files(zephyr_lora_files):
+    """Create zephyr LoRA files with added tokens once per test session."""
+    import shutil
+    from tempfile import TemporaryDirectory
+
+    from transformers import AutoTokenizer
+
+    tmp_dir = TemporaryDirectory()
+    tmp_model_dir = f"{tmp_dir.name}/zephyr"
+    shutil.copytree(zephyr_lora_files, tmp_model_dir)
+    tokenizer = AutoTokenizer.from_pretrained("HuggingFaceH4/zephyr-7b-beta")
+    # Copy tokenizer to adapter and add some unique tokens
+    # 32000, 32001, 32002
+    added = tokenizer.add_tokens(["vllm1", "vllm2", "vllm3"],
+                                 special_tokens=True)
+    assert added == 3
+    tokenizer.save_pretrained(tmp_model_dir)
+    yield tmp_model_dir
+    tmp_dir.cleanup()
diff --git a/tests/entrypoints/llm/test_chat.py b/tests/entrypoints/llm/test_chat.py
index 2cbfed98a577a8af110e73b111c33f5b1dcc89fe..bf460d0fb25d3e02af8957cda75572d27fe99552 100644
--- a/tests/entrypoints/llm/test_chat.py
+++ b/tests/entrypoints/llm/test_chat.py
@@ -7,7 +7,7 @@ import pytest
 from vllm import LLM
 from vllm.distributed import cleanup_dist_env_and_memory
 
-from ..openai.test_vision import TEST_IMAGE_URLS
+from ..openai.test_vision import TEST_IMAGE_ASSETS
 
 
 @pytest.fixture(scope="function")
@@ -95,7 +95,8 @@ def vision_llm():
 
 
 @pytest.mark.parametrize("image_urls",
-                         [[TEST_IMAGE_URLS[0], TEST_IMAGE_URLS[1]]])
+                         [[TEST_IMAGE_ASSETS[0], TEST_IMAGE_ASSETS[1]]],
+                         indirect=True)
 def test_chat_multi_image(vision_llm, image_urls: list[str]):
     messages = [{
         "role":
diff --git a/tests/entrypoints/offline_mode/test_offline_mode.py b/tests/entrypoints/offline_mode/test_offline_mode.py
index a154bb1059aae15b7db54c88d554440ed2821d44..f8ed5dda260ff087ab3f72012f995033d30b0769 100644
--- a/tests/entrypoints/offline_mode/test_offline_mode.py
+++ b/tests/entrypoints/offline_mode/test_offline_mode.py
@@ -79,7 +79,7 @@ def test_offline_mode(monkeypatch: pytest.MonkeyPatch):
             )
 
             # Need to re-import huggingface_hub
-            # and friends to setup offline mode
+            # and friends to set up offline mode
             _re_import_modules()
             # Cached model files should be used in offline mode
             for model_config in MODEL_CONFIGS:
@@ -136,7 +136,7 @@ def test_model_from_huggingface_offline(monkeypatch: pytest.MonkeyPatch):
                 disable_connect,
             )
             # Need to re-import huggingface_hub
-            # and friends to setup offline mode
+            # and friends to set up offline mode
             _re_import_modules()
             engine_args = EngineArgs(model="facebook/opt-125m")
             LLM(**dataclasses.asdict(engine_args))
diff --git a/tests/entrypoints/openai/conftest.py b/tests/entrypoints/openai/conftest.py
new file mode 100644
index 0000000000000000000000000000000000000000..0ecdd4245df43d4fc8f9b1091d5428c7abdfef64
--- /dev/null
+++ b/tests/entrypoints/openai/conftest.py
@@ -0,0 +1,27 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import pytest
+
+from vllm.assets.audio import AudioAsset
+
+
+@pytest.fixture
+def mary_had_lamb():
+    path = AudioAsset('mary_had_lamb').get_local_path()
+    with open(str(path), "rb") as f:
+        yield f
+
+
+@pytest.fixture
+def winning_call():
+    path = AudioAsset('winning_call').get_local_path()
+    with open(str(path), "rb") as f:
+        yield f
+
+
+@pytest.fixture
+def foscolo():
+    # Test translation it->en
+    path = AudioAsset('azacinto_foscolo').get_local_path()
+    with open(str(path), "rb") as f:
+        yield f
diff --git a/tests/entrypoints/openai/correctness/test_transcription_api_correctness.py b/tests/entrypoints/openai/correctness/test_transcription_api_correctness.py
index 0d0ce0be8c5f86e568f4d6640d06586867efd7f5..9122b7003bf9a3cb9e0baaaccab8abb7a0938e59 100644
--- a/tests/entrypoints/openai/correctness/test_transcription_api_correctness.py
+++ b/tests/entrypoints/openai/correctness/test_transcription_api_correctness.py
@@ -32,7 +32,7 @@ def to_bytes(y, sr):
 
 async def transcribe_audio(client, tokenizer, y, sr):
     # Send loaded audio directly instead of loading from disk,
-    # dont account for that time though
+    # don't account for that time though
     with to_bytes(y, sr) as f:
         start_time = time.perf_counter()
         transcription = await client.audio.transcriptions.create(
diff --git a/tests/entrypoints/openai/test_chat.py b/tests/entrypoints/openai/test_chat.py
index 5ad29d70f10df95ca5acd81754a5dfc8d0fd1e32..4608850c7dae22e29e0653de26367e1f3a781af1 100644
--- a/tests/entrypoints/openai/test_chat.py
+++ b/tests/entrypoints/openai/test_chat.py
@@ -12,11 +12,9 @@ import pytest_asyncio
 import regex as re
 import requests
 import torch
-from openai import BadRequestError, OpenAI
+from openai import BadRequestError
 
 from ...utils import RemoteOpenAIServer
-from .test_completion import zephyr_lora_added_tokens_files  # noqa: F401
-from .test_completion import zephyr_lora_files  # noqa: F401
 
 # any model with a chat template should work here
 MODEL_NAME = "HuggingFaceH4/zephyr-7b-beta"
@@ -970,59 +968,6 @@ async def test_long_seed(client: openai.AsyncOpenAI):
                 or "less_than_equal" in exc_info.value.message)
 
 
-@pytest.mark.asyncio
-async def test_http_chat_no_model_name_with_curl(server: RemoteOpenAIServer):
-    url = f"http://localhost:{server.port}/v1/chat/completions"
-    headers = {
-        "Content-Type": "application/json",
-    }
-    data = {
-        # model_name is avoided here.
-        "messages": [{
-            "role": "system",
-            "content": "You are a helpful assistant."
-        }, {
-            "role": "user",
-            "content": "what is 1+1?"
-        }],
-        "max_tokens":
-        5
-    }
-
-    response = requests.post(url, headers=headers, json=data)
-    response_data = response.json()
-    print(response_data)
-    assert response_data.get("model") == MODEL_NAME
-    choice = response_data.get("choices")[0]
-    message = choice.get("message")
-    assert message is not None
-    content = message.get("content")
-    assert content is not None
-    assert len(content) > 0
-
-
-@pytest.mark.asyncio
-async def test_http_chat_no_model_name_with_openai(server: RemoteOpenAIServer):
-    openai_api_key = "EMPTY"
-    openai_api_base = f"http://localhost:{server.port}/v1"
-
-    client = OpenAI(
-        api_key=openai_api_key,
-        base_url=openai_api_base,
-    )
-    messages = [
-        {
-            "role": "user",
-            "content": "Hello, vLLM!"
-        },
-    ]
-    response = client.chat.completions.create(
-        model="",  # empty string
-        messages=messages,
-    )
-    assert response.model == MODEL_NAME
-
-
 @pytest.mark.asyncio
 async def test_invocations(server: RemoteOpenAIServer,
                            client: openai.AsyncOpenAI):
diff --git a/tests/entrypoints/openai/test_chat_template.py b/tests/entrypoints/openai/test_chat_template.py
index 5b6e2a4146b1fa3f028c521173276faefe814ff6..ce90a67c015177743cfb934c1e6ee098c2ac425a 100644
--- a/tests/entrypoints/openai/test_chat_template.py
+++ b/tests/entrypoints/openai/test_chat_template.py
@@ -104,7 +104,9 @@ def test_get_gen_prompt(model, template, add_generation_prompt,
         trust_remote_code=model_info.trust_remote_code,
         revision=model_info.revision,
         hf_overrides=model_info.hf_overrides,
-    )
+        skip_tokenizer_init=model_info.skip_tokenizer_init,
+        enforce_eager=model_info.enforce_eager,
+        dtype=model_info.dtype)
 
     # Initialize the tokenizer
     tokenizer = get_tokenizer(
diff --git a/tests/entrypoints/openai/test_completion.py b/tests/entrypoints/openai/test_completion.py
index 74ef6deeea16b8ef0eda323ad4a691f3efdc0f20..d55f8d9d65d9b8b78aee756e8cf9fbae69b3e0fa 100644
--- a/tests/entrypoints/openai/test_completion.py
+++ b/tests/entrypoints/openai/test_completion.py
@@ -3,8 +3,6 @@
 # imports for guided decoding tests
 import json
 import os
-import shutil
-from tempfile import TemporaryDirectory
 from typing import Optional
 
 import jsonschema
@@ -14,9 +12,7 @@ import pytest_asyncio
 import regex as re
 import requests
 # downloading lora to test lora requests
-from huggingface_hub import snapshot_download
 from openai import BadRequestError
-from transformers import AutoTokenizer
 
 from vllm.transformers_utils.tokenizer import get_tokenizer
 
@@ -26,32 +22,10 @@ from ...utils import RemoteOpenAIServer
 MODEL_NAME = "HuggingFaceH4/zephyr-7b-beta"
 # technically these adapters use a different base model,
 # but we're not testing generation quality here
-LORA_NAME = "typeof/zephyr-7b-beta-lora"
 
 GUIDED_DECODING_BACKENDS = ["outlines", "xgrammar", "guidance"]
 
 
-@pytest.fixture(scope="module")
-def zephyr_lora_files():
-    return snapshot_download(repo_id=LORA_NAME)
-
-
-@pytest.fixture(scope="module")
-def zephyr_lora_added_tokens_files(zephyr_lora_files):
-    tmp_dir = TemporaryDirectory()
-    tmp_model_dir = f"{tmp_dir.name}/zephyr"
-    shutil.copytree(zephyr_lora_files, tmp_model_dir)
-    tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
-    # Copy tokenizer to adapter and add some unique tokens
-    # 32000, 32001, 32002
-    added = tokenizer.add_tokens(["vllm1", "vllm2", "vllm3"],
-                                 special_tokens=True)
-    assert added == 3
-    tokenizer.save_pretrained(tmp_model_dir)
-    yield tmp_model_dir
-    tmp_dir.cleanup()
-
-
 @pytest.fixture(scope="module")
 def default_server_args(zephyr_lora_files, zephyr_lora_added_tokens_files):
     return [
diff --git a/tests/entrypoints/openai/test_completion_with_prompt_embeds.py b/tests/entrypoints/openai/test_completion_with_prompt_embeds.py
index 00d3ffb61ee9f995367aa15fa58c51192a9b6632..a0ef31762ea150a915234f766b6fb1458b494d45 100644
--- a/tests/entrypoints/openai/test_completion_with_prompt_embeds.py
+++ b/tests/entrypoints/openai/test_completion_with_prompt_embeds.py
@@ -3,48 +3,23 @@
 
 import base64
 import io
-import shutil
-from tempfile import TemporaryDirectory
 
 import openai  # use the official client for correctness check
 import pytest
 import pytest_asyncio
 import torch
 # downloading lora to test lora requests
-from huggingface_hub import snapshot_download
 from openai import BadRequestError
-from transformers import AutoConfig, AutoTokenizer
+from transformers import AutoConfig
 
 from ...utils import RemoteOpenAIServer
 
 # any model with a chat template should work here
 MODEL_NAME = "HuggingFaceH4/zephyr-7b-beta"
-LORA_NAME = "typeof/zephyr-7b-beta-lora"
 
 CONFIG = AutoConfig.from_pretrained(MODEL_NAME)
 
 
-@pytest.fixture(scope="module")
-def zephyr_lora_files():
-    return snapshot_download(repo_id=LORA_NAME)
-
-
-@pytest.fixture(scope="module")
-def zephyr_lora_added_tokens_files(zephyr_lora_files):
-    tmp_dir = TemporaryDirectory()
-    tmp_model_dir = f"{tmp_dir.name}/zephyr"
-    shutil.copytree(zephyr_lora_files, tmp_model_dir)
-    tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
-    # Copy tokenizer to adapter and add some unique tokens
-    # 32000, 32001, 32002
-    added = tokenizer.add_tokens(["vllm1", "vllm2", "vllm3"],
-                                 special_tokens=True)
-    assert added == 3
-    tokenizer.save_pretrained(tmp_model_dir)
-    yield tmp_model_dir
-    tmp_dir.cleanup()
-
-
 @pytest.fixture(scope="module")
 def default_server_args(
     zephyr_lora_files,
diff --git a/tests/entrypoints/openai/test_encoder_decoder.py b/tests/entrypoints/openai/test_encoder_decoder.py
index 9c2aef23e87722aa97dc57315d8bf5e14305aeae..75612962c95f7c8a43004c8a5097f5e0d6b18df3 100644
--- a/tests/entrypoints/openai/test_encoder_decoder.py
+++ b/tests/entrypoints/openai/test_encoder_decoder.py
@@ -30,6 +30,7 @@ async def client(server):
 
 @pytest.mark.asyncio
 @pytest.mark.parametrize("model_name", [MODEL_NAME])
+@pytest.mark.skip(reason="bart is not yet supported in V1")
 async def test_single_completion(client: openai.AsyncOpenAI, model_name: str):
     completion = await client.completions.create(model=model_name,
                                                  prompt="Hello, my name is",
diff --git a/tests/entrypoints/openai/test_lora_adapters.py b/tests/entrypoints/openai/test_lora_adapters.py
index bcdeaaacedea0e03ec77187c0eb921c249e4065a..f91dcf194b839fa2ca75a1ad3faa6b3c97f75472 100644
--- a/tests/entrypoints/openai/test_lora_adapters.py
+++ b/tests/entrypoints/openai/test_lora_adapters.py
@@ -9,8 +9,6 @@ from contextlib import suppress
 import openai  # use the official client for correctness check
 import pytest
 import pytest_asyncio
-# downloading lora to test lora requests
-from huggingface_hub import snapshot_download
 
 from ...utils import RemoteOpenAIServer
 
@@ -18,7 +16,6 @@ from ...utils import RemoteOpenAIServer
 MODEL_NAME = "HuggingFaceH4/zephyr-7b-beta"
 # technically this needs Mistral-7B-v0.1 as base, but we're not testing
 # generation quality here
-LORA_NAME = "typeof/zephyr-7b-beta-lora"
 
 BADREQUEST_CASES = [
     (
@@ -48,11 +45,6 @@ BADREQUEST_CASES = [
 ]
 
 
-@pytest.fixture(scope="module")
-def zephyr_lora_files():
-    return snapshot_download(repo_id=LORA_NAME)
-
-
 @pytest.fixture(scope="module")
 def monkeypatch_module():
     from _pytest.monkeypatch import MonkeyPatch
diff --git a/tests/entrypoints/openai/test_metrics.py b/tests/entrypoints/openai/test_metrics.py
index ff2e7004ff9f88ac5436bc2ecffbb2414ebfd0da..a4e1aca8bcac24e5d372c3f3cc55a2f118fc46f9 100644
--- a/tests/entrypoints/openai/test_metrics.py
+++ b/tests/entrypoints/openai/test_metrics.py
@@ -250,12 +250,15 @@ EXPECTED_METRICS_V1 = [
     "vllm:request_params_max_tokens_sum",
     "vllm:request_params_max_tokens_bucket",
     "vllm:request_params_max_tokens_count",
-    "vllm:time_to_first_token_seconds_sum",
-    "vllm:time_to_first_token_seconds_bucket",
-    "vllm:time_to_first_token_seconds_count",
     "vllm:time_per_output_token_seconds_sum",
     "vllm:time_per_output_token_seconds_bucket",
     "vllm:time_per_output_token_seconds_count",
+    "vllm:time_to_first_token_seconds_sum",
+    "vllm:time_to_first_token_seconds_bucket",
+    "vllm:time_to_first_token_seconds_count",
+    "vllm:inter_token_latency_seconds_sum",
+    "vllm:inter_token_latency_seconds_bucket",
+    "vllm:inter_token_latency_seconds_count",
     "vllm:e2e_request_latency_seconds_sum",
     "vllm:e2e_request_latency_seconds_bucket",
     "vllm:e2e_request_latency_seconds_count",
@@ -273,7 +276,11 @@ EXPECTED_METRICS_V1 = [
     "vllm:request_decode_time_seconds_count",
 ]
 
-HIDDEN_DEPRECATED_METRICS: list[str] = []
+HIDDEN_DEPRECATED_METRICS: list[str] = [
+    "vllm:time_per_output_token_seconds_sum",
+    "vllm:time_per_output_token_seconds_bucket",
+    "vllm:time_per_output_token_seconds_count",
+]
 
 
 @pytest.mark.asyncio
@@ -289,9 +296,10 @@ async def test_metrics_exist(server: RemoteOpenAIServer,
     assert response.status_code == HTTPStatus.OK
 
     for metric in (EXPECTED_METRICS_V1 if use_v1 else EXPECTED_METRICS):
-        if (not server.show_hidden_metrics
-                and metric not in HIDDEN_DEPRECATED_METRICS):
-            assert metric in response.text
+        if (metric in HIDDEN_DEPRECATED_METRICS
+                and not server.show_hidden_metrics):
+            continue
+        assert metric in response.text
 
 
 @pytest.mark.asyncio
diff --git a/tests/entrypoints/openai/test_models.py b/tests/entrypoints/openai/test_models.py
index 1980daa80db9e5b97b578751bc972ef8160671c5..7cd3ca196a431d7f671b3af20edb16ac2f43c3ad 100644
--- a/tests/entrypoints/openai/test_models.py
+++ b/tests/entrypoints/openai/test_models.py
@@ -4,8 +4,6 @@
 import openai  # use the official client for correctness check
 import pytest
 import pytest_asyncio
-# downloading lora to test lora requests
-from huggingface_hub import snapshot_download
 
 from ...utils import RemoteOpenAIServer
 
@@ -13,12 +11,6 @@ from ...utils import RemoteOpenAIServer
 MODEL_NAME = "HuggingFaceH4/zephyr-7b-beta"
 # technically this needs Mistral-7B-v0.1 as base, but we're not testing
 # generation quality here
-LORA_NAME = "typeof/zephyr-7b-beta-lora"
-
-
-@pytest.fixture(scope="module")
-def zephyr_lora_files():
-    return snapshot_download(repo_id=LORA_NAME)
 
 
 @pytest.fixture(scope="module")
diff --git a/tests/entrypoints/openai/test_prompt_validation.py b/tests/entrypoints/openai/test_prompt_validation.py
index 4197583074dfec6f3510e558423d87917c528e9c..bfa3f983cd87ee41cf33376805e8c5cf5172a70d 100644
--- a/tests/entrypoints/openai/test_prompt_validation.py
+++ b/tests/entrypoints/openai/test_prompt_validation.py
@@ -10,7 +10,7 @@ import pytest
 import regex as re
 import torch
 
-from vllm.entrypoints.openai.serving_engine import OpenAIServing
+from vllm.entrypoints.renderer import BaseRenderer
 
 from ...utils import RemoteOpenAIServer
 
@@ -27,12 +27,16 @@ async def test_empty_prompt():
     with RemoteOpenAIServer(model_name, server_args) as remote_server:
         client = remote_server.get_async_client()
 
-        with pytest.raises(openai.BadRequestError,
-                           match="decoder prompt cannot be empty"):
+        with pytest.raises(
+                openai.BadRequestError,
+                match=
+                "Either prompt or prompt_embeds must be provided and non-empty."
+        ):
             await client.completions.create(model=model_name,
                                             prompt="",
                                             max_tokens=5,
-                                            temperature=0.0)
+                                            temperature=0.0,
+                                            extra_body={"prompt_embeds": []})
 
 
 @pytest.mark.asyncio
@@ -83,7 +87,7 @@ def test_load_prompt_embeds(dtype: torch.dtype, layout: torch.layout,
     buffer.seek(0)
     encoded_tensor = pybase64.b64encode(buffer.getvalue())
 
-    loaded_prompt_embeds = OpenAIServing._load_prompt_embeds(encoded_tensor)
+    loaded_prompt_embeds = BaseRenderer.load_prompt_embeds(encoded_tensor)
     assert len(loaded_prompt_embeds) == 1
     loaded_tensor = loaded_prompt_embeds[0]["prompt_embeds"]
     assert loaded_tensor.device.type == "cpu"
diff --git a/tests/entrypoints/openai/test_response_api_with_harmony.py b/tests/entrypoints/openai/test_response_api_with_harmony.py
index 72d468db08f655b0b971e1c79aff41bfe99c6b2d..0d5836fab5a7c409319c7c1e75499b21f369fcc9 100644
--- a/tests/entrypoints/openai/test_response_api_with_harmony.py
+++ b/tests/entrypoints/openai/test_response_api_with_harmony.py
@@ -275,7 +275,8 @@ async def test_stateful_multi_turn(client: OpenAI, model_name: str):
 
 @pytest.mark.asyncio
 @pytest.mark.parametrize("model_name", [MODEL_NAME])
-async def test_streaming(client: OpenAI, model_name: str):
+@pytest.mark.parametrize("background", [True, False])
+async def test_streaming(client: OpenAI, model_name: str, background: bool):
     # TODO: Add back when web search and code interpreter are available in CI
     prompts = [
         "tell me a story about a cat in 20 words",
@@ -300,11 +301,16 @@ async def test_streaming(client: OpenAI, model_name: str):
                 # },
             ],
             stream=True,
+            background=background,
         )
 
         events = []
         current_event_mode = None
+        resp_id = None
         async for event in response:
+            if event.type == "response.created":
+                resp_id = event.response.id
+
             if current_event_mode != event.type:
                 current_event_mode = event.type
                 print(f"\n[{event.type}] ", end="", flush=True)
@@ -322,6 +328,17 @@ async def test_streaming(client: OpenAI, model_name: str):
 
         assert len(events) > 0
 
+        if background:
+            starting_after = 5
+            async with await client.responses.retrieve(
+                    response_id=resp_id,
+                    stream=True,
+                    starting_after=starting_after) as stream:
+                counter = starting_after
+                async for event in stream:
+                    counter += 1
+                    assert event == events[counter]
+
 
 @pytest.mark.asyncio
 @pytest.mark.parametrize("model_name", [MODEL_NAME])
diff --git a/tests/entrypoints/openai/test_return_token_ids.py b/tests/entrypoints/openai/test_return_token_ids.py
index 6addcb41c4098e17d72a208823f94228b697885d..ff8f193fec5520cbf6170c9c5021dd46dfc99f48 100644
--- a/tests/entrypoints/openai/test_return_token_ids.py
+++ b/tests/entrypoints/openai/test_return_token_ids.py
@@ -224,7 +224,7 @@ async def test_comparison_with_prompt_logprobs_and_logprobs(server):
                 logprobs_token_ids.append(token_id)
 
         # When echo=True, the logprobs include both prompt and response tokens
-        # The token_ids field should match the the suffix of response portion
+        # The token_ids field should match the suffix of response portion
         # The prompt_token_ids should match the prompt portion
         assert len(completion.choices[0].token_ids) < len(logprobs_token_ids)
         response_token_ids_length = len(completion.choices[0].token_ids)
diff --git a/tests/entrypoints/openai/test_return_tokens_as_ids.py b/tests/entrypoints/openai/test_return_tokens_as_ids.py
index af58fbd4b36406fbd12ed12d9dfa89de44319890..5f43fdc9588f3e3a994cdf62da807f1567e63efc 100644
--- a/tests/entrypoints/openai/test_return_tokens_as_ids.py
+++ b/tests/entrypoints/openai/test_return_tokens_as_ids.py
@@ -11,8 +11,6 @@ from vllm.transformers_utils.tokenizer import get_tokenizer
 
 from ...utils import RemoteOpenAIServer
 from .test_completion import default_server_args  # noqa: F401
-from .test_completion import zephyr_lora_added_tokens_files  # noqa: F401
-from .test_completion import zephyr_lora_files  # noqa: F401
 from .test_completion import MODEL_NAME
 
 
diff --git a/tests/entrypoints/openai/test_serving_chat.py b/tests/entrypoints/openai/test_serving_chat.py
index 10879f0be83c8a8d90b73fb0adfdbfc3aba62733..d219a1f311f1523c3795d10a46dc943cbd22ca11 100644
--- a/tests/entrypoints/openai/test_serving_chat.py
+++ b/tests/entrypoints/openai/test_serving_chat.py
@@ -1,13 +1,16 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
+from __future__ import annotations
+
 import asyncio
 from contextlib import suppress
 from dataclasses import dataclass, field
-from typing import Any, Optional
+from typing import TYPE_CHECKING, Any, Optional
 from unittest.mock import MagicMock
 
 import pytest
+import pytest_asyncio
 
 from vllm.config import MultiModalConfig
 from vllm.engine.multiprocessing.client import MQLLMEngineClient
@@ -17,9 +20,205 @@ from vllm.entrypoints.openai.serving_models import (BaseModelPath,
                                                     OpenAIServingModels)
 from vllm.transformers_utils.tokenizer import get_tokenizer
 
+from ...utils import RemoteOpenAIServer
+
+if TYPE_CHECKING:
+    from openai import OpenAI
+
+GPT_OSS_MODEL_NAME = "openai/gpt-oss-20b"
+
+
+@pytest.fixture(scope="module")
+def monkeypatch_module():
+    from _pytest.monkeypatch import MonkeyPatch
+    mpatch = MonkeyPatch()
+    yield mpatch
+    mpatch.undo()
+
+
+@pytest.fixture(scope="module",
+                params=[True, False],
+                ids=["with_tool_parser", "without_tool_parser"])
+def with_tool_parser(request) -> bool:
+    return request.param
+
+
+@pytest.fixture(scope="module")
+def default_server_args(with_tool_parser: bool):
+    args = [
+        # use half precision for speed and memory savings in CI environment
+        "--enforce-eager",
+        "--max-model-len",
+        "4096",
+        "--reasoning-parser",
+        "openai_gptoss",
+        "--gpu-memory-utilization",
+        "0.8",
+    ]
+    if with_tool_parser:
+        args.extend([
+            "--tool-call-parser",
+            "openai",
+            "--enable-auto-tool-choice",
+        ])
+    return args
+
+
+@pytest.fixture(scope="module")
+def gptoss_server(monkeypatch_module: pytest.MonkeyPatch,
+                  default_server_args: list[str]):
+    with monkeypatch_module.context() as m:
+        m.setenv("VLLM_ATTENTION_BACKEND", "TRITON_ATTN_VLLM_V1")
+        with RemoteOpenAIServer(GPT_OSS_MODEL_NAME,
+                                default_server_args) as remote_server:
+            yield remote_server
+
+
+@pytest_asyncio.fixture
+async def gptoss_client(gptoss_server):
+    async with gptoss_server.get_async_client() as async_client:
+        yield async_client
+
+
+@pytest.mark.asyncio
+async def test_gpt_oss_chat_tool_call_streaming(gptoss_client: OpenAI,
+                                                with_tool_parser: bool):
+    tools = [{
+        "type": "function",
+        "function": {
+            "name": "get_current_weather",
+            "description": "Get the current weather in a given location",
+            "parameters": {
+                "type": "object",
+                "properties": {
+                    "city": {
+                        "type": "string"
+                    },
+                    "state": {
+                        "type": "string"
+                    },
+                    "unit": {
+                        "type": "string",
+                        "enum": ["celsius", "fahrenheit"],
+                    },
+                },
+                "required": ["city", "state", "unit"],
+            },
+        },
+    }]
+
+    messages = [
+        {
+            "role": "user",
+            "content": "What is the weather in Dallas, TX?"
+        },
+    ]
+
+    stream = await gptoss_client.chat.completions.create(
+        model=GPT_OSS_MODEL_NAME,
+        messages=messages,
+        tools=tools if with_tool_parser else None,
+        stream=True)
+
+    name = None
+    args_buf = ""
+    content_buf = ""
+    async for chunk in stream:
+        delta = chunk.choices[0].delta
+        if delta.tool_calls:
+            tc = delta.tool_calls[0]
+            if tc.function and tc.function.name:
+                name = tc.function.name
+            if tc.function and tc.function.arguments:
+                args_buf += tc.function.arguments
+        if getattr(delta, "content", None):
+            content_buf += delta.content
+    if with_tool_parser:
+        assert name is not None
+        assert len(args_buf) > 0
+    else:
+        assert name is None
+        assert len(args_buf) == 0
+        assert len(content_buf) > 0
+
+
+@pytest.mark.asyncio
+async def test_gpt_oss_multi_turn_chat(gptoss_client: OpenAI,
+                                       with_tool_parser: bool):
+    if not with_tool_parser:
+        pytest.skip("skip non-tool for multi-turn tests")
+    tools = [{
+        "type": "function",
+        "function": {
+            "name": "get_current_weather",
+            "description": "Get the current weather in a given location",
+            "parameters": {
+                "type": "object",
+                "properties": {
+                    "city": {
+                        "type": "string"
+                    },
+                    "state": {
+                        "type": "string"
+                    },
+                    "unit": {
+                        "type": "string",
+                        "enum": ["celsius", "fahrenheit"],
+                    },
+                },
+                "required": ["city", "state", "unit"],
+            },
+        },
+    }]
+
+    messages = [
+        {
+            "role": "system",
+            "content": "you are a helpful assistant"
+        },
+        {
+            "role": "user",
+            "content": "What is the weather in Dallas, TX with celsius?"
+        },
+    ]
+
+    first = await gptoss_client.chat.completions.create(
+        model=GPT_OSS_MODEL_NAME,
+        messages=messages,
+        tools=tools,
+        temperature=0.0,
+    )
+    first_msg = first.choices[0].message
+    assert first_msg.tool_calls is not None and len(first_msg.tool_calls) > 0
+    tc = first_msg.tool_calls[0]
+    assert tc.function is not None and tc.function.name == "get_current_weather"
+    args1 = tc.function.arguments
+    assert args1 is not None and len(args1) > 0
+
+    messages.append({"role": "assistant", "content": args1})
+    messages.append({
+        "role": "user",
+        "content": "Now convert to celsius and return JSON only"
+    })
+
+    second = await gptoss_client.chat.completions.create(
+        model=GPT_OSS_MODEL_NAME,
+        messages=messages,
+        tools=tools,
+        temperature=0.0,
+    )
+    second_msg = second.choices[0].message
+    assert (second_msg.content is not None and len(second_msg.content) > 0) or \
+        (second_msg.tool_calls is not None and len(second_msg.tool_calls) > 0)
+
+
 MODEL_NAME = "openai-community/gpt2"
+MODEL_NAME_SHORT = "gpt2"
 CHAT_TEMPLATE = "Dummy chat template for testing {}"
-BASE_MODEL_PATHS = [BaseModelPath(name=MODEL_NAME, model_path=MODEL_NAME)]
+BASE_MODEL_PATHS = [
+    BaseModelPath(name=MODEL_NAME, model_path=MODEL_NAME),
+    BaseModelPath(name=MODEL_NAME_SHORT, model_path=MODEL_NAME_SHORT)
+]
 
 
 @dataclass
@@ -75,6 +274,42 @@ def test_async_serving_chat_init():
     assert serving_completion.chat_template == CHAT_TEMPLATE
 
 
+@pytest.mark.asyncio
+async def test_serving_chat_returns_correct_model_name():
+    mock_engine = MagicMock(spec=MQLLMEngineClient)
+    mock_engine.get_tokenizer.return_value = get_tokenizer(MODEL_NAME)
+    mock_engine.errored = False
+
+    models = OpenAIServingModels(engine_client=mock_engine,
+                                 base_model_paths=BASE_MODEL_PATHS,
+                                 model_config=MockModelConfig())
+    serving_chat = OpenAIServingChat(mock_engine,
+                                     MockModelConfig(),
+                                     models,
+                                     response_role="assistant",
+                                     chat_template=CHAT_TEMPLATE,
+                                     chat_template_content_format="auto",
+                                     request_logger=None)
+    messages = [{"role": "user", "content": "what is 1+1?"}]
+
+    async def return_model_name(*args):
+        return args[3]
+
+    serving_chat.chat_completion_full_generator = return_model_name
+
+    # Test that full name is returned when short name is requested
+    req = ChatCompletionRequest(model=MODEL_NAME_SHORT, messages=messages)
+    assert await serving_chat.create_chat_completion(req) == MODEL_NAME
+
+    # Test that full name is returned when empty string is specified
+    req = ChatCompletionRequest(model="", messages=messages)
+    assert await serving_chat.create_chat_completion(req) == MODEL_NAME
+
+    # Test that full name is returned when no model is specified
+    req = ChatCompletionRequest(messages=messages)
+    assert await serving_chat.create_chat_completion(req) == MODEL_NAME
+
+
 @pytest.mark.asyncio
 async def test_serving_chat_should_set_correct_max_tokens():
     mock_engine = MagicMock(spec=MQLLMEngineClient)
@@ -313,7 +548,7 @@ async def test_serving_chat_did_set_correct_cache_salt(model_type):
         }],
     )
 
-    # By default cache_salt in the engine prompt is not set
+    # By default, cache_salt in the engine prompt is not set
     with suppress(Exception):
         await serving_chat.create_chat_completion(req)
     assert "cache_salt" not in mock_engine.generate.call_args.args[0]
diff --git a/tests/entrypoints/openai/test_skip_tokenizer.py b/tests/entrypoints/openai/test_skip_tokenizer.py
index 0bb42ed8aa7fb3108ad0cd6f813532ddc3e37d5a..840e0dac81c9767b1632065903c541fafbbacb68 100644
--- a/tests/entrypoints/openai/test_skip_tokenizer.py
+++ b/tests/entrypoints/openai/test_skip_tokenizer.py
@@ -11,7 +11,7 @@ import torch
 
 from ...utils import RemoteOpenAIServer
 
-MODEL_NAME = "christian-pinto/Prithvi-EO-2.0-300M-TL-VLLM"
+MODEL_NAME = "ibm-nasa-geospatial/Prithvi-EO-2.0-300M-TL-Sen1Floods11"
 DTYPE = "float16"
 
 
@@ -35,7 +35,9 @@ def server():
         "--trust-remote-code",
         "--skip-tokenizer-init",
         "--max-num-seqs",
-        "32"
+        "32",
+        "--model-impl",
+        "terratorch"
     ]
 
     with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
diff --git a/tests/entrypoints/openai/test_tokenization.py b/tests/entrypoints/openai/test_tokenization.py
index 0dbbdfbfd24ad43edaabcc2610869fb95757662f..72c8a3510c9b0fe2faffe15d7367428f8107a633 100644
--- a/tests/entrypoints/openai/test_tokenization.py
+++ b/tests/entrypoints/openai/test_tokenization.py
@@ -8,8 +8,6 @@ import requests
 from vllm.transformers_utils.tokenizer import get_tokenizer
 
 from ...utils import RemoteOpenAIServer
-from .test_completion import zephyr_lora_added_tokens_files  # noqa: F401
-from .test_completion import zephyr_lora_files  # noqa: F401
 
 # any model with a chat template should work here
 MODEL_NAME = "HuggingFaceH4/zephyr-7b-beta"
diff --git a/tests/entrypoints/openai/test_transcription_validation.py b/tests/entrypoints/openai/test_transcription_validation.py
index 6009d9aeec935b59ec647f93dbb2f8d9674f0fc0..6a3cdfdfc80811abd5118ab8468cd6ea6d81b212 100644
--- a/tests/entrypoints/openai/test_transcription_validation.py
+++ b/tests/entrypoints/openai/test_transcription_validation.py
@@ -12,8 +12,6 @@ import pytest
 import pytest_asyncio
 import soundfile as sf
 
-from vllm.assets.audio import AudioAsset
-
 from ...utils import RemoteOpenAIServer
 
 MODEL_NAME = "openai/whisper-large-v3-turbo"
@@ -24,20 +22,6 @@ MISTRAL_FORMAT_ARGS = [
 ]
 
 
-@pytest.fixture
-def mary_had_lamb():
-    path = AudioAsset('mary_had_lamb').get_local_path()
-    with open(str(path), "rb") as f:
-        yield f
-
-
-@pytest.fixture
-def winning_call():
-    path = AudioAsset('winning_call').get_local_path()
-    with open(str(path), "rb") as f:
-        yield f
-
-
 @pytest.fixture(scope="module")
 def server():
     with RemoteOpenAIServer(MODEL_NAME, SERVER_ARGS) as remote_server:
@@ -76,6 +60,25 @@ async def test_basic_audio(mary_had_lamb, model_name):
         assert out_usage["seconds"] == 16, out_usage["seconds"]
 
 
+@pytest.mark.asyncio
+async def test_basic_audio_gemma(foscolo):
+    # Gemma accuracy on some of the audio samples we use is particularly bad,
+    # hence we use a different one here. WER is evaluated separately.
+    model_name = "google/gemma-3n-E2B-it"
+    server_args = ["--enforce-eager"]
+
+    with RemoteOpenAIServer(model_name, server_args) as remote_server:
+        client = remote_server.get_async_client()
+        transcription = await client.audio.transcriptions.create(
+            model=model_name,
+            file=foscolo,
+            language="it",
+            response_format="text",
+            temperature=0.0)
+        out = json.loads(transcription)['text']
+        assert "da cui vergine nacque Venere" in out
+
+
 @pytest.mark.asyncio
 async def test_non_asr_model(winning_call):
     # text to text model
diff --git a/tests/entrypoints/openai/test_translation_validation.py b/tests/entrypoints/openai/test_translation_validation.py
index f4f5c66f2deeb285690f9070d9d9c95e83cd00d3..f43b7a253d28df2289d5b781684f4f80a06d41db 100644
--- a/tests/entrypoints/openai/test_translation_validation.py
+++ b/tests/entrypoints/openai/test_translation_validation.py
@@ -12,32 +12,24 @@ import pytest
 import pytest_asyncio
 import soundfile as sf
 
-from vllm.assets.audio import AudioAsset
-
 from ...utils import RemoteOpenAIServer
 
-MODEL_NAME = "openai/whisper-small"
 SERVER_ARGS = ["--enforce-eager"]
 
 
-@pytest.fixture
-def foscolo():
-    # Test translation it->en
-    path = AudioAsset('azacinto_foscolo').get_local_path()
-    with open(str(path), "rb") as f:
-        yield f
-
-
-@pytest.fixture(scope="module")
-def server():
-    with RemoteOpenAIServer(MODEL_NAME, SERVER_ARGS) as remote_server:
-        yield remote_server
+@pytest.fixture(scope="module",
+                params=["openai/whisper-small", "google/gemma-3n-E2B-it"])
+def server(request):
+    # Parametrize over model name
+    with RemoteOpenAIServer(request.param, SERVER_ARGS) as remote_server:
+        yield remote_server, request.param
 
 
 @pytest_asyncio.fixture
-async def client(server):
+async def client_and_model(server):
+    server, model_name = server
     async with server.get_async_client() as async_client:
-        yield async_client
+        yield async_client, model_name
 
 
 @pytest.mark.asyncio
@@ -56,27 +48,29 @@ async def test_non_asr_model(foscolo):
 
 # NOTE: (NickLucche) the large-v3-turbo model was not trained on translation!
 @pytest.mark.asyncio
-async def test_basic_audio(foscolo, client):
+async def test_basic_audio(foscolo, client_and_model):
+    client, model_name = client_and_model
     translation = await client.audio.translations.create(
-        model=MODEL_NAME,
+        model=model_name,
         file=foscolo,
         response_format="text",
-        # TODO remove once language detection is implemented
-        extra_body=dict(language="it"),
+        # TODO remove `language="it"` once language detection is implemented
+        extra_body=dict(language="it", to_language="en"),
         temperature=0.0)
     out = json.loads(translation)['text'].strip().lower()
     assert "greek sea" in out
 
 
 @pytest.mark.asyncio
-async def test_audio_prompt(foscolo, client):
+async def test_audio_prompt(foscolo, client_and_model):
+    client, model_name = client_and_model
     # Condition whisper on starting text
     prompt = "Nor have I ever"
     transcription = await client.audio.translations.create(
-        model=MODEL_NAME,
+        model=model_name,
         file=foscolo,
         prompt=prompt,
-        extra_body=dict(language="it"),
+        extra_body=dict(language="it", to_language="en"),
         response_format="text",
         temperature=0.0)
     out = json.loads(transcription)['text']
@@ -85,22 +79,27 @@ async def test_audio_prompt(foscolo, client):
 
 
 @pytest.mark.asyncio
-async def test_streaming_response(foscolo, client, server):
+async def test_streaming_response(foscolo, client_and_model, server):
+    client, model_name = client_and_model
     translation = ""
     res_no_stream = await client.audio.translations.create(
-        model=MODEL_NAME,
+        model=model_name,
         file=foscolo,
         response_format="json",
-        extra_body=dict(language="it"),
+        extra_body=dict(language="it", to_language="en", seed=42),
         temperature=0.0)
+
     # Stream via HTTPX since OpenAI translation client doesn't expose streaming
+    server, model_name = server
     url = server.url_for("v1/audio/translations")
     headers = {"Authorization": f"Bearer {server.DUMMY_API_KEY}"}
     data = {
-        "model": MODEL_NAME,
+        "model": model_name,
         "language": "it",
+        "to_language": "en",
         "stream": True,
         "temperature": 0.0,
+        "seed": 42,
     }
     foscolo.seek(0)
     async with httpx.AsyncClient() as http_client:
@@ -121,16 +120,24 @@ async def test_streaming_response(foscolo, client, server):
                 text = chunk["choices"][0].get("delta", {}).get("content")
                 translation += text or ""
 
-    assert translation == res_no_stream.text
+    res_stream = translation.split()
+    # NOTE There's a small non-deterministic issue here, likely in the attn
+    # computation, which will cause a few tokens to be different, while still
+    # being very close semantically.
+    assert sum([
+        x == y for x, y in zip(res_stream, res_no_stream.text.split())
+    ]) >= len(res_stream) * 0.9
 
 
 @pytest.mark.asyncio
-async def test_stream_options(foscolo, client, server):
+async def test_stream_options(foscolo, server):
+    server, model_name = server
     url = server.url_for("v1/audio/translations")
     headers = {"Authorization": f"Bearer {server.DUMMY_API_KEY}"}
     data = {
-        "model": MODEL_NAME,
+        "model": model_name,
         "language": "it",
+        "to_language": "en",
         "stream": True,
         "stream_include_usage": True,
         "stream_continuous_usage_stats": True,
@@ -164,7 +171,10 @@ async def test_stream_options(foscolo, client, server):
 
 
 @pytest.mark.asyncio
-async def test_long_audio_request(foscolo, client):
+async def test_long_audio_request(foscolo, client_and_model):
+    client, model_name = client_and_model
+    if model_name == "google/gemma-3n-E2B-it":
+        pytest.skip("Gemma3n does not support long audio requests")
     foscolo.seek(0)
     audio, sr = librosa.load(foscolo)
     repeated_audio = np.tile(audio, 2)
@@ -173,9 +183,9 @@ async def test_long_audio_request(foscolo, client):
     sf.write(buffer, repeated_audio, sr, format='WAV')
     buffer.seek(0)
     translation = await client.audio.translations.create(
-        model=MODEL_NAME,
+        model=model_name,
         file=buffer,
-        extra_body=dict(language="it"),
+        extra_body=dict(language="it", to_language="en"),
         response_format="text",
         temperature=0.0)
     out = json.loads(translation)['text'].strip().lower()
diff --git a/tests/entrypoints/openai/test_vision.py b/tests/entrypoints/openai/test_vision.py
index 106ec121a422e833971031dfe7eb005f69e46e6f..29a3b40d2d86525c98ce554d480341fb25de32ae 100644
--- a/tests/entrypoints/openai/test_vision.py
+++ b/tests/entrypoints/openai/test_vision.py
@@ -16,11 +16,11 @@ MODEL_NAME = "microsoft/Phi-3.5-vision-instruct"
 MAXIMUM_IMAGES = 2
 
 # Test different image extensions (JPG/PNG) and formats (gray/RGB/RGBA)
-TEST_IMAGE_URLS = [
-    "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg",
-    "https://upload.wikimedia.org/wikipedia/commons/f/fa/Grayscale_8bits_palette_sample_image.png",
-    "https://upload.wikimedia.org/wikipedia/commons/thumb/9/91/Venn_diagram_rgb.svg/1280px-Venn_diagram_rgb.svg.png",
-    "https://upload.wikimedia.org/wikipedia/commons/0/0b/RGBA_comp.png",
+TEST_IMAGE_ASSETS = [
+    "2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg",  # "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg"
+    "Grayscale_8bits_palette_sample_image.png",  # "https://upload.wikimedia.org/wikipedia/commons/f/fa/Grayscale_8bits_palette_sample_image.png",
+    "1280px-Venn_diagram_rgb.svg.png",  # "https://upload.wikimedia.org/wikipedia/commons/thumb/9/91/Venn_diagram_rgb.svg/1280px-Venn_diagram_rgb.svg.png",
+    "RGBA_comp.png",  # "https://upload.wikimedia.org/wikipedia/commons/0/0b/RGBA_comp.png",
 ]
 
 EXPECTED_MM_BEAM_SEARCH_RES = [
@@ -69,10 +69,11 @@ async def client(server):
 
 
 @pytest.fixture(scope="session")
-def base64_encoded_image() -> dict[str, str]:
+def base64_encoded_image(local_asset_server) -> dict[str, str]:
     return {
-        image_url: encode_image_base64(fetch_image(image_url))
-        for image_url in TEST_IMAGE_URLS
+        image_asset:
+        encode_image_base64(local_asset_server.get_image_asset(image_asset))
+        for image_asset in TEST_IMAGE_ASSETS
     }
 
 
@@ -97,7 +98,7 @@ def get_hf_prompt_tokens(model_name, content, image_url):
 
 @pytest.mark.asyncio
 @pytest.mark.parametrize("model_name", [MODEL_NAME])
-@pytest.mark.parametrize("image_url", TEST_IMAGE_URLS)
+@pytest.mark.parametrize("image_url", TEST_IMAGE_ASSETS, indirect=True)
 async def test_single_chat_session_image(client: openai.AsyncOpenAI,
                                          model_name: str, image_url: str):
     content_text = "What's in this image?"
@@ -157,7 +158,7 @@ async def test_single_chat_session_image(client: openai.AsyncOpenAI,
 
 @pytest.mark.asyncio
 @pytest.mark.parametrize("model_name", [MODEL_NAME])
-@pytest.mark.parametrize("image_url", TEST_IMAGE_URLS)
+@pytest.mark.parametrize("image_url", TEST_IMAGE_ASSETS, indirect=True)
 async def test_error_on_invalid_image_url_type(client: openai.AsyncOpenAI,
                                                model_name: str,
                                                image_url: str):
@@ -187,7 +188,7 @@ async def test_error_on_invalid_image_url_type(client: openai.AsyncOpenAI,
 
 @pytest.mark.asyncio
 @pytest.mark.parametrize("model_name", [MODEL_NAME])
-@pytest.mark.parametrize("image_url", TEST_IMAGE_URLS)
+@pytest.mark.parametrize("image_url", TEST_IMAGE_ASSETS, indirect=True)
 async def test_single_chat_session_image_beamsearch(client: openai.AsyncOpenAI,
                                                     model_name: str,
                                                     image_url: str):
@@ -223,10 +224,11 @@ async def test_single_chat_session_image_beamsearch(client: openai.AsyncOpenAI,
 
 @pytest.mark.asyncio
 @pytest.mark.parametrize("model_name", [MODEL_NAME])
-@pytest.mark.parametrize("image_url", TEST_IMAGE_URLS)
+@pytest.mark.parametrize("raw_image_url", TEST_IMAGE_ASSETS)
+@pytest.mark.parametrize("image_url", TEST_IMAGE_ASSETS, indirect=True)
 async def test_single_chat_session_image_base64encoded(
-        client: openai.AsyncOpenAI, model_name: str, image_url: str,
-        base64_encoded_image: dict[str, str]):
+        client: openai.AsyncOpenAI, model_name: str, raw_image_url: str,
+        image_url: str, base64_encoded_image: dict[str, str]):
 
     content_text = "What's in this image?"
     messages = [{
@@ -237,7 +239,7 @@ async def test_single_chat_session_image_base64encoded(
                 "type": "image_url",
                 "image_url": {
                     "url":
-                    f"data:image/jpeg;base64,{base64_encoded_image[image_url]}"
+                    f"data:image/jpeg;base64,{base64_encoded_image[raw_image_url]}"
                 }
             },
             {
@@ -287,12 +289,12 @@ async def test_single_chat_session_image_base64encoded(
 
 @pytest.mark.asyncio
 @pytest.mark.parametrize("model_name", [MODEL_NAME])
-@pytest.mark.parametrize("image_idx", list(range(len(TEST_IMAGE_URLS))))
+@pytest.mark.parametrize("image_idx", list(range(len(TEST_IMAGE_ASSETS))))
 async def test_single_chat_session_image_base64encoded_beamsearch(
         client: openai.AsyncOpenAI, model_name: str, image_idx: int,
         base64_encoded_image: dict[str, str]):
     # NOTE: This test also validates that we pass MM data through beam search
-    image_url = TEST_IMAGE_URLS[image_idx]
+    raw_image_url = TEST_IMAGE_ASSETS[image_idx]
     expected_res = EXPECTED_MM_BEAM_SEARCH_RES[image_idx]
 
     messages = [{
@@ -303,7 +305,7 @@ async def test_single_chat_session_image_base64encoded_beamsearch(
                 "type": "image_url",
                 "image_url": {
                     "url":
-                    f"data:image/jpeg;base64,{base64_encoded_image[image_url]}"
+                    f"data:image/jpeg;base64,{base64_encoded_image[raw_image_url]}"
                 }
             },
             {
@@ -326,7 +328,7 @@ async def test_single_chat_session_image_base64encoded_beamsearch(
 
 @pytest.mark.asyncio
 @pytest.mark.parametrize("model_name", [MODEL_NAME])
-@pytest.mark.parametrize("image_url", TEST_IMAGE_URLS)
+@pytest.mark.parametrize("image_url", TEST_IMAGE_ASSETS, indirect=True)
 async def test_chat_streaming_image(client: openai.AsyncOpenAI,
                                     model_name: str, image_url: str):
     messages = [{
@@ -385,7 +387,8 @@ async def test_chat_streaming_image(client: openai.AsyncOpenAI,
 @pytest.mark.parametrize("model_name", [MODEL_NAME])
 @pytest.mark.parametrize(
     "image_urls",
-    [TEST_IMAGE_URLS[:i] for i in range(2, len(TEST_IMAGE_URLS))])
+    [TEST_IMAGE_ASSETS[:i] for i in range(2, len(TEST_IMAGE_ASSETS))],
+    indirect=True)
 async def test_multi_image_input(client: openai.AsyncOpenAI, model_name: str,
                                  image_urls: list[str]):
 
@@ -433,3 +436,132 @@ async def test_multi_image_input(client: openai.AsyncOpenAI, model_name: str,
         )
         message = chat_completion.choices[0].message
         assert message.content is not None and len(message.content) >= 0
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("model_name", [MODEL_NAME])
+@pytest.mark.parametrize(
+    "image_urls",
+    [TEST_IMAGE_ASSETS[:i] for i in range(2, len(TEST_IMAGE_ASSETS))],
+    indirect=True)
+async def test_completions_with_image(
+    client: openai.AsyncOpenAI,
+    model_name: str,
+    image_urls: list[str],
+):
+    for image_url in image_urls:
+        chat_completion = await client.chat.completions.create(
+            messages=[
+                {
+                    "role": "system",
+                    "content": "You are a helpful assistant."
+                },
+                {
+                    "role":
+                    "user",
+                    "content": [
+                        {
+                            "type": "text",
+                            "text": "Describe this image.",
+                        },
+                        {
+                            "type": "image_url",
+                            "image_url": {
+                                "url": image_url,
+                            }
+                        },
+                    ],
+                },
+            ],
+            model=model_name,
+        )
+        assert chat_completion.choices[0].message.content is not None
+        assert isinstance(chat_completion.choices[0].message.content, str)
+        assert len(chat_completion.choices[0].message.content) > 0
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("model_name", [MODEL_NAME])
+@pytest.mark.parametrize(
+    "image_urls",
+    [TEST_IMAGE_ASSETS[:i] for i in range(2, len(TEST_IMAGE_ASSETS))],
+    indirect=True)
+async def test_completions_with_image_with_uuid(
+    client: openai.AsyncOpenAI,
+    model_name: str,
+    image_urls: list[str],
+):
+    for image_url in image_urls:
+        chat_completion = await client.chat.completions.create(
+            messages=[
+                {
+                    "role": "system",
+                    "content": "You are a helpful assistant."
+                },
+                {
+                    "role":
+                    "user",
+                    "content": [
+                        {
+                            "type": "text",
+                            "text": "Describe this image.",
+                        },
+                        {
+                            "type": "image_url",
+                            "image_url": {
+                                "url": image_url,
+                            },
+                            "uuid": image_url
+                        },
+                    ],
+                },
+            ],
+            model=model_name,
+        )
+        assert chat_completion.choices[0].message.content is not None
+        assert isinstance(chat_completion.choices[0].message.content, str)
+        assert len(chat_completion.choices[0].message.content) > 0
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("model_name", [MODEL_NAME])
+@pytest.mark.parametrize(
+    "image_urls",
+    [TEST_IMAGE_ASSETS[:i] for i in range(2, len(TEST_IMAGE_ASSETS))],
+    indirect=True)
+async def test_completions_with_image_with_incorrect_uuid_format(
+    client: openai.AsyncOpenAI,
+    model_name: str,
+    image_urls: list[str],
+):
+    for image_url in image_urls:
+        chat_completion = await client.chat.completions.create(
+            messages=[
+                {
+                    "role": "system",
+                    "content": "You are a helpful assistant."
+                },
+                {
+                    "role":
+                    "user",
+                    "content": [
+                        {
+                            "type": "text",
+                            "text": "Describe this image.",
+                        },
+                        {
+                            "type": "image_url",
+                            "image_url": {
+                                "url": image_url,
+                                "incorrect_uuid_key": image_url,
+                            },
+                            "also_incorrect_uuid_key": image_url,
+                        },
+                    ],
+                },
+            ],
+            model=model_name,
+        )
+        assert chat_completion.choices[0].message.content is not None
+        assert isinstance(chat_completion.choices[0].message.content, str)
+        assert len(chat_completion.choices[0].message.content) > 0
diff --git a/tests/entrypoints/pooling/__init__.py b/tests/entrypoints/pooling/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/tests/entrypoints/pooling/correctness/__init__.py b/tests/entrypoints/pooling/correctness/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/tests/entrypoints/openai/correctness/test_mteb_embed.py b/tests/entrypoints/pooling/correctness/test_mteb_embed.py
similarity index 71%
rename from tests/entrypoints/openai/correctness/test_mteb_embed.py
rename to tests/entrypoints/pooling/correctness/test_mteb_embed.py
index 783f7d3e0d5aaef65dd1b41bac1276f80eb82f57..12a4875bdacfd2c8a23a89c400f4fe9e154eeabc 100644
--- a/tests/entrypoints/openai/correctness/test_mteb_embed.py
+++ b/tests/entrypoints/pooling/correctness/test_mteb_embed.py
@@ -4,10 +4,9 @@ import os
 
 import pytest
 
-from tests.models.language.pooling.mteb_utils import (MTEB_EMBED_TASKS,
-                                                      MTEB_EMBED_TOL,
-                                                      OpenAIClientMtebEncoder,
-                                                      run_mteb_embed_task)
+from tests.models.language.pooling_mteb_test.mteb_utils import (
+    MTEB_EMBED_TASKS, MTEB_EMBED_TOL, OpenAIClientMtebEncoder,
+    run_mteb_embed_task)
 from tests.utils import RemoteOpenAIServer
 
 os.environ["VLLM_LOGGING_LEVEL"] = "WARNING"
@@ -37,4 +36,6 @@ def test_mteb_embed(server):
     print("SentenceTransformer main score: ", st_main_score)
     print("Difference: ", st_main_score - vllm_main_score)
 
-    assert st_main_score == pytest.approx(vllm_main_score, abs=MTEB_EMBED_TOL)
+    # We are not concerned that the vllm mteb results are better
+    # than SentenceTransformers, so we only perform one-sided testing.
+    assert st_main_score - vllm_main_score < MTEB_EMBED_TOL
diff --git a/tests/entrypoints/openai/correctness/test_mteb_score.py b/tests/entrypoints/pooling/correctness/test_mteb_score.py
similarity index 64%
rename from tests/entrypoints/openai/correctness/test_mteb_score.py
rename to tests/entrypoints/pooling/correctness/test_mteb_score.py
index cfb865815c9b28cddbb609b91f80945f607108b3..7c059d16b38635022bc2d258227282a9f7a73405 100644
--- a/tests/entrypoints/openai/correctness/test_mteb_score.py
+++ b/tests/entrypoints/pooling/correctness/test_mteb_score.py
@@ -4,18 +4,15 @@ import os
 
 import pytest
 
-# yapf conflicts with isort for this block
-# yapf: disable
-from tests.models.language.pooling.mteb_utils import (
+from tests.models.language.pooling_mteb_test.mteb_utils import (
     MTEB_RERANK_LANGS, MTEB_RERANK_TASKS, MTEB_RERANK_TOL,
-    RerankClientMtebEncoder, ScoreClientMtebEncoder,
-    mteb_test_rerank_models_hf, run_mteb_rerank)
-# yapf: enable
+    RerankClientMtebEncoder, ScoreClientMtebEncoder, run_mteb_rerank)
 from tests.utils import RemoteOpenAIServer
 
 os.environ["VLLM_LOGGING_LEVEL"] = "WARNING"
 
 MODEL_NAME = "cross-encoder/ms-marco-MiniLM-L-6-v2"
+st_main_score = 0.33457
 
 
 @pytest.fixture(scope="module")
@@ -29,15 +26,7 @@ def server():
         yield remote_server
 
 
-@pytest.fixture(scope="module")
-def st_main_score(hf_runner):
-    # The main score related to the version of the dependency.
-    # So we need to recalculate every time.
-    main_score, st_dtype = mteb_test_rerank_models_hf(hf_runner, MODEL_NAME)
-    return main_score
-
-
-def test_mteb_score(server, st_main_score):
+def test_mteb_score(server):
     url = server.url_for("score")
     encoder = ScoreClientMtebEncoder(MODEL_NAME, url)
     vllm_main_score = run_mteb_rerank(encoder, MTEB_RERANK_TASKS,
@@ -47,10 +36,12 @@ def test_mteb_score(server, st_main_score):
     print("SentenceTransformer main score: ", st_main_score)
     print("Difference: ", st_main_score - vllm_main_score)
 
-    assert st_main_score == pytest.approx(vllm_main_score, abs=MTEB_RERANK_TOL)
+    # We are not concerned that the vllm mteb results are better
+    # than SentenceTransformers, so we only perform one-sided testing.
+    assert st_main_score - vllm_main_score < MTEB_RERANK_TOL
 
 
-def test_mteb_rerank(server, st_main_score):
+def test_mteb_rerank(server):
     url = server.url_for("rerank")
     encoder = RerankClientMtebEncoder(MODEL_NAME, url)
     vllm_main_score = run_mteb_rerank(encoder, MTEB_RERANK_TASKS,
@@ -60,4 +51,6 @@ def test_mteb_rerank(server, st_main_score):
     print("SentenceTransformer main score: ", st_main_score)
     print("Difference: ", st_main_score - vllm_main_score)
 
-    assert st_main_score == pytest.approx(vllm_main_score, abs=MTEB_RERANK_TOL)
+    # We are not concerned that the vllm mteb results are better
+    # than SentenceTransformers, so we only perform one-sided testing.
+    assert st_main_score - vllm_main_score < MTEB_RERANK_TOL
diff --git a/tests/entrypoints/pooling/llm/__init__.py b/tests/entrypoints/pooling/llm/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/tests/entrypoints/llm/test_classify.py b/tests/entrypoints/pooling/llm/test_classify.py
similarity index 98%
rename from tests/entrypoints/llm/test_classify.py
rename to tests/entrypoints/pooling/llm/test_classify.py
index 6c0c9cd0158010adb7710e11804afd8a01b05de8..ff5cea11a9182cd8d0bf46349b6a9fd7ae8caa15 100644
--- a/tests/entrypoints/llm/test_classify.py
+++ b/tests/entrypoints/pooling/llm/test_classify.py
@@ -6,11 +6,10 @@ import weakref
 import pytest
 import torch
 
+from tests.models.utils import softmax
 from vllm import LLM, PoolingParams
 from vllm.distributed import cleanup_dist_env_and_memory
 
-from ...models.utils import softmax
-
 MODEL_NAME = "jason9693/Qwen2.5-1.5B-apeach"
 
 prompts = ["The chef prepared a delicious meal."]
diff --git a/tests/entrypoints/llm/test_embedding.py b/tests/entrypoints/pooling/llm/test_embedding.py
similarity index 100%
rename from tests/entrypoints/llm/test_embedding.py
rename to tests/entrypoints/pooling/llm/test_embedding.py
diff --git a/tests/entrypoints/llm/test_encode.py b/tests/entrypoints/pooling/llm/test_encode.py
similarity index 100%
rename from tests/entrypoints/llm/test_encode.py
rename to tests/entrypoints/pooling/llm/test_encode.py
diff --git a/tests/entrypoints/llm/test_reward.py b/tests/entrypoints/pooling/llm/test_reward.py
similarity index 97%
rename from tests/entrypoints/llm/test_reward.py
rename to tests/entrypoints/pooling/llm/test_reward.py
index 2cee3c8d94e362281c503aa08a545a37789c29ba..11d164c978a9264929011c8218383e0bd14dcdd2 100644
--- a/tests/entrypoints/llm/test_reward.py
+++ b/tests/entrypoints/pooling/llm/test_reward.py
@@ -6,11 +6,10 @@ import weakref
 import pytest
 import torch
 
+from tests.models.utils import softmax
 from vllm import LLM, PoolingParams
 from vllm.distributed import cleanup_dist_env_and_memory
 
-from ...models.utils import softmax
-
 MODEL_NAME = "internlm/internlm2-1_8b-reward"
 
 prompts = ["The chef prepared a delicious meal."]
diff --git a/tests/entrypoints/llm/test_score.py b/tests/entrypoints/pooling/llm/test_score.py
similarity index 97%
rename from tests/entrypoints/llm/test_score.py
rename to tests/entrypoints/pooling/llm/test_score.py
index f715dacacb8ff22ddcff5dfc2b2d5518226cc59a..447378f989d09cae7599ad8635deb8bd0a5881d6 100644
--- a/tests/entrypoints/llm/test_score.py
+++ b/tests/entrypoints/pooling/llm/test_score.py
@@ -6,11 +6,10 @@ import weakref
 import pytest
 import torch
 
+from tests.models.utils import softmax
 from vllm import LLM, PoolingParams
 from vllm.distributed import cleanup_dist_env_and_memory
 
-from ...models.utils import softmax
-
 MODEL_NAME = "tomaarsen/Qwen3-Reranker-0.6B-seq-cls"
 
 
diff --git a/tests/entrypoints/pooling/openai/__init__.py b/tests/entrypoints/pooling/openai/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/tests/entrypoints/openai/test_classification.py b/tests/entrypoints/pooling/openai/test_classification.py
similarity index 99%
rename from tests/entrypoints/openai/test_classification.py
rename to tests/entrypoints/pooling/openai/test_classification.py
index 36c96d76c2e5f5f02060f1248706b0f71b038122..26c2c8e6af17d9e9721977ee02647b06737a2a5b 100644
--- a/tests/entrypoints/openai/test_classification.py
+++ b/tests/entrypoints/pooling/openai/test_classification.py
@@ -6,10 +6,9 @@ import requests
 import torch
 import torch.nn.functional as F
 
+from tests.utils import RemoteOpenAIServer
 from vllm.entrypoints.openai.protocol import ClassificationResponse
 
-from ...utils import RemoteOpenAIServer
-
 MODEL_NAME = "jason9693/Qwen2.5-1.5B-apeach"
 DTYPE = "float32"  # Use float32 to avoid NaN issue
 
diff --git a/tests/entrypoints/openai/test_embedding.py b/tests/entrypoints/pooling/openai/test_embedding.py
similarity index 98%
rename from tests/entrypoints/openai/test_embedding.py
rename to tests/entrypoints/pooling/openai/test_embedding.py
index d46ab304ba6d5495ad27d9f463d183cb3200a278..37a10e79d4fc770af68afe26dc1027703b74c3ec 100644
--- a/tests/entrypoints/openai/test_embedding.py
+++ b/tests/entrypoints/pooling/openai/test_embedding.py
@@ -11,14 +11,13 @@ import requests
 import torch
 import torch.nn.functional as F
 
+from tests.models.language.pooling.embed_utils import (
+    run_embedding_correctness_test)
+from tests.models.utils import check_embeddings_close
+from tests.utils import RemoteOpenAIServer
 from vllm.entrypoints.openai.protocol import EmbeddingResponse
 from vllm.transformers_utils.tokenizer import get_tokenizer
 
-from ...models.language.pooling.embed_utils import (
-    run_embedding_correctness_test)
-from ...models.utils import check_embeddings_close
-from ...utils import RemoteOpenAIServer
-
 MODEL_NAME = "intfloat/multilingual-e5-small"
 DUMMY_CHAT_TEMPLATE = """{% for message in messages %}{{message['role'] + ': ' + message['content'] + '\\n'}}{% endfor %}"""  # noqa: E501
 DTYPE = "bfloat16"
diff --git a/tests/entrypoints/openai/test_embedding_dimensions.py b/tests/entrypoints/pooling/openai/test_embedding_dimensions.py
similarity index 95%
rename from tests/entrypoints/openai/test_embedding_dimensions.py
rename to tests/entrypoints/pooling/openai/test_embedding_dimensions.py
index 91e91699b92ca39f80ba48afc3adb75a12387716..3c7e88daa8ff3a394040b7a5c78727bab0e32acc 100644
--- a/tests/entrypoints/openai/test_embedding_dimensions.py
+++ b/tests/entrypoints/pooling/openai/test_embedding_dimensions.py
@@ -9,13 +9,12 @@ from typing import Optional
 import openai
 import pytest
 
-from vllm.entrypoints.openai.protocol import EmbeddingResponse
-
-from ...conftest import HfRunner
-from ...models.language.pooling.embed_utils import (
+from tests.conftest import HfRunner
+from tests.models.language.pooling.embed_utils import (
     run_embedding_correctness_test)
-from ...models.utils import EmbedModelInfo
-from ...utils import RemoteOpenAIServer
+from tests.models.utils import EmbedModelInfo
+from tests.utils import RemoteOpenAIServer
+from vllm.entrypoints.openai.protocol import EmbeddingResponse
 
 MODELS = [
     EmbedModelInfo("intfloat/multilingual-e5-small", is_matryoshka=False),
diff --git a/tests/entrypoints/openai/test_embedding_long_text.py b/tests/entrypoints/pooling/openai/test_embedding_long_text.py
similarity index 99%
rename from tests/entrypoints/openai/test_embedding_long_text.py
rename to tests/entrypoints/pooling/openai/test_embedding_long_text.py
index 86bd34abb97e09343fcd447271c07bb8df06ea31..2d3da238d245e9ca329e10914c07b603f6f64eaf 100644
--- a/tests/entrypoints/openai/test_embedding_long_text.py
+++ b/tests/entrypoints/pooling/openai/test_embedding_long_text.py
@@ -14,10 +14,9 @@ import openai
 import pytest
 import pytest_asyncio
 
+from tests.utils import RemoteOpenAIServer
 from vllm.entrypoints.openai.protocol import EmbeddingResponse
 
-from ...utils import RemoteOpenAIServer
-
 
 def _generate_random_text(word_count: int) -> str:
     """Generate random text with approximately the specified word count."""
diff --git a/tests/entrypoints/openai/test_pooling.py b/tests/entrypoints/pooling/openai/test_pooling.py
similarity index 99%
rename from tests/entrypoints/openai/test_pooling.py
rename to tests/entrypoints/pooling/openai/test_pooling.py
index 63f4205e0a42b0650afd219d10fbe8f20577687b..9f58955cfb40bec639ef3a4f9247c1834d5aad70 100644
--- a/tests/entrypoints/openai/test_pooling.py
+++ b/tests/entrypoints/pooling/openai/test_pooling.py
@@ -8,11 +8,10 @@ import pytest
 import requests
 
 from tests.models.utils import check_embeddings_close
+from tests.utils import RemoteOpenAIServer
 from vllm.entrypoints.openai.protocol import PoolingResponse
 from vllm.transformers_utils.tokenizer import get_tokenizer
 
-from ...utils import RemoteOpenAIServer
-
 MODEL_NAME = "internlm/internlm2-1_8b-reward"
 DUMMY_CHAT_TEMPLATE = """{% for message in messages %}{{message['role'] + ': ' + message['content'] + '\\n'}}{% endfor %}"""  # noqa: E501
 
diff --git a/tests/entrypoints/openai/test_rerank.py b/tests/entrypoints/pooling/openai/test_rerank.py
similarity index 99%
rename from tests/entrypoints/openai/test_rerank.py
rename to tests/entrypoints/pooling/openai/test_rerank.py
index ce4d6c5f5d337725b3629caa2a6e941e9898f572..992cb5147ef0d516557d38d4f5ac6db616b3f829 100644
--- a/tests/entrypoints/openai/test_rerank.py
+++ b/tests/entrypoints/pooling/openai/test_rerank.py
@@ -6,10 +6,9 @@ import requests
 import torch
 import torch.nn.functional as F
 
+from tests.utils import RemoteOpenAIServer
 from vllm.entrypoints.openai.protocol import RerankResponse
 
-from ...utils import RemoteOpenAIServer
-
 MODEL_NAME = "BAAI/bge-reranker-base"
 DTYPE = "bfloat16"
 
diff --git a/tests/entrypoints/openai/test_score.py b/tests/entrypoints/pooling/openai/test_score.py
similarity index 99%
rename from tests/entrypoints/openai/test_score.py
rename to tests/entrypoints/pooling/openai/test_score.py
index 4fafcfb45fa222bec9d16c002e31fb8eda81a183..d676ecccbc87c67f12b755d639e8bf7aa43120cb 100644
--- a/tests/entrypoints/openai/test_score.py
+++ b/tests/entrypoints/pooling/openai/test_score.py
@@ -8,10 +8,9 @@ import torch
 import torch.nn.functional as F
 from torch import tensor
 
+from tests.utils import RemoteOpenAIServer
 from vllm.entrypoints.openai.protocol import ScoreResponse
 
-from ...utils import RemoteOpenAIServer
-
 MODELS = [
     {
         "name": "BAAI/bge-reranker-v2-m3",
diff --git a/tests/entrypoints/openai/test_truncation.py b/tests/entrypoints/pooling/openai/test_truncation.py
similarity index 87%
rename from tests/entrypoints/openai/test_truncation.py
rename to tests/entrypoints/pooling/openai/test_truncation.py
index 121c0413e1af78b7c69a9c50e458db814fa2471a..6bdf5ce7c4a6c91a349fb929eec3171160209b6a 100644
--- a/tests/entrypoints/openai/test_truncation.py
+++ b/tests/entrypoints/pooling/openai/test_truncation.py
@@ -73,17 +73,11 @@ async def test_zero_truncation_size(client: openai.AsyncOpenAI):
         "truncate_prompt_tokens": truncation_size
     }
 
-    with pytest.raises(openai.BadRequestError) as err:
-        await client.post(path="embeddings", cast_to=object, body={**kwargs})
-
-    assert err.value.status_code == 400
-    error_details = err.value.response.json()["error"]
+    response = await client.post(path="embeddings",
+                                 cast_to=object,
+                                 body={**kwargs})
 
-    assert error_details["type"] == "BadRequestError"
-    assert "This model's maximum context length is" in error_details["message"]
-    assert "tokens in the input for embedding generation" in error_details[
-        "message"]
-    assert "Please reduce the length of the input" in error_details["message"]
+    assert response["usage"]["prompt_tokens"] == truncation_size
 
 
 @pytest.mark.asyncio
diff --git a/tests/entrypoints/openai/test_vision_embedding.py b/tests/entrypoints/pooling/openai/test_vision_embedding.py
similarity index 74%
rename from tests/entrypoints/openai/test_vision_embedding.py
rename to tests/entrypoints/pooling/openai/test_vision_embedding.py
index d3cc2fac6af5754eb591a2e36b5ce790d375a8c8..48434e36eb2659e27b9a26b6d9cbf8b06fcd3ae0 100644
--- a/tests/entrypoints/openai/test_vision_embedding.py
+++ b/tests/entrypoints/pooling/openai/test_vision_embedding.py
@@ -7,11 +7,10 @@ import pytest
 import requests
 from transformers import AutoProcessor
 
+from tests.utils import VLLM_PATH, RemoteOpenAIServer
 from vllm.entrypoints.openai.protocol import EmbeddingResponse
 from vllm.multimodal.utils import encode_image_base64, fetch_image
 
-from ...utils import VLLM_PATH, RemoteOpenAIServer
-
 MODEL_NAME = "TIGER-Lab/VLM2Vec-Full"
 MAXIMUM_IMAGES = 2
 
@@ -19,11 +18,11 @@ vlm2vec_jinja_path = VLLM_PATH / "examples/template_vlm2vec.jinja"
 assert vlm2vec_jinja_path.exists()
 
 # Test different image extensions (JPG/PNG) and formats (gray/RGB/RGBA)
-TEST_IMAGE_URLS = [
-    "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg",
-    "https://upload.wikimedia.org/wikipedia/commons/f/fa/Grayscale_8bits_palette_sample_image.png",
-    "https://upload.wikimedia.org/wikipedia/commons/thumb/9/91/Venn_diagram_rgb.svg/1280px-Venn_diagram_rgb.svg.png",
-    "https://upload.wikimedia.org/wikipedia/commons/0/0b/RGBA_comp.png",
+TEST_IMAGE_ASSETS = [
+    "2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg",  # "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg"
+    "Grayscale_8bits_palette_sample_image.png",  # "https://upload.wikimedia.org/wikipedia/commons/f/fa/Grayscale_8bits_palette_sample_image.png",
+    "1280px-Venn_diagram_rgb.svg.png",  # "https://upload.wikimedia.org/wikipedia/commons/thumb/9/91/Venn_diagram_rgb.svg/1280px-Venn_diagram_rgb.svg.png",
+    "RGBA_comp.png",  # "https://upload.wikimedia.org/wikipedia/commons/0/0b/RGBA_comp.png",
 ]
 
 
@@ -49,10 +48,11 @@ def server():
 
 
 @pytest.fixture(scope="session")
-def base64_encoded_image() -> dict[str, str]:
+def base64_encoded_image(local_asset_server) -> dict[str, str]:
     return {
-        image_url: encode_image_base64(fetch_image(image_url))
-        for image_url in TEST_IMAGE_URLS
+        image_url:
+        encode_image_base64(local_asset_server.get_image_asset(image_url))
+        for image_url in TEST_IMAGE_ASSETS
     }
 
 
@@ -70,7 +70,7 @@ def get_hf_prompt_tokens(model_name, content, image_url):
 
 @pytest.mark.asyncio
 @pytest.mark.parametrize("model_name", [MODEL_NAME])
-@pytest.mark.parametrize("image_url", TEST_IMAGE_URLS)
+@pytest.mark.parametrize("image_url", TEST_IMAGE_ASSETS, indirect=True)
 async def test_image_embedding(server: RemoteOpenAIServer, model_name: str,
                                image_url: str):
     content_text = "Represent the given image."
diff --git a/tests/entrypoints/test_api_server_process_manager.py b/tests/entrypoints/test_api_server_process_manager.py
index e4af60a782651e73c865ae4205fb923b1cbd3963..a993e24ff838aa9dbc17f079b6563e0e7a052d59 100644
--- a/tests/entrypoints/test_api_server_process_manager.py
+++ b/tests/entrypoints/test_api_server_process_manager.py
@@ -95,7 +95,7 @@ def test_api_server_process_manager_init(api_server_args, with_stats_update):
             assert not proc.is_alive()
 
 
-@patch("vllm.entrypoints.cli.serve.run_api_server_worker",
+@patch("vllm.entrypoints.cli.serve.run_api_server_worker_proc",
        mock_run_api_server_worker)
 def test_wait_for_completion_or_failure(api_server_args):
     """Test that wait_for_completion_or_failure works with failures."""
diff --git a/tests/entrypoints/test_chat_utils.py b/tests/entrypoints/test_chat_utils.py
index 647f1c7b7f34f5a833c5d5edc90eb704895e091e..5149ca346050ed9bd0d531b0b0158ef903f6f36e 100644
--- a/tests/entrypoints/test_chat_utils.py
+++ b/tests/entrypoints/test_chat_utils.py
@@ -21,7 +21,7 @@ from vllm.entrypoints.chat_utils import (_try_extract_ast, load_chat_template,
                                          resolve_chat_template_content_format,
                                          resolve_hf_chat_template)
 from vllm.entrypoints.llm import apply_hf_chat_template
-from vllm.multimodal import MultiModalDataDict
+from vllm.multimodal import MultiModalDataDict, MultiModalUUIDDict
 from vllm.multimodal.utils import (encode_audio_base64, encode_image_base64,
                                    encode_video_base64)
 from vllm.transformers_utils.tokenizer_group import TokenizerGroup
@@ -46,23 +46,27 @@ MISTRAL_MODEL_ID = "mistralai/Mistral-Small-3.1-24B-Instruct-2503"
 
 @pytest.fixture(scope="function")
 def phi3v_model_config():
-    return ModelConfig(PHI3V_MODEL_ID,
-                       runner="generate",
-                       trust_remote_code=True,
-                       limit_mm_per_prompt={
-                           "image": 2,
-                       })
+    return ModelConfig(
+        PHI3V_MODEL_ID,
+        runner="generate",
+        trust_remote_code=True,
+        limit_mm_per_prompt={
+            "image": 2,
+        },
+    )
 
 
 @pytest.fixture(scope="function")
 def phi3v_model_config_mm_interleaved():
-    return ModelConfig(PHI3V_MODEL_ID,
-                       runner="generate",
-                       trust_remote_code=True,
-                       interleave_mm_strings=True,
-                       limit_mm_per_prompt={
-                           "image": 2,
-                       })
+    return ModelConfig(
+        PHI3V_MODEL_ID,
+        runner="generate",
+        trust_remote_code=True,
+        interleave_mm_strings=True,
+        limit_mm_per_prompt={
+            "image": 2,
+        },
+    )
 
 
 @pytest.fixture(scope="module")
@@ -77,14 +81,16 @@ def phi3v_tokenizer():
 
 @pytest.fixture(scope="function")
 def qwen25omni_model_config_mm_interleaved():
-    return ModelConfig(QWEN25OMNI_MODEL_ID,
-                       runner="generate",
-                       interleave_mm_strings=True,
-                       limit_mm_per_prompt={
-                           "image": 2,
-                           "audio": 1,
-                           "video": 1,
-                       })
+    return ModelConfig(
+        QWEN25OMNI_MODEL_ID,
+        runner="generate",
+        interleave_mm_strings=True,
+        limit_mm_per_prompt={
+            "image": 2,
+            "audio": 1,
+            "video": 1,
+        },
+    )
 
 
 @pytest.fixture(scope="module")
@@ -99,11 +105,13 @@ def qwen25omni_tokenizer():
 
 @pytest.fixture(scope="module")
 def mllama_model_config():
-    return ModelConfig(MLLAMA_MODEL_ID,
-                       runner="generate",
-                       limit_mm_per_prompt={
-                           "image": 2,
-                       })
+    return ModelConfig(
+        MLLAMA_MODEL_ID,
+        runner="generate",
+        limit_mm_per_prompt={
+            "image": 2,
+        },
+    )
 
 
 @pytest.fixture(scope="module")
@@ -118,11 +126,13 @@ def mllama_tokenizer():
 
 @pytest.fixture(scope="function")
 def mistral_model_config():
-    return ModelConfig(MISTRAL_MODEL_ID,
-                       runner="generate",
-                       limit_mm_per_prompt={
-                           "image": 2,
-                       })
+    return ModelConfig(
+        MISTRAL_MODEL_ID,
+        runner="generate",
+        limit_mm_per_prompt={
+            "image": 2,
+        },
+    )
 
 
 @pytest.fixture(scope="module")
@@ -137,21 +147,21 @@ def mistral_tokenizer():
 
 @pytest.fixture(scope="module")
 def image_url():
-    image = ImageAsset('cherry_blossom')
+    image = ImageAsset("cherry_blossom")
     base64 = encode_image_base64(image.pil_image)
     return f"data:image/jpeg;base64,{base64}"
 
 
 @pytest.fixture(scope="module")
 def video_url():
-    video = VideoAsset('baby_reading', 1)
+    video = VideoAsset("baby_reading", 1)
     base64 = encode_video_base64(video.np_ndarrays)
     return f"data:video/jpeg;base64,{base64}"
 
 
 @pytest.fixture(scope="module")
 def audio_url():
-    audio = AudioAsset('mary_had_lamb')
+    audio = AudioAsset("mary_had_lamb")
     base64 = encode_audio_base64(*audio.audio_and_sample_rate)
     return f"data:audio/ogg;base64,{base64}"
 
@@ -169,6 +179,27 @@ def _assert_mm_data_is_image_input(
     assert isinstance(image_data, list) and len(image_data) == image_count
 
 
+def _assert_mm_uuids(
+    mm_uuids: Optional[MultiModalUUIDDict],
+    media_count: int,
+    expected_uuids: list[Optional[str]],
+    modality: str = "image",
+) -> None:
+    if len(expected_uuids) > 0:
+        assert mm_uuids is not None
+        assert modality in mm_uuids
+
+        image_uuids = mm_uuids.get(modality)
+        assert image_uuids is not None
+
+        assert isinstance(image_uuids,
+                          list) and len(image_uuids) == media_count
+
+        assert image_uuids == expected_uuids
+    else:
+        assert mm_uuids is None
+
+
 ModalityType = Literal["image", "video", "audio"]
 MultiModalDataCounts = Mapping[ModalityType, int]
 
@@ -191,19 +222,22 @@ def test_parse_chat_messages_single_image(
     phi3v_tokenizer,
     image_url,
 ):
-    conversation, mm_data = parse_chat_messages(
+    conversation, mm_data, mm_uuids = parse_chat_messages(
         [{
             "role":
             "user",
-            "content": [{
-                "type": "image_url",
-                "image_url": {
-                    "url": image_url
-                }
-            }, {
-                "type": "text",
-                "text": "What's in the image?"
-            }]
+            "content": [
+                {
+                    "type": "image_url",
+                    "image_url": {
+                        "url": image_url
+                    }
+                },
+                {
+                    "type": "text",
+                    "text": "What's in the image?"
+                },
+            ],
         }],
         phi3v_model_config,
         phi3v_tokenizer,
@@ -215,87 +249,70 @@ def test_parse_chat_messages_single_image(
         "content": "<|image_1|>\nWhat's in the image?"
     }]
     _assert_mm_data_is_image_input(mm_data, 1)
+    _assert_mm_uuids(mm_uuids, 1, expected_uuids=[None])
 
 
-def test_parse_chat_messages_empty_system(
-    mistral_model_config,
-    mistral_tokenizer,
+def test_parse_chat_messages_single_image_with_uuid(
+    phi3v_model_config,
+    phi3v_tokenizer,
+    image_url,
 ):
-    # Test string format
-    conversation, _ = parse_chat_messages(
+    image_uuid = str(hash(image_url))
+    conversation, mm_data, mm_uuids = parse_chat_messages(
         [{
-            "role": "system",
-            "content": ""
-        }, {
-            "role": "user",
-            "content": [{
-                "type": "text",
-                "text": "Who are you?"
-            }]
+            "role":
+            "user",
+            "content": [
+                {
+                    "type": "image_url",
+                    "image_url": {
+                        "url": image_url,
+                    },
+                    "uuid": image_uuid,
+                },
+                {
+                    "type": "text",
+                    "text": "What's in the image?"
+                },
+            ],
         }],
-        mistral_model_config,
-        mistral_tokenizer,
+        phi3v_model_config,
+        phi3v_tokenizer,
         content_format="string",
     )
-    assert conversation == [{
-        "role": "system",
-        "content": ""
-    }, {
-        "role": "user",
-        "content": "Who are you?"
-    }]
 
-    # Test openai format
-    conversation, _ = parse_chat_messages(
-        [{
-            "role": "system",
-            "content": ""
-        }, {
-            "role": "user",
-            "content": [{
-                "type": "text",
-                "text": "Who are you?"
-            }]
-        }],
-        mistral_model_config,
-        mistral_tokenizer,
-        content_format="openai",
-    )
     assert conversation == [{
-        "role": "system",
-        "content": [{
-            "type": "text",
-            "text": ""
-        }]
-    }, {
-        "role":
-        "user",
-        "content": [{
-            "type": "text",
-            "text": "Who are you?"
-        }]
+        "role": "user",
+        "content": "<|image_1|>\nWhat's in the image?"
     }]
+    _assert_mm_data_is_image_input(mm_data, 1)
+    _assert_mm_uuids(mm_uuids, 1, expected_uuids=[image_uuid])
 
 
-@pytest.mark.asyncio
-async def test_parse_chat_messages_single_image_async(
+def test_parse_chat_messages_single_image_with_bad_uuid_format(
     phi3v_model_config,
     phi3v_tokenizer,
     image_url,
 ):
-    conversation, mm_future = parse_chat_messages_futures(
+    image_uuid = str(hash(image_url))
+    conversation, mm_data, mm_uuids = parse_chat_messages(
         [{
             "role":
             "user",
-            "content": [{
-                "type": "image_url",
-                "image_url": {
-                    "url": image_url
-                }
-            }, {
-                "type": "text",
-                "text": "What's in the image?"
-            }]
+            "content": [
+                {
+                    "type": "image_url",
+                    "image_url": {
+                        "url": image_url,
+                        "uuid": image_uuid,
+                    },
+                    "bad_uuid_key": image_uuid,
+                },
+                {
+                    "type": "text",
+                    "text": "What's in the image?"
+                },
+            ],
         }],
         phi3v_model_config,
         phi3v_tokenizer,
@@ -306,30 +323,42 @@ async def test_parse_chat_messages_single_image_async(
         "role": "user",
         "content": "<|image_1|>\nWhat's in the image?"
     }]
-    _assert_mm_data_is_image_input(await mm_future, 1)
+    _assert_mm_data_is_image_input(mm_data, 1)
+    _assert_mm_uuids(mm_uuids, 1, expected_uuids=[None])
 
 
-def test_parse_chat_messages_multiple_images(
+def test_parse_chat_messages_multiple_images_with_uuids(
     phi3v_model_config,
     phi3v_tokenizer,
     image_url,
 ):
-    conversation, mm_data = parse_chat_messages(
+    image_uuid1 = "my_uuid_1"
+    image_uuid2 = "my_uuid_2"
+
+    conversation, mm_data, mm_uuids = parse_chat_messages(
         [{
             "role":
             "user",
-            "content": [{
-                "type": "image_url",
-                "image_url": {
-                    "url": image_url
-                }
-            }, {
-                "type": "image_pil",
-                "image_pil": ImageAsset('cherry_blossom').pil_image
-            }, {
-                "type": "text",
-                "text": "What's in these images?"
-            }]
+            "content": [
+                {
+                    "type": "image_url",
+                    "image_url": {
+                        "url": image_url,
+                    },
+                    "uuid": image_uuid1,
+                },
+                {
+                    "type": "image_url",
+                    "image_url": {
+                        "url": image_url,
+                    },
+                    "uuid": image_uuid2,
+                },
+                {
+                    "type": "text",
+                    "text": "What's in the image?"
+                },
+            ],
         }],
         phi3v_model_config,
         phi3v_tokenizer,
@@ -340,33 +369,36 @@ def test_parse_chat_messages_multiple_images(
         "role":
         "user",
         "content":
-        "<|image_1|>\n<|image_2|>\nWhat's in these images?"
+        "<|image_1|>\n<|image_2|>\nWhat's in the image?",
     }]
     _assert_mm_data_is_image_input(mm_data, 2)
+    _assert_mm_uuids(mm_uuids, 2, expected_uuids=[image_uuid1, image_uuid2])
 
 
 @pytest.mark.asyncio
-async def test_parse_chat_messages_multiple_images_async(
+async def test_parse_chat_messages_single_image_with_uuid_async(
     phi3v_model_config,
     phi3v_tokenizer,
     image_url,
 ):
-    conversation, mm_future = parse_chat_messages_futures(
+    image_uuid = str(hash(image_url))
+    conversation, mm_future, mm_uuids = parse_chat_messages_futures(
         [{
             "role":
             "user",
-            "content": [{
-                "type": "image_url",
-                "image_url": {
-                    "url": image_url
-                }
-            }, {
-                "type": "image_pil",
-                "image_pil": ImageAsset('cherry_blossom').pil_image
-            }, {
-                "type": "text",
-                "text": "What's in these images?"
-            }]
+            "content": [
+                {
+                    "type": "image_url",
+                    "image_url": {
+                        "url": image_url
+                    },
+                    "uuid": image_uuid,
+                },
+                {
+                    "type": "text",
+                    "text": "What's in the image?"
+                },
+            ],
         }],
         phi3v_model_config,
         phi3v_tokenizer,
@@ -374,59 +406,69 @@ async def test_parse_chat_messages_multiple_images_async(
     )
 
     assert conversation == [{
-        "role":
-        "user",
-        "content":
-        "<|image_1|>\n<|image_2|>\nWhat's in these images?"
+        "role": "user",
+        "content": "<|image_1|>\nWhat's in the image?"
     }]
-    _assert_mm_data_is_image_input(await mm_future, 2)
+    _assert_mm_data_is_image_input(await mm_future, 1)
+    _assert_mm_uuids(mm_uuids, 1, expected_uuids=[image_uuid])
 
 
-def test_parse_chat_messages_placeholder_already_in_prompt(
+@pytest.mark.asyncio
+async def test_parse_chat_messages_multiple_images_with_uuids_async(
     phi3v_model_config,
     phi3v_tokenizer,
     image_url,
 ):
-    conversation, mm_data = parse_chat_messages(
+    image_uuid1 = "my_uuid_1"
+    image_uuid2 = "my_uuid_2"
+
+    conversation, mm_future, mm_uuids = parse_chat_messages_futures(
         [{
             "role":
             "user",
-            "content": [{
-                "type": "image_url",
-                "image_url": {
-                    "url": image_url
-                }
-            }, {
-                "type": "image_url",
-                "image_url": {
-                    "url": image_url
-                }
-            }, {
-                "type":
-                "text",
-                "text":
-                "What's in <|image_1|> and how does it compare to <|image_2|>?"
-            }]
+            "content": [
+                {
+                    "type": "image_url",
+                    "image_url": {
+                        "url": image_url
+                    },
+                    "uuid": image_uuid1,
+                },
+                {
+                    "type": "image_pil",
+                    "image_pil": ImageAsset("cherry_blossom").pil_image,
+                    "uuid": image_uuid2,
+                },
+                {
+                    "type": "text",
+                    "text": "What's in these images?"
+                },
+            ],
         }],
         phi3v_model_config,
         phi3v_tokenizer,
         content_format="string",
     )
+
     assert conversation == [{
         "role":
         "user",
         "content":
-        "What's in <|image_1|> and how does it compare to <|image_2|>?"
+        "<|image_1|>\n<|image_2|>\nWhat's in these images?",
     }]
-    _assert_mm_data_is_image_input(mm_data, 2)
+    _assert_mm_data_is_image_input(await mm_future, 2)
+    _assert_mm_uuids(mm_uuids, 2, expected_uuids=[image_uuid1, image_uuid2])
 
 
-def test_parse_chat_messages_placeholder_one_already_in_prompt(
+@pytest.mark.asyncio
+async def test_parse_chat_messages_multiple_images_with_partial_uuids_async(
     phi3v_model_config,
     phi3v_tokenizer,
     image_url,
 ):
-    conversation, mm_data = parse_chat_messages(
+    image_uuid2 = "my_uuid_2"
+
+    conversation, mm_future, mm_uuids = parse_chat_messages_futures(
         [{
             "role":
             "user",
@@ -435,21 +477,18 @@ def test_parse_chat_messages_placeholder_one_already_in_prompt(
                     "type": "image_url",
                     "image_url": {
                         "url": image_url
-                    }
+                    },
                 },
                 {
-                    "type": "image_url",
-                    "image_url": {
-                        "url": image_url
-                    }
+                    "type": "image_pil",
+                    "image_pil": ImageAsset("cherry_blossom").pil_image,
+                    "uuid": image_uuid2,
                 },
                 {
-                    "type":
-                    "text",
-                    "text":
-                    "What's in <|image_1|> and how does it compare to the other one?"  # noqa: E501
-                }
-            ]
+                    "type": "text",
+                    "text": "What's in these images?"
+                },
+            ],
         }],
         phi3v_model_config,
         phi3v_tokenizer,
@@ -460,268 +499,277 @@ def test_parse_chat_messages_placeholder_one_already_in_prompt(
         "role":
         "user",
         "content":
-        "<|image_2|>\nWhat's in <|image_1|> and how does it compare to the "
-        "other one?"
+        "<|image_1|>\n<|image_2|>\nWhat's in these images?",
     }]
-    _assert_mm_data_is_image_input(mm_data, 2)
+    _assert_mm_data_is_image_input(await mm_future, 2)
+    _assert_mm_uuids(mm_uuids, 2, expected_uuids=[None, image_uuid2])
 
 
-def test_parse_chat_messages_multiple_images_across_messages(
-    phi3v_model_config,
-    phi3v_tokenizer,
-    image_url,
+def test_parse_chat_messages_empty_system(
+    mistral_model_config,
+    mistral_tokenizer,
 ):
-    conversation, mm_data = parse_chat_messages(
-        [{
-            "role":
-            "user",
-            "content": [{
-                "type": "image_url",
-                "image_url": {
-                    "url": image_url
-                }
-            }, {
-                "type": "text",
-                "text": "What's in this image?"
-            }]
-        }, {
-            "role": "assistant",
-            "content": "Some stuff."
-        }, {
-            "role":
-            "user",
-            "content": [{
-                "type": "image_url",
-                "image_url": {
-                    "url": image_url
-                }
-            }, {
-                "type": "text",
-                "text": "What about this one?"
-            }]
-        }],
-        phi3v_model_config,
-        phi3v_tokenizer,
+    # Test string format
+    conversation, _, _ = parse_chat_messages(
+        [
+            {
+                "role": "system",
+                "content": ""
+            },
+            {
+                "role": "user",
+                "content": [{
+                    "type": "text",
+                    "text": "Who are you?"
+                }],
+            },
+        ],
+        mistral_model_config,
+        mistral_tokenizer,
         content_format="string",
     )
-
     assert conversation == [
+        {
+            "role": "system",
+            "content": ""
+        },
         {
             "role": "user",
-            "content": "<|image_1|>\nWhat's in this image?"
+            "content": "Who are you?"
         },
+    ]
+
+    # Test openai format
+    conversation, _, _ = parse_chat_messages(
+        [
+            {
+                "role": "system",
+                "content": ""
+            },
+            {
+                "role": "user",
+                "content": [{
+                    "type": "text",
+                    "text": "Who are you?"
+                }],
+            },
+        ],
+        mistral_model_config,
+        mistral_tokenizer,
+        content_format="openai",
+    )
+    assert conversation == [
         {
-            "role": "assistant",
-            "content": "Some stuff."
+            "role": "system",
+            "content": [{
+                "type": "text",
+                "text": ""
+            }]
         },
         {
             "role": "user",
-            "content": "<|image_2|>\nWhat about this one?"
+            "content": [{
+                "type": "text",
+                "text": "Who are you?"
+            }]
         },
     ]
-    _assert_mm_data_is_image_input(mm_data, 2)
 
 
-def test_parse_chat_messages_context_text_format(
+@pytest.mark.asyncio
+async def test_parse_chat_messages_single_image_async(
     phi3v_model_config,
     phi3v_tokenizer,
+    image_url,
 ):
-    conversation, mm_data = parse_chat_messages(
+    conversation, mm_future, mm_uuids = parse_chat_messages_futures(
         [{
-            "role": "user",
-            "content": [{
-                "type": "text",
-                "text": "What's in this text?"
-            }]
-        }, {
-            "role": "assistant",
-            "content": "Some stuff."
-        }, {
-            "role": "user",
-            "content": "What about this one?"
+            "role":
+            "user",
+            "content": [
+                {
+                    "type": "image_url",
+                    "image_url": {
+                        "url": image_url
+                    }
+                },
+                {
+                    "type": "text",
+                    "text": "What's in the image?"
+                },
+            ],
         }],
         phi3v_model_config,
         phi3v_tokenizer,
-        content_format="openai",
+        content_format="string",
     )
 
-    assert conversation == [
-        {
-            "role": "user",
-            "content": [{
-                "type": "text",
-                "text": "What's in this text?"
-            }]
-        },
-        {
-            "role": "assistant",
-            "content": [{
-                "type": "text",
-                "text": "Some stuff."
-            }]
-        },
-        {
-            "role": "user",
-            "content": [{
-                "type": "text",
-                "text": "What about this one?"
-            }]
-        },
-    ]
+    assert conversation == [{
+        "role": "user",
+        "content": "<|image_1|>\nWhat's in the image?"
+    }]
+    _assert_mm_data_is_image_input(await mm_future, 1)
+    _assert_mm_uuids(mm_uuids, 1, expected_uuids=[None])
 
 
-def test_parse_chat_messages_rejects_too_many_images_in_one_message(
+def test_parse_chat_messages_multiple_images(
     phi3v_model_config,
     phi3v_tokenizer,
     image_url,
 ):
-    with warnings.catch_warnings():
-        warnings.filterwarnings(
-            "ignore",
-            message="coroutine 'async_get_and_parse_image' was never awaited")
-        with pytest.raises(ValueError, match="At most"):
-            parse_chat_messages(
-                [{
-                    "role":
-                    "user",
-                    "content": [{
-                        "type": "image_url",
-                        "image_url": {
-                            "url": image_url
-                        }
-                    }, {
-                        "type": "image_url",
-                        "image_url": {
-                            "url": image_url
-                        }
-                    }, {
-                        "type": "image_url",
-                        "image_url": {
-                            "url": image_url
-                        }
-                    }, {
-                        "type": "text",
-                        "text": "What's in these images?"
-                    }]
-                }],
-                phi3v_model_config,
-                phi3v_tokenizer,
-                content_format="string",
-            )
+    conversation, mm_data, mm_uuids = parse_chat_messages(
+        [{
+            "role":
+            "user",
+            "content": [
+                {
+                    "type": "image_url",
+                    "image_url": {
+                        "url": image_url
+                    }
+                },
+                {
+                    "type": "image_pil",
+                    "image_pil": ImageAsset("cherry_blossom").pil_image,
+                },
+                {
+                    "type": "text",
+                    "text": "What's in these images?"
+                },
+            ],
+        }],
+        phi3v_model_config,
+        phi3v_tokenizer,
+        content_format="string",
+    )
+
+    assert conversation == [{
+        "role":
+        "user",
+        "content":
+        "<|image_1|>\n<|image_2|>\nWhat's in these images?",
+    }]
+    _assert_mm_data_is_image_input(mm_data, 2)
+    _assert_mm_uuids(mm_uuids, 2, expected_uuids=[None, None])
 
 
-def test_parse_chat_messages_rejects_too_many_images_across_messages(
+@pytest.mark.asyncio
+async def test_parse_chat_messages_multiple_images_async(
     phi3v_model_config,
     phi3v_tokenizer,
     image_url,
 ):
-    with warnings.catch_warnings():
-        warnings.filterwarnings(
-            "ignore",
-            message="coroutine 'async_get_and_parse_image' was never awaited")
-        with pytest.raises(ValueError, match="At most"):
-            parse_chat_messages(
-                [{
-                    "role":
-                    "user",
-                    "content": [{
-                        "type": "image_url",
-                        "image_url": {
-                            "url": image_url
-                        }
-                    }, {
-                        "type": "text",
-                        "text": "What's in this image?"
-                    }]
-                }, {
-                    "role": "assistant",
-                    "content": "Some stuff."
-                }, {
-                    "role":
-                    "user",
-                    "content": [{
-                        "type": "image_url",
-                        "image_url": {
-                            "url": image_url
-                        }
-                    }, {
-                        "type": "image_url",
-                        "image_url": {
-                            "url": image_url
-                        }
-                    }, {
-                        "type": "text",
-                        "text": "What about these two?"
-                    }]
-                }],
-                phi3v_model_config,
-                phi3v_tokenizer,
-                content_format="string",
-            )
+    conversation, mm_future, mm_uuids = parse_chat_messages_futures(
+        [{
+            "role":
+            "user",
+            "content": [
+                {
+                    "type": "image_url",
+                    "image_url": {
+                        "url": image_url
+                    }
+                },
+                {
+                    "type": "image_pil",
+                    "image_pil": ImageAsset("cherry_blossom").pil_image,
+                },
+                {
+                    "type": "text",
+                    "text": "What's in these images?"
+                },
+            ],
+        }],
+        phi3v_model_config,
+        phi3v_tokenizer,
+        content_format="string",
+    )
 
+    assert conversation == [{
+        "role":
+        "user",
+        "content":
+        "<|image_1|>\n<|image_2|>\nWhat's in these images?",
+    }]
+    _assert_mm_data_is_image_input(await mm_future, 2)
+    _assert_mm_uuids(mm_uuids, 2, expected_uuids=[None, None])
 
-def test_parse_chat_messages_multiple_images_uncommon_input(
+
+def test_parse_chat_messages_placeholder_already_in_prompt(
     phi3v_model_config,
     phi3v_tokenizer,
     image_url,
 ):
-    conversation, mm_data = parse_chat_messages(
+    conversation, mm_data, mm_uuids = parse_chat_messages(
         [{
             "role":
             "user",
             "content": [
-                "What's in these images?", {
-                    "image_url": image_url
-                }, {
-                    "image_url": image_url
-                }
-            ]
+                {
+                    "type": "image_url",
+                    "image_url": {
+                        "url": image_url
+                    }
+                },
+                {
+                    "type": "image_url",
+                    "image_url": {
+                        "url": image_url
+                    }
+                },
+                {
+                    "type":
+                    "text",
+                    "text":
+                    "What's in <|image_1|> and how does it compare to <|image_2|>?",  # noqa: E501
+                },
+            ],
         }],
         phi3v_model_config,
         phi3v_tokenizer,
         content_format="string",
     )
-
     assert conversation == [{
         "role":
         "user",
         "content":
-        "<|image_1|>\n<|image_2|>\nWhat's in these images?"
+        "What's in <|image_1|> and how does it compare to <|image_2|>?",
     }]
     _assert_mm_data_is_image_input(mm_data, 2)
+    _assert_mm_uuids(mm_uuids, 2, expected_uuids=[None, None])
 
 
-def test_parse_chat_messages_multiple_images_interleave(
-    phi3v_model_config_mm_interleaved,
+def test_parse_chat_messages_placeholder_one_already_in_prompt(
+    phi3v_model_config,
     phi3v_tokenizer,
     image_url,
 ):
-    conversation, mm_data = parse_chat_messages(
+    conversation, mm_data, mm_uuids = parse_chat_messages(
         [{
             "role":
             "user",
-            "content": [{
-                "type": "text",
-                "text": "I need you to compare this image"
-            }, {
-                "type": "image_url",
-                "image_url": {
-                    "url": image_url
-                }
-            }, {
-                "type": "text",
-                "text": "and this one"
-            }, {
-                "type": "image_url",
-                "image_url": {
-                    "url": image_url
-                }
-            }, {
-                "type": "text",
-                "text": "Do they have differences?"
-            }]
+            "content": [
+                {
+                    "type": "image_url",
+                    "image_url": {
+                        "url": image_url
+                    }
+                },
+                {
+                    "type": "image_url",
+                    "image_url": {
+                        "url": image_url
+                    }
+                },
+                {
+                    "type":
+                    "text",
+                    "text":
+                    "What's in <|image_1|> and how does it compare to the other one?",  # noqa: E501
+                },
+            ],
         }],
-        phi3v_model_config_mm_interleaved,
+        phi3v_model_config,
         phi3v_tokenizer,
         content_format="string",
     )
@@ -730,195 +778,968 @@ def test_parse_chat_messages_multiple_images_interleave(
         "role":
         "user",
         "content":
-        "I need you to compare this image\n<|image_1|>\nand this one\n<|image_2|>\n"  # noqa: E501
-        "Do they have differences?"
+        "<|image_2|>\nWhat's in <|image_1|> and how does it compare to the "
+        "other one?",
     }]
     _assert_mm_data_is_image_input(mm_data, 2)
+    _assert_mm_uuids(mm_uuids, 2, expected_uuids=[None, None])
 
 
-@pytest.mark.asyncio
-async def test_parse_chat_messages_multiple_images_interleave_async(
-    phi3v_model_config_mm_interleaved,
+def test_parse_chat_messages_multiple_images_across_messages(
+    phi3v_model_config,
     phi3v_tokenizer,
     image_url,
 ):
-    conversation, mm_data = parse_chat_messages_futures(
-        [{
-            "role":
-            "user",
-            "content": [{
-                "type": "text",
-                "text": "I need you to compare this image"
-            }, {
-                "type": "image_url",
-                "image_url": {
-                    "url": image_url
-                }
-            }, {
-                "type": "text",
-                "text": "and this one"
-            }, {
-                "type": "image_url",
-                "image_url": {
-                    "url": image_url
-                }
-            }, {
-                "type": "text",
-                "text": "Do they have differences?"
-            }]
-        }],
-        phi3v_model_config_mm_interleaved,
-        phi3v_tokenizer,
+    conversation, mm_data, mm_uuids = parse_chat_messages(
+        [
+            {
+                "role":
+                "user",
+                "content": [
+                    {
+                        "type": "image_url",
+                        "image_url": {
+                            "url": image_url
+                        }
+                    },
+                    {
+                        "type": "text",
+                        "text": "What's in this image?"
+                    },
+                ],
+            },
+            {
+                "role": "assistant",
+                "content": "Some stuff."
+            },
+            {
+                "role":
+                "user",
+                "content": [
+                    {
+                        "type": "image_url",
+                        "image_url": {
+                            "url": image_url
+                        }
+                    },
+                    {
+                        "type": "text",
+                        "text": "What about this one?"
+                    },
+                ],
+            },
+        ],
+        phi3v_model_config,
+        phi3v_tokenizer,
+        content_format="string",
+    )
+
+    assert conversation == [
+        {
+            "role": "user",
+            "content": "<|image_1|>\nWhat's in this image?"
+        },
+        {
+            "role": "assistant",
+            "content": "Some stuff."
+        },
+        {
+            "role": "user",
+            "content": "<|image_2|>\nWhat about this one?"
+        },
+    ]
+    _assert_mm_data_is_image_input(mm_data, 2)
+    _assert_mm_uuids(mm_uuids, 2, expected_uuids=[None, None])
+
+
+def test_parse_chat_messages_multiple_images_with_uuids_across_messages(
+    phi3v_model_config,
+    phi3v_tokenizer,
+    image_url,
+):
+    image_uuid = str(hash(image_url))
+    conversation, mm_data, mm_uuids = parse_chat_messages(
+        [
+            {
+                "role":
+                "user",
+                "content": [
+                    {
+                        "type": "image_url",
+                        "image_url": {
+                            "url": image_url
+                        },
+                        "uuid": image_uuid,
+                    },
+                    {
+                        "type": "text",
+                        "text": "What's in this image?"
+                    },
+                ],
+            },
+            {
+                "role": "assistant",
+                "content": "Some stuff."
+            },
+            {
+                "role":
+                "user",
+                "content": [
+                    {
+                        "type": "image_url",
+                        "image_url": {
+                            "url": image_url
+                        },
+                        "uuid": image_uuid,
+                    },
+                    {
+                        "type": "text",
+                        "text": "What about this one?"
+                    },
+                ],
+            },
+        ],
+        phi3v_model_config,
+        phi3v_tokenizer,
+        content_format="string",
+    )
+
+    assert conversation == [
+        {
+            "role": "user",
+            "content": "<|image_1|>\nWhat's in this image?"
+        },
+        {
+            "role": "assistant",
+            "content": "Some stuff."
+        },
+        {
+            "role": "user",
+            "content": "<|image_2|>\nWhat about this one?"
+        },
+    ]
+    _assert_mm_data_is_image_input(mm_data, 2)
+    _assert_mm_uuids(mm_uuids, 2, expected_uuids=[image_uuid, image_uuid])
+
+
+def test_parse_chat_messages_context_text_format(
+    phi3v_model_config,
+    phi3v_tokenizer,
+):
+    conversation, mm_data, mm_uuids = parse_chat_messages(
+        [
+            {
+                "role": "user",
+                "content": [{
+                    "type": "text",
+                    "text": "What's in this text?"
+                }],
+            },
+            {
+                "role": "assistant",
+                "content": "Some stuff."
+            },
+            {
+                "role": "user",
+                "content": "What about this one?"
+            },
+        ],
+        phi3v_model_config,
+        phi3v_tokenizer,
+        content_format="openai",
+    )
+
+    assert conversation == [
+        {
+            "role": "user",
+            "content": [{
+                "type": "text",
+                "text": "What's in this text?"
+            }],
+        },
+        {
+            "role": "assistant",
+            "content": [{
+                "type": "text",
+                "text": "Some stuff."
+            }],
+        },
+        {
+            "role": "user",
+            "content": [{
+                "type": "text",
+                "text": "What about this one?"
+            }],
+        },
+    ]
+    assert mm_data is None
+    assert mm_uuids is None
+
+
+def test_parse_chat_messages_rejects_too_many_images_in_one_message(
+    phi3v_model_config,
+    phi3v_tokenizer,
+    image_url,
+):
+    with warnings.catch_warnings():
+        warnings.filterwarnings(
+            "ignore",
+            message="coroutine 'async_get_and_parse_image' was never awaited",
+        )
+        with pytest.raises(ValueError, match="At most"):
+            parse_chat_messages(
+                [{
+                    "role":
+                    "user",
+                    "content": [
+                        {
+                            "type": "image_url",
+                            "image_url": {
+                                "url": image_url
+                            },
+                        },
+                        {
+                            "type": "image_url",
+                            "image_url": {
+                                "url": image_url
+                            },
+                        },
+                        {
+                            "type": "image_url",
+                            "image_url": {
+                                "url": image_url
+                            },
+                        },
+                        {
+                            "type": "text",
+                            "text": "What's in these images?"
+                        },
+                    ],
+                }],
+                phi3v_model_config,
+                phi3v_tokenizer,
+                content_format="string",
+            )
+
+
+def test_parse_chat_messages_rejects_too_many_images_across_messages(
+    phi3v_model_config,
+    phi3v_tokenizer,
+    image_url,
+):
+    with warnings.catch_warnings():
+        warnings.filterwarnings(
+            "ignore",
+            message="coroutine 'async_get_and_parse_image' was never awaited",
+        )
+        with pytest.raises(ValueError, match="At most"):
+            parse_chat_messages(
+                [
+                    {
+                        "role":
+                        "user",
+                        "content": [
+                            {
+                                "type": "image_url",
+                                "image_url": {
+                                    "url": image_url
+                                },
+                            },
+                            {
+                                "type": "text",
+                                "text": "What's in this image?"
+                            },
+                        ],
+                    },
+                    {
+                        "role": "assistant",
+                        "content": "Some stuff."
+                    },
+                    {
+                        "role":
+                        "user",
+                        "content": [
+                            {
+                                "type": "image_url",
+                                "image_url": {
+                                    "url": image_url
+                                },
+                            },
+                            {
+                                "type": "image_url",
+                                "image_url": {
+                                    "url": image_url
+                                },
+                            },
+                            {
+                                "type": "text",
+                                "text": "What about these two?"
+                            },
+                        ],
+                    },
+                ],
+                phi3v_model_config,
+                phi3v_tokenizer,
+                content_format="string",
+            )
+
+
+def test_parse_chat_messages_multiple_images_uncommon_input(
+    phi3v_model_config,
+    phi3v_tokenizer,
+    image_url,
+):
+    conversation, mm_data, mm_uuids = parse_chat_messages(
+        [{
+            "role":
+            "user",
+            "content": [
+                "What's in these images?",
+                {
+                    "image_url": image_url
+                },
+                {
+                    "image_url": image_url
+                },
+            ],
+        }],
+        phi3v_model_config,
+        phi3v_tokenizer,
+        content_format="string",
+    )
+
+    assert conversation == [{
+        "role":
+        "user",
+        "content":
+        "<|image_1|>\n<|image_2|>\nWhat's in these images?",
+    }]
+    _assert_mm_data_is_image_input(mm_data, 2)
+    _assert_mm_uuids(mm_uuids, 2, expected_uuids=[None, None])
+
+
+def test_parse_chat_messages_multiple_images_interleave(
+    phi3v_model_config_mm_interleaved,
+    phi3v_tokenizer,
+    image_url,
+):
+    conversation, mm_data, mm_uuids = parse_chat_messages(
+        [{
+            "role":
+            "user",
+            "content": [
+                {
+                    "type": "text",
+                    "text": "I need you to compare this image",
+                },
+                {
+                    "type": "image_url",
+                    "image_url": {
+                        "url": image_url
+                    }
+                },
+                {
+                    "type": "text",
+                    "text": "and this one"
+                },
+                {
+                    "type": "image_url",
+                    "image_url": {
+                        "url": image_url
+                    }
+                },
+                {
+                    "type": "text",
+                    "text": "Do they have differences?"
+                },
+            ],
+        }],
+        phi3v_model_config_mm_interleaved,
+        phi3v_tokenizer,
+        content_format="string",
+    )
+
+    assert conversation == [{
+        "role":
+        "user",
+        "content":
+        "I need you to compare this image\n<|image_1|>\nand this one\n<|image_2|>\n"  # noqa: E501
+        "Do they have differences?",
+    }]
+    _assert_mm_data_is_image_input(mm_data, 2)
+    _assert_mm_uuids(mm_uuids, 2, expected_uuids=[None, None])
+
+
+@pytest.mark.asyncio
+async def test_parse_chat_messages_multiple_images_interleave_async(
+    phi3v_model_config_mm_interleaved,
+    phi3v_tokenizer,
+    image_url,
+):
+    conversation, mm_data, mm_uuids = parse_chat_messages_futures(
+        [{
+            "role":
+            "user",
+            "content": [
+                {
+                    "type": "text",
+                    "text": "I need you to compare this image",
+                },
+                {
+                    "type": "image_url",
+                    "image_url": {
+                        "url": image_url
+                    }
+                },
+                {
+                    "type": "text",
+                    "text": "and this one"
+                },
+                {
+                    "type": "image_url",
+                    "image_url": {
+                        "url": image_url
+                    }
+                },
+                {
+                    "type": "text",
+                    "text": "Do they have differences?"
+                },
+            ],
+        }],
+        phi3v_model_config_mm_interleaved,
+        phi3v_tokenizer,
+        content_format="string",
+    )
+
+    assert conversation == [{
+        "role":
+        "user",
+        "content":
+        "I need you to compare this image\n<|image_1|>\nand this one\n<|image_2|>\n"  # noqa: E501
+        "Do they have differences?",
+    }]
+    _assert_mm_data_is_image_input(await mm_data, 2)
+    _assert_mm_uuids(mm_uuids, 2, expected_uuids=[None, None])
+
+
+@pytest.mark.asyncio
+async def test_parse_chat_messages_multiple_images_with_uuids_interleave_async(
+    phi3v_model_config_mm_interleaved,
+    phi3v_tokenizer,
+    image_url,
+):
+    image_uuid = str(hash(image_url))
+    conversation, mm_data, mm_uuids = parse_chat_messages_futures(
+        [{
+            "role":
+            "user",
+            "content": [
+                {
+                    "type": "text",
+                    "text": "I need you to compare this image",
+                },
+                {
+                    "type": "image_url",
+                    "image_url": {
+                        "url": image_url
+                    },
+                    "uuid": image_uuid,
+                },
+                {
+                    "type": "text",
+                    "text": "and this one"
+                },
+                {
+                    "type": "image_url",
+                    "image_url": {
+                        "url": image_url
+                    },
+                    "uuid": image_uuid,
+                },
+                {
+                    "type": "text",
+                    "text": "Do they have differences?"
+                },
+            ],
+        }],
+        phi3v_model_config_mm_interleaved,
+        phi3v_tokenizer,
+        content_format="string",
+    )
+
+    assert conversation == [{
+        "role":
+        "user",
+        "content":
+        "I need you to compare this image\n<|image_1|>\nand this one\n<|image_2|>\n"  # noqa: E501
+        "Do they have differences?",
+    }]
+    _assert_mm_data_is_image_input(await mm_data, 2)
+    _assert_mm_uuids(mm_uuids, 2, expected_uuids=[image_uuid, image_uuid])
+
+
+def test_parse_chat_messages_multiple_images_multiple_messages_interleave(
+    phi3v_model_config_mm_interleaved,
+    phi3v_tokenizer,
+    image_url,
+):
+    conversation, mm_data, mm_uuids = parse_chat_messages(
+        [
+            {
+                "role":
+                "user",
+                "content": [
+                    {
+                        "type": "text",
+                        "text": "What's on this image?"
+                    },
+                    {
+                        "type": "image_url",
+                        "image_url": {
+                            "url": image_url
+                        }
+                    },
+                    {
+                        "type": "text",
+                        "text": "Be accurate."
+                    },
+                ],
+            },
+            {
+                "role": "assistant",
+                "content": "Some stuff."
+            },
+            {
+                "role":
+                "user",
+                "content": [
+                    {
+                        "type": "text",
+                        "text": "What's on this image?"
+                    },
+                    {
+                        "type": "image_url",
+                        "image_url": {
+                            "url": image_url
+                        }
+                    },
+                ],
+            },
+        ],
+        phi3v_model_config_mm_interleaved,
+        phi3v_tokenizer,
+        content_format="string",
+    )
+
+    assert conversation == [
+        {
+            "role": "user",
+            "content": "What's on this image?\n<|image_1|>\nBe accurate.",
+        },
+        {
+            "role": "assistant",
+            "content": "Some stuff."
+        },
+        {
+            "role": "user",
+            "content": "What's on this image?\n<|image_2|>"
+        },
+    ]
+    _assert_mm_data_is_image_input(mm_data, 2)
+    _assert_mm_uuids(mm_uuids, 2, expected_uuids=[None, None])
+
+
+def test_parse_chat_messages_multiple_images_with_uuids_multiple_messages_interleave(  # noqa: E501
+    phi3v_model_config_mm_interleaved,
+    phi3v_tokenizer,
+    image_url,
+):
+    image_uuid = str(hash(image_url))
+    conversation, mm_data, mm_uuids = parse_chat_messages(
+        [
+            {
+                "role":
+                "user",
+                "content": [
+                    {
+                        "type": "text",
+                        "text": "What's on this image?"
+                    },
+                    {
+                        "type": "image_url",
+                        "image_url": {
+                            "url": image_url
+                        },
+                        "uuid": image_uuid,
+                    },
+                    {
+                        "type": "text",
+                        "text": "Be accurate."
+                    },
+                ],
+            },
+            {
+                "role": "assistant",
+                "content": "Some stuff."
+            },
+            {
+                "role":
+                "user",
+                "content": [
+                    {
+                        "type": "text",
+                        "text": "What's on this image?"
+                    },
+                    {
+                        "type": "image_url",
+                        "image_url": {
+                            "url": image_url
+                        },
+                        "uuid": image_uuid,
+                    },
+                ],
+            },
+        ],
+        phi3v_model_config_mm_interleaved,
+        phi3v_tokenizer,
+        content_format="string",
+    )
+
+    assert conversation == [
+        {
+            "role": "user",
+            "content": "What's on this image?\n<|image_1|>\nBe accurate.",
+        },
+        {
+            "role": "assistant",
+            "content": "Some stuff."
+        },
+        {
+            "role": "user",
+            "content": "What's on this image?\n<|image_2|>"
+        },
+    ]
+    _assert_mm_data_is_image_input(mm_data, 2)
+    _assert_mm_uuids(mm_uuids, 2, expected_uuids=[image_uuid, image_uuid])
+
+
+def test_parse_chat_messages_multiple_modals_multiple_messages_interleave(
+    qwen25omni_model_config_mm_interleaved,
+    qwen25omni_tokenizer,
+    image_url,
+    video_url,
+    audio_url,
+):
+    conversation, mm_data, mm_uuids = parse_chat_messages(
+        [
+            {
+                "role":
+                "user",
+                "content": [
+                    {
+                        "type": "text",
+                        "text": "What's on this image?"
+                    },
+                    {
+                        "type": "image_url",
+                        "image_url": {
+                            "url": image_url
+                        }
+                    },
+                    {
+                        "type": "text",
+                        "text": "Now listen to this audio"
+                    },
+                    {
+                        "type": "audio_url",
+                        "audio_url": {
+                            "url": audio_url
+                        }
+                    },
+                ],
+            },
+            {
+                "role": "assistant",
+                "content": "Some stuff."
+            },
+            {
+                "role":
+                "user",
+                "content": [
+                    {
+                        "type": "text",
+                        "text": "What's on this image?"
+                    },
+                    {
+                        "type": "image_url",
+                        "image_url": {
+                            "url": image_url
+                        }
+                    },
+                    {
+                        "type": "text",
+                        "text": "And what's in the video?"
+                    },
+                    {
+                        "type": "video_url",
+                        "video_url": {
+                            "url": video_url
+                        }
+                    },
+                ],
+            },
+        ],
+        qwen25omni_model_config_mm_interleaved,
+        qwen25omni_tokenizer,
         content_format="string",
     )
 
-    assert conversation == [{
-        "role":
-        "user",
-        "content":
-        "I need you to compare this image\n<|image_1|>\nand this one\n<|image_2|>\n"  # noqa: E501
-        "Do they have differences?"
-    }]
-    _assert_mm_data_is_image_input(await mm_data, 2)
-
-
-def test_parse_chat_messages_multiple_images_multiple_messages_interleave(
-    phi3v_model_config_mm_interleaved,
-    phi3v_tokenizer,
-    image_url,
-):
-    conversation, mm_data = parse_chat_messages(
-        [{
+    assert conversation == [
+        {
             "role":
             "user",
-            "content": [
-                {
-                    "type": "text",
-                    "text": "What's on this image?"
-                },
-                {
-                    "type": "image_url",
-                    "image_url": {
-                        "url": image_url
-                    }
-                },
-                {
-                    "type": "text",
-                    "text": "Be accurate."
-                },
-            ]
-        }, {
+            "content":
+            "What's on this image?\n<|vision_start|><|IMAGE|><|vision_end|>\n"
+            "Now listen to this audio\nAudio 1: <|audio_bos|><|AUDIO|><|audio_eos|>",  # noqa: E501
+        },
+        {
             "role": "assistant",
             "content": "Some stuff."
-        }, {
+        },
+        {
             "role":
             "user",
-            "content": [{
-                "type": "text",
-                "text": "What's on this image?"
-            }, {
-                "type": "image_url",
-                "image_url": {
-                    "url": image_url
-                }
-            }]
-        }],
-        phi3v_model_config_mm_interleaved,
-        phi3v_tokenizer,
-        content_format="string",
-    )
+            "content":
+            "What's on this image?\n<|vision_start|><|IMAGE|><|vision_end|>\n"
+            "And what's in the video?\n<|vision_start|><|VIDEO|><|vision_end|>",
+        },
+    ]
 
-    assert conversation == [{
-        "role":
-        "user",
-        "content":
-        "What's on this image?\n<|image_1|>\nBe accurate."
-    }, {
-        "role": "assistant",
-        "content": "Some stuff."
-    }, {
-        "role": "user",
-        "content": "What's on this image?\n<|image_2|>"
-    }]
-    _assert_mm_data_is_image_input(mm_data, 2)
+    _assert_mm_data_inputs(mm_data, {"image": 2, "video": 1, "audio": 1})
+    _assert_mm_uuids(mm_uuids,
+                     2,
+                     modality="image",
+                     expected_uuids=[None, None])
+    _assert_mm_uuids(mm_uuids, 1, modality="video", expected_uuids=[None])
+    _assert_mm_uuids(mm_uuids, 1, modality="audio", expected_uuids=[None])
 
 
-def test_parse_chat_messages_multiple_modals_multiple_messages_interleave(
-        qwen25omni_model_config_mm_interleaved, qwen25omni_tokenizer,
-        image_url, video_url, audio_url):
-    conversation, mm_data = parse_chat_messages(
-        [{
+def test_parse_chat_messages_multiple_modals_with_uuids_multiple_messages_interleave(  # noqa: E501
+    qwen25omni_model_config_mm_interleaved,
+    qwen25omni_tokenizer,
+    image_url,
+    video_url,
+    audio_url,
+):
+    conversation, mm_data, mm_uuids = parse_chat_messages(
+        [
+            {
+                "role":
+                "user",
+                "content": [
+                    {
+                        "type": "text",
+                        "text": "What's on this image?"
+                    },
+                    {
+                        "type": "image_url",
+                        "image_url": {
+                            "url": image_url
+                        },
+                        "uuid": "image_123",
+                    },
+                    {
+                        "type": "text",
+                        "text": "Now listen to this audio"
+                    },
+                    {
+                        "type": "audio_url",
+                        "audio_url": {
+                            "url": audio_url
+                        },
+                        "uuid": "audio_123",
+                    },
+                ],
+            },
+            {
+                "role": "assistant",
+                "content": "Some stuff."
+            },
+            {
+                "role":
+                "user",
+                "content": [
+                    {
+                        "type": "text",
+                        "text": "What's on this image?"
+                    },
+                    {
+                        "type": "image_url",
+                        "image_url": {
+                            "url": image_url
+                        },
+                        "uuid": "image_123",
+                    },
+                    {
+                        "type": "text",
+                        "text": "And what's in the video?"
+                    },
+                    {
+                        "type": "video_url",
+                        "video_url": {
+                            "url": video_url
+                        },
+                        "uuid": "video_123",
+                    },
+                ],
+            },
+        ],
+        qwen25omni_model_config_mm_interleaved,
+        qwen25omni_tokenizer,
+        content_format="string",
+    )
+
+    assert conversation == [
+        {
             "role":
             "user",
-            "content": [
-                {
-                    "type": "text",
-                    "text": "What's on this image?"
-                },
-                {
-                    "type": "image_url",
-                    "image_url": {
-                        "url": image_url
-                    }
-                },
-                {
-                    "type": "text",
-                    "text": "Now listen to this audio"
-                },
-                {
-                    "type": "audio_url",
-                    "audio_url": {
-                        "url": audio_url
-                    }
-                },
-            ]
-        }, {
+            "content":
+            "What's on this image?\n<|vision_start|><|IMAGE|><|vision_end|>\n"
+            "Now listen to this audio\nAudio 1: <|audio_bos|><|AUDIO|><|audio_eos|>",  # noqa: E501
+        },
+        {
             "role": "assistant",
             "content": "Some stuff."
-        }, {
+        },
+        {
             "role":
             "user",
-            "content": [{
-                "type": "text",
-                "text": "What's on this image?"
-            }, {
-                "type": "image_url",
-                "image_url": {
-                    "url": image_url
-                }
-            }, {
-                "type": "text",
-                "text": "And what's in the video?"
-            }, {
-                "type": "video_url",
-                "video_url": {
-                    "url": video_url
-                }
-            }]
-        }],
+            "content":
+            "What's on this image?\n<|vision_start|><|IMAGE|><|vision_end|>\n"
+            "And what's in the video?\n<|vision_start|><|VIDEO|><|vision_end|>",
+        },
+    ]
+
+    _assert_mm_data_inputs(mm_data, {"image": 2, "video": 1, "audio": 1})
+    _assert_mm_uuids(mm_uuids,
+                     2,
+                     modality="image",
+                     expected_uuids=["image_123", "image_123"])
+    _assert_mm_uuids(mm_uuids,
+                     1,
+                     modality="video",
+                     expected_uuids=["video_123"])
+    _assert_mm_uuids(mm_uuids,
+                     1,
+                     modality="audio",
+                     expected_uuids=["audio_123"])
+
+
+def test_parse_chat_messages_multiple_modals_with_partial_uuids_multiple_messages_interleave(  # noqa: E501
+    qwen25omni_model_config_mm_interleaved,
+    qwen25omni_tokenizer,
+    image_url,
+    video_url,
+    audio_url,
+):
+    conversation, mm_data, mm_uuids = parse_chat_messages(
+        [
+            {
+                "role":
+                "user",
+                "content": [
+                    {
+                        "type": "text",
+                        "text": "What's on this image?"
+                    },
+                    {
+                        "type": "image_url",
+                        "image_url": {
+                            "url": image_url
+                        },
+                        "uuid": "image_123",
+                    },
+                    {
+                        "type": "text",
+                        "text": "Now listen to this audio"
+                    },
+                    {
+                        "type": "audio_url",
+                        "audio_url": {
+                            "url": audio_url
+                        }
+                    },
+                ],
+            },
+            {
+                "role": "assistant",
+                "content": "Some stuff."
+            },
+            {
+                "role":
+                "user",
+                "content": [
+                    {
+                        "type": "text",
+                        "text": "What's on this image?"
+                    },
+                    {
+                        "type": "image_url",
+                        "image_url": {
+                            "url": image_url
+                        }
+                    },
+                    {
+                        "type": "text",
+                        "text": "And what's in the video?"
+                    },
+                    {
+                        "type": "video_url",
+                        "video_url": {
+                            "url": video_url
+                        },
+                        "uuid": "video_123",
+                    },
+                ],
+            },
+        ],
         qwen25omni_model_config_mm_interleaved,
         qwen25omni_tokenizer,
         content_format="string",
     )
 
-    assert conversation == [{
-        "role":
-        "user",
-        "content":
-        "What's on this image?\n<|vision_start|><|IMAGE|><|vision_end|>\n"
-        "Now listen to this audio\nAudio 1: <|audio_bos|><|AUDIO|><|audio_eos|>"
-    }, {
-        "role": "assistant",
-        "content": "Some stuff."
-    }, {
-        "role":
-        "user",
-        "content":
-        "What's on this image?\n<|vision_start|><|IMAGE|><|vision_end|>\n"
-        "And what's in the video?\n<|vision_start|><|VIDEO|><|vision_end|>"
-    }]
+    assert conversation == [
+        {
+            "role":
+            "user",
+            "content":
+            "What's on this image?\n<|vision_start|><|IMAGE|><|vision_end|>\n"
+            "Now listen to this audio\nAudio 1: <|audio_bos|><|AUDIO|><|audio_eos|>",  # noqa: E501
+        },
+        {
+            "role": "assistant",
+            "content": "Some stuff."
+        },
+        {
+            "role":
+            "user",
+            "content":
+            "What's on this image?\n<|vision_start|><|IMAGE|><|vision_end|>\n"
+            "And what's in the video?\n<|vision_start|><|VIDEO|><|vision_end|>",
+        },
+    ]
 
     _assert_mm_data_inputs(mm_data, {"image": 2, "video": 1, "audio": 1})
+    _assert_mm_uuids(mm_uuids,
+                     2,
+                     modality="image",
+                     expected_uuids=["image_123", None])
+    _assert_mm_uuids(mm_uuids,
+                     1,
+                     modality="video",
+                     expected_uuids=["video_123"])
+    _assert_mm_uuids(mm_uuids, 1, modality="audio", expected_uuids=[None])
 
 
 def test_parse_chat_messages_multiple_images_interleave_with_placeholders(
@@ -929,7 +1750,8 @@ def test_parse_chat_messages_multiple_images_interleave_with_placeholders(
     with pytest.raises(
             ValueError,
             match=r"Found more '<|image_1|>' placeholders in input prompt "
-            "than actual multimodal data items."):
+            "than actual multimodal data items.",
+    ):
         parse_chat_messages(
             [{
                 "role":
@@ -952,9 +1774,9 @@ def test_parse_chat_messages_multiple_images_interleave_with_placeholders(
                         "text",
                         "text":
                         "I need you to compare this image\n<|image_1|>\nand this one\n<|image_2|>\n"  # noqa: E501
-                        "Do they have differences?"
+                        "Do they have differences?",
                     },
-                ]
+                ],
             }],
             phi3v_model_config_mm_interleaved,
             phi3v_tokenizer,
@@ -969,31 +1791,38 @@ def test_mllama_single_image(
     image_url,
 ):
     """Ensures that a single image is parsed correctly mllama."""
-    conversation, mm_data = parse_chat_messages(
+    conversation, mm_data, mm_uuids = parse_chat_messages(
         [{
             "role":
             "user",
-            "content": [{
-                'type': 'text',
-                'text': 'The content of this image is:'
-            }, {
-                "image_url": image_url
-            }]
+            "content": [
+                {
+                    "type": "text",
+                    "text": "The content of this image is:"
+                },
+                {
+                    "image_url": image_url
+                },
+            ],
         }],
         mllama_model_config,
         mllama_tokenizer,
         content_format="openai",
     )
     _assert_mm_data_is_image_input(mm_data, 1)
+    _assert_mm_uuids(mm_uuids, 1, expected_uuids=[None])
     assert conversation == [{
-        'role':
-        'user',
-        'content': [{
-            'type': 'text',
-            'text': 'The content of this image is:'
-        }, {
-            'type': 'image'
-        }]
+        "role":
+        "user",
+        "content": [
+            {
+                "type": "text",
+                "text": "The content of this image is:"
+            },
+            {
+                "type": "image"
+            },
+        ],
     }]
 
 
@@ -1003,46 +1832,52 @@ def test_mllama_interleaved_images(
     image_url,
 ):
     """Ensures that multiple image are parsed as interleaved dicts."""
-    conversation, mm_data = parse_chat_messages(
+    conversation, mm_data, mm_uuids = parse_chat_messages(
         [{
             "role":
             "user",
             "content": [
                 {
-                    'type': 'text',
-                    'text': 'The content of the first image is:'
+                    "type": "text",
+                    "text": "The content of the first image is:",
                 },
                 {
                     "image_url": image_url
                 },
                 {
-                    'type': 'text',
-                    'text': 'The content of the second image is:'
+                    "type": "text",
+                    "text": "The content of the second image is:",
                 },
                 {
                     "image_url": image_url
                 },
-            ]
+            ],
         }],
         mllama_model_config,
         mllama_tokenizer,
         content_format="openai",
     )
     _assert_mm_data_is_image_input(mm_data, 2)
+    _assert_mm_uuids(mm_uuids, 2, expected_uuids=[None, None])
     assert conversation == [{
-        'role':
-        'user',
-        'content': [{
-            'type': 'text',
-            'text': 'The content of the first image is:'
-        }, {
-            'type': 'image'
-        }, {
-            'type': 'text',
-            'text': 'The content of the second image is:'
-        }, {
-            'type': 'image'
-        }]
+        "role":
+        "user",
+        "content": [
+            {
+                "type": "text",
+                "text": "The content of the first image is:"
+            },
+            {
+                "type": "image"
+            },
+            {
+                "type": "text",
+                "text": "The content of the second image is:"
+            },
+            {
+                "type": "image"
+            },
+        ],
     }]
 
 
@@ -1053,34 +1888,36 @@ def test_multimodal_image_parsing_matches_hf(model, image_url):
     def get_conversation(is_hf: bool):
         img_part = {"type": "image_url", "image_url": {"url": image_url}}
         if is_hf:
-            img_part = {'type': 'image'}
+            img_part = {"type": "image"}
         return [{
-            'role':
-            'user',
-            'content': [
+            "role":
+            "user",
+            "content": [
                 {
-                    'type': 'text',
-                    'text': 'The content of the first image is:'
+                    "type": "text",
+                    "text": "The content of the first image is:",
                 },
                 img_part,
                 {
-                    'type': 'text',
-                    'text': 'The content of the second image is:'
+                    "type": "text",
+                    "text": "The content of the second image is:",
                 },
                 img_part,
                 {
-                    'type': 'text',
-                    'text': 'What animal is in the first image?'
+                    "type": "text",
+                    "text": "What animal is in the first image?",
                 },
-            ]
+            ],
         }]
 
     # Build a config for the model
-    model_config = ModelConfig(model,
-                               runner="generate",
-                               limit_mm_per_prompt={
-                                   "image": 2,
-                               })
+    model_config = ModelConfig(
+        model,
+        runner="generate",
+        limit_mm_per_prompt={
+            "image": 2,
+        },
+    )
 
     # Build the tokenizer group and grab the underlying tokenizer
     tokenizer_group = TokenizerGroup(
@@ -1102,7 +1939,7 @@ def test_multimodal_image_parsing_matches_hf(model, image_url):
 
     # Now parse with vLLMs chat utils & apply the template
     vllm_conversation = get_conversation(is_hf=False)
-    conversation, _ = parse_chat_messages(
+    conversation, _, _ = parse_chat_messages(
         vllm_conversation,
         model_config,
         tokenizer_group,
@@ -1126,7 +1963,8 @@ def test_multimodal_image_parsing_matches_hf(model, image_url):
     [
         QWEN2VL_MODEL_ID,  # tokenizer.chat_template is of type str
         HERMES_MODEL_ID,  # tokenizer.chat_template is of type dict
-    ])
+    ],
+)
 @pytest.mark.parametrize("use_tools", [True, False])
 def test_resolve_hf_chat_template(sample_json_schema, model, use_tools):
     """checks that chat_template is a dict type for HF models."""
@@ -1140,7 +1978,9 @@ def test_resolve_hf_chat_template(sample_json_schema, model, use_tools):
         revision=model_info.revision,
         trust_remote_code=model_info.trust_remote_code,
         hf_overrides=model_info.hf_overrides,
-    )
+        skip_tokenizer_init=model_info.skip_tokenizer_init,
+        enforce_eager=model_info.enforce_eager,
+        dtype=model_info.dtype)
 
     # Build the tokenizer group and grab the underlying tokenizer
     tokenizer_group = TokenizerGroup(
@@ -1152,14 +1992,14 @@ def test_resolve_hf_chat_template(sample_json_schema, model, use_tools):
     )
     tokenizer = tokenizer_group.tokenizer
 
-    tools = [{
+    tools = ([{
         "type": "function",
         "function": {
             "name": "dummy_function_name",
             "description": "This is a dummy function",
-            "parameters": sample_json_schema
-        }
-    }] if use_tools else None
+            "parameters": sample_json_schema,
+        },
+    }] if use_tools else None)
 
     # Test detecting the tokenizer's chat_template
     chat_template = resolve_hf_chat_template(
@@ -1196,7 +2036,9 @@ def test_resolve_content_format_hf_defined(model, expected_format):
         revision=model_info.revision,
         trust_remote_code=model_info.trust_remote_code,
         hf_overrides=model_info.hf_overrides,
-    )
+        skip_tokenizer_init=model_info.skip_tokenizer_init,
+        enforce_eager=model_info.enforce_eager,
+        dtype=model_info.dtype)
 
     tokenizer_group = TokenizerGroup(
         model,
@@ -1256,7 +2098,9 @@ def test_resolve_content_format_fallbacks(model, expected_format):
         revision=model_info.revision,
         trust_remote_code=model_info.trust_remote_code,
         hf_overrides=model_info.hf_overrides,
-    )
+        skip_tokenizer_init=model_info.skip_tokenizer_init,
+        enforce_eager=model_info.enforce_eager,
+        dtype=model_info.dtype)
 
     tokenizer_group = TokenizerGroup(
         model_config.tokenizer,
@@ -1386,7 +2230,7 @@ def test_parse_chat_messages_include_thinking_chunk(mistral_model_config,
         }],
     }]
 
-    conversation_with_thinking, _ = parse_chat_messages(
+    conversation_with_thinking, _, _ = parse_chat_messages(
         messages,
         mistral_model_config,
         mistral_tokenizer,
diff --git a/tests/entrypoints/test_context.py b/tests/entrypoints/test_context.py
new file mode 100644
index 0000000000000000000000000000000000000000..5e6a4c85ff7900a163e3843f7c8b1cb99ad461db
--- /dev/null
+++ b/tests/entrypoints/test_context.py
@@ -0,0 +1,425 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from unittest.mock import MagicMock, patch
+
+import pytest
+from openai_harmony import StreamState
+
+from vllm.entrypoints.context import HarmonyContext, StreamingHarmonyContext
+from vllm.outputs import CompletionOutput, RequestOutput
+
+
+# Helper function for Python < 3.10 compatibility
+async def async_next(async_iterator):
+    """Compatibility function equivalent to Python 3.10's anext()."""
+    return await async_iterator.__anext__()
+
+
+def create_mock_request_output(
+    prompt_token_ids=None,
+    output_token_ids=None,
+    num_cached_tokens=0,
+    finished=True,
+):
+    """Helper function to create a mock RequestOutput object for testing."""
+    outputs = []
+    token_ids = output_token_ids if output_token_ids is not None else []
+    outputs = [
+        CompletionOutput(
+            index=0,
+            text="Test output",
+            token_ids=token_ids,
+            cumulative_logprob=0.0,
+            logprobs=None,
+            finish_reason=None,
+            stop_reason=None,
+        )
+    ]
+
+    return RequestOutput(
+        request_id="test-id",
+        prompt="Test prompt",
+        prompt_token_ids=prompt_token_ids,
+        prompt_logprobs=None,
+        outputs=outputs,
+        finished=finished,
+        num_cached_tokens=num_cached_tokens,
+    )
+
+
+async def generate_mock_outputs(num_turns,
+                                prompt_token_counts,
+                                output_token_counts,
+                                cached_token_counts=None):
+    """Generate a sequence of mock RequestOutput objects to simulate multiple
+    turns."""
+    if cached_token_counts is None:
+        cached_token_counts = [0] * num_turns
+
+    for i in range(num_turns):
+        # Create mock prompt token IDs and output token IDs
+        prompt_token_ids = list(range(1, prompt_token_counts[i] + 1))
+        output_token_ids = list(range(1, output_token_counts[i] + 1))
+
+        # Create and yield the RequestOutput
+        yield create_mock_request_output(
+            prompt_token_ids=prompt_token_ids,
+            output_token_ids=output_token_ids,
+            num_cached_tokens=cached_token_counts[i],
+        )
+
+
+@pytest.fixture
+def mock_parser():
+    """Set up a mock parser for tests."""
+    with patch("vllm.entrypoints.context.get_streamable_parser_for_assistant"
+               ) as mock_parser_factory:
+        # Create a mock parser object
+        parser = MagicMock()
+        parser.messages = []
+        parser.current_channel = None
+        parser.state = StreamState.EXPECT_START
+        mock_parser_factory.return_value = parser
+        yield parser
+
+
+def test_single_turn_token_counting():
+    """Test token counting behavior for a single turn."""
+    # Create a context
+    context = HarmonyContext(messages=[], available_tools=[])
+
+    # Create a mock RequestOutput with specific token counts
+    mock_output = create_mock_request_output(
+        prompt_token_ids=[1, 2, 3, 4, 5],  # 5 prompt tokens
+        output_token_ids=[6, 7, 8],  # 3 output tokens
+        num_cached_tokens=2,  # 2 cached tokens
+    )
+
+    # Append the output to the context
+    context.append_output(mock_output)
+
+    # Verify the token counts
+    assert context.num_prompt_tokens == 5
+    assert context.num_output_tokens == 3
+    assert context.num_cached_tokens == 2
+    assert context.num_tool_output_tokens == 0  # No tool tokens in first turn
+
+    # Verify internal state tracking
+    assert not context.is_first_turn
+    assert context.previous_turn.input_tokens == 5
+    assert context.previous_turn.output_tokens == 3
+
+
+@pytest.mark.asyncio
+async def test_multi_turn_token_counting():
+    """Test token counting behavior across multiple turns with tool output."""
+    # Create a context
+    context = HarmonyContext(messages=[], available_tools=["browser"])
+
+    # Simulate a conversation with 3 turns
+    # Turn 1: prefill 5, decode 3, tool 7
+    # Turn 2: prefill 15, cached 5, decode 4, tool 1
+    # Turn 3: prefill 20, cached 15, decode 5
+    prompt_token_counts = [5, 15, 20]
+    output_token_counts = [3, 4, 5]
+    cached_token_counts = [0, 5, 15]
+    mock_generator = generate_mock_outputs(3, prompt_token_counts,
+                                           output_token_counts,
+                                           cached_token_counts)
+
+    # First turn - initial prompt and response
+    mock_output1 = await async_next(mock_generator)
+    context.append_output(mock_output1)
+
+    # At this point, we should have 5 prompt tokens and 3 output tokens
+    assert context.num_prompt_tokens == 5
+    assert context.num_output_tokens == 3
+    assert context.num_tool_output_tokens == 0
+
+    # Second turn - after tool output
+    mock_output2 = await async_next(mock_generator)
+    context.append_output(mock_output2)
+    # Current prompt tokens (15) - last_turn_input_tokens (5) -
+    # last_turn_output_tokens (3) = 7
+    expected_tool_output = 7
+
+    assert context.num_prompt_tokens == 5 + 15
+    assert context.num_output_tokens == 3 + 4
+    assert context.num_tool_output_tokens == expected_tool_output
+    assert context.num_cached_tokens == 5
+
+    # Third turn - final response
+    mock_output3 = await async_next(mock_generator)
+    context.append_output(mock_output3)
+    # Additional tool output tokens from third turn:
+    # Current prompt (20) - last_turn_input_tokens (15) -
+    # last_turn_output_tokens (4) = 1
+    expected_tool_output = 7 + 1
+
+    assert context.num_prompt_tokens == 5 + 15 + 20
+    assert context.num_output_tokens == 3 + 4 + 5
+    assert context.num_tool_output_tokens == expected_tool_output
+    assert context.num_cached_tokens == 5 + 15
+
+
+def test_empty_output_tokens():
+    """Test behavior when RequestOutput has empty output tokens."""
+    context = HarmonyContext(messages=[], available_tools=[])
+
+    # Create a RequestOutput with empty output tokens
+    mock_output = create_mock_request_output(
+        prompt_token_ids=[1, 2, 3],  # 3 prompt tokens
+        output_token_ids=[],  # Empty output tokens list
+        num_cached_tokens=1,
+    )
+
+    context.append_output(mock_output)
+
+    # Should handle empty outputs gracefully
+    assert context.num_prompt_tokens == 3
+    assert context.num_output_tokens == 0  # No output tokens
+    assert context.num_cached_tokens == 1
+    assert context.num_tool_output_tokens == 0
+
+
+def test_missing_prompt_token_ids():
+    """Test behavior when RequestOutput has None prompt_token_ids."""
+    context = HarmonyContext(messages=[], available_tools=[])
+
+    mock_output = create_mock_request_output(
+        prompt_token_ids=None,  # No prompt token IDs
+        output_token_ids=[1, 2],  # 2 output tokens
+        num_cached_tokens=0,
+    )
+
+    # Logger.error will be called, but we don't need to check for warnings
+    # here Just ensure it doesn't raise an exception
+    context.append_output(mock_output)
+
+    # Should handle missing prompt tokens gracefully
+    assert context.num_prompt_tokens == 0
+    assert context.num_output_tokens == 2
+    assert context.num_cached_tokens == 0
+    assert context.num_tool_output_tokens == 0
+
+
+def test_reasoning_tokens_counting(mock_parser):
+    """Test that reasoning tokens are counted correctly."""
+    context = HarmonyContext(messages=[], available_tools=[])
+
+    # Mock parser to simulate reasoning channel
+    mock_parser.current_channel = "analysis"  # Reasoning channel
+
+    mock_output = create_mock_request_output(
+        prompt_token_ids=[1, 2, 3],
+        output_token_ids=[4, 5, 6, 7],  # 4 tokens, all in reasoning
+        num_cached_tokens=0,
+    )
+
+    context.append_output(mock_output)
+
+    # All output tokens should be counted as reasoning
+    assert context.num_reasoning_tokens == 4
+    assert context.num_output_tokens == 4
+
+
+def test_zero_tokens_edge_case():
+    """Test behavior with all zero token counts."""
+    context = HarmonyContext(messages=[], available_tools=[])
+
+    # Create a request with empty lists (not None) for both prompt and
+    # output tokens
+    mock_output = create_mock_request_output(
+        prompt_token_ids=[],  # Empty prompt tokens
+        output_token_ids=[],  # Empty output tokens
+        num_cached_tokens=0,
+    )
+
+    context.append_output(mock_output)
+
+    # All counts should be zero
+    assert context.num_prompt_tokens == 0
+    assert context.num_output_tokens == 0
+    assert context.num_cached_tokens == 0
+    assert context.num_tool_output_tokens == 0
+    assert context.num_reasoning_tokens == 0
+
+
+@pytest.mark.asyncio
+async def test_single_turn_no_tool_output():
+    """Test that first turn never generates tool output tokens."""
+    context = HarmonyContext(
+        messages=[],
+        available_tools=["browser"]  # Tools available
+    )
+
+    # Even with large prompt in first turn, no tool tokens should be counted
+    mock_output = create_mock_request_output(
+        prompt_token_ids=list(range(100)),  # 100 tokens
+        output_token_ids=[1, 2, 3],
+        num_cached_tokens=0,
+    )
+
+    context.append_output(mock_output)
+
+    # First turn should never have tool output tokens
+    assert context.num_tool_output_tokens == 0
+    assert context.is_first_turn is False  # Should be updated after first turn
+
+
+@pytest.mark.asyncio
+async def test_negative_tool_tokens_edge_case():
+    """Test edge case where calculation could result in negative tool
+    tokens. We should log an error and clamp the value to 0."""
+    # Use patch to check if logger.error was called
+    with patch("vllm.entrypoints.context.logger.error") as mock_log:
+        context = HarmonyContext(messages=[], available_tools=["browser"])
+
+        # First turn
+        mock_output1 = create_mock_request_output(
+            prompt_token_ids=list(range(10)),  # 10 tokens
+            output_token_ids=[1, 2, 3, 4, 5],  # 5 tokens
+        )
+        context.append_output(mock_output1)
+
+        # Second turn with fewer new tokens than previous output
+        # This could happen in edge cases with aggressive caching
+        mock_output2 = create_mock_request_output(
+            prompt_token_ids=list(range(12)),  # 12 tokens (only 2 new)
+            output_token_ids=[6, 7],  # 2 tokens
+        )
+        context.append_output(mock_output2)
+
+        # Calculated negative tool tokens (12 - 10 - 5 = -3) should be clamped
+        # to 0 and an error should be logged
+        assert context.num_tool_output_tokens == 0
+        assert context.num_prompt_tokens == 10 + 12
+        assert context.num_output_tokens == 5 + 2
+
+        # Verify the error was logged properly
+        mock_log.assert_called_once()
+
+        # Extract the actual log message and arguments from the call
+        args, _ = mock_log.call_args
+        log_message = args[0]
+
+        # Check for key parts of the message
+        assert "Negative tool output tokens calculated" in log_message
+        assert "-3" in str(args)  # Check that -3 is in the arguments
+
+
+@pytest.mark.asyncio
+async def test_streaming_multi_turn_token_counting(mock_parser):
+    """Test token counting for streaming multi-turn conversations.
+    
+    This test focuses on how StreamingHarmonyContext counts tokens in a 
+    multi-turn conversation with streaming (token-by-token) outputs and 
+    message boundaries.
+    """
+    # Create a streaming context
+    context = StreamingHarmonyContext(messages=[], available_tools=["browser"])
+
+    # Simulate three turns of conversation:
+    # Turn 1: stream tokens one by one, then finish the message
+    # Turn 2: new prompt, stream more tokens with a reasoning segment
+    # Turn 3: new prompt with tool output and cached tokens
+
+    # First turn: 3 tokens streamed one by one
+    # First token of first turn
+    context.append_output(
+        create_mock_request_output(
+            prompt_token_ids=[1, 2, 3],  # 3 prompt tokens
+            output_token_ids=[101],  # Single token
+            num_cached_tokens=0,
+            finished=False,  # Not end of message yet
+        ))
+
+    # Second token of first turn
+    context.append_output(
+        create_mock_request_output(
+            output_token_ids=[102],
+            finished=False,
+        ))
+
+    # Last token of first turn (finished=True signals end of message)
+    context.append_output(
+        create_mock_request_output(
+            output_token_ids=[103],
+            finished=True,  # End of message
+        ))
+
+    # Check token counts after first turn
+    assert context.num_prompt_tokens == 3  # Initial prompt tokens
+    assert context.num_output_tokens == 3  # Three output tokens
+    assert context.num_cached_tokens == 0
+    assert context.num_tool_output_tokens == 0  # No tool output in first turn
+    assert context.first_tok_of_message is True  # Ready for next message
+
+    # Second turn: reasoning tokens in analysis channel
+    mock_parser.current_channel = "analysis"  # Set to reasoning channel
+
+    # First token of second turn
+    context.append_output(
+        create_mock_request_output(
+            prompt_token_ids=[1, 2, 3, 101, 102, 103, 4,
+                              5],  # 8 tokens (includes previous)
+            output_token_ids=[201],
+            num_cached_tokens=3,  # Some tokens cached
+            finished=False,
+        ))
+
+    # More tokens in reasoning channel
+    context.append_output(
+        create_mock_request_output(
+            output_token_ids=[202],
+            finished=False,
+        ))
+
+    context.append_output(
+        create_mock_request_output(
+            output_token_ids=[203],
+            finished=True,  # End of reasoning message
+        ))
+
+    # Check counts after second turn (reasoning message)
+    assert context.num_prompt_tokens == 3 + 8  # Initial + second prompt
+    assert context.num_output_tokens == 3 + 3  # First turn + second turn
+    assert context.num_reasoning_tokens == 3  # All tokens in analysis channel
+    assert context.num_cached_tokens == 3  # Cached tokens from second turn
+
+    # Formula: this turn prompt tokens - last turn prompt - last turn output
+    expected_tool_tokens = 8 - 3 - 3  # = 2
+    assert context.num_tool_output_tokens == expected_tool_tokens
+
+    # Third turn: regular output channel
+    mock_parser.current_channel = "final"  # Switch back to regular channel
+
+    # Third turn (with more cached tokens)
+    context.append_output(
+        create_mock_request_output(
+            prompt_token_ids=[
+                1, 2, 3, 101, 102, 103, 4, 5, 201, 202, 203, 6, 7
+            ],  # 13 tokens
+            output_token_ids=[301],
+            num_cached_tokens=8,  # More cached tokens
+            finished=False,
+        ))
+
+    context.append_output(
+        create_mock_request_output(
+            output_token_ids=[302],
+            finished=True,
+        ))
+
+    # Final token counts check
+    assert context.num_prompt_tokens == 3 + 8 + 13  # All prompts
+    assert context.num_output_tokens == 3 + 3 + 2  # All outputs
+    assert context.num_reasoning_tokens == 3  # Unchanged from second turn
+    assert context.num_cached_tokens == 3 + 8  # Accumulated cached tokens
+
+    # Additional tool tokens from third turn
+    # Formula: this turn prompt - last turn prompt - last turn output
+    additional_tool_tokens = 13 - 8 - 3  # = 2
+    assert context.num_tool_output_tokens == expected_tool_tokens \
+        + additional_tool_tokens
diff --git a/tests/entrypoints/test_renderer.py b/tests/entrypoints/test_renderer.py
new file mode 100644
index 0000000000000000000000000000000000000000..1f55b1fba613ba84ec2e7e898c902209d979b32a
--- /dev/null
+++ b/tests/entrypoints/test_renderer.py
@@ -0,0 +1,333 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import io
+from dataclasses import dataclass
+from typing import Optional
+from unittest.mock import AsyncMock, MagicMock
+
+import pybase64
+import pytest
+import torch
+
+from vllm.entrypoints.renderer import CompletionRenderer, RenderConfig
+from vllm.inputs.data import is_embeds_prompt
+
+
+@dataclass
+class MockModelConfig:
+    max_model_len: int = 100
+    encoder_config: Optional[dict] = None
+
+
+class MockTokenizerResult:
+
+    def __init__(self, input_ids):
+        self.input_ids = input_ids
+
+
+@pytest.fixture
+def mock_model_config():
+    return MockModelConfig()
+
+
+@pytest.fixture
+def mock_tokenizer():
+    tokenizer = MagicMock()
+    return tokenizer
+
+
+@pytest.fixture
+def mock_async_tokenizer():
+    async_tokenizer = AsyncMock()
+    return async_tokenizer
+
+
+@pytest.fixture
+def renderer(mock_model_config, mock_tokenizer):
+    return CompletionRenderer(model_config=mock_model_config,
+                              tokenizer=mock_tokenizer,
+                              async_tokenizer_pool={})
+
+
+class TestRenderPrompt:
+    """Test Category A: Basic Functionality Tests"""
+
+    @pytest.mark.asyncio
+    async def test_token_input(self, renderer):
+        tokens = [101, 7592, 2088]
+        results = await renderer.render_prompt(
+            prompt_or_prompts=tokens, config=RenderConfig(max_length=100))
+
+        assert len(results) == 1
+        assert results[0]["prompt_token_ids"] == tokens
+
+    @pytest.mark.asyncio
+    async def test_token_list_input(self, renderer):
+        token_lists = [[101, 7592, 2088], [102, 1234, 5678, 9012], [103, 4567]]
+        results = await renderer.render_prompt(
+            prompt_or_prompts=token_lists, config=RenderConfig(max_length=100))
+
+        assert len(results) == 3
+        assert results[0]["prompt_token_ids"] == [101, 7592, 2088]
+        assert results[1]["prompt_token_ids"] == [102, 1234, 5678, 9012]
+        assert results[2]["prompt_token_ids"] == [103, 4567]
+
+    @pytest.mark.asyncio
+    async def test_text_input(self, renderer, mock_async_tokenizer):
+        mock_async_tokenizer.return_value = MockTokenizerResult(
+            [101, 7592, 2088])
+        renderer.async_tokenizer_pool[
+            renderer.tokenizer] = mock_async_tokenizer
+
+        results = await renderer.render_prompt(
+            prompt_or_prompts="Hello world",
+            config=RenderConfig(max_length=100))
+
+        assert len(results) == 1
+        assert results[0]["prompt_token_ids"] == [101, 7592, 2088]
+        mock_async_tokenizer.assert_called_once()
+
+    @pytest.mark.asyncio
+    async def test_text_list_input(self, renderer, mock_async_tokenizer):
+        mock_async_tokenizer.return_value = MockTokenizerResult(
+            [101, 7592, 2088])
+        renderer.async_tokenizer_pool[
+            renderer.tokenizer] = mock_async_tokenizer
+
+        text_list_input = ["Hello world", "How are you?", "Good morning"]
+        results = await renderer.render_prompt(
+            prompt_or_prompts=text_list_input,
+            config=RenderConfig(max_length=100))
+
+        assert len(results) == 3
+        for result in results:
+            assert result["prompt_token_ids"] == [101, 7592, 2088]
+        assert mock_async_tokenizer.call_count == 3
+
+    @pytest.mark.asyncio
+    async def test_no_truncation(self, renderer, mock_async_tokenizer):
+        mock_async_tokenizer.return_value = MockTokenizerResult(
+            [101, 7592, 2088])
+        renderer.async_tokenizer_pool[
+            renderer.tokenizer] = mock_async_tokenizer
+
+        results = await renderer.render_prompt(
+            prompt_or_prompts="Hello world",
+            config=RenderConfig(max_length=100))
+
+        assert len(results) == 1
+        call_args = mock_async_tokenizer.call_args
+        assert "truncation" not in call_args.kwargs or call_args.kwargs[
+            "truncation"] is False
+
+    @pytest.mark.asyncio
+    async def test_truncation_positive(self, renderer, mock_async_tokenizer):
+        mock_async_tokenizer.return_value = MockTokenizerResult(
+            [101, 7592, 2088])  # Truncated
+        renderer.async_tokenizer_pool[
+            renderer.tokenizer] = mock_async_tokenizer
+
+        results = await renderer.render_prompt(prompt_or_prompts="Hello world",
+                                               config=RenderConfig(
+                                                   max_length=100,
+                                                   truncate_prompt_tokens=50))
+
+        assert len(results) == 1
+        call_args = mock_async_tokenizer.call_args
+        assert call_args.kwargs["truncation"] is True
+        assert call_args.kwargs["max_length"] == 50
+
+    @pytest.mark.asyncio
+    async def test_truncation_negative(self, renderer, mock_async_tokenizer):
+        # Test that negative truncation uses model's max_model_len
+        mock_async_tokenizer.return_value = MockTokenizerResult(
+            [101, 7592, 2088])  # Truncated to max_model_len
+        renderer.async_tokenizer_pool[
+            renderer.tokenizer] = mock_async_tokenizer
+
+        results = await renderer.render_prompt(prompt_or_prompts="Hello world",
+                                               config=RenderConfig(
+                                                   max_length=200,
+                                                   truncate_prompt_tokens=-1))
+
+        assert len(results) == 1
+        call_args = mock_async_tokenizer.call_args
+        assert call_args.kwargs["truncation"] is True
+        assert call_args.kwargs["max_length"] == 100  # model's max_model_len
+
+    @pytest.mark.asyncio
+    async def test_token_truncation_last_elements(self, renderer):
+        # Test that token truncation keeps the last N elements
+        long_tokens = [100, 101, 102, 103, 104, 105, 106, 107, 108,
+                       109]  # 10 tokens
+        results = await renderer.render_prompt(prompt_or_prompts=long_tokens,
+                                               config=RenderConfig(
+                                                   max_length=100,
+                                                   truncate_prompt_tokens=5))
+
+        assert len(results) == 1
+        # Should keep the last 5 tokens: [105, 106, 107, 108, 109]
+        assert results[0]["prompt_token_ids"] == [105, 106, 107, 108, 109]
+
+    @pytest.mark.asyncio
+    async def test_max_length_exceeded(self, renderer):
+        long_tokens = list(range(150))  # Exceeds max_model_len=100
+
+        with pytest.raises(ValueError, match="maximum context length"):
+            await renderer.render_prompt(prompt_or_prompts=long_tokens,
+                                         config=RenderConfig(max_length=100))
+
+    @pytest.mark.asyncio
+    async def test_no_tokenizer_for_text(self, mock_model_config):
+        renderer_no_tokenizer = CompletionRenderer(
+            model_config=mock_model_config,
+            tokenizer=None,
+            async_tokenizer_pool={})
+
+        with pytest.raises(ValueError, match="No tokenizer available"):
+            await renderer_no_tokenizer.render_prompt(
+                prompt_or_prompts="Hello world",
+                config=RenderConfig(max_length=100))
+
+    @pytest.mark.asyncio
+    async def test_token_input_with_needs_detokenization(
+            self, renderer, mock_async_tokenizer):
+        # When needs_detokenization=True for token inputs, renderer should
+        # use the async tokenizer to decode and include the original text
+        # in the returned prompt object.
+        mock_async_tokenizer.decode = AsyncMock(return_value="decoded text")
+        renderer.async_tokenizer_pool[
+            renderer.tokenizer] = mock_async_tokenizer
+
+        tokens = [1, 2, 3, 4]
+        results = await renderer.render_prompt(
+            prompt_or_prompts=tokens,
+            config=RenderConfig(needs_detokenization=True),
+        )
+
+        assert len(results) == 1
+        assert results[0]["prompt_token_ids"] == tokens
+        assert results[0]["prompt"] == "decoded text"
+        mock_async_tokenizer.decode.assert_awaited_once()
+
+
+class TestRenderEmbedPrompt:
+
+    def _create_test_embed_bytes(self, tensor: torch.Tensor) -> bytes:
+        """Helper to create base64-encoded tensor bytes"""
+        buffer = io.BytesIO()
+        torch.save(tensor, buffer)
+        buffer.seek(0)
+        return pybase64.b64encode(buffer.read())
+
+    @pytest.mark.asyncio
+    async def test_single_prompt_embed(self, renderer):
+        # Create a test tensor
+        test_tensor = torch.randn(10, 768, dtype=torch.float32)
+        embed_bytes = self._create_test_embed_bytes(test_tensor)
+
+        results = await renderer.render_prompt_and_embeds(
+            prompt_embeds=embed_bytes,
+            config=RenderConfig(cache_salt="test_salt"),
+        )
+
+        assert len(results) == 1
+        assert is_embeds_prompt(results[0])
+        assert torch.allclose(results[0]["prompt_embeds"], test_tensor)
+        assert results[0]["cache_salt"] == "test_salt"
+
+    @pytest.mark.asyncio
+    async def test_multiple_prompt_embeds(self, renderer):
+        # Create multiple test tensors
+        test_tensors = [
+            torch.randn(8, 512, dtype=torch.float32),
+            torch.randn(12, 512, dtype=torch.float32),
+        ]
+        embed_bytes_list = [
+            self._create_test_embed_bytes(t) for t in test_tensors
+        ]
+
+        results = await renderer.render_prompt_and_embeds(
+            prompt_embeds=embed_bytes_list,
+            config=RenderConfig(),
+        )
+
+        assert len(results) == 2
+        for i, result in enumerate(results):
+            assert is_embeds_prompt(result)
+            assert torch.allclose(result["prompt_embeds"], test_tensors[i])
+
+    @pytest.mark.asyncio
+    async def test_prompt_embed_truncation(self, renderer):
+        # Create tensor with more tokens than truncation limit
+        test_tensor = torch.randn(20, 768, dtype=torch.float32)
+        embed_bytes = self._create_test_embed_bytes(test_tensor)
+
+        results = await renderer.render_prompt_and_embeds(
+            prompt_embeds=embed_bytes,
+            config=RenderConfig(truncate_prompt_tokens=10),
+        )
+
+        assert len(results) == 1
+        # Should keep last 10 tokens
+        expected = test_tensor[-10:]
+        assert torch.allclose(results[0]["prompt_embeds"], expected)
+
+    @pytest.mark.asyncio
+    async def test_prompt_embed_different_dtypes(self, renderer):
+        # Test different supported dtypes
+        dtypes = [torch.float32, torch.float16, torch.bfloat16]
+
+        for dtype in dtypes:
+            test_tensor = torch.randn(5, 256, dtype=dtype)
+            embed_bytes = self._create_test_embed_bytes(test_tensor)
+
+            results = await renderer.render_prompt_and_embeds(
+                prompt_embeds=embed_bytes,
+                config=RenderConfig(),
+            )
+
+            assert len(results) == 1
+            assert results[0]["prompt_embeds"].dtype == dtype
+
+    @pytest.mark.asyncio
+    async def test_prompt_embed_squeeze_batch_dim(self, renderer):
+        # Test tensor with batch dimension gets squeezed
+        test_tensor = torch.randn(1, 10, 768, dtype=torch.float32)
+        embed_bytes = self._create_test_embed_bytes(test_tensor)
+
+        results = await renderer.render_prompt_and_embeds(
+            prompt_embeds=embed_bytes,
+            config=RenderConfig(),
+        )
+
+        assert len(results) == 1
+        # Should be squeezed to 2D
+        assert results[0]["prompt_embeds"].shape == (10, 768)
+
+    @pytest.mark.asyncio
+    async def test_both_prompts_and_embeds(self, renderer,
+                                           mock_async_tokenizer):
+        # Set up text tokenization
+        mock_async_tokenizer.return_value = MockTokenizerResult(
+            [101, 102, 103])
+        renderer.async_tokenizer_pool[
+            renderer.tokenizer] = mock_async_tokenizer
+
+        # Create embed
+        test_tensor = torch.randn(5, 256, dtype=torch.float32)
+        embed_bytes = self._create_test_embed_bytes(test_tensor)
+
+        results = await renderer.render_prompt_and_embeds(
+            prompt_or_prompts="Hello world",
+            prompt_embeds=embed_bytes,
+            config=RenderConfig(),
+        )
+
+        assert len(results) == 2
+        # First should be embed prompt
+        assert is_embeds_prompt(results[0])
+        # Second should be tokens prompt
+        assert "prompt_token_ids" in results[1]
+        assert results[1]["prompt_token_ids"] == [101, 102, 103]
diff --git a/tests/kernels/attention/test_attention_selector.py b/tests/kernels/attention/test_attention_selector.py
index a360069ed0684a2940dc27d90b01b5df86c75b04..26f617e1e15846db358579a3aa60464485bdbf24 100644
--- a/tests/kernels/attention/test_attention_selector.py
+++ b/tests/kernels/attention/test_attention_selector.py
@@ -22,7 +22,7 @@ def clear_cache():
 
 # Define MLA and non-MLA backends separately
 DEVICE_MLA_BACKENDS = {
-    "cuda": ["TRITON_MLA", "FLASHMLA"],
+    "cuda": ["TRITON_MLA", "FLASHMLA", "FLASH_ATTN_MLA", "CUTLASS_MLA"],
     "hip": ["TRITON_MLA", "ROCM_AITER_MLA"],
     "cpu": [],
 }
@@ -97,21 +97,14 @@ def test_env(
             with patch("vllm.attention.selector.current_platform",
                        RocmPlatform()):
                 if use_mla:
-                    # Validate HIP MLA backend-block_size combinations
-                    valid_combination = (
-                        (name == "TRITON_MLA" and block_size != 1)
-                        or (name == "ROCM_AITER_MLA" and block_size == 1))
-
-                    if valid_combination:
-                        backend = get_attn_backend(16,
-                                                   torch.float16,
-                                                   torch.float16,
-                                                   block_size,
-                                                   False,
-                                                   use_mla=use_mla)
-                        expected = f"{name}_VLLM_V1" if use_v1 else name
-                        assert backend.get_name() == expected
-                    else:
+                    # ROCm MLA backend logic:
+                    # - TRITON_MLA: supported when block_size != 1
+                    # - ROCM_AITER_MLA: supported when block_size == 1
+                    # If backend is forced but doesn't match block_size,
+                    # should raise ValueError
+
+                    if name == "TRITON_MLA" and block_size == 1:
+                        # TRITON_MLA doesn't support block_size == 1
                         with pytest.raises(ValueError) as exc_info:
                             get_attn_backend(16,
                                              torch.float16,
@@ -121,6 +114,27 @@ def test_env(
                                              use_mla=use_mla)
                         assert f"The selected backend, {name}" in str(
                             exc_info.value)
+                    elif name == "ROCM_AITER_MLA" and block_size != 1:
+                        # ROCM_AITER_MLA only supports block_size == 1
+                        with pytest.raises(ValueError) as exc_info:
+                            get_attn_backend(16,
+                                             torch.float16,
+                                             torch.float16,
+                                             block_size,
+                                             False,
+                                             use_mla=use_mla)
+                        assert f"The selected backend, {name}" in str(
+                            exc_info.value)
+                    else:
+                        # Valid backend-block_size combination
+                        backend = get_attn_backend(16,
+                                                   torch.float16,
+                                                   torch.float16,
+                                                   block_size,
+                                                   False,
+                                                   use_mla=use_mla)
+                        expected = f"{name}_VLLM_V1" if use_v1 else name
+                        assert backend.get_name() == expected
                 else:
                     backend = get_attn_backend(16,
                                                torch.float16,
@@ -135,16 +149,57 @@ def test_env(
             with patch("vllm.attention.selector.current_platform",
                        CudaPlatform()):
                 if use_mla:
-                    if name == "FLASHMLA" and block_size == 64:
-                        from vllm.attention.backends.flashmla import (
-                            is_flashmla_supported)
-
-                        # only on cuda platforms with specific capability.
-                        is_supported, _ = is_flashmla_supported()
-
-                        if not is_supported:
-                            # if platform is not supported then skip this case.
-                            pytest.skip()
+                    # CUDA MLA backend logic:
+                    # - CUTLASS_MLA: only supported with block_size == 128
+                    #   and Blackwell GPUs (SM 10.0), V1 only
+                    # - FLASHMLA: only supported with block_size == 64
+                    # - FLASH_ATTN_MLA: V1 only
+                    # - TRITON_MLA: fallback for other cases
+
+                    if name == "CUTLASS_MLA":
+                        if not use_v1:
+                            # CUTLASS_MLA only supported on V1 engine
+                            pytest.skip(
+                                "CUTLASS_MLA only supported on V1 engine")
+                        elif block_size != 128:
+                            # CUTLASS_MLA only supports block_size == 128
+                            pytest.skip(
+                                "CUTLASS_MLA only supports block_size 128")
+                        else:
+                            backend = get_attn_backend(16,
+                                                       torch.float16,
+                                                       torch.float16,
+                                                       block_size,
+                                                       False,
+                                                       use_mla=use_mla)
+                            expected = "CUTLASS_MLA_VLLM_V1"
+                            assert backend.get_name() == expected
+                    elif name == "FLASHMLA":
+                        if block_size != 64:
+                            # FlashMLA only supports block_size == 64
+                            pytest.skip("FlashMLA only supports block_size 64")
+                        else:
+                            from vllm.attention.backends.flashmla import (
+                                is_flashmla_supported)
+                            is_supported, _ = is_flashmla_supported()
+                            if not is_supported:
+                                pytest.skip(
+                                    "FlashMLA not supported on this platform")
+                            else:
+                                backend = get_attn_backend(16,
+                                                           torch.float16,
+                                                           torch.float16,
+                                                           block_size,
+                                                           False,
+                                                           use_mla=use_mla)
+                                expected = f"{name}_VLLM_V1" if use_v1 else name
+                                assert backend.get_name() == expected
+                    elif name == "FLASH_ATTN_MLA":
+                        if not use_v1:
+                            # FlashAttention MLA only supported on V1 engine
+                            pytest.skip(
+                                "FlashAttention MLA only supported on V1 engine"
+                            )
                         else:
                             backend = get_attn_backend(16,
                                                        torch.float16,
@@ -152,9 +207,10 @@ def test_env(
                                                        block_size,
                                                        False,
                                                        use_mla=use_mla)
-                            expected = f"{name}_VLLM_V1" if use_v1 else name
+                            expected = "FLASH_ATTN_MLA"
                             assert backend.get_name() == expected
                     else:
+                        # TRITON_MLA or other fallback
                         backend = get_attn_backend(16,
                                                    torch.float16,
                                                    torch.float16,
diff --git a/tests/kernels/attention/test_cutlass_mla_decode.py b/tests/kernels/attention/test_cutlass_mla_decode.py
new file mode 100644
index 0000000000000000000000000000000000000000..5078bd730a1a31cfb05587ae4dfb20c42be15e1f
--- /dev/null
+++ b/tests/kernels/attention/test_cutlass_mla_decode.py
@@ -0,0 +1,198 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import math
+import random
+from typing import Optional
+
+import pytest
+import torch
+
+import vllm._custom_ops as ops
+from vllm.platforms import current_platform
+from vllm.triton_utils import triton
+
+
+def cal_diff(x: torch.Tensor,
+             y: torch.Tensor,
+             name: str,
+             use_fp8: bool = False,
+             diff_threshold: Optional[float] = None) -> None:
+    x, y = x.double(), y.double()
+    cos_diff = 1 - 2 * (x * y).sum().item() / max(
+        (x * x + y * y).sum().item(), 1e-12)
+    if diff_threshold is not None:
+        # directly compare the cos_diff with the threshold
+        assert cos_diff < diff_threshold
+    else:
+        # use the default threshold
+        if (use_fp8):
+            assert cos_diff < 1e-4
+        else:
+            assert cos_diff < 1e-5
+
+
+CUTLASS_MLA_UNSUPPORTED_REASON = \
+    "Cutlass MLA Requires compute capability of 10 or above." \
+    if not current_platform.is_device_capability(100) \
+    else "Cutlass MLA is supported"
+
+
+@pytest.mark.skipif(not current_platform.has_device_capability(100),
+                    reason=CUTLASS_MLA_UNSUPPORTED_REASON)
+@pytest.mark.parametrize("b", [128])
+@pytest.mark.parametrize("s_q", [1])
+@pytest.mark.parametrize("mean_sk", [4096, 8192, 16384])
+@pytest.mark.parametrize("h_q", [16, 32, 64, 128])
+@pytest.mark.parametrize("h_kv", [1])
+@pytest.mark.parametrize("d", [576])
+@pytest.mark.parametrize("dv", [512])
+@pytest.mark.parametrize("block_size", [64])
+@pytest.mark.parametrize("causal", [True])
+@pytest.mark.parametrize("varlen", [False, True])
+@pytest.mark.parametrize(
+    "torch_dtype",
+    [
+        torch.bfloat16,
+        # fp8 can have occasional precision-related failures.
+        pytest.param(torch.float8_e4m3fn, marks=pytest.mark.flaky(reruns=2))
+    ])
+@torch.inference_mode()
+def test_cutlass_mla_decode(b, s_q, mean_sk, h_q, h_kv, d, dv, block_size,
+                            causal, varlen, torch_dtype):
+    device = torch.device("cuda:0")
+    if torch_dtype == torch.float8_e4m3fn:
+        init_dtype = torch.bfloat16
+    else:
+        init_dtype = torch_dtype
+    torch.set_default_dtype(init_dtype)
+    torch.set_default_device(device)
+    torch.cuda.set_device(device)
+    torch.manual_seed(42)
+    random.seed(42)
+
+    print(f"{b=}, {s_q=}, {mean_sk=}, {h_q=}, {h_kv=}, "
+          f"{d=}, {dv=}, {causal=}, {varlen=}, {torch_dtype=}")
+
+    use_fp8 = torch_dtype == torch.float8_e4m3fn
+    scale = math.sqrt(d)**(-1)
+    cache_seqlens = torch.full((b, ), mean_sk, dtype=torch.int32)
+    if varlen:
+        for i in range(b):
+            cache_seqlens[i] = max(random.normalvariate(mean_sk, mean_sk / 2),
+                                   s_q)
+    total_seqlens = cache_seqlens.sum().item()
+    max_seqlen = cache_seqlens.max().item()
+    max_seqlen_pad = triton.cdiv(max_seqlen, 256) * 256
+
+    q = torch.randn(b, s_q, h_q, d)
+    block_table = torch.arange(b * max_seqlen_pad // block_size,
+                               dtype=torch.int32).view(
+                                   b, max_seqlen_pad // block_size)
+    blocked_k = torch.randn(block_table.numel(), block_size, h_kv, d)
+    blocked_v = blocked_k[..., :dv]
+
+    init_dtype = q.dtype
+    if use_fp8:
+        fp8_dtype = torch.float8_e4m3fn
+        descale_q = torch.ones((1), dtype=torch.float32)
+        descale_k = torch.ones((1), dtype=torch.float32)
+
+        q = q.to(fp8_dtype)
+        blocked_k = blocked_k.to(fp8_dtype)
+        blocked_v = blocked_v.to(fp8_dtype)
+    else:
+        descale_q = None
+        descale_k = None
+
+    def cutlass_mla():
+        MAX_HEADS = 128
+
+        q_reshaped = q.squeeze(1)
+        q_nope = q_reshaped[:, :, :dv].clone()
+        q_pe = q_reshaped[:, :, dv:].clone()
+
+        if h_q < MAX_HEADS:
+            q_nope_padded = q_nope.new_empty((b, MAX_HEADS, dv))
+            q_nope_padded[:, :h_q] = q_nope
+            q_nope = q_nope_padded
+
+            q_pe_padded = q_pe.new_empty((b, MAX_HEADS, d - dv))
+            q_pe_padded[:, :h_q] = q_pe
+            q_pe = q_pe_padded
+
+        kv_cache_flat = blocked_k.squeeze(2)
+        device_properties = torch.cuda.get_device_properties(
+            torch.device("cuda:0"))
+        sm_count = device_properties.multi_processor_count
+        workspace_size = ops.sm100_cutlass_mla_get_workspace_size(
+            max_seqlen * block_size, b, sm_count, num_kv_splits=1)
+        workspace = torch.empty(workspace_size,
+                                device="cuda",
+                                dtype=torch.uint8)
+
+        out_ans = torch.empty(b, MAX_HEADS, dv, dtype=init_dtype)
+        output_lse = torch.empty((b, MAX_HEADS),
+                                 dtype=torch.float32,
+                                 device=q_nope.device)
+        ops.sm100_cutlass_mla_decode(out_ans, output_lse, q_nope, q_pe,
+                                     kv_cache_flat, cache_seqlens, block_table,
+                                     workspace, scale, 1)
+        return out_ans[:, :h_q].contiguous(), output_lse[:, :h_q].contiguous()
+
+    def scaled_dot_product_attention(query, key, value, is_causal=False):
+        query = query.float()
+        key = key.float()
+        value = value.float()
+        key = key.repeat_interleave(h_q // h_kv, dim=0)
+        value = value.repeat_interleave(h_q // h_kv, dim=0)
+        attn_weight = query @ key.transpose(-2, -1) / math.sqrt(query.size(-1))
+        if is_causal:
+            s_q = query.shape[-2]
+            s_k = key.shape[-2]
+            attn_bias = torch.zeros(s_q, s_k, dtype=query.dtype)
+            temp_mask = torch.ones(s_q, s_k,
+                                   dtype=torch.bool).tril(diagonal=s_k - s_q)
+            attn_bias.masked_fill_(temp_mask.logical_not(), float("-inf"))
+            attn_bias.to(query.dtype)
+            attn_weight += attn_bias
+        lse = attn_weight.logsumexp(dim=-1)
+        attn_weight = torch.softmax(attn_weight, dim=-1, dtype=torch.float32)
+        return attn_weight @ value, lse
+
+    def ref_mla():
+        q_ = (q.to(torch.float) * descale_q).to(init_dtype) if use_fp8 else q
+        blocked_k_ = (blocked_k.to(torch.float) *
+                      descale_k).to(init_dtype) if use_fp8 else blocked_k
+        blocked_v_ = (blocked_v.to(torch.float) *
+                      descale_k).to(init_dtype) if use_fp8 else blocked_v
+        out = torch.empty(b, s_q, h_q, dv, dtype=torch.float32)
+        lse = torch.empty(b, h_q, s_q, dtype=torch.float32)
+        for i in range(b):
+            begin = i * max_seqlen_pad
+            end = begin + cache_seqlens[i]
+            out_i, lse_i = scaled_dot_product_attention(
+                q_[i].transpose(0, 1),
+                blocked_k_.view(-1, h_kv, d)[begin:end].transpose(0, 1),
+                blocked_v_.view(-1, h_kv, dv)[begin:end].transpose(0, 1),
+                is_causal=causal,
+            )
+            out[i] = out_i.transpose(0, 1)
+            lse[i] = lse_i
+        return out, lse
+
+    out_cutlass, lse_cutlass = cutlass_mla()
+    out_torch, lse_torch = ref_mla()
+    # Extract the single token (s_q=1) slice to match cutlass output shape
+    out_torch_slice = out_torch[:, 0, :, :]  # [b, h_q, dv]
+    lse_torch_slice = lse_torch[:, 0, :]  # [b, h_q]
+    cal_diff(out_cutlass, out_torch_slice, "out", use_fp8)
+    # lse has larger numerical error, so use a larger threshold
+    cal_diff(lse_cutlass, lse_torch_slice, "lse", use_fp8, diff_threshold=1e-3)
+
+    t = triton.testing.do_bench(cutlass_mla)
+    FLOPS = s_q * total_seqlens * h_q * (d + dv) * 2
+    bytes = (total_seqlens * h_kv * d +
+             b * s_q * h_q * d) * (torch.finfo(torch_dtype).bits // 8) + (
+                 b * s_q * h_q * dv) * (torch.finfo(init_dtype).bits // 8)
+    print(f"{t:.3f} ms, {FLOPS / 10 ** 9 / t:.0f} TFLOPS,",
+          f"{bytes / 10 ** 6 / t:.0f} GB/s")
diff --git a/tests/kernels/attention/test_flashinfer_mla_decode.py b/tests/kernels/attention/test_flashinfer_mla_decode.py
new file mode 100644
index 0000000000000000000000000000000000000000..02225432f77fc63aea37a5bc1abfe62e06b89017
--- /dev/null
+++ b/tests/kernels/attention/test_flashinfer_mla_decode.py
@@ -0,0 +1,123 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import pytest
+import torch
+import torch.nn.functional as F
+from flashinfer.decode import trtllm_batch_decode_with_kv_cache_mla
+from torch import Tensor
+
+from vllm.platforms import current_platform
+
+FLASHINFER_WORKSPACE_BUFFER_SIZE = 128 * 1024 * 1024
+
+if not current_platform.has_device_capability(100):
+    pytest.skip(
+        reason="FlashInfer MLA Requires compute capability of 10 or above.",
+        allow_module_level=True)
+
+
+def ref_mla(
+        out: Tensor,  # (bs, num_heads, v_head_dim)
+        query: Tensor,  # (bs, num_heads, head_dim)
+        kv_cache: Tensor,  # (num_blocks, block_size, head_dim)
+        scale: float,
+        block_tables: Tensor,  # (bs, max_num_blocks)
+        seq_lens: Tensor,  # (bs,)
+):
+    bs, num_heads, v_head_dim = out.shape
+    head_dim = query.shape[2]
+
+    for i in range(bs):
+        # gather and flatten KV-cache
+        kv = kv_cache[
+            block_tables[i]]  # (max_num_blocks, block_size, head_dim)
+        kv = kv.view(1, -1,
+                     head_dim)[:, :seq_lens[i]]  # (1, seq_len, head_dim)
+        v = kv[:, :, :v_head_dim]
+
+        q = query[i].view(num_heads, 1, head_dim)
+        o = F.scaled_dot_product_attention(q,
+                                           kv,
+                                           v,
+                                           scale=scale,
+                                           enable_gqa=True)
+        out[i] = o.view(num_heads, v_head_dim)
+
+    return out
+
+
+@pytest.mark.parametrize("dtype", [torch.bfloat16])
+@pytest.mark.parametrize("bs", [1, 2, 4, 16])
+@pytest.mark.parametrize("block_size", [32, 64])
+def test_flashinfer_mla_decode(dtype: torch.dtype, bs: int, block_size: int):
+    torch.set_default_device('cuda')
+    torch.manual_seed(42)
+
+    # Deepseek R1 config
+    num_heads = 128
+    kv_lora_rank = 512
+    qk_nope_head_dim = 128
+    qk_rope_head_dim = 64
+    qk_head_dim = kv_lora_rank + qk_rope_head_dim
+    scale = (qk_nope_head_dim + qk_rope_head_dim)**-0.5
+
+    MAX_SEQ_LEN = 1024
+
+    seq_lens = [torch.randint(2, MAX_SEQ_LEN, (1, )).item() for _ in range(bs)]
+    seq_lens[-1] = MAX_SEQ_LEN
+    max_seq_len = max(seq_lens)
+    seq_lens_tensor = torch.tensor(seq_lens, dtype=torch.int32)
+
+    # Generate block tables with random but unique block IDs
+    # From https://github.com/flashinfer-ai/flashinfer/pull/1222
+    blocks_per_seq = (seq_lens_tensor + block_size - 1) // block_size
+    max_num_blocks_per_seq = max(blocks_per_seq.max().item(), 4)
+    total_blocks_needed = sum(blocks_per_seq)
+    # Get random unique IDs for all blocks
+    all_block_ids = torch.randperm(total_blocks_needed)
+
+    block_id = 0
+    block_tables = torch.zeros(
+        (bs, max_num_blocks_per_seq),
+        dtype=torch.int32,
+    )
+
+    # Populate block tables and track block assignments
+    block_id = 0
+    for i in range(bs):
+        num_blocks_needed = blocks_per_seq[i]
+        block_tables[i, :num_blocks_needed] = all_block_ids[block_id:block_id +
+                                                            num_blocks_needed]
+        block_id += num_blocks_needed
+
+    kv_cache = torch.randn(block_tables.numel(), block_size,
+                           qk_head_dim).to(dtype)
+    q = torch.randn(bs, num_heads, qk_head_dim).to(dtype)
+
+    out_ref = q.new_zeros(bs, num_heads, kv_lora_rank)
+    ref_mla(out_ref, q, kv_cache, scale, block_tables, seq_lens_tensor)
+
+    workspace_buffer = torch.zeros(
+        FLASHINFER_WORKSPACE_BUFFER_SIZE,
+        dtype=torch.uint8,
+        device=q.device,
+    )
+    # Flashinfer MLA expects the query to be of shape
+    # (bs, q_len_per_request, num_heads, qk_head_dim),
+    # where q_len_per_request is the MTP query length (=1 without MTP)
+    q = q.unsqueeze(1)
+
+    out_ans = trtllm_batch_decode_with_kv_cache_mla(
+        query=q,
+        kv_cache=kv_cache.unsqueeze(1),
+        workspace_buffer=workspace_buffer,
+        qk_nope_head_dim=qk_nope_head_dim,
+        kv_lora_rank=kv_lora_rank,
+        qk_rope_head_dim=qk_rope_head_dim,
+        block_tables=block_tables,
+        seq_lens=seq_lens_tensor,
+        max_seq_len=max_seq_len,
+        bmm1_scale=scale,
+    )
+    out_ans = out_ans.squeeze(1)
+    torch.testing.assert_close(out_ans, out_ref, atol=1e-2, rtol=1e-2)
diff --git a/tests/kernels/attention/test_flashinfer_trtllm_attention.py b/tests/kernels/attention/test_flashinfer_trtllm_attention.py
index 8d0a11d8eb8ab8bb82478e6cf95a606df9efed03..bd3ba554b32e2cbfd75dbc5f489875f5e8ce487c 100644
--- a/tests/kernels/attention/test_flashinfer_trtllm_attention.py
+++ b/tests/kernels/attention/test_flashinfer_trtllm_attention.py
@@ -35,6 +35,7 @@ QUANT_DTYPES = [
     # (q_quant_dtype, kv_quant_dtype, o_quant_dtype)
     (None, None, None),
     (None, FP8_DTYPE, None),
+    (FP8_DTYPE, FP8_DTYPE, None),
     (FP8_DTYPE, FP8_DTYPE, FP8_DTYPE),
     (FP8_DTYPE, FP8_DTYPE, FP4_DTYPE),
 ]
@@ -44,6 +45,7 @@ NUM_HEADS = [(64, 8), (40, 8)]
 HEAD_SIZE = [128]
 KV_LAYOUT = ["HND"]  # currently only HND is supported
 BLOCK_SIZE = [16]
+WINDOW_LEFT = [-1, 127]
 SOFT_CAP = [None, 50.0]
 
 NUM_BLOCKS = 32768  # Large enough to test overflow in index calculation.
@@ -57,6 +59,7 @@ NUM_BLOCKS = 32768  # Large enough to test overflow in index calculation.
 @pytest.mark.parametrize("head_size", HEAD_SIZE)
 @pytest.mark.parametrize("kv_layout", KV_LAYOUT)
 @pytest.mark.parametrize("block_size", BLOCK_SIZE)
+@pytest.mark.parametrize("window_left", WINDOW_LEFT)
 @pytest.mark.parametrize("soft_cap", SOFT_CAP)
 @torch.inference_mode
 def test_flashinfer_trtllm_decode_with_baseline(
@@ -69,6 +72,7 @@ def test_flashinfer_trtllm_decode_with_baseline(
     head_size: int,
     kv_layout: str,
     block_size: int,
+    window_left: int,
     soft_cap: Optional[float],
 ) -> None:
     torch.set_default_device("cuda")
@@ -155,6 +159,7 @@ def test_flashinfer_trtllm_decode_with_baseline(
                  sm_scale=sm_scale,
                  q_data_type=dtype,
                  kv_data_type=dtype,
+                 window_left=window_left,
                  logits_soft_cap=soft_cap)
 
     output = torch.empty(ref_query.shape, dtype=dtype)
@@ -188,6 +193,7 @@ def test_flashinfer_trtllm_decode_with_baseline(
         max_seq_len=max_seq_len,
         bmm1_scale=q_scale * k_scale * sm_scale,
         bmm2_scale=v_scale / o_scale,
+        window_left=window_left,
         o_sf_scale=o_sf_scale,
         out=output_trtllm,
     )
@@ -222,6 +228,7 @@ def test_flashinfer_trtllm_decode_with_baseline(
 @pytest.mark.parametrize("head_size", HEAD_SIZE)
 @pytest.mark.parametrize("kv_layout", KV_LAYOUT)
 @pytest.mark.parametrize("block_size", BLOCK_SIZE)
+@pytest.mark.parametrize("window_left", WINDOW_LEFT)
 @pytest.mark.parametrize("soft_cap", [None])
 @torch.inference_mode
 def test_flashinfer_trtllm_prefill_with_baseline(
@@ -234,6 +241,7 @@ def test_flashinfer_trtllm_prefill_with_baseline(
     head_size: int,
     kv_layout: str,
     block_size: int,
+    window_left: int,
     soft_cap: Optional[float],
 ) -> None:
     torch.set_default_device("cuda")
@@ -334,6 +342,7 @@ def test_flashinfer_trtllm_prefill_with_baseline(
                  sm_scale=sm_scale,
                  q_data_type=dtype,
                  kv_data_type=dtype,
+                 window_left=window_left,
                  logits_soft_cap=soft_cap)
 
     output = torch.empty(ref_query.shape, dtype=dtype)
@@ -371,6 +380,7 @@ def test_flashinfer_trtllm_prefill_with_baseline(
         batch_size=batch_size,
         cum_seq_lens_q=q_indptr,
         cum_seq_lens_kv=kv_indptr,
+        window_left=window_left,
         o_sf_scale=o_sf_scale,
         out=output_trtllm,
     )
@@ -390,6 +400,8 @@ def test_flashinfer_trtllm_prefill_with_baseline(
         rtol, atol = 4e-1, 1e0
     elif q_quant_dtype == FP8_DTYPE and o_quant_dtype == FP8_DTYPE:
         rtol, atol = 5e-2, 7e-2
+    elif q_quant_dtype == FP8_DTYPE and o_quant_dtype == dtype:
+        rtol, atol = 4e-2, 6e-2
     else:
         rtol, atol = 1e-2, 1e-2
 
diff --git a/tests/kernels/attention/test_mha_attn.py b/tests/kernels/attention/test_mha_attn.py
index 53c37554b15a344e26bdf230d5924075251cf4f1..c01ea32994da0f31f12013ee1e0a4b7dd2cd97e7 100644
--- a/tests/kernels/attention/test_mha_attn.py
+++ b/tests/kernels/attention/test_mha_attn.py
@@ -23,6 +23,9 @@ def clear_cache():
     """Clear lru cache to ensure each test case runs without caching.
     """
     _cached_get_attn_backend.cache_clear()
+    # Clear xformers availability cache
+    import vllm.attention.layer as layer_module
+    layer_module.USE_XFORMERS_OPS = None
 
 
 @pytest.mark.parametrize("device", ["cpu", "hip", "cuda"])
@@ -33,19 +36,28 @@ def test_mha_attn_platform(device: str):
     torch.set_default_dtype(torch.float16)
 
     if device == "cpu":
-        with patch("vllm.attention.selector.current_platform", CpuPlatform()):
+        with patch("vllm.attention.selector.current_platform",
+                   CpuPlatform()), \
+             patch("vllm.platforms.current_platform", CpuPlatform()):
             attn = MultiHeadAttention(16, 64, scale=1)
-            assert attn.attn_backend == _Backend.TORCH_SDPA
+            assert attn.attn_backend == _Backend.TORCH_SDPA_VLLM_V1
     elif device == "hip":
-        with patch("vllm.attention.selector.current_platform", RocmPlatform()):
+        with patch("vllm.attention.selector.current_platform",
+                   RocmPlatform()), \
+             patch("vllm.platforms.current_platform", RocmPlatform()), \
+             patch("vllm.attention.layer.current_platform", RocmPlatform()):
             attn = MultiHeadAttention(16, 64, scale=1)
             assert attn.attn_backend == _Backend.TORCH_SDPA
     else:
-        with patch("vllm.attention.selector.current_platform", CudaPlatform()):
+        with patch("vllm.attention.selector.current_platform",
+                   CudaPlatform()), \
+             patch("vllm.platforms.current_platform", CudaPlatform()):
             attn = MultiHeadAttention(16, 64, scale=1)
             assert attn.attn_backend == _Backend.XFORMERS
 
-        with patch("vllm.attention.selector.current_platform", CudaPlatform()):
+        with patch("vllm.attention.selector.current_platform",
+                   CudaPlatform()), \
+             patch("vllm.platforms.current_platform", CudaPlatform()):
             attn = MultiHeadAttention(16, 72, scale=1)
             assert attn.attn_backend == _Backend.XFORMERS
 
diff --git a/tests/kernels/core/test_layernorm.py b/tests/kernels/core/test_layernorm.py
index 02316ceaac7359ea22d8a4dae91482f78b030cef..53e6d793cf2f9f28d96a21fc9224dd0163e01d66 100644
--- a/tests/kernels/core/test_layernorm.py
+++ b/tests/kernels/core/test_layernorm.py
@@ -6,7 +6,7 @@ import torch
 
 from tests.kernels.quant_utils import FP8_DTYPE
 from tests.kernels.utils import opcheck
-from vllm.model_executor.layers.layernorm import RMSNorm
+from vllm.model_executor.layers.layernorm import PolyNorm, RMSNorm
 from vllm.platforms import current_platform
 
 DTYPES = [torch.half, torch.bfloat16, torch.float]
@@ -70,6 +70,37 @@ def test_rms_norm(
                 (out, x, layer.weight.data, layer.variance_epsilon))
 
 
+@pytest.mark.parametrize("num_tokens", NUM_TOKENS)
+@pytest.mark.parametrize("hidden_size", HIDDEN_SIZES)
+@pytest.mark.parametrize("dtype", DTYPES)
+@pytest.mark.parametrize("seed", SEEDS)
+@pytest.mark.parametrize("device", CUDA_DEVICES)
+@torch.inference_mode()
+def test_poly_norm(
+    num_tokens: int,
+    hidden_size: int,
+    dtype: torch.dtype,
+    seed: int,
+    device: str,
+) -> None:
+    current_platform.seed_everything(seed)
+    torch.set_default_device(device)
+    layer = PolyNorm().to(dtype=dtype)
+    layer.weight.data.normal_(mean=1.0, std=0.1)
+    layer.bias.data.normal_(mean=1.0, std=0.1)
+    scale = 1 / (2 * hidden_size)
+    x = torch.randn(num_tokens, hidden_size, dtype=dtype)
+    x *= scale
+
+    ref_out = layer.forward_native(x)
+    out = layer(x)
+    torch.testing.assert_close(out, ref_out, atol=1e-2, rtol=1e-2)
+
+    opcheck(
+        torch.ops._C.poly_norm,
+        (out, x, layer.weight.data, layer.bias.data, layer.variance_epsilon))
+
+
 @pytest.mark.parametrize("num_tokens", NUM_TOKENS)
 @pytest.mark.parametrize("hidden_size", HIDDEN_SIZES)
 @pytest.mark.parametrize("add_residual", ADD_RESIDUAL)
diff --git a/tests/kernels/mamba/test_mamba_ssm_ssd.py b/tests/kernels/mamba/test_mamba_ssm_ssd.py
index 2c554baaff76c413932508bfbf625c8b0fc73cdf..fc60d5ac82b27d4a4478c27b877858114e9e96fa 100644
--- a/tests/kernels/mamba/test_mamba_ssm_ssd.py
+++ b/tests/kernels/mamba/test_mamba_ssm_ssd.py
@@ -115,21 +115,27 @@ def generate_continuous_batched_examples(example_lens_by_batch,
                                          n_heads,
                                          d_head,
                                          itype,
-                                         device='cuda'):
+                                         device='cuda',
+                                         return_naive_ref=True):
 
     # this function generates a random examples of certain length
     # and then cut according to "example_lens_by_batch" and feed
-    # them in continuous batches to the kernels
+    # them in continuous batches to the kernels.
+    # If if return_naive_ref=True, the naive torch implementation
+    # ssd_minimal_discrete will be used to compute and return
+    # reference output.
 
     # generate the full-length example
     A, dt, X, B, C = generate_random_inputs(num_examples, full_length, n_heads,
                                             d_head, itype)
 
-    Y_min, final_state_min = ssd_minimal_discrete(X * dt.unsqueeze(-1),
-                                                  A * dt,
-                                                  B,
-                                                  C,
-                                                  block_len=full_length // 4)
+    if return_naive_ref:
+        Y_min, final_state_min = ssd_minimal_discrete(X * dt.unsqueeze(-1),
+                                                      A * dt,
+                                                      B,
+                                                      C,
+                                                      block_len=full_length //
+                                                      4)
 
     # internal function that outputs a cont batch of examples
     # given a tuple of lengths for each example in the batch
@@ -179,7 +185,8 @@ def generate_continuous_batched_examples(example_lens_by_batch,
             IND_S = [x % full_length for x in IND_E]
         IND_E = [end_boundary(x + y) for x, y in zip(IND_S, spec)]
 
-        yield ([Y_min[s, IND_S[s]:IND_E[s]] for s in range(num_examples)],
+        yield ([Y_min[s, IND_S[s]:IND_E[s]]
+                for s in range(num_examples)] if return_naive_ref else None,
                cu_seqlens, seq_idx.unsqueeze(0), (A, dt2, X2, B2, C2))
 
 
@@ -324,3 +331,213 @@ def test_mamba_chunk_scan_cont_batch(d_head, n_heads, seq_len_chunk_size_cases,
             if clear:
                 states[i].fill_(0.)
                 exhausted[i] = False
+
+
+@pytest.mark.parametrize("chunk_size", [8, 256])
+@pytest.mark.parametrize("seqlens", [
+    (16, 2, 8, 13),
+    (270, 88, 212, 203),
+    (16, 20),
+])
+def test_mamba_chunk_scan_cont_batch_prefill_chunking(chunk_size, seqlens):
+
+    # This test verifies the correctness of the chunked prefill implementation
+    # in the mamba2 ssd kernels, by comparing concatenation (in the sequence
+    # dimension) of chunked results with the full sequence result.
+    # It is different from test_mamba_chunk_scan_cont_batch by:
+    # 1. Not using the naive torch implementation (ssd_minimal_discrete) to get
+    #    reference outputs. Instead, it compares chunked kernel outputs to full
+    #    sequence kernel outputs. This is the most straightforward way to
+    #    assert chunked prefill correctness.
+    # 2. It focuses on cases where sequences change in the middle of mamba
+    #    chunks, and not necessarily on chunk boundaries.
+
+    max_seqlen = max(seqlens)
+    # This test can have larger error for longer sequences
+    if max_seqlen > 256:
+        atol, rtol = 1e-2, 5e-3
+    else:
+        atol, rtol = 5e-3, 5e-3
+
+    num_sequences = len(seqlens)
+    n_heads = 16
+    d_head = 64
+    itype = torch.float32
+
+    # hold state during the cutting process so we know if an
+    # example has been exhausted and needs to cycle
+    last_taken: dict = {}  # map: eg -> pointer to last taken sample
+    exhausted: dict = {}  # map: eg -> boolean indicating example is exhausted
+    _, cu_seqlens, seq_idx, (A, dt, X, B, C) = next(
+        generate_continuous_batched_examples([seqlens],
+                                             num_sequences,
+                                             max_seqlen,
+                                             last_taken,
+                                             exhausted,
+                                             n_heads,
+                                             d_head,
+                                             itype,
+                                             return_naive_ref=False))
+    seqlens = torch.tensor(seqlens, dtype=torch.int32, device=X.device)
+    device = X.device
+
+    ## full seqlen computation
+    chunk_indices, chunk_offsets = \
+            _query_start_loc_to_chunk_indices_offsets(
+                cu_seqlens, chunk_size, cu_seqlens[-1])
+    Y_ref = torch.empty_like(X)
+    state_ref = mamba_chunk_scan_combined(
+        X,
+        dt,
+        A,
+        B,
+        C,
+        chunk_size,
+        D=None,
+        cu_seqlens=cu_seqlens,
+        seq_idx=seq_idx,
+        chunk_indices=chunk_indices,
+        chunk_offsets=chunk_offsets,
+        return_varlen_states=True,
+        initial_states=None,
+        out=Y_ref,
+    )
+
+    ## chunked seqlen computation
+    # first chunk
+    chunked_seqlens = seqlens // 2
+    chunked_cu_seqlens = torch.cat([
+        torch.tensor([0], device=device),
+        torch.cumsum(chunked_seqlens, dim=0)
+    ],
+                                   dim=0)
+    chunked_seq_idx = torch.repeat_interleave(
+        torch.arange(len(chunked_seqlens), device=device),
+        chunked_seqlens,
+        output_size=chunked_cu_seqlens[-1]).unsqueeze(0).to(torch.int32)
+    chunked_input_seq_len = chunked_cu_seqlens[-1]
+    X_chunked = torch.zeros_like(X)[:, :chunked_input_seq_len, ...]
+    dt_chunked = torch.zeros_like(dt)[:, :chunked_input_seq_len, ...]
+    B_chunked = torch.zeros_like(B)[:, :chunked_input_seq_len, ...]
+    C_chunked = torch.zeros_like(C)[:, :chunked_input_seq_len, ...]
+    for i in range(num_sequences):
+        # fmt: off
+        chunk_f = lambda x, i: x[:, cu_seqlens[i]:cu_seqlens[i] + chunked_seqlens[i], ...]  # noqa: E501
+
+        X_chunked[:, chunked_cu_seqlens[i]:chunked_cu_seqlens[i+1], ...] = chunk_f(X, i)  # noqa: E501
+        dt_chunked[:, chunked_cu_seqlens[i]:chunked_cu_seqlens[i+1], ...] = chunk_f(dt, i)  # noqa: E501
+        B_chunked[:, chunked_cu_seqlens[i]:chunked_cu_seqlens[i+1], ...] = chunk_f(B, i)  # noqa: E501
+        C_chunked[:, chunked_cu_seqlens[i]:chunked_cu_seqlens[i+1], ...] = chunk_f(C, i)  # noqa: E501
+        # fmt: on
+
+    chunk_indices, chunk_offsets = \
+            _query_start_loc_to_chunk_indices_offsets(
+                chunked_cu_seqlens, chunk_size, chunked_cu_seqlens[-1])
+    Y_partial = torch.empty_like(X_chunked)
+    partial_state = mamba_chunk_scan_combined(
+        X_chunked,
+        dt_chunked,
+        A,
+        B_chunked,
+        C_chunked,
+        chunk_size,
+        D=None,
+        cu_seqlens=chunked_cu_seqlens,
+        seq_idx=chunked_seq_idx,
+        chunk_indices=chunk_indices,
+        chunk_offsets=chunk_offsets,
+        return_varlen_states=True,
+        initial_states=None,
+        out=Y_partial,
+    )
+
+    # remaining chunk
+    remaining_chunked_seqlens = seqlens - chunked_seqlens
+    remaining_chunked_cu_seqlens = torch.cat([
+        torch.tensor([0], device=device),
+        torch.cumsum(remaining_chunked_seqlens, dim=0)
+    ],
+                                             dim=0)
+    remaining_chunked_seq_idx = torch.repeat_interleave(
+        torch.arange(len(remaining_chunked_seqlens), device=device),
+        remaining_chunked_seqlens,
+        output_size=remaining_chunked_cu_seqlens[-1]).unsqueeze(0).to(
+            torch.int32)
+    remaining_chunked_input_seq_len = remaining_chunked_cu_seqlens[-1]
+    # fmt: off
+    remaining_X_chunked = torch.zeros_like(X)[:, :remaining_chunked_input_seq_len, ...]  # noqa: E501
+    remaining_dt_chunked = torch.zeros_like(dt)[:, :remaining_chunked_input_seq_len, ...]  # noqa: E501
+    remaining_B_chunked = torch.zeros_like(B)[:, :remaining_chunked_input_seq_len, ...]  # noqa: E501
+    remaining_C_chunked = torch.zeros_like(C)[:, :remaining_chunked_input_seq_len, ...]  # noqa: E501
+    for i in range(num_sequences):
+        remaining_chunk_f = lambda x, i: x[:, cu_seqlens[i] + chunked_seqlens[i]:cu_seqlens[i+1], ...]  # noqa: E501
+
+        remaining_X_chunked[:, remaining_chunked_cu_seqlens[i]:remaining_chunked_cu_seqlens[i+1], ...] = remaining_chunk_f(X, i)  # noqa: E501
+        remaining_dt_chunked[:, remaining_chunked_cu_seqlens[i]:remaining_chunked_cu_seqlens[i+1], ...] = remaining_chunk_f(dt, i)  # noqa: E501
+        remaining_B_chunked[:, remaining_chunked_cu_seqlens[i]:remaining_chunked_cu_seqlens[i+1], ...] = remaining_chunk_f(B, i)  # noqa: E501
+        remaining_C_chunked[:, remaining_chunked_cu_seqlens[i]:remaining_chunked_cu_seqlens[i+1], ...] = remaining_chunk_f(C, i)  # noqa: E501
+
+    # assert input chunking is correct
+    concat_chunk_f = lambda pt1, pt2, i: torch.cat([
+        pt1[:,chunked_cu_seqlens[i]:chunked_cu_seqlens[i+1],...],
+        pt2[:,remaining_chunked_cu_seqlens[i]:remaining_chunked_cu_seqlens[i+1],...],
+        ],
+        dim=1)
+    concat_batch_f = lambda pt1, pt2: torch.cat([concat_chunk_f(pt1, pt2, i) for i in range(num_sequences)], dim=1)  # noqa: E501
+    # fmt: on
+
+    assert concat_batch_f(X_chunked, remaining_X_chunked).equal(X)
+    assert concat_batch_f(dt_chunked, remaining_dt_chunked).equal(dt)
+    assert concat_batch_f(B_chunked, remaining_B_chunked).equal(B)
+    assert concat_batch_f(C_chunked, remaining_C_chunked).equal(C)
+
+    chunk_indices, chunk_offsets = \
+            _query_start_loc_to_chunk_indices_offsets(
+                remaining_chunked_cu_seqlens,
+                chunk_size,
+                remaining_chunked_cu_seqlens[-1])
+
+    Y_chunked = torch.empty_like(remaining_X_chunked)
+    state_chunked = mamba_chunk_scan_combined(
+        remaining_X_chunked,
+        remaining_dt_chunked,
+        A,
+        remaining_B_chunked,
+        remaining_C_chunked,
+        chunk_size,
+        D=None,
+        cu_seqlens=remaining_chunked_cu_seqlens,
+        seq_idx=remaining_chunked_seq_idx,
+        chunk_indices=chunk_indices,
+        chunk_offsets=chunk_offsets,
+        return_varlen_states=True,
+        initial_states=partial_state,
+        out=Y_chunked,
+    )
+    Y = concat_batch_f(Y_partial, Y_chunked)
+
+    # kernel chunked is same as kernel overall
+    for i in range(num_sequences):
+        Y_seq = Y[:, cu_seqlens[i]:cu_seqlens[i + 1], ...]
+        Y_ref_seq = Y_ref[:, cu_seqlens[i]:cu_seqlens[i + 1], ...]
+        torch.testing.assert_close(
+            Y_seq[:, :chunked_seqlens[i], ...],
+            Y_ref_seq[:, :chunked_seqlens[i], ...],
+            atol=atol,
+            rtol=rtol,
+            msg=lambda x: f"seq{i} output part1 " + x)  # noqa: B023
+        torch.testing.assert_close(
+            Y_seq[:, chunked_seqlens[i]:, ...],
+            Y_ref_seq[:, chunked_seqlens[i]:, ...],
+            atol=atol,
+            rtol=rtol,
+            msg=lambda x: f"seq{i} output part2 " + x)  # noqa: B023
+
+        state_seq = state_chunked[i]
+        state_seq_ref = state_ref[i]
+        torch.testing.assert_close(
+            state_seq,
+            state_seq_ref,
+            atol=atol,
+            rtol=rtol,
+            msg=lambda x: f"seq{i} state " + x)  # noqa: B023
diff --git a/tests/kernels/moe/test_deepep_deepgemm_moe.py b/tests/kernels/moe/test_deepep_deepgemm_moe.py
index 36a98522a658805512323d9175e625b5a7b5ca8e..6558cab6a9efffc40fd1c18486d106cabec6dc61 100644
--- a/tests/kernels/moe/test_deepep_deepgemm_moe.py
+++ b/tests/kernels/moe/test_deepep_deepgemm_moe.py
@@ -282,7 +282,7 @@ def triton_impl(a: torch.Tensor, topk_ids: torch.Tensor,
         a1_scale=a1_scale,
         block_shape=block_shape,
         # Make sure this is set to False so we
-        # dont end up comparing the same implementation.
+        # don't end up comparing the same implementation.
         allow_deep_gemm=False)
 
 
diff --git a/tests/kernels/moe/test_moe.py b/tests/kernels/moe/test_moe.py
index 0ea9667914fd530854ad00f8f4fff284e75f1539..850c486b95240af977082d5ea13318f6baed1341 100644
--- a/tests/kernels/moe/test_moe.py
+++ b/tests/kernels/moe/test_moe.py
@@ -371,8 +371,8 @@ def test_fused_moe_wn16(m: int, n: int, k: int, e: int, topk: int,
 @pytest.mark.parametrize(
     "use_rocm_aiter", [True, False] if current_platform.is_rocm() else [False])
 @torch.inference_mode()
-def test_mixtral_moe(dtype: torch.dtype, padding: bool, use_rocm_aiter: bool,
-                     monkeypatch):
+def test_mixtral_moe(dist_init, dtype: torch.dtype, padding: bool,
+                     use_rocm_aiter: bool, monkeypatch):
     """Make sure our Mixtral MoE implementation agrees with the one from
     huggingface."""
 
diff --git a/tests/kernels/moe/test_mxfp4_moe.py b/tests/kernels/moe/test_mxfp4_moe.py
index 7bd1ffce58e9624b88ab4e8c98c8a99abb277ff7..9fd72ee152b55441a62ec1c81958a7b020c72dbd 100644
--- a/tests/kernels/moe/test_mxfp4_moe.py
+++ b/tests/kernels/moe/test_mxfp4_moe.py
@@ -11,6 +11,7 @@ import torch
 from packaging import version
 
 from vllm.platforms import current_platform
+from vllm.utils.flashinfer import has_flashinfer
 
 QUARK_MXFP4_AVAILABLE = importlib.util.find_spec(
     "quark") is not None and version.parse(
@@ -19,11 +20,17 @@ QUARK_MXFP4_AVAILABLE = importlib.util.find_spec(
 TRTLLM_GEN_MXFP4_AVAILABLE = current_platform.is_cuda(
 ) and current_platform.is_device_capability(100)
 
+HOPPER_MXFP4_BF16_AVAILABLE = (current_platform.is_cuda()
+                               and current_platform.is_device_capability(90)
+                               and has_flashinfer())
+
 if TRTLLM_GEN_MXFP4_AVAILABLE:
     from flashinfer import (fp4_quantize, mxfp8_quantize,
                             next_positive_power_of_2,
                             reorder_rows_for_gated_act_gemm, shuffle_matrix_a,
                             shuffle_matrix_sf_a, trtllm_fp4_block_scale_moe)
+    from flashinfer.fp4_quantization import nvfp4_block_scale_interleave
+    from flashinfer.fused_moe.core import _maybe_get_cached_w2_permute_indices
 
 
 @dataclass
@@ -204,6 +211,7 @@ def tg_mxfp4_moe(
     alpha,
     beta,
     limit,
+    transpose_optimized: bool = False,
 ) -> torch.Tensor:
     sf_block_size = 32
     assert (w13_weight.dim() == 3 and w13_weight.shape[0] == num_experts
@@ -224,7 +232,7 @@ def tg_mxfp4_moe(
     assert (w2_bias.dim() == 2 and w2_bias.shape[0] == num_experts
             and w2_bias.shape[1] == hidden_size)
 
-    # Swap w1 and w3 as the defenition of
+    # Swap w1 and w3 as the definition of
     # swiglu is different in the trtllm-gen
     w13_weight_scale_ = w13_weight_scale.clone()
     w13_weight_ = w13_weight.clone()
@@ -267,22 +275,85 @@ def tg_mxfp4_moe(
     gemm1_bias_shuffled = []
     gemm2_bias_shuffled = []
     epilogue_tile_m = 128  # FIXME: this depends on the kernel internals
-    for i in range(num_experts):
-        gemm1_weights_shuffled.append(
-            shuffle_matrix_a(w13_weight[i].view(torch.uint8), epilogue_tile_m))
-        gemm1_scales_shuffled.append(
-            shuffle_matrix_sf_a(w13_weight_scale[i].view(torch.uint8),
-                                epilogue_tile_m))
-
-        gemm2_weights_shuffled.append(
-            shuffle_matrix_a(w2_weight[i].view(torch.uint8), epilogue_tile_m))
-        gemm2_scales_shuffled.append(
-            shuffle_matrix_sf_a(w2_weight_scale[i].view(torch.uint8),
-                                epilogue_tile_m))
-        gemm1_bias_shuffled.append(
-            shuffle_matrix_a(w13_bias[i].reshape(-1, 1), epilogue_tile_m))
-        gemm2_bias_shuffled.append(
-            shuffle_matrix_a(w2_bias[i].reshape(-1, 1), epilogue_tile_m))
+    _cache_permute_indices: dict[torch.Size, torch.Tensor] = {}
+    if transpose_optimized:
+        for i in range(num_experts):
+            # w13 weight shuffling
+            permute_indices = _maybe_get_cached_w2_permute_indices(
+                _cache_permute_indices,
+                w13_weight[i].view(torch.uint8),
+                epilogue_tile_m,
+            )
+            gemm1_weights_shuffled.append(w13_weight[i].view(
+                torch.uint8)[permute_indices.to(
+                    w13_weight.device)].contiguous())
+            # w13 scale shuffling
+            permute_sf_indices = _maybe_get_cached_w2_permute_indices(
+                _cache_permute_indices,
+                w13_weight_scale[i].view(torch.uint8),
+                epilogue_tile_m,
+                num_elts_per_sf=16,
+            )
+            gemm1_scales_shuffled.append(
+                nvfp4_block_scale_interleave(w13_weight_scale[i].view(
+                    torch.uint8)[permute_sf_indices.to(
+                        w13_weight_scale.device)].contiguous()))
+            # w13 bias shuffling
+            permute_bias_indices = _maybe_get_cached_w2_permute_indices(
+                _cache_permute_indices,
+                w13_bias[i].clone().reshape(-1, 1),
+                epilogue_tile_m,
+            )
+            gemm1_bias_shuffled.append(w13_bias[i].clone().reshape(
+                -1, 1)[permute_bias_indices.to(w13_bias.device)].contiguous())
+            # w2 weight shuffling
+            permute_indices = _maybe_get_cached_w2_permute_indices(
+                _cache_permute_indices,
+                w2_weight[i].view(torch.uint8),
+                epilogue_tile_m,
+            )
+            gemm2_weights_shuffled.append(w2_weight[i].view(
+                torch.uint8)[permute_indices.to(
+                    w2_weight.device)].contiguous())
+            # w2 scale shuffling
+            permute_sf_indices = _maybe_get_cached_w2_permute_indices(
+                _cache_permute_indices,
+                w2_weight_scale[i].view(torch.uint8),
+                epilogue_tile_m,
+                num_elts_per_sf=16,
+            )
+            gemm2_scales_shuffled.append(
+                nvfp4_block_scale_interleave(w2_weight_scale[i].view(
+                    torch.uint8)[permute_sf_indices.to(
+                        w2_weight_scale.device)].contiguous()))
+            # w2 bias shuffling
+            permute_indices = _maybe_get_cached_w2_permute_indices(
+                _cache_permute_indices,
+                w2_bias[i].clone().reshape(-1, 1),
+                epilogue_tile_m,
+            )
+            gemm2_bias_shuffled.append(w2_bias[i].clone().reshape(
+                -1, 1)[permute_indices.to(w2_bias.device)].contiguous())
+
+    else:
+        for i in range(num_experts):
+            gemm1_weights_shuffled.append(
+                shuffle_matrix_a(w13_weight[i].view(torch.uint8),
+                                 epilogue_tile_m))
+            gemm1_scales_shuffled.append(
+                shuffle_matrix_sf_a(w13_weight_scale[i].view(torch.uint8),
+                                    epilogue_tile_m))
+
+            gemm2_weights_shuffled.append(
+                shuffle_matrix_a(w2_weight[i].view(torch.uint8),
+                                 epilogue_tile_m))
+            gemm2_scales_shuffled.append(
+                shuffle_matrix_sf_a(w2_weight_scale[i].view(torch.uint8),
+                                    epilogue_tile_m))
+            gemm1_bias_shuffled.append(
+                shuffle_matrix_a(w13_bias[i].reshape(-1, 1), epilogue_tile_m))
+            gemm2_bias_shuffled.append(
+                shuffle_matrix_a(w2_bias[i].reshape(-1, 1), epilogue_tile_m))
 
     w13_weight = torch.stack(gemm1_weights_shuffled)
     w13_weight_scale = torch.stack(gemm1_scales_shuffled).reshape(
@@ -356,6 +427,7 @@ def check_accuracy(a, b, atol, rtol, percent):
 @pytest.mark.parametrize("alpha,beta,limit", [(1.0, 1.0, None),
                                               (1.702, 1.0, 7.0)])
 @pytest.mark.parametrize("act_type", ['mxfp8', 'bf16'])
+@pytest.mark.parametrize("transpose_optimized", [False, True])
 @pytest.mark.skipif(
     not TRTLLM_GEN_MXFP4_AVAILABLE,
     reason="nvidia gpu and compute capability sm100 is required for this test")
@@ -369,6 +441,7 @@ def test_trtllm_gen_mxfp4_fused_moe(
     beta: float,
     limit: Optional[float],
     act_type: str,
+    transpose_optimized: bool,
 ):
     seed = 42
     torch.manual_seed(seed)
@@ -470,6 +543,321 @@ def test_trtllm_gen_mxfp4_fused_moe(
                              act_type,
                              alpha=alpha,
                              beta=beta,
-                             limit=limit)
+                             limit=limit,
+                             transpose_optimized=transpose_optimized)
     # relatively loose check since the mxfp4 quantization is less accurate
     check_accuracy(ref_result, tg_result, atol=0, rtol=0.3, percent=0.8)
+
+
+def _interleave_scales_lastdim_by4(scales: torch.Tensor) -> torch.Tensor:
+    """Interleave scales on the last dimension by groups of 4, matching
+    the transformation in mxfp4.py's BF16 (Hopper) path."""
+    s = scales.to(torch.uint8)
+    s_shape = s.shape
+    assert s_shape[-1] % 4 == 0
+    s = s.reshape(*s_shape[:-1], s_shape[-1] // 4, 4)
+    # Move the 4-group dimension before the row dimension
+    permuted = s.permute(0, 2, 1, 3)
+    # Merge the row dim with the 4-group dim
+    return permuted.reshape(s_shape[0], s_shape[-1] // 4, s_shape[1] * 4)
+
+
+@pytest.mark.parametrize("topk", [1, 4])
+@pytest.mark.parametrize("num_experts", [32])
+@pytest.mark.parametrize("num_tokens", [1, 128])
+@pytest.mark.parametrize("intermediate_size,hidden_size", [(3072, 3072)])
+@pytest.mark.parametrize("alpha,beta,limit", [(1.0, 1.0, None),
+                                              (1.702, 1.0, 7.0)])
+@pytest.mark.skipif(
+    not HOPPER_MXFP4_BF16_AVAILABLE,
+    reason="nvidia gpu sm90 and flashinfer are required for this test",
+)
+def test_flashinfer_cutlass_mxfp4_fused_moe(
+    topk: int,
+    num_experts: int,
+    num_tokens: int,
+    intermediate_size: int,
+    hidden_size: int,
+    alpha: float,
+    beta: float,
+    limit: Optional[float],
+):
+    torch.manual_seed(42)
+    device = "cuda:0"
+
+    # Inputs
+    hidden_states = torch.randn(num_tokens,
+                                hidden_size,
+                                device=device,
+                                dtype=torch.bfloat16)
+    # Random MXFP4 weights and scales (uint8), contiguous [w1; w3]
+    w13_q = torch.randint(
+        0,
+        256, (num_experts, 2 * intermediate_size, hidden_size // 2),
+        device=device,
+        dtype=torch.uint8)
+    w13_scale = torch.randint(
+        118,
+        123, (num_experts, 2 * intermediate_size, hidden_size // 32),
+        device=device,
+        dtype=torch.uint8)
+
+    w2_q = torch.randint(0,
+                         256,
+                         (num_experts, hidden_size, intermediate_size // 2),
+                         device=device,
+                         dtype=torch.uint8)
+    w2_scale = torch.randint(
+        118,
+        123, (num_experts, hidden_size, intermediate_size // 32),
+        device=device,
+        dtype=torch.uint8)
+    # Bias contiguous [b1; b3]
+    bias13 = (torch.randn(num_experts,
+                          2 * intermediate_size,
+                          device=device,
+                          dtype=torch.bfloat16) * 10)
+    bias2 = (torch.randn(
+        num_experts, hidden_size, device=device, dtype=torch.bfloat16) * 10)
+    router_logits = torch.rand(num_tokens,
+                               num_experts,
+                               dtype=torch.float32,
+                               device=device)
+
+    w13_ref = mxfp4_dequantize(w13_q.clone(), w13_scale.clone()).reshape(
+        num_experts, 2 * intermediate_size, hidden_size)
+    w2_ref = mxfp4_dequantize(w2_q.clone(), w2_scale.clone()).reshape(
+        num_experts, hidden_size, intermediate_size)
+    ref = reference_moe(router_logits.to(torch.float32), topk, num_experts,
+                        hidden_states.to(torch.float32), w13_ref,
+                        bias13.to(torch.float32), w2_ref,
+                        bias2.to(torch.float32), alpha, beta, limit, 'bf16')
+
+    from vllm.utils.flashinfer import flashinfer_cutlass_fused_moe
+
+    # Swap halves to arrange as [w3; w1] (kernel expectation)
+    w1_w, w3_w = torch.chunk(w13_q, 2, dim=1)
+    w13_q_swapped = torch.cat([w3_w, w1_w], dim=1)
+
+    b1, b3 = torch.chunk(bias13.to(torch.float32), 2, dim=-1)
+    w13_b = torch.cat([b3, b1], dim=-1).to(torch.bfloat16)
+
+    w1_s, w3_s = torch.chunk(w13_scale, 2, dim=1)
+    w13_s = torch.cat([w3_s, w1_s], dim=1)
+    w13_s_inter = _interleave_scales_lastdim_by4(w13_s)
+    w2_s_inter = _interleave_scales_lastdim_by4(w2_scale)
+
+    routing_weights = torch.nn.functional.softmax(router_logits,
+                                                  dim=1,
+                                                  dtype=torch.float32)
+    token_final_scales, token_selected_experts = torch.topk(routing_weights,
+                                                            topk,
+                                                            dim=-1)
+    token_final_scales = (token_final_scales /
+                          token_final_scales.sum(dim=-1, keepdim=True))
+    token_selected_experts = token_selected_experts.to(torch.int).contiguous()
+
+    out = torch.empty_like(hidden_states, dtype=torch.bfloat16)
+    if alpha is not None:
+        alpha = torch.full((num_experts, ), alpha, device=hidden_states.device)
+    if beta is not None:
+        beta = torch.full((num_experts, ), beta, device=hidden_states.device)
+    if limit is not None:
+        limit = torch.full((num_experts, ), limit, device=hidden_states.device)
+
+    _ = flashinfer_cutlass_fused_moe(
+        input=hidden_states,
+        token_selected_experts=token_selected_experts,
+        token_final_scales=token_final_scales,
+        fc1_expert_weights=w13_q_swapped,
+        fc2_expert_weights=w2_q,
+        output_dtype=torch.bfloat16,
+        output=out,
+        quant_scales=[w13_s_inter.to(torch.uint8),
+                      w2_s_inter.to(torch.uint8)],
+        fc1_expert_biases=w13_b,
+        fc2_expert_biases=bias2.to(torch.bfloat16),
+        swiglu_alpha=alpha,
+        swiglu_beta=beta,
+        swiglu_limit=limit,
+        tp_size=1,
+        tp_rank=0,
+        ep_size=1,
+        ep_rank=0,
+        use_w4_group_scaling=True,
+    )
+
+    # Allow some mismatch due to MXFP4 quantization
+    check_accuracy(ref, out, atol=0, rtol=0.3, percent=0.8)
+
+
+@pytest.mark.parametrize("topk", [1, 4])
+@pytest.mark.parametrize("num_experts", [32])
+@pytest.mark.parametrize("num_tokens", [1, 128])
+@pytest.mark.parametrize("intermediate_size,hidden_size", [(3072, 3072)])
+@pytest.mark.parametrize("alpha,beta,limit", [(1.0, 1.0, None),
+                                              (1.702, 1.0, 7.0)])
+@pytest.mark.skipif(
+    not (current_platform.is_cuda()
+         and current_platform.is_device_capability(100) and has_flashinfer()),
+    reason="NVIDIA GPU sm100 and flashinfer are required for this test",
+)
+def test_flashinfer_cutlass_mxfp4_mxfp8_fused_moe(
+    topk: int,
+    num_experts: int,
+    num_tokens: int,
+    intermediate_size: int,
+    hidden_size: int,
+    alpha: Optional[float],
+    beta: Optional[float],
+    limit: Optional[float],
+):
+    torch.manual_seed(42)
+    device = "cuda:0"
+
+    # Inputs
+    hidden_states = torch.randn(num_tokens,
+                                hidden_size,
+                                device=device,
+                                dtype=torch.bfloat16)
+    # Float weights in w13 format [w1; w3]
+    w13 = (torch.randn(num_experts,
+                       2 * intermediate_size,
+                       hidden_size,
+                       device=device,
+                       dtype=torch.bfloat16) / 10)
+    w2 = (torch.randn(num_experts,
+                      hidden_size,
+                      intermediate_size,
+                      device=device,
+                      dtype=torch.bfloat16) / 10)
+    # Bias contiguous [b1; b3]
+    bias13 = (torch.randn(num_experts,
+                          2 * intermediate_size,
+                          device=device,
+                          dtype=torch.bfloat16) * 10)
+    bias2 = (torch.randn(
+        num_experts, hidden_size, device=device, dtype=torch.bfloat16) * 10)
+    router_logits = torch.rand(num_tokens,
+                               num_experts,
+                               dtype=torch.float32,
+                               device=device)
+
+    # Quantize weights to MXFP4 per expert (SM100 path)
+    from flashinfer import mxfp4_quantize
+
+    def quant_mxfp4_batches(a: torch.Tensor, e: int):
+        qs, sfs = [], []
+        for i in range(e):
+            q, sf = mxfp4_quantize(a[i].cuda())
+            qs.append(q)
+            sfs.append(sf)
+        return torch.stack(qs), torch.stack(sfs)
+
+    def dequant_mxfp4_batches(mat_fp4: torch.Tensor,
+                              scale_tensor: torch.Tensor):
+        num_batches = mat_fp4.size(0)
+        scale_tensor = scale_tensor.view(num_batches, -1)
+        from flashinfer import mxfp4_dequantize
+        return torch.stack([
+            mxfp4_dequantize(mat_fp4[b, :, :], scale_tensor[b, :])
+            for b in range(num_batches)
+        ])
+
+    w13_q, w13_scale = quant_mxfp4_batches(w13, num_experts)
+    w2_q, w2_scale = quant_mxfp4_batches(w2, num_experts)
+
+    # Reference result using dequantized tensors and reference_moe
+    w13_ref = dequant_mxfp4_batches(
+        w13_q.view(torch.uint8),
+        w13_scale.view(torch.uint8).reshape(-1)).to(torch.float32).reshape(
+            num_experts, 2 * intermediate_size, hidden_size)
+    w2_ref = dequant_mxfp4_batches(
+        w2_q.view(torch.uint8),
+        w2_scale.view(torch.uint8).reshape(-1)).to(torch.float32).reshape(
+            num_experts, hidden_size, intermediate_size)
+
+    # Quantize activations for SM100 path and dequantize for reference
+    hidden_states_q, hidden_states_sf = mxfp8_quantize(hidden_states, True, 32)
+    # Reference uses BF16 input but quantizes intermediate activation to MXFP8
+    ref = reference_moe(router_logits.to(torch.float32), topk, num_experts,
+                        hidden_states.to(torch.float32), w13_ref,
+                        bias13.to(torch.float32), w2_ref,
+                        bias2.to(torch.float32), alpha, beta, limit, 'mxfp8')
+
+    # Prepare inputs for FlashInfer CUTLASS fused MoE
+    from vllm.utils.flashinfer import flashinfer_cutlass_fused_moe
+
+    # Swap halves to arrange as [w3; w1] (kernel expectation)
+    w1_w, w3_w = torch.chunk(w13_q, 2, dim=1)
+    w13_q_swapped = torch.cat([w3_w, w1_w], dim=1)
+
+    # Swap scales halves to match swapped weights
+    s1, s3 = torch.chunk(w13_scale, 2, dim=1)
+    w13_scale_swapped = torch.cat([s3, s1], dim=1)
+
+    b1, b3 = torch.chunk(bias13.to(torch.float32), 2, dim=-1)
+    w13_b = torch.cat([b3, b1], dim=-1).to(torch.bfloat16)
+
+    # Build routing for kernel
+    routing_weights = torch.nn.functional.softmax(router_logits,
+                                                  dim=1,
+                                                  dtype=torch.float32)
+    token_final_scales, token_selected_experts = torch.topk(routing_weights,
+                                                            topk,
+                                                            dim=-1)
+    token_final_scales = (token_final_scales /
+                          token_final_scales.sum(dim=-1, keepdim=True))
+    token_selected_experts = token_selected_experts.to(torch.int).contiguous()
+
+    out = torch.empty_like(hidden_states, dtype=torch.bfloat16)
+    if alpha is not None:
+        alpha_t = torch.full((num_experts, ),
+                             alpha,
+                             device=hidden_states.device)
+    else:
+        alpha_t = None
+    if beta is not None:
+        beta_t = torch.full((num_experts, ), beta, device=hidden_states.device)
+    else:
+        beta_t = None
+    if limit is not None:
+        limit_t = torch.full((num_experts, ),
+                             limit,
+                             device=hidden_states.device)
+    else:
+        limit_t = None
+
+    # Quant scales for SM100 MXFP8+MXFP4 path
+    fake_input_scale = torch.ones(num_experts, device=device)
+    quant_scales = [
+        w13_scale_swapped.view(torch.int32),
+        fake_input_scale,
+        w2_scale.view(torch.int32),
+        fake_input_scale,
+    ]
+
+    _ = flashinfer_cutlass_fused_moe(
+        input=hidden_states_q,
+        token_selected_experts=token_selected_experts,
+        token_final_scales=token_final_scales,
+        fc1_expert_weights=w13_q_swapped.contiguous().view(torch.long),
+        fc2_expert_weights=w2_q.contiguous().view(torch.long),
+        output_dtype=torch.bfloat16,
+        output=out,
+        quant_scales=quant_scales,
+        fc1_expert_biases=w13_b,
+        fc2_expert_biases=bias2.to(torch.bfloat16),
+        swiglu_alpha=alpha_t,
+        swiglu_beta=beta_t,
+        swiglu_limit=limit_t,
+        tp_size=1,
+        tp_rank=0,
+        ep_size=1,
+        ep_rank=0,
+        use_mxfp8_act_scaling=True,
+        input_sf=hidden_states_sf,
+    )
+
+    # Allow some mismatch due to MXFP4 quantization
+    check_accuracy(ref, out, atol=0, rtol=0.3, percent=0.8)
diff --git a/tests/kernels/moe/test_pplx_moe.py b/tests/kernels/moe/test_pplx_moe.py
index 3f36d7ada2e94de7e778cc309e2ba643e27dfd2c..394f5211408590e5f87cdff54f38a05afeb4f4fc 100644
--- a/tests/kernels/moe/test_pplx_moe.py
+++ b/tests/kernels/moe/test_pplx_moe.py
@@ -4,10 +4,11 @@
 
 Run `pytest tests/kernels/test_pplx_moe.py`.
 """
+import copy
 import itertools
 import textwrap
 import traceback
-from typing import Callable, Optional
+from typing import Callable, Optional, Union
 
 import pytest
 import torch
@@ -21,7 +22,10 @@ try:
 except ImportError:
     has_pplx = False
 
-from tests.kernels.moe.utils import make_test_weights, naive_batched_moe
+from tests.kernels.moe.modular_kernel_tools.parallel_utils import (
+    _set_vllm_config)
+from tests.kernels.moe.utils import (make_shared_experts, make_test_weights,
+                                     naive_batched_moe)
 from tests.kernels.quant_utils import dequant
 from tests.kernels.utils import torch_experts
 from vllm.config import VllmConfig, set_current_vllm_config
@@ -511,7 +515,8 @@ def pplx_moe(
     block_shape: Optional[list[int]] = None,
     use_compile: bool = False,
     use_cudagraphs: bool = True,
-) -> torch.Tensor:
+    shared_experts: Optional[torch.nn.Module] = None,
+) -> Union[torch.Tensor, tuple[torch.Tensor, torch.Tensor]]:
 
     num_tokens, hidden_dim = a.shape
     num_experts = w1.shape[0]
@@ -546,6 +551,7 @@ def pplx_moe(
     fused_experts = FusedMoEModularKernel(
         prepare_finalize,
         experts,
+        shared_experts,
     )
 
     # Note: workers with the same dp_rank must use the exact same inputs.
@@ -586,7 +592,11 @@ def pplx_moe(
                          global_num_experts=num_experts)
 
     if use_cudagraphs:
-        out.fill_(0)
+        if isinstance(out, tuple):
+            out[0].fill_(0)
+            out[1].fill_(0)
+        else:
+            out.fill_(0)
         stream = torch.cuda.Stream()
         graph = torch.cuda.CUDAGraph()
         with torch.cuda.graph(graph, stream=stream):
@@ -626,6 +636,7 @@ def _pplx_moe(
     per_act_token_quant: bool = False,
     block_shape: Optional[list[int]] = None,
     use_internode: bool = False,
+    shared_experts: Optional[torch.nn.Module] = None,
 ):
     try:
         if use_internode:
@@ -666,6 +677,11 @@ def _pplx_moe(
         with set_current_vllm_config(vllm_config), override_config(moe_config):
             topk_weight, topk_ids, _ = fused_topk(a, score, topk, False)
 
+            if shared_experts is not None:
+                shared_output = shared_experts(a)
+            else:
+                shared_output = None
+
             torch_output = torch_experts(
                 a,
                 w1,
@@ -696,7 +712,7 @@ def _pplx_moe(
                 block_shape=block_shape,
             )
 
-            pplx_output = pplx_moe(
+            pplx_outputs = pplx_moe(
                 group_name,
                 rank,
                 world_size,
@@ -713,8 +729,24 @@ def _pplx_moe(
                 quant_dtype=quant_dtype,
                 per_act_token_quant=per_act_token_quant,
                 block_shape=block_shape,
+                shared_experts=shared_experts,
             )
 
+        if shared_experts is None:
+            pplx_shared_output = None
+            pplx_output = pplx_outputs
+            assert isinstance(pplx_output, torch.Tensor)
+        else:
+            pplx_shared_output, pplx_output = pplx_outputs
+
+        if shared_output is not None:
+            assert pplx_shared_output is not None
+            chunked_shared_output = chunk_by_rank(
+                shared_output, pgi.rank,
+                pgi.world_size).to(pplx_shared_output.device)
+        else:
+            chunked_shared_output = None
+
         chunked_batch_output = chunk_by_rank(
             batched_output, pgi.rank, pgi.world_size).to(pplx_output.device)
 
@@ -727,6 +759,15 @@ def _pplx_moe(
                                    chunked_batch_output,
                                    atol=3e-2,
                                    rtol=3e-2)
+
+        if shared_experts is not None:
+            assert chunked_shared_output is not None
+            assert pplx_shared_output is not None
+            torch.testing.assert_close(pplx_shared_output,
+                                       chunked_shared_output,
+                                       atol=3e-2,
+                                       rtol=3e-2)
+
     finally:
         if use_internode:
             nvshmem_finalize()
@@ -788,7 +829,8 @@ def test_pplx_moe_slow(
 
 
 def _pplx_test_loop(pgi: ProcessGroupInfo, dp_size: int, use_internode: bool,
-                    make_weights: bool, test_fn: Callable):
+                    use_shared_experts: bool, make_weights: bool,
+                    test_fn: Callable):
 
     def format_result(msg, ex=None):
         if ex is not None:
@@ -803,6 +845,14 @@ def _pplx_test_loop(pgi: ProcessGroupInfo, dp_size: int, use_internode: bool,
         else:
             print(f"PASSED {msg}")
 
+    if use_shared_experts:
+        # Note: this config is only needed for the non-naive shared experts.
+        new_vllm_config = copy.deepcopy(vllm_config)
+        new_vllm_config.parallel_config.data_parallel_size = pgi.world_size
+        new_vllm_config.parallel_config.enable_expert_parallel = True
+        _set_vllm_config(new_vllm_config, pgi.world_size, pgi.rank,
+                         pgi.local_rank)
+
     current_platform.seed_everything(7)
     combos = itertools.product(PPLX_COMBOS, NUM_EXPERTS, TOP_KS, DTYPES,
                                [False, True], [None, [128, 128]])
@@ -819,9 +869,11 @@ def _pplx_test_loop(pgi: ProcessGroupInfo, dp_size: int, use_internode: bool,
             use_fp8_w8a8 = False
             quant_dtype = None
 
-        test_desc = (f"test_pplx_moe[mnk={mnk}, e={e}, topk={topk}, "
-                     f"dtype={dtype}, per_act_token={per_act_token_quant}, "
-                     f"block_shape={block_shape}")
+        test_desc = (
+            f"test_pplx_moe[mnk={mnk}, e={e}, topk={topk}, "
+            f"dtype={dtype}, per_act_token={per_act_token_quant}, "
+            f"block_shape={block_shape}, use_internode={use_internode}, "
+            f"use_shared_experts={use_shared_experts}")
 
         if not use_fp8_w8a8 and (per_act_token_quant
                                  or block_shape is not None):
@@ -852,6 +904,14 @@ def _pplx_test_loop(pgi: ProcessGroupInfo, dp_size: int, use_internode: bool,
             args["w1_s"] = w1_s
             args["w2_s"] = w2_s
 
+        if use_shared_experts:
+            args["shared_experts"] = make_shared_experts(
+                n,
+                k,
+                in_dtype=a.dtype,
+                quant_dtype=quant_dtype,
+            )
+
         try:
             test_fn(
                 pgi=pgi,
@@ -891,18 +951,20 @@ def test_pplx_prepare_finalize(
     current_platform.seed_everything(7)
     world_size, dp_size = world_dp_size
     parallel_launch(world_size * dp_size, _pplx_test_loop, dp_size,
-                    use_internode, False, _pplx_prepare_finalize)
+                    use_internode, False, False, _pplx_prepare_finalize)
 
 
 @pytest.mark.parametrize("world_dp_size", [[2, 1]])
 @pytest.mark.parametrize("use_internode", [False])
+@pytest.mark.parametrize("use_shared_experts", [False, True])
 @requires_pplx
 @multi_gpu_test(num_gpus=2)
 def test_pplx_moe(
     world_dp_size: tuple[int, int],
     use_internode: bool,
+    use_shared_experts: bool,
 ):
     current_platform.seed_everything(7)
     world_size, dp_size = world_dp_size
-    parallel_launch(world_size, _pplx_test_loop, dp_size, use_internode, True,
-                    _pplx_moe)
+    parallel_launch(world_size, _pplx_test_loop, dp_size, use_internode,
+                    use_shared_experts, True, _pplx_moe)
diff --git a/tests/kernels/moe/utils.py b/tests/kernels/moe/utils.py
index 82960bd57345daf3f7789b9b9bf6f97f1b822df4..4b58a28eed1255c7bde197baf97f02f27b24faf5 100644
--- a/tests/kernels/moe/utils.py
+++ b/tests/kernels/moe/utils.py
@@ -8,6 +8,7 @@ import vllm._custom_ops as ops
 from tests.kernels.quant_utils import per_block_cast_to_int8
 from tests.kernels.quantization.nvfp4_utils import (FLOAT4_E2M1_MAX,
                                                     FLOAT8_E4M3_MAX)
+from vllm.model_executor.layers.activation import SiluAndMul
 from vllm.model_executor.layers.fused_moe import fused_experts
 from vllm.model_executor.layers.fused_moe.fused_batched_moe import (
     BatchedPrepareAndFinalize, BatchedTritonExperts, NaiveBatchedExperts)
@@ -282,3 +283,151 @@ def per_token_cast_to_fp8(
     x_amax = x_view.abs().float().amax(dim=2).view(m, -1).clamp(1e-4)
     fp8_data = (x_view * (448.0 / x_amax.unsqueeze(2))).to(torch.float8_e4m3fn)
     return fp8_data.view(m, n + pad_size)[:, :n], (x_amax / 448.0).view(m, -1)
+
+
+# CustomOp?
+class BaselineMM(torch.nn.Module):
+
+    def __init__(
+        self,
+        b: torch.Tensor,
+        out_dtype: torch.dtype,
+    ):
+        super().__init__()
+        self.b = b.to(dtype=torch.float32)
+        self.out_dtype = out_dtype
+
+    def forward(
+            self,
+            a: torch.Tensor) -> tuple[torch.Tensor, Optional[torch.Tensor]]:
+        return torch.mm(a.to(dtype=torch.float32),
+                        self.b).to(self.out_dtype), None
+
+
+class TestMLP(torch.nn.Module):
+
+    def __init__(
+        self,
+        w1: torch.Tensor,
+        w2: torch.Tensor,
+        out_dtype: torch.dtype,
+    ):
+        super().__init__()
+        self.gate_up_proj = BaselineMM(w1, out_dtype)
+        self.down_proj = BaselineMM(w2, out_dtype)
+        self.act_fn = SiluAndMul()
+
+    def forward(self, x):
+        x, _ = self.gate_up_proj(x)
+        x = self.act_fn(x)
+        x, _ = self.down_proj(x)
+        return x
+
+
+def make_naive_shared_experts(
+    N: int,
+    K: int,
+    in_dtype: torch.dtype = torch.bfloat16,
+) -> torch.nn.Module:
+    w1 = torch.randn((K, N * 2), device="cuda", dtype=in_dtype) / 15
+    w2 = torch.randn((N, K), device="cuda", dtype=in_dtype) / 15
+    return TestMLP(w1, w2, out_dtype=in_dtype)
+
+
+class RealMLP(torch.nn.Module):
+
+    def __init__(
+        self,
+        hidden_size: int,
+        intermediate_size: int,
+        w1: torch.Tensor,
+        w2: torch.Tensor,
+        hidden_act: str = "silu",
+        quant_config=None,
+        reduce_results: bool = True,
+        prefix: str = "",
+        w1_s: Optional[torch.Tensor] = None,
+        w2_s: Optional[torch.Tensor] = None,
+    ) -> None:
+        from vllm.model_executor.layers.linear import (
+            MergedColumnParallelLinear, RowParallelLinear)
+
+        super().__init__()
+        self.gate_up_proj = MergedColumnParallelLinear(
+            hidden_size, [intermediate_size] * 2,
+            bias=False,
+            quant_config=quant_config,
+            prefix=f"{prefix}.gate_up_proj")
+        self.gate_up_proj.register_parameter(
+            "weight", torch.nn.Parameter(w1, requires_grad=False))
+        self.gate_up_proj.register_parameter(
+            "weight_scale", torch.nn.Parameter(w1_s, requires_grad=False))
+        self.gate_up_proj.register_parameter(
+            "input_scale",
+            None)  #torch.nn.Parameter(None, requires_grad=False))
+        self.down_proj = RowParallelLinear(intermediate_size,
+                                           hidden_size,
+                                           bias=False,
+                                           quant_config=quant_config,
+                                           reduce_results=reduce_results,
+                                           prefix=f"{prefix}.down_proj")
+        self.down_proj.register_parameter(
+            "weight", torch.nn.Parameter(w2, requires_grad=False))
+        self.down_proj.register_parameter(
+            "weight_scale", torch.nn.Parameter(w2_s, requires_grad=False))
+        self.down_proj.register_parameter(
+            "input_scale",
+            None)  #torch.nn.Parameter(None, requires_grad=False))
+        if hidden_act != "silu":
+            raise ValueError(f"Unsupported activation: {hidden_act}. "
+                             "Only silu is supported for now.")
+        self.act_fn = SiluAndMul()
+
+    def forward(self, x):
+        gate_up, _ = self.gate_up_proj(x)
+        x = self.act_fn(gate_up)
+        x, _ = self.down_proj(x)
+        return x
+
+
+def make_shared_experts(
+    N: int,
+    K: int,
+    in_dtype: torch.dtype = torch.bfloat16,
+    quant_dtype: Union[torch.dtype, str, None] = None,
+) -> torch.nn.Module:
+    from vllm.model_executor.layers.quantization.fp8 import Fp8Config
+
+    (_, w1, w1_s, _), (_, w2, w2_s, _) = make_test_weights(
+        1,
+        N,
+        K,
+        in_dtype=in_dtype,
+        quant_dtype=quant_dtype,
+    )
+    old_dtype = torch.get_default_dtype()
+    try:
+        torch.set_default_dtype(in_dtype)
+        if quant_dtype == torch.float8_e4m3fn:
+            w1 = w1[0].transpose(0, 1)
+            w2 = w2[0].transpose(0, 1)
+            w1_s = w1_s[0].transpose(0, 1) if w1_s is not None else None
+            w2_s = w2_s[0].transpose(0, 1) if w2_s is not None else None
+            quant_config = Fp8Config(True)
+        else:
+            w1 = w1[0]
+            w2 = w2[0]
+            w1_s = None
+            w2_s = None
+            quant_config = None
+
+        return RealMLP(K,
+                       N,
+                       w1,
+                       w2,
+                       "silu",
+                       quant_config,
+                       w1_s=w1_s,
+                       w2_s=w2_s)
+    finally:
+        torch.set_default_dtype(old_dtype)
diff --git a/tests/kernels/quantization/nvfp4_utils.py b/tests/kernels/quantization/nvfp4_utils.py
index 1095975ab2b419eec32a7f899d7a324eaddcf12a..fc4e125550180e22561c139b057f282fe7c0864c 100644
--- a/tests/kernels/quantization/nvfp4_utils.py
+++ b/tests/kernels/quantization/nvfp4_utils.py
@@ -2,6 +2,7 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import torch
 
+from vllm._custom_ops import scaled_fp4_quant
 from vllm.scalar_type import scalar_types
 
 FLOAT4_E2M1_MAX = scalar_types.float4_e2m1f.max()
@@ -65,3 +66,10 @@ def break_fp4_bytes(a, dtype):
 
     # Reshape to final form
     return values.reshape(m, n * 2).to(dtype=dtype)
+
+
+def quant_nvfp4_tensor(a: torch.Tensor):
+    a_global_scale = ((FLOAT8_E4M3_MAX * FLOAT4_E2M1_MAX) /
+                      torch.abs(a).max().to(torch.float32))
+    a_quant, a_block_scale = scaled_fp4_quant(a, a_global_scale)
+    return a_quant, a_block_scale, a_global_scale
diff --git a/tests/kernels/quantization/test_block_fp8.py b/tests/kernels/quantization/test_block_fp8.py
index d9154d3fd7f33a52b31b6cb788e2b4353ca2736f..c440747316b8035d6fc2913df739a46b6dbbe5b4 100644
--- a/tests/kernels/quantization/test_block_fp8.py
+++ b/tests/kernels/quantization/test_block_fp8.py
@@ -11,8 +11,8 @@ from tests.kernels.quant_utils import (native_per_token_group_quant_fp8,
                                        native_w8a8_block_matmul)
 from vllm.config import VllmConfig
 from vllm.model_executor.layers.quantization.utils.fp8_utils import (
-    get_col_major_tma_aligned_tensor, per_token_group_quant_fp8,
-    w8a8_block_fp8_matmul)
+    cutlass_scaled_mm, get_col_major_tma_aligned_tensor,
+    per_token_group_quant_fp8, w8a8_block_fp8_matmul)
 from vllm.platforms import current_platform
 from vllm.utils import has_deep_gemm
 from vllm.utils.deep_gemm import fp8_gemm_nt, per_block_cast_to_fp8
@@ -98,6 +98,54 @@ def test_w8a8_block_fp8_matmul(M, N, K, block_size, out_dtype, seed):
     assert rel_diff < 0.001
 
 
+@torch.inference_mode()
+def test_w8a8_block_fp8_cutlass_matmul():
+    # Test simple case where weight.shape % 128 != 0,
+    # like in DSV3 kv_a_proj_with_mqa
+    M = 32
+    N = 576
+    K = 7168
+    block_size = [128, 128]
+    out_dtype = torch.bfloat16
+    seed = 0
+
+    torch.manual_seed(seed)
+    factor_for_scale = 1e-2
+    fp8_info = torch.finfo(torch.float8_e4m3fn)
+    fp8_max, fp8_min = fp8_info.max, fp8_info.min
+
+    A_fp32 = (torch.rand(M, K, dtype=torch.float32) - 0.5) * 2 * fp8_max
+
+    B_fp32 = (torch.rand(N, K, dtype=torch.float32) - 0.5) * 2 * fp8_max
+    B_fp8 = B_fp32.clamp(min=fp8_min, max=fp8_max).to(torch.float8_e4m3fn)
+
+    block_n, block_k = block_size[0], block_size[1]
+    n_tiles = (N + block_n - 1) // block_n
+    k_tiles = (K + block_k - 1) // block_k
+
+    Bs = torch.rand(n_tiles, k_tiles, dtype=torch.float32) * factor_for_scale
+    # Hopper requires row-major format for scales
+    Bs_cutlass = Bs.T.contiguous() if current_platform.is_device_capability(
+        90) else Bs
+
+    A_fp8, As = per_token_group_quant_fp8(A_fp32,
+                                          block_size[1],
+                                          column_major_scales=False)
+    # CUTLASS uses column-major format for scales
+    A_fp8_cutlass, As_cutlass = per_token_group_quant_fp8(
+        A_fp32, block_size[1], column_major_scales=True)
+
+    ref_out = native_w8a8_block_matmul(A_fp8, B_fp8, As, Bs, block_size,
+                                       out_dtype)
+    out = cutlass_scaled_mm(A_fp8_cutlass, B_fp8, As_cutlass, Bs_cutlass,
+                            block_size, out_dtype)
+
+    rel_diff = (torch.mean(
+        torch.abs(out.to(torch.float32) - ref_out.to(torch.float32))) /
+                torch.mean(torch.abs(ref_out.to(torch.float32))))
+    assert rel_diff < 0.001
+
+
 @pytest.mark.parametrize(
     "M,N,K,block_size,out_dtype,seed",
     itertools.product(M, N, K, BLOCK_SIZE, OUT_DTYPES, SEEDS))
diff --git a/tests/kernels/test_cutlass_mla_decode.py b/tests/kernels/test_cutlass_mla_decode.py
deleted file mode 100644
index 2b745b84dae6c125f8825477f48276503198d3ea..0000000000000000000000000000000000000000
--- a/tests/kernels/test_cutlass_mla_decode.py
+++ /dev/null
@@ -1,96 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-import pytest
-import torch
-import torch.nn.functional as F
-from torch import Tensor
-
-import vllm._custom_ops as ops
-from vllm.platforms import current_platform
-
-if not current_platform.has_device_capability(100):
-    pytest.skip(
-        reason="Cutlass MLA Requires compute capability of 10 or above.",
-        allow_module_level=True)
-
-
-def ref_mla(
-        out: Tensor,  # (bs, num_heads, v_head_dim)
-        query: Tensor,  # (bs, num_heads, head_dim)
-        kv_cache: Tensor,  # (num_blocks, block_size, head_dim)
-        scale: float,
-        block_tables: Tensor,  # (bs, max_num_blocks)
-        seq_lens: Tensor,  # (bs,)
-):
-    bs, num_heads, v_head_dim = out.shape
-    head_dim = query.shape[2]
-
-    for i in range(bs):
-        # gather and flatten KV-cache
-        kv = kv_cache[
-            block_tables[i]]  # (max_num_blocks, block_size, head_dim)
-        kv = kv.view(1, -1,
-                     head_dim)[:, :seq_lens[i]]  # (1, seq_len, head_dim)
-        v = kv[:, :, :v_head_dim]
-
-        q = query[i].view(num_heads, 1, head_dim)
-        o = F.scaled_dot_product_attention(q,
-                                           kv,
-                                           v,
-                                           scale=scale,
-                                           enable_gqa=True)
-        out[i] = o.view(num_heads, v_head_dim)
-
-    return out
-
-
-@pytest.mark.parametrize("dtype", [torch.bfloat16, torch.float16])
-@pytest.mark.parametrize("mean_seq_len", [128, 1024, 4096])
-@pytest.mark.parametrize("bs", [1, 2, 4])
-@pytest.mark.parametrize("varlen", [False, True])
-@pytest.mark.parametrize("block_size", [16, 64, 128])
-def test_cutlass_mla_decode(dtype: torch.dtype, mean_seq_len: int, bs: int,
-                            varlen: bool, block_size: int):
-    torch.set_default_dtype(dtype)
-    torch.set_default_device('cuda')
-    torch.manual_seed(42)
-
-    d = 576
-    h_q = 128
-    dv = 512
-
-    q_nope_dim = 128
-    q_pe_dim = 64
-    scale = (q_nope_dim + q_pe_dim)**(-0.5)
-    if varlen:
-        seq_lens = torch.empty(bs).normal_(mean_seq_len, mean_seq_len / 2)
-        seq_lens = seq_lens.clip(2).to(torch.int32)
-    else:
-        seq_lens = torch.full((bs, ), mean_seq_len, dtype=torch.int32)
-    max_seq_len = seq_lens.max().item()
-    block_num = (max_seq_len + block_size - 1) // block_size
-
-    # Pad block_num so that small blocks can be packed into full 128-sized
-    # CUTLASS tiles. One 128-wide tile can hold (128 // block_size) small
-    # blocks.
-    pack_factor = 128 // block_size
-    block_num = ((block_num + pack_factor - 1) // pack_factor) * pack_factor
-
-    # Amplify input values to ensure test coverage of edge cases where CUTLASS
-    # kernel errors occur with split_k settings.
-    q = torch.randn(bs, h_q, d) * 100
-    block_table = torch.randint(0,
-                                bs * block_num, (bs, block_num),
-                                dtype=torch.int32)
-
-    kv_cache = torch.randn(block_table.numel(), block_size, d)
-
-    out_ref = q.new_zeros(bs, h_q, dv)
-    ref_mla(out_ref, q, kv_cache, scale, block_table, seq_lens)
-    out_ans = torch.zeros_like(out_ref)
-    q_nope = q[:, :, :dv].clone()
-    q_pe = q[:, :, dv:].clone()
-    ops.cutlass_mla_decode(out_ans, q_nope, q_pe, kv_cache, seq_lens,
-                           block_table, scale)
-
-    torch.testing.assert_close(out_ans, out_ref, atol=1e-2, rtol=1e-2)
diff --git a/tests/kernels/test_onednn.py b/tests/kernels/test_onednn.py
index 17692384ac9a9f0b279678ff678912f75ffa369b..37772464a209bc243388718dc2125daf7427faa6 100644
--- a/tests/kernels/test_onednn.py
+++ b/tests/kernels/test_onednn.py
@@ -111,6 +111,49 @@ def onednn_int8_gemm_test_helper(primitive_cache_size: int,
         torch.testing.assert_close(out, baseline, rtol=1e-1, atol=1e0)
 
 
+def onednn_gemm_test_helper(primitive_cache_size: int,
+                            m: int,
+                            n: int,
+                            k: int,
+                            use_bias: bool,
+                            use_stride: bool,
+                            dtype: torch.dtype = torch.bfloat16,
+                            device: str = "cpu"):
+    if use_stride:
+        a = torch.rand((m, 2 * k), dtype=dtype, device=device) * 1.5
+        a = a[:, :k]
+    else:
+        a = torch.rand((m, k), dtype=dtype, device=device) * 1.5
+
+    b = torch.rand((n, k), dtype=dtype, device=device) * 1.5
+
+    if use_bias:
+        bias = torch.rand((n, ), device=device, dtype=dtype) * 5
+        bias_f32 = bias.float()
+    else:
+        bias = None
+        bias_f32 = None
+
+    handler = ops.create_onednn_mm(
+        b.t(),
+        primitive_cache_size,
+    )
+
+    out = ops.onednn_mm(handler, a, bias)
+    baseline = torch.nn.functional.linear(a.float(), b.float(),
+                                          bias_f32).to(dtype=a.dtype)
+
+    torch.testing.assert_close(out, baseline)
+
+    if use_bias:
+        # To test runtime bias setting
+        out = ops.onednn_mm(handler, a, None)
+        baseline = torch.nn.functional.linear(a.float(), b.float(),
+                                              None).to(dtype=a.dtype)
+
+        torch.testing.assert_close(out, baseline)
+
+
 @pytest.mark.parametrize("n,k", NK_FACTORS)
 @pytest.mark.parametrize("m_list", M_FACTORS)
 @pytest.mark.parametrize("per_tensor_a_scale", [True, False])
@@ -142,3 +185,30 @@ def test_onednn_int8_scaled_gemm(
             use_azp=use_azp,
             out_dtype=output_type,
         )
+
+
+@pytest.mark.parametrize("n,k", NK_FACTORS)
+@pytest.mark.parametrize("m_list", M_FACTORS)
+@pytest.mark.parametrize("use_bias", [True, False])
+@pytest.mark.parametrize("use_stride", [True, False])
+@pytest.mark.parametrize("dtype", DTYPE)
+@pytest.mark.parametrize("primitive_cache_size", CACHE_SIZES)
+def test_onednn_gemm(
+    n: int,
+    k: int,
+    m_list: tuple[int],
+    use_bias: bool,
+    use_stride: bool,
+    dtype: torch.dtype,
+    primitive_cache_size: int,
+):
+    for m in m_list:
+        onednn_gemm_test_helper(
+            primitive_cache_size=primitive_cache_size,
+            m=m,
+            n=n,
+            k=k,
+            use_bias=use_bias,
+            use_stride=use_stride,
+            dtype=dtype,
+        )
diff --git a/tests/kernels/utils.py b/tests/kernels/utils.py
index fa4125840a010fb252df0835b6a7c46919d3b497..c9bf85f6e2a5c0f5a84943f17cd4f0ae40611b56 100644
--- a/tests/kernels/utils.py
+++ b/tests/kernels/utils.py
@@ -1236,7 +1236,7 @@ def baseline_scaled_mm(a: torch.Tensor,
                        bias: Optional[torch.Tensor] = None) -> torch.Tensor:
 
     # We treat N-dimensional group scaling as extended numpy-style broadcasting
-    # in numpy simply stretches dimensions with an extent of 1 to match the
+    # in numpy simply stretches dimensions with an extent of 1 to match
     # the target shape by repeating the data along that dimension (broadcasting)
     # , we extend these semantics to say if the extent of a dimension in the
     # source shape is not 1 and does not match the target shape we repeat each
@@ -1247,7 +1247,7 @@ def baseline_scaled_mm(a: torch.Tensor,
     # then we would expand a to:
     #       a = [[1, 1, 2, 2],
     #            [3, 3, 4, 4]]
-    # NOTE this function this function does not explicitly broadcast dimensions
+    # NOTE this function does not explicitly broadcast dimensions
     # with an extent of 1, since this can be done implicitly by pytorch
     def group_broadcast(t, shape):
         for i, s in enumerate(shape):
diff --git a/tests/kv_transfer/test_lookup_buffer.py b/tests/kv_transfer/test_lookup_buffer.py
index 352ab63552de74bd194a5908e11b460bfbc5578a..ca2f04dabfc98d434d189fdc4b006ba91ed12fa2 100644
--- a/tests/kv_transfer/test_lookup_buffer.py
+++ b/tests/kv_transfer/test_lookup_buffer.py
@@ -128,7 +128,7 @@ if __name__ == "__main__":
     print(f"initialized! My rank is {my_rank}")
 
     config = KVTransferConfig(
-        kv_connector='PyNcclConnector',
+        kv_connector='P2pNcclConnector',
         kv_buffer_device='cuda',
         kv_buffer_size=1e9,
         kv_rank=my_rank,
diff --git a/tests/kv_transfer/test_send_recv.py b/tests/kv_transfer/test_send_recv.py
index 32116608a2177ac5315327a05f27dbb103c4c3cb..99ad2b43aeac8b6af1415a9c3d7f176dfe3c3c47 100644
--- a/tests/kv_transfer/test_send_recv.py
+++ b/tests/kv_transfer/test_send_recv.py
@@ -137,7 +137,7 @@ if __name__ == "__main__":
     )
 
     config = KVTransferConfig(
-        kv_connector='PyNcclConnector',
+        kv_connector='P2pNcclConnector',
         kv_buffer_device='cuda',
         kv_buffer_size=1e9,
         kv_rank=my_rank,
diff --git a/tests/lora/test_add_lora.py b/tests/lora/test_add_lora.py
index 44755c603f281cfffa48e881049503367c6faa11..35d0245759154df2d623023b2a175dfcef8716aa 100644
--- a/tests/lora/test_add_lora.py
+++ b/tests/lora/test_add_lora.py
@@ -59,10 +59,10 @@ async def requests_processing_time(llm,
 @pytest.mark.asyncio
 async def test_add_lora(chatglm3_lora_files):
     """ 
-    The add_lora function is used to pre-load some LoRA adapters into the
+    The add_lora function is used to preload some LoRA adapters into the
     engine in anticipation of future requests using these adapters. To test
     this functionality, we use the async engine to process some requests - We
-    do it twice, once with add_lora() pre-loading and once without.
+    do it twice, once with add_lora() preloading and once without.
 
     We measure the request processing time in both cases and expect the time 
     to be lesser in the case with add_lora() calls.
diff --git a/tests/lora/test_layers.py b/tests/lora/test_layers.py
index 6e2dda464d8eb74a279150c9dd1874fb82fab517..6735b7cd9e4367e9a770da6b9367261855495a9d 100644
--- a/tests/lora/test_layers.py
+++ b/tests/lora/test_layers.py
@@ -11,21 +11,21 @@ import pytest
 import torch
 import torch.nn.functional as F
 
-from vllm.config import LoRAConfig
-from vllm.lora.fully_sharded_layers import (
-    ColumnParallelLinearWithShardedLoRA,
-    MergedColumnParallelLinearWithShardedLoRA,
-    MergedQKVParallelLinearWithShardedLoRA, QKVParallelLinearWithShardedLoRA,
-    RowParallelLinearWithShardedLoRA)
+from vllm.config.lora import LoRAConfig
 # yapf conflicts with isort for this block
 # yapf: disable
 from vllm.lora.layers import (BaseLayerWithLoRA, ColumnParallelLinearWithLoRA,
+                              ColumnParallelLinearWithShardedLoRA,
                               LogitsProcessorWithLoRA, LoRAMapping,
                               MergedColumnParallelLinearWithLoRA,
+                              MergedColumnParallelLinearWithShardedLoRA,
                               MergedQKVParallelLinearWithLoRA,
+                              MergedQKVParallelLinearWithShardedLoRA,
                               QKVParallelLinearWithLoRA,
+                              QKVParallelLinearWithShardedLoRA,
                               ReplicatedLinearWithLoRA,
                               RowParallelLinearWithLoRA,
+                              RowParallelLinearWithShardedLoRA,
                               VocabParallelEmbeddingWithLoRA)
 # yapf: enable
 from vllm.lora.models import LoRALayerWeights, PackedLoRALayerWeights
@@ -60,9 +60,9 @@ DEVICES = ([
 # prefill stage(True) or decode stage(False)
 STAGES = [True, False]
 
-NUM_RANDOM_SEEDS = 6
+NUM_RANDOM_SEEDS = 2
 
-VOCAB_PARALLEL_EMBEDDING_TEST_NUM_RANDOM_SEEDS = 128
+VOCAB_PARALLEL_EMBEDDING_TEST_NUM_RANDOM_SEEDS = 2
 
 
 @pytest.fixture(autouse=True)
diff --git a/tests/lora/test_lora_allowed_token_ids.py b/tests/lora/test_lora_allowed_token_ids.py
index 01bc102bd112bc945d88437181218f07286fbd47..be6409000ae7723d786f98ba0fc8667ca854f690 100644
--- a/tests/lora/test_lora_allowed_token_ids.py
+++ b/tests/lora/test_lora_allowed_token_ids.py
@@ -3,8 +3,8 @@
 
 import pytest
 
-from vllm.config import (CacheConfig, DeviceConfig, LoRAConfig, ModelConfig,
-                         VllmConfig)
+from vllm.config import CacheConfig, DeviceConfig, ModelConfig, VllmConfig
+from vllm.config.lora import LoRAConfig
 from vllm.lora.request import LoRARequest
 from vllm.sampling_params import SamplingParams
 from vllm.transformers_utils.tokenizer_group import init_tokenizer_from_configs
@@ -18,7 +18,7 @@ def test_allowed_token_ids_with_lora_vocab(llama_2_7b_base_huggingface_id,
     adapters that define additional tokens.
     """
 
-    # Setup a base model compatible with the sql_lora_files adapter and
+    # Set up a base model compatible with the sql_lora_files adapter and
     # a known number of tokens in the base model.
     model_config = ModelConfig(
         model=llama_2_7b_base_huggingface_id,
@@ -84,7 +84,7 @@ def test_allowed_token_ids_with_lora_adapter_no_vocab(
     adapters that do not define additional tokens.
     """
 
-    # Setup a base model compatible with the qwen25vl_lora_files adapter and
+    # Set up a base model compatible with the qwen25vl_lora_files adapter and
     # a known number of tokens in the base model.
     model_config = ModelConfig(
         model=qwen25vl_base_huggingface_id,
diff --git a/tests/lora/test_lora_manager.py b/tests/lora/test_lora_manager.py
index c9ab32edc7f32d703ae0fd29db27ad50b0004d15..a5802c108c6be4be3aada1d82d0940925bf37b39 100644
--- a/tests/lora/test_lora_manager.py
+++ b/tests/lora/test_lora_manager.py
@@ -8,7 +8,7 @@ import torch
 from safetensors.torch import load_file
 from torch import nn
 
-from vllm.config import LoRAConfig
+from vllm.config.lora import LoRAConfig
 from vllm.lora.layers import (ColumnParallelLinearWithLoRA,
                               MergedColumnParallelLinearWithLoRA,
                               RowParallelLinearWithLoRA)
diff --git a/tests/lora/test_peft_helper.py b/tests/lora/test_peft_helper.py
index df8696cf58e0ffe9a1a78af3bb518c98c60a36a3..ffffb5d8eab90d8be4b44d927dc6575e4ab6e5d4 100644
--- a/tests/lora/test_peft_helper.py
+++ b/tests/lora/test_peft_helper.py
@@ -7,7 +7,7 @@ import shutil
 
 import pytest
 
-from vllm.config import LoRAConfig
+from vllm.config.lora import LoRAConfig
 from vllm.lora.peft_helper import PEFTHelper
 
 ERROR_CASES = [
diff --git a/tests/lora/test_worker.py b/tests/lora/test_worker.py
index a836ff94ba3ed5a58526bb8fb7d95d11a06bd4f7..9c47abf8f4dceb8112841e2e2fb233670293a3e6 100644
--- a/tests/lora/test_worker.py
+++ b/tests/lora/test_worker.py
@@ -6,9 +6,10 @@ import random
 import tempfile
 from unittest.mock import patch
 
-from vllm.config import (CacheConfig, DeviceConfig, LoadConfig, LoRAConfig,
-                         ModelConfig, ParallelConfig, SchedulerConfig,
-                         VllmConfig)
+from vllm.config import (CacheConfig, DeviceConfig, ModelConfig,
+                         ParallelConfig, SchedulerConfig, VllmConfig)
+from vllm.config.load import LoadConfig
+from vllm.config.lora import LoRAConfig
 from vllm.lora.models import LoRAMapping
 from vllm.lora.request import LoRARequest
 from vllm.v1.worker.gpu_worker import Worker
diff --git a/tests/model_executor/model_loader/test_registry.py b/tests/model_executor/model_loader/test_registry.py
index 93a3e34835b5a896a872e772149738a26d2c4986..639ee6db9270fd17b2cfe9758267b19535425cc9 100644
--- a/tests/model_executor/model_loader/test_registry.py
+++ b/tests/model_executor/model_loader/test_registry.py
@@ -4,7 +4,8 @@
 import pytest
 from torch import nn
 
-from vllm.config import LoadConfig, ModelConfig
+from vllm.config import ModelConfig
+from vllm.config.load import LoadConfig
 from vllm.model_executor.model_loader import (get_model_loader,
                                               register_model_loader)
 from vllm.model_executor.model_loader.base_loader import BaseModelLoader
diff --git a/tests/model_executor/test_enabled_custom_ops.py b/tests/model_executor/test_enabled_custom_ops.py
index 140f00294765da870a4ab1c124b35f75e972287e..86139d598582d9f1a45cce7874b88f0688546dd8 100644
--- a/tests/model_executor/test_enabled_custom_ops.py
+++ b/tests/model_executor/test_enabled_custom_ops.py
@@ -13,13 +13,15 @@ from vllm.model_executor.layers.fused_moe.fused_moe import (dispatch_topk_func,
                                                             vllm_topk_softmax)
 from vllm.model_executor.layers.fused_moe.rocm_aiter_fused_moe import (
     is_rocm_aiter_moe_enabled)
-from vllm.model_executor.layers.layernorm import (
-    RMSNorm, dispatch_cuda_rmsnorm_func, fused_add_rms_norm, rms_norm,
-    rocm_aiter_fused_add_rms_norm, rocm_aiter_rms_norm)
+from vllm.model_executor.layers.layernorm import (RMSNorm,
+                                                  dispatch_rocm_rmsnorm_func,
+                                                  fused_add_rms_norm, rms_norm)
 from vllm.model_executor.layers.quantization.utils.fp8_utils import (
     cutlass_scaled_mm, dispatch_w8a8_blockscale_func, w8a8_block_fp8_matmul)
 from vllm.platforms import current_platform
 
+RMS_NORM_SUPPORTED_DTYPES = [torch.float16, torch.bfloat16]
+
 
 # Registered subclass for test
 @CustomOp.register("relu3")
@@ -149,24 +151,27 @@ def test_topk_dispatch(use_rocm_aiter: str, monkeypatch):
 
 
 @pytest.mark.parametrize("add_residual", [True, False])
+@pytest.mark.parametrize("dtype",
+                         [torch.float32, torch.float16, torch.bfloat16])
 @pytest.mark.parametrize("use_rocm_aiter", ["0", "1"])
 @pytest.mark.parametrize("use_rocm_aiter_norm", ["0", "1"])
 @pytest.mark.skipif(not current_platform.is_rocm(),
                     reason="AITER is a feature exclusive for ROCm")
-def test_rms_norm_dispatch(add_residual: bool, use_rocm_aiter: str,
-                           use_rocm_aiter_norm: str, monkeypatch):
+def test_rms_norm_dispatch(add_residual: bool, dtype: torch.dtype,
+                           use_rocm_aiter: str, use_rocm_aiter_norm: str,
+                           monkeypatch):
     monkeypatch.setenv("VLLM_ROCM_USE_AITER", use_rocm_aiter)
     monkeypatch.setenv("VLLM_ROCM_USE_AITER_RMSNORM", use_rocm_aiter_norm)
-    rms_norm_func = dispatch_cuda_rmsnorm_func(add_residual)
-
-    if not add_residual:
-        if current_platform.is_rocm() and int(use_rocm_aiter) and int(
-                use_rocm_aiter_norm):
-            assert rms_norm_func == rocm_aiter_rms_norm
-        else:
-            assert rms_norm_func == rms_norm
-    elif current_platform.is_rocm() and int(use_rocm_aiter) and int(
-            use_rocm_aiter_norm):
-        assert rms_norm_func == rocm_aiter_fused_add_rms_norm
-    else:
+    rms_norm_func = dispatch_rocm_rmsnorm_func(add_residual, dtype)
+
+    should_use_rocm_aiter = current_platform.is_rocm() and int(use_rocm_aiter) \
+        and int(use_rocm_aiter_norm) and dtype in RMS_NORM_SUPPORTED_DTYPES
+
+    if add_residual and should_use_rocm_aiter:
+        assert rms_norm_func == torch.ops.vllm.rocm_aiter_rmsnorm2d_fwd_with_add
+    elif should_use_rocm_aiter:
+        assert rms_norm_func == torch.ops.vllm.rocm_aiter_rms_norm
+    elif add_residual:
         assert rms_norm_func == fused_add_rms_norm
+    else:
+        assert rms_norm_func == rms_norm
diff --git a/tests/models/language/generation/test_bart.py b/tests/models/language/generation/test_bart.py
index b4c771840196c066adb76aad942852ddb55c4a29..22ceb27869ac4b3e78b1967dfbacf76d5bcd4217 100644
--- a/tests/models/language/generation/test_bart.py
+++ b/tests/models/language/generation/test_bart.py
@@ -178,6 +178,7 @@ def run_test(
 @pytest.mark.parametrize("max_tokens", [64])
 @pytest.mark.parametrize("num_logprobs", [5])
 @pytest.mark.parametrize("decoder_prompt_type", list(DecoderPromptType))
+@pytest.mark.skip(reason="bart not supported in V1")
 def test_models(hf_runner, vllm_runner, example_encoder_decoder_prompts, model,
                 dtype, max_tokens, num_logprobs, decoder_prompt_type) -> None:
 
@@ -201,6 +202,7 @@ def test_models(hf_runner, vllm_runner, example_encoder_decoder_prompts, model,
 @pytest.mark.parametrize("max_tokens", [64])
 @pytest.mark.parametrize("num_logprobs", [5])
 @pytest.mark.parametrize("decoder_prompt_type", [DecoderPromptType.CUSTOM])
+@pytest.mark.skip(reason="bart not supported in V1")
 def test_models_distributed(hf_runner, vllm_runner,
                             example_encoder_decoder_prompts,
                             distributed_executor_backend, model, dtype,
diff --git a/tests/models/language/generation/test_common.py b/tests/models/language/generation/test_common.py
index 4c4434c94145ac350db3fb6c8f798f5335fbdbc9..6fc8f1301fdb934f23712401a0c14b0b61dd95b8 100644
--- a/tests/models/language/generation/test_common.py
+++ b/tests/models/language/generation/test_common.py
@@ -13,7 +13,7 @@ from ...registry import HF_EXAMPLE_MODELS
 from ...utils import check_logprobs_close
 
 # These have unsupported head_dim for FA. We do not
-# not have a clean way to fall back, so we fail with
+# have a clean way to fall back, so we fail with
 # a clear msg when it happens.
 # https://github.com/vllm-project/vllm/issues/14524
 REQUIRES_V0 = ["microsoft/phi-2", "stabilityai/stablelm-3b-4e1t"]
@@ -93,7 +93,7 @@ AITER_MODEL_LIST = [
             "allenai/OLMoE-1B-7B-0924-Instruct",
             marks=[pytest.mark.cpu_model],
         ),
-        pytest.param("swiss-ai/Apertus-8B"),  # apertus
+        pytest.param("swiss-ai/Apertus-8B-2509"),  # apertus
     ])
 @pytest.mark.parametrize("max_tokens", [32])
 @pytest.mark.parametrize("num_logprobs", [5])
diff --git a/tests/models/language/generation/test_hybrid.py b/tests/models/language/generation/test_hybrid.py
index 31ca3a6f0f985d9195e8dac04a348d35b892a139..d0e42062099eca101e015db8b2aee31333f45016 100644
--- a/tests/models/language/generation/test_hybrid.py
+++ b/tests/models/language/generation/test_hybrid.py
@@ -25,8 +25,7 @@ SSM_MODELS = [
 
 HYBRID_MODELS = [
     "ai21labs/Jamba-tiny-dev",
-    # skipping until vLLM implementation issues are resolved
-    # "pfnet/plamo-2-1b",
+    "pfnet/plamo-2-1b",
     "Zyphra/Zamba2-1.2B-instruct",
     "hmellor/tiny-random-BambaForCausalLM",
     "ibm-granite/granite-4.0-tiny-preview",
@@ -34,20 +33,10 @@ HYBRID_MODELS = [
     "LiquidAI/LFM2-1.2B",
 ]
 
-HF_UNSUPPORTED_MODELS = [
-    # The HF transformers implementation of
-    # Mamba2 is buggy for Codestral as it doesn't handle n_groups, so the test
-    # doesn't compare vLLM output with HF output.
-    # See https://github.com/huggingface/transformers/pull/35943
-    "yujiepan/mamba2-codestral-v0.1-tiny-random",
-    # transformers 4.55 is still producing garbage for this model
-    # TODO(tdoublep): follow-up on transformers side
-    "ibm-granite/granite-4.0-tiny-preview"
-]
-
 V1_SUPPORTED_MODELS = [
     "state-spaces/mamba-130m-hf",
     "ai21labs/Jamba-tiny-dev",
+    "pfnet/plamo-2-1b",
     "yujiepan/mamba2-codestral-v0.1-tiny-random",
     "Zyphra/Zamba2-1.2B-instruct",
     "hmellor/tiny-random-BambaForCausalLM",
@@ -58,6 +47,7 @@ V1_SUPPORTED_MODELS = [
 
 FULL_CUDA_GRAPH_MODELS = [
     "ai21labs/Jamba-tiny-dev",
+    "pfnet/plamo-2-1b",
     "Zyphra/Zamba2-1.2B-instruct",
 ]
 
@@ -65,6 +55,11 @@ V0_UNSUPPORTED_MODELS = [
     "LiquidAI/LFM2-1.2B",
 ]
 
+FP32_STATE_MODELS = [
+    "state-spaces/mamba-130m-hf",
+    "Zyphra/Zamba2-1.2B-instruct",
+]
+
 # Avoid OOM
 MAX_NUM_SEQS = 4
 
@@ -85,20 +80,13 @@ def test_models(
     try:
         model_info = HF_EXAMPLE_MODELS.find_hf_info(model)
         model_info.check_available_online(on_fail="skip")
-        hf_version_check = model_info.check_transformers_version(
-            on_fail="return")
+        model_info.check_transformers_version(on_fail="skip")
     except ValueError:
-        hf_version_check = None
-
-    if hf_version_check is not None:
-        print(f"Skipping transformers comparison because: {hf_version_check}")
+        pass
 
     with hf_runner(model) as hf_model:
-        if model not in HF_UNSUPPORTED_MODELS and hf_version_check is None:
-            hf_outputs = hf_model.generate_greedy_logprobs_limit(
-                example_prompts, max_tokens, num_logprobs)
-        else:
-            hf_outputs = None
+        hf_outputs = hf_model.generate_greedy_logprobs_limit(
+            example_prompts, max_tokens, num_logprobs)
 
     with monkeypatch.context() as m:
         m.setenv("VLLM_USE_V1", "0")
@@ -116,7 +104,7 @@ def test_models(
     else:
         vllm_v1_outputs = None
 
-    if hf_outputs is not None and vllm_v0_outputs is not None:
+    if vllm_v0_outputs is not None:
         check_logprobs_close(
             outputs_0_lst=hf_outputs,
             outputs_1_lst=vllm_v0_outputs,
@@ -125,12 +113,10 @@ def test_models(
         )
 
     if model in V1_SUPPORTED_MODELS:
-        ref_outputs = hf_outputs if hf_outputs is not None else vllm_v0_outputs
-        assert ref_outputs is not None
         check_logprobs_close(
-            outputs_0_lst=ref_outputs,
+            outputs_0_lst=hf_outputs,
             outputs_1_lst=vllm_v1_outputs,
-            name_0="hf" if hf_outputs is not None else "vllm-v0",
+            name_0="hf",
             name_1="vllm-v1",
         )
 
@@ -315,7 +301,7 @@ def test_fail_upon_inc_requests_and_finished_requests_lt_available_blocks(
     finished_requests_ids is larger than the maximum mamba block capacity.
 
     This could generally happen due to the fact that hybrid does support
-    statelessness mechanism where it can cleanup new incoming requests in
+    statelessness mechanism where it can clean up new incoming requests in
     a single step.
     """
     try:
@@ -336,7 +322,7 @@ def test_state_cleanup(
     This test is for verifying that the Hybrid state is cleaned up between
     steps.
     
-    If its not cleaned, an error would be expected.
+    If it's not cleaned, an error would be expected.
     """
     try:
         with vllm_runner(model, max_num_seqs=MAX_NUM_SEQS) as vllm_model:
@@ -397,11 +383,8 @@ def test_full_cuda_graph(
         pass
 
     with hf_runner(model) as hf_model:
-        if model not in HF_UNSUPPORTED_MODELS:
-            hf_outputs = hf_model.generate_greedy_logprobs_limit(
-                example_prompts, max_tokens, num_logprobs)
-        else:
-            hf_outputs = None
+        hf_outputs = hf_model.generate_greedy_logprobs_limit(
+            example_prompts, max_tokens, num_logprobs)
 
     with monkeypatch.context() as m:
         m.setenv("VLLM_USE_V1", "0")
@@ -416,7 +399,7 @@ def test_full_cuda_graph(
         vllm_v1_outputs = vllm_model.generate_greedy_logprobs(
             example_prompts, max_tokens, num_logprobs)
 
-    if hf_outputs is not None and vllm_v0_outputs is not None:
+    if vllm_v0_outputs is not None:
         check_logprobs_close(
             outputs_0_lst=hf_outputs,
             outputs_1_lst=vllm_v0_outputs,
@@ -424,17 +407,15 @@ def test_full_cuda_graph(
             name_1="vllm-v0",
         )
 
-    ref_outputs = hf_outputs if hf_outputs is not None else vllm_v0_outputs
-    assert ref_outputs is not None
     check_logprobs_close(
-        outputs_0_lst=ref_outputs,
+        outputs_0_lst=hf_outputs,
         outputs_1_lst=vllm_v1_outputs,
-        name_0="hf" if hf_outputs is not None else "vllm-v0",
+        name_0="hf",
         name_1="vllm-v1",
     )
 
 
-@pytest.mark.parametrize("model", ["Zyphra/Zamba2-1.2B-instruct"])
+@pytest.mark.parametrize("model", FP32_STATE_MODELS)
 @pytest.mark.parametrize("max_tokens", [64])
 @pytest.mark.parametrize("num_logprobs", [5])
 def test_fp32_state(
@@ -455,11 +436,8 @@ def test_fp32_state(
         pass
 
     with hf_runner(model) as hf_model:
-        if model not in HF_UNSUPPORTED_MODELS:
-            hf_outputs = hf_model.generate_greedy_logprobs_limit(
-                example_prompts, max_tokens, num_logprobs)
-        else:
-            hf_outputs = None
+        hf_outputs = hf_model.generate_greedy_logprobs_limit(
+            example_prompts, max_tokens, num_logprobs)
 
     with monkeypatch.context() as m:
         m.setenv("VLLM_USE_V1", "0")
@@ -475,18 +453,16 @@ def test_fp32_state(
         vllm_v1_outputs = vllm_model.generate_greedy_logprobs(
             example_prompts, max_tokens, num_logprobs)
 
-    if hf_outputs is not None:
-        check_logprobs_close(
-            outputs_0_lst=hf_outputs,
-            outputs_1_lst=vllm_v0_outputs,
-            name_0="hf",
-            name_1="vllm-v0",
-        )
+    check_logprobs_close(
+        outputs_0_lst=hf_outputs,
+        outputs_1_lst=vllm_v0_outputs,
+        name_0="hf",
+        name_1="vllm-v0",
+    )
 
-    ref_outputs = hf_outputs if hf_outputs is not None else vllm_v0_outputs
     check_logprobs_close(
-        outputs_0_lst=ref_outputs,
+        outputs_0_lst=hf_outputs,
         outputs_1_lst=vllm_v1_outputs,
-        name_0="hf" if hf_outputs is not None else "vllm-v0",
+        name_0="hf",
         name_1="vllm-v1",
     )
diff --git a/tests/models/language/generation/test_mistral.py b/tests/models/language/generation/test_mistral.py
index af51a60edfd622f6f5a45ac839dc74a4ad63a8e1..845afbfa8a45ede4c7977f3e44fbb50d487e29d4 100644
--- a/tests/models/language/generation/test_mistral.py
+++ b/tests/models/language/generation/test_mistral.py
@@ -20,7 +20,7 @@ MISTRAL_FORMAT_MODELS = [
     "mistralai/Mistral-7B-Instruct-v0.3",
     # uses the v3-Tekken tokenizer
     "mistralai/Ministral-8B-Instruct-2410",
-    # Mistral-Nemo is to big for CI, but passes locally
+    # Mistral-Nemo is too big for CI, but passes locally
     # "mistralai/Mistral-Nemo-Instruct-2407"
 ]
 
@@ -273,7 +273,7 @@ def test_mistral_function_calling(vllm_runner, model: str, dtype: str) -> None:
 
 
 def test_mistral_function_call_nested_json():
-    """Ensure that the function-name regex captures the entire outer-most
+    """Ensure that the function-name regex captures the entire outermost
     JSON block, including nested braces."""
 
     # Create a minimal stub tokenizer that provides the few attributes the
diff --git a/tests/models/language/generation_ppl_test/__init__.py b/tests/models/language/generation_ppl_test/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/tests/models/language/generation_ppl_test/ppl_utils.py b/tests/models/language/generation_ppl_test/ppl_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..550e874cf85791cd5eecb65382909e484cb9e9d2
--- /dev/null
+++ b/tests/models/language/generation_ppl_test/ppl_utils.py
@@ -0,0 +1,131 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+# Adapted from https://huggingface.co/docs/transformers/perplexity
+from typing import Optional, cast
+
+import pytest
+import torch
+from datasets import load_dataset
+
+from tests.models.utils import (GenerateModelInfo,
+                                TokensTextLogprobsPromptLogprobs)
+from vllm.logprobs import Logprob
+
+# See #24485
+PPL_TOL = 0.01
+MAX_LENGTH = 1024
+
+
+@torch.inference_mode
+def wikitext_ppl_test(hf_runner,
+                      vllm_runner,
+                      model_info: GenerateModelInfo,
+                      max_length=MAX_LENGTH,
+                      vllm_extra_kwargs=None,
+                      atol=PPL_TOL):
+
+    # A model family has many models with the same architecture,
+    # and we don't need to test each one.
+    if not model_info.enable_test:
+        pytest.skip("Skipping test.")
+
+    dataset = load_dataset("wikitext", "wikitext-2-raw-v1", split="test")
+
+    # Allow vllm to test using the given dtype, such as float32
+    vllm_extra_kwargs = vllm_extra_kwargs or {}
+    vllm_extra_kwargs["dtype"] = model_info.dtype
+
+    # Allow vllm to test using hf_overrides
+    if model_info.hf_overrides is not None:
+        vllm_extra_kwargs["hf_overrides"] = model_info.hf_overrides
+
+    with vllm_runner(model_info.name,
+                     gpu_memory_utilization=0.7,
+                     max_model_len=max_length,
+                     max_num_seqs=1,
+                     enforce_eager=True,
+                     **vllm_extra_kwargs) as vllm_model:
+        # Use max_num_seqs=1 to avoid OOM,
+        # and batch different requests together.
+
+        model_config = vllm_model.llm.llm_engine.model_config
+
+        # Confirm whether vllm is using the correct architecture
+        if model_info.architecture:
+            assert (model_info.architecture in model_config.architectures)
+
+        max_length = min(model_config.max_model_len - 1, max_length)
+        stride = max_length
+
+        tokenizer = vllm_model.llm.get_tokenizer()
+        tokens = tokenizer.encode("\n\n".join(dataset["text"]))
+        n_tokens = len(tokens)
+
+        chunks = []
+        for begin_loc in range(0, n_tokens, stride):
+            end_loc = min(begin_loc + max_length, n_tokens)
+            chunks.append(tokens[begin_loc:end_loc])
+
+        outputs = vllm_model.generate_greedy_logprobs(prompts=chunks,
+                                                      max_tokens=1,
+                                                      num_logprobs=None,
+                                                      num_prompt_logprobs=0,
+                                                      use_tqdm=False)
+        nll_sum = torch.tensor(0., dtype=torch.float32, device="cpu")
+        n_tokens = 0
+        for output in outputs:
+            output = cast(TokensTextLogprobsPromptLogprobs, output)
+            token_datas = cast(list[Optional[dict[int, Logprob]]], output[3])
+
+            assert token_datas[0] is None
+            token_log_probs = []
+            for token_data in token_datas[1:]:
+                assert token_data is not None
+                assert len(token_data) == 1
+                token_log_prob = list(token_data.values())[0].logprob
+                token_log_probs.append(token_log_prob)
+
+            neg_log_likelihood = -torch.tensor(
+                token_log_probs, dtype=torch.float32, device="cpu").sum()
+            nll_sum += neg_log_likelihood
+            n_tokens += len(token_log_probs)
+        vllm_ppl = float(torch.exp(nll_sum / n_tokens))
+        vllm_dtype = model_config.dtype
+
+    # Accelerate ppl test by setting Transformers ppl score to a constant
+    if model_info.hf_ppl is None:
+        with hf_runner(
+                model_info.name,
+                dtype=model_info.hf_dtype,
+        ) as hf_model:
+            nll_sum = torch.tensor(0., dtype=torch.float32, device="cpu")
+            n_tokens = 0
+            for chunk in chunks:
+                inputs = hf_model.wrap_device(
+                    {"input_ids": torch.tensor([chunk])})
+                input_ids = inputs["input_ids"]
+                outputs = hf_model.model(input_ids, labels=input_ids)
+                neg_log_likelihood = outputs.loss
+
+                neg_log_likelihood = neg_log_likelihood.to(torch.float32).cpu()
+
+                num_loss_tokens = len(chunk) - 1
+                nll_sum += neg_log_likelihood * num_loss_tokens
+                n_tokens += num_loss_tokens
+
+            hf_ppl = float(torch.exp(nll_sum / n_tokens))
+            hf_dtype = next(hf_model.model.parameters()).dtype
+    else:
+        hf_ppl = model_info.hf_ppl
+        hf_dtype = "Constant"
+
+    differ = (vllm_ppl - hf_ppl) / hf_ppl
+    print("Model:", model_info.name)
+    print("VLLM:", vllm_dtype, vllm_ppl)
+    print("Transformers:", hf_dtype, hf_ppl)
+    print("Difference (%):", differ * 100)
+
+    # PPL the smaller, the better
+    # We are not concerned that the vllm PPL is less than Transformers,
+    # so we only perform one-sided testing.
+    assert differ < atol
diff --git a/tests/models/language/generation_ppl_test/test_gemma.py b/tests/models/language/generation_ppl_test/test_gemma.py
new file mode 100644
index 0000000000000000000000000000000000000000..5324de143d67484d6cfe50f83a866c022efe31aa
--- /dev/null
+++ b/tests/models/language/generation_ppl_test/test_gemma.py
@@ -0,0 +1,18 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import pytest
+
+from tests.models.utils import GenerateModelInfo
+
+from .ppl_utils import wikitext_ppl_test
+
+MODELS = [
+    GenerateModelInfo("google/gemma-2b"),
+    GenerateModelInfo("google/gemma-2-2b"),
+    GenerateModelInfo("google/gemma-3-4b-it"),
+]
+
+
+@pytest.mark.parametrize("model_info", MODELS)
+def test_ppl(hf_runner, vllm_runner, model_info: GenerateModelInfo):
+    wikitext_ppl_test(hf_runner, vllm_runner, model_info)
diff --git a/tests/models/language/generation_ppl_test/test_gpt.py b/tests/models/language/generation_ppl_test/test_gpt.py
new file mode 100644
index 0000000000000000000000000000000000000000..f3f9e55a242349455ff1e9c94ca0c22118512b78
--- /dev/null
+++ b/tests/models/language/generation_ppl_test/test_gpt.py
@@ -0,0 +1,14 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import pytest
+
+from tests.models.utils import GenerateModelInfo
+
+from .ppl_utils import wikitext_ppl_test
+
+MODELS = [GenerateModelInfo("openai-community/gpt2-large")]
+
+
+@pytest.mark.parametrize("model_info", MODELS)
+def test_ppl(hf_runner, vllm_runner, model_info: GenerateModelInfo):
+    wikitext_ppl_test(hf_runner, vllm_runner, model_info)
diff --git a/tests/models/language/generation_ppl_test/test_qwen.py b/tests/models/language/generation_ppl_test/test_qwen.py
new file mode 100644
index 0000000000000000000000000000000000000000..0d3127cbaac4747db1a97ebbc72200dbde1024cf
--- /dev/null
+++ b/tests/models/language/generation_ppl_test/test_qwen.py
@@ -0,0 +1,21 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import pytest
+
+from tests.models.utils import GenerateModelInfo
+
+from .ppl_utils import wikitext_ppl_test
+
+MODELS = [
+    GenerateModelInfo("Qwen/Qwen3-0.6B"),
+    GenerateModelInfo("Qwen/Qwen3-0.6B-FP8"),
+    # transformers:
+    # Loading a GPTQ quantized model requires optimum, gptqmodel
+    # GenerateModelInfo("Qwen/Qwen3-0.6B-GPTQ-Int8"),
+]
+
+
+@pytest.mark.parametrize("model_info", MODELS)
+def test_ppl(hf_runner, vllm_runner, model_info: GenerateModelInfo):
+    wikitext_ppl_test(hf_runner, vllm_runner, model_info)
diff --git a/tests/models/language/pooling/embed_utils.py b/tests/models/language/pooling/embed_utils.py
index a74ad2aa2597247e1f3063c9ba9ea00ae80326a4..86751e0a4d5f4ce23f104f2e1aa1748d99dee4eb 100644
--- a/tests/models/language/pooling/embed_utils.py
+++ b/tests/models/language/pooling/embed_utils.py
@@ -35,10 +35,7 @@ def correctness_test_embed_models(hf_runner,
                                   example_prompts,
                                   vllm_extra_kwargs=None,
                                   hf_model_callback=None):
-    if not model_info.enable_test:
-        # A model family has many models with the same architecture,
-        # and we don't need to test each one.
-        pytest.skip("Skipping test.")
+    pytest.skip("Debug only, ci prefers to use mteb test.")
 
     # The example_prompts has ending "\n", for example:
     # "Write a short story about a robot that dreams for the first time.\n"
@@ -62,7 +59,7 @@ def correctness_test_embed_models(hf_runner,
 
     with hf_runner(
             model_info.name,
-            dtype="float32",
+            dtype=model_info.hf_dtype,
             is_sentence_transformer=True,
     ) as hf_model:
 
diff --git a/tests/models/language/pooling/test_embedding.py b/tests/models/language/pooling/test_embedding.py
index f918b2b91bcc3122cc8322c3e736a3f9a36dc8cc..41574b844a668fb2d0283e1f44a4d1ed523dcc09 100644
--- a/tests/models/language/pooling/test_embedding.py
+++ b/tests/models/language/pooling/test_embedding.py
@@ -7,7 +7,7 @@ import pytest
 from vllm.config import PoolerConfig
 from vllm.platforms import current_platform
 
-from ...utils import check_embeddings_close, check_transformers_version
+from ...utils import check_embeddings_close
 
 
 @pytest.mark.parametrize(
@@ -27,12 +27,17 @@ from ...utils import check_embeddings_close, check_transformers_version
         pytest.param("ssmits/Qwen2-7B-Instruct-embed-base",
                      marks=[pytest.mark.cpu_model]),
         # [Encoder-only]
-        pytest.param("BAAI/bge-base-en-v1.5", marks=[pytest.mark.core_model]),
+        pytest.param(
+            "BAAI/bge-base-en-v1.5",
+            marks=[pytest.mark.core_model, pytest.mark.cpu_model],
+        ),
         pytest.param("sentence-transformers/all-MiniLM-L12-v2"),
         pytest.param("intfloat/multilingual-e5-small"),
-        pytest.param("Alibaba-NLP/gte-Qwen2-1.5B-instruct"),
         # [Cross-Encoder]
-        pytest.param("sentence-transformers/stsb-roberta-base-v2"),
+        pytest.param(
+            "sentence-transformers/stsb-roberta-base-v2",
+            marks=[pytest.mark.core_model, pytest.mark.cpu_model],
+        ),
     ],
 )
 def test_models(
@@ -42,8 +47,6 @@ def test_models(
     model,
     monkeypatch,
 ) -> None:
-    if model == "Alibaba-NLP/gte-Qwen2-1.5B-instruct":
-        check_transformers_version(model, max_transformers_version="4.53.2")
 
     if model == "BAAI/bge-multilingual-gemma2" and current_platform.is_rocm():
         # ROCm Triton FA does not currently support sliding window attention
diff --git a/tests/models/language/pooling_mteb_test/__init__.py b/tests/models/language/pooling_mteb_test/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/tests/models/language/pooling/mteb_utils.py b/tests/models/language/pooling_mteb_test/mteb_utils.py
similarity index 74%
rename from tests/models/language/pooling/mteb_utils.py
rename to tests/models/language/pooling_mteb_test/mteb_utils.py
index 640858125bfcaccb9050815c2168bda4fa28f54a..56a105e96e5ee10bed7b09c401d0097569a238ff 100644
--- a/tests/models/language/pooling/mteb_utils.py
+++ b/tests/models/language/pooling_mteb_test/mteb_utils.py
@@ -9,8 +9,10 @@ import mteb
 import numpy as np
 import pytest
 import requests
+import torch
 
-from tests.models.utils import EmbedModelInfo, RerankModelInfo
+from tests.models.utils import (EmbedModelInfo, RerankModelInfo,
+                                check_embeddings_close)
 
 # Most embedding models on the STS12 task (See #17175):
 # - Model implementation and minor changes in tensor dtype
@@ -18,7 +20,7 @@ from tests.models.utils import EmbedModelInfo, RerankModelInfo
 # - Different model results in differences more than 1e-3
 # 1e-4 is a good tolerance threshold
 MTEB_EMBED_TASKS = ["STS12"]
-MTEB_EMBED_TOL = 0.02
+MTEB_EMBED_TOL = 1e-4
 
 # See #19344
 MTEB_RERANK_TASKS = ["NFCorpus"]
@@ -163,15 +165,20 @@ def mteb_test_embed_models(hf_runner,
                            model_info: EmbedModelInfo,
                            vllm_extra_kwargs=None,
                            hf_model_callback=None,
-                           atol=MTEB_RERANK_TOL):
+                           atol=MTEB_EMBED_TOL):
+    # A model family has many models with the same architecture,
+    # and we don't need to test each one.
     if not model_info.enable_test:
-        # A model family has many models with the same architecture,
-        # and we don't need to test each one.
         pytest.skip("Skipping test.")
 
+    # Test embed_dims, isnan and whether to use normalize
+    example_prompts = ["The chef prepared a delicious meal." * 1000]
+
+    # Allow vllm to test using the given dtype, such as float32
     vllm_extra_kwargs = vllm_extra_kwargs or {}
     vllm_extra_kwargs["dtype"] = model_info.dtype
 
+    # Allow vllm to test using hf_overrides
     if model_info.hf_overrides is not None:
         vllm_extra_kwargs["hf_overrides"] = model_info.hf_overrides
 
@@ -183,8 +190,12 @@ def mteb_test_embed_models(hf_runner,
 
         model_config = vllm_model.llm.llm_engine.model_config
 
+        # Confirm whether vllm is using the correct architecture
         if model_info.architecture:
             assert model_info.architecture in model_config.architectures
+
+        # Confirm whether vllm uses the correct default_pooling_type, which
+        # relates to whether chunked prefill and prefix caching are enabled
         assert (model_config._model_info.default_pooling_type ==
                 model_info.default_pooling_type)
 
@@ -192,22 +203,46 @@ def mteb_test_embed_models(hf_runner,
                                               MTEB_EMBED_TASKS)
         vllm_dtype = vllm_model.llm.llm_engine.model_config.dtype
 
-    with hf_runner(model_info.name,
-                   is_sentence_transformer=True,
-                   dtype="float32") as hf_model:
-
-        if hf_model_callback is not None:
-            hf_model_callback(hf_model)
-
-        st_main_score = run_mteb_embed_task(hf_model, MTEB_EMBED_TASKS)
-        st_dtype = next(hf_model.model.parameters()).dtype
+        # Test embed_dims, isnan and whether to use normalize
+        vllm_outputs = vllm_model.embed(example_prompts,
+                                        truncate_prompt_tokens=-1)
+        assert not torch.any(torch.isnan(torch.tensor(vllm_outputs)))
+
+    # Accelerate mteb test by setting
+    # SentenceTransformers mteb score to a constant
+    if model_info.mteb_score is None:
+        with hf_runner(model_info.name,
+                       is_sentence_transformer=True,
+                       dtype=model_info.hf_dtype) as hf_model:
+
+            # e.g. setting default parameters for the encode method of hf_runner
+            if hf_model_callback is not None:
+                hf_model_callback(hf_model)
+
+            st_main_score = run_mteb_embed_task(hf_model, MTEB_EMBED_TASKS)
+            st_dtype = next(hf_model.model.parameters()).dtype
+
+            # Test embed_dims and whether to use normalize
+            hf_outputs = hf_model.encode(example_prompts)
+            check_embeddings_close(
+                embeddings_0_lst=hf_outputs,
+                embeddings_1_lst=vllm_outputs,
+                name_0="hf",
+                name_1="vllm",
+                tol=1e-2,
+            )
+    else:
+        st_main_score = model_info.mteb_score
+        st_dtype = "Constant"
 
     print("Model:", model_info.name)
     print("VLLM:", vllm_dtype, vllm_main_score)
     print("SentenceTransformers:", st_dtype, st_main_score)
     print("Difference:", st_main_score - vllm_main_score)
 
-    assert st_main_score == pytest.approx(vllm_main_score, abs=atol)
+    # We are not concerned that the vllm mteb results are better
+    # than SentenceTransformers, so we only perform one-sided testing.
+    assert st_main_score - vllm_main_score < atol
 
 
 def run_mteb_rerank(cross_encoder, tasks, languages):
@@ -243,9 +278,12 @@ def run_mteb_rerank(cross_encoder, tasks, languages):
     return main_score
 
 
-def mteb_test_rerank_models_hf(hf_runner, model_name, hf_model_callback=None):
+def mteb_test_rerank_models_hf(hf_runner,
+                               model_name,
+                               hf_dtype="float32",
+                               hf_model_callback=None):
     with hf_runner(model_name, is_cross_encoder=True,
-                   dtype="float32") as hf_model:
+                   dtype=hf_dtype) as hf_model:
 
         original_predict = hf_model.predict
 
@@ -279,14 +317,16 @@ def mteb_test_rerank_models(hf_runner,
                             hf_model_callback=None,
                             vllm_mteb_encoder=VllmMtebEncoder,
                             atol=MTEB_RERANK_TOL):
+    # A model family has many models with the same architecture,
+    # and we don't need to test each one.
     if not model_info.enable_test:
-        # A model family has many models with the same architecture,
-        # and we don't need to test each one.
         pytest.skip("Skipping test.")
 
+    # Allow vllm to test using the given dtype, such as float32
     vllm_extra_kwargs = vllm_extra_kwargs or {}
     vllm_extra_kwargs["dtype"] = model_info.dtype
 
+    # Allow vllm to test using hf_overrides
     if model_info.hf_overrides is not None:
         vllm_extra_kwargs["hf_overrides"] = model_info.hf_overrides
 
@@ -299,9 +339,15 @@ def mteb_test_rerank_models(hf_runner,
 
         model_config = vllm_model.llm.llm_engine.model_config
 
+        # Confirm whether vllm is using the correct architecture
         if model_info.architecture:
             assert (model_info.architecture in model_config.architectures)
+
+        # Score API is only enabled for num_labels == 1
         assert model_config.hf_config.num_labels == 1
+
+        # Confirm whether vllm uses the correct default_pooling_type, which
+        # relates to whether chunked prefill and prefix caching are enabled
         assert (model_config._model_info.default_pooling_type ==
                 model_info.default_pooling_type)
 
@@ -310,12 +356,20 @@ def mteb_test_rerank_models(hf_runner,
                                           languages=MTEB_RERANK_LANGS)
         vllm_dtype = model_config.dtype
 
-    st_main_score, st_dtype = mteb_test_rerank_models_hf(
-        hf_runner, model_info.name, hf_model_callback)
+    # Accelerate mteb test by setting
+    # SentenceTransformers mteb score to a constant
+    if model_info.mteb_score is None:
+        st_main_score, st_dtype = mteb_test_rerank_models_hf(
+            hf_runner, model_info.name, model_info.hf_dtype, hf_model_callback)
+    else:
+        st_main_score = model_info.mteb_score
+        st_dtype = "Constant"
 
     print("Model:", model_info.name)
     print("VLLM:", vllm_dtype, vllm_main_score)
     print("SentenceTransformers:", st_dtype, st_main_score)
     print("Difference:", st_main_score - vllm_main_score)
 
-    assert st_main_score == pytest.approx(vllm_main_score, abs=atol)
+    # We are not concerned that the vllm mteb results are better
+    # than SentenceTransformers, so we only perform one-sided testing.
+    assert st_main_score - vllm_main_score < atol
diff --git a/tests/models/language/pooling/test_baai.py b/tests/models/language/pooling_mteb_test/test_baai.py
similarity index 89%
rename from tests/models/language/pooling/test_baai.py
rename to tests/models/language/pooling_mteb_test/test_baai.py
index 6fbe0e82d7f8a9d00a18ff9772901aa473c0daca..e131c9b1038ded91d3a13a7b71b9220343b16378 100644
--- a/tests/models/language/pooling/test_baai.py
+++ b/tests/models/language/pooling_mteb_test/test_baai.py
@@ -2,16 +2,19 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import pytest
 
-from ...utils import (CLSPoolingEmbedModelInfo, CLSPoolingRerankModelInfo,
-                      EmbedModelInfo, LASTPoolingEmbedModelInfo,
-                      RerankModelInfo)
-from .embed_utils import correctness_test_embed_models
+from tests.models.language.pooling.embed_utils import (
+    correctness_test_embed_models)
+from tests.models.utils import (CLSPoolingEmbedModelInfo,
+                                CLSPoolingRerankModelInfo, EmbedModelInfo,
+                                LASTPoolingEmbedModelInfo, RerankModelInfo)
+
 from .mteb_utils import mteb_test_embed_models, mteb_test_rerank_models
 
 MODELS = [
     ########## BertModel
     CLSPoolingEmbedModelInfo("BAAI/bge-base-en",
                              architecture="BertModel",
+                             mteb_score=0.779336792,
                              enable_test=True),
     CLSPoolingEmbedModelInfo("BAAI/bge-base-zh",
                              architecture="BertModel",
@@ -52,10 +55,12 @@ MODELS = [
     ########## XLMRobertaModel
     CLSPoolingEmbedModelInfo("BAAI/bge-m3",
                              architecture="XLMRobertaModel",
+                             mteb_score=0.787343078,
                              enable_test=True),
     ########## Qwen2Model
     LASTPoolingEmbedModelInfo("BAAI/bge-code-v1",
                               architecture="Qwen2Model",
+                              mteb_score=0.75724465,
                               dtype="float32",
                               enable_test=True),
 ]
@@ -65,6 +70,7 @@ RERANK_MODELS = [
     CLSPoolingRerankModelInfo(
         "BAAI/bge-reranker-base",
         architecture="XLMRobertaForSequenceClassification",
+        mteb_score=0.32398,
         enable_test=True),
     CLSPoolingRerankModelInfo(
         "BAAI/bge-reranker-large",
diff --git a/tests/models/language/pooling/test_bge_reranker_v2_gemma.py b/tests/models/language/pooling_mteb_test/test_bge_reranker_v2_gemma.py
similarity index 95%
rename from tests/models/language/pooling/test_bge_reranker_v2_gemma.py
rename to tests/models/language/pooling_mteb_test/test_bge_reranker_v2_gemma.py
index f473e0ba01ffa2c64403c60eb4ba5573bf6ac0f0..1eca2a2c0abd97a685fa70ac70e53855649cfee1 100644
--- a/tests/models/language/pooling/test_bge_reranker_v2_gemma.py
+++ b/tests/models/language/pooling_mteb_test/test_bge_reranker_v2_gemma.py
@@ -7,13 +7,14 @@ import pytest
 import torch
 
 from tests.conftest import HfRunner
-
-from ...utils import LASTPoolingRerankModelInfo, RerankModelInfo
-from .mteb_utils import VllmMtebEncoder, mteb_test_rerank_models
+from tests.models.language.pooling_mteb_test.mteb_utils import (
+    VllmMtebEncoder, mteb_test_rerank_models)
+from tests.models.utils import LASTPoolingRerankModelInfo, RerankModelInfo
 
 RERANK_MODELS = [
     LASTPoolingRerankModelInfo("BAAI/bge-reranker-v2-gemma",
                                architecture="GemmaForSequenceClassification",
+                               mteb_score=0.33757,
                                hf_overrides={
                                    "architectures":
                                    ["GemmaForSequenceClassification"],
@@ -104,7 +105,6 @@ class GemmaMtebEncoder(VllmMtebEncoder):
 
     def __init__(self, *args, **kwargs):
         super().__init__(*args, **kwargs)
-        self.prompt = PROMPT
         self.query_template = "A: {query}\n"
         self.document_template = "B: {doc}\n{prompt}"
 
@@ -119,7 +119,7 @@ class GemmaMtebEncoder(VllmMtebEncoder):
         _sentences = []
         for query, corpus, prompt in sentences:
             query = self.query_template.format(query=query)
-            corpus = self.document_template.format(doc=corpus, prompt=prompt)
+            corpus = self.document_template.format(doc=corpus, prompt=PROMPT)
             _sentences.append((query, corpus, prompt))
 
         return super().predict(_sentences, *args, **kwargs)
diff --git a/tests/models/language/pooling/test_cross_encoder.py b/tests/models/language/pooling_mteb_test/test_cross_encoder.py
similarity index 75%
rename from tests/models/language/pooling/test_cross_encoder.py
rename to tests/models/language/pooling_mteb_test/test_cross_encoder.py
index 8c1bc5779b8a1c8f2a0d30e3dd31f1e0568da2f8..ad320fae0c85a5a7e4b824d0843db4d7054f5179 100644
--- a/tests/models/language/pooling/test_cross_encoder.py
+++ b/tests/models/language/pooling_mteb_test/test_cross_encoder.py
@@ -2,14 +2,17 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import pytest
 
-from ...utils import (CLSPoolingRerankModelInfo, LASTPoolingRerankModelInfo,
-                      RerankModelInfo)
+from tests.models.utils import (CLSPoolingRerankModelInfo,
+                                LASTPoolingRerankModelInfo, RerankModelInfo)
+
 from .mteb_utils import mteb_test_rerank_models
 
 RERANK_MODELS = [
     CLSPoolingRerankModelInfo("cross-encoder/ms-marco-TinyBERT-L-2-v2",
+                              mteb_score=0.32898,
                               architecture="BertForSequenceClassification"),
     LASTPoolingRerankModelInfo("tomaarsen/Qwen3-Reranker-0.6B-seq-cls",
+                               mteb_score=0.25736,
                                architecture="Qwen3ForSequenceClassification")
 ]
 
diff --git a/tests/models/language/pooling/test_gte.py b/tests/models/language/pooling_mteb_test/test_gte.py
similarity index 80%
rename from tests/models/language/pooling/test_gte.py
rename to tests/models/language/pooling_mteb_test/test_gte.py
index 9911620c018ef82fbabc8bb5e7dc1f71233bb717..9ae43fd05bf78c47a8a66bef1c942fe821325199 100644
--- a/tests/models/language/pooling/test_gte.py
+++ b/tests/models/language/pooling_mteb_test/test_gte.py
@@ -3,15 +3,18 @@
 
 import pytest
 
-from ...utils import (CLSPoolingEmbedModelInfo, CLSPoolingRerankModelInfo,
-                      EmbedModelInfo, LASTPoolingEmbedModelInfo,
-                      RerankModelInfo, check_transformers_version)
-from .embed_utils import correctness_test_embed_models
+from tests.models.language.pooling.embed_utils import (
+    correctness_test_embed_models)
+from tests.models.utils import (CLSPoolingEmbedModelInfo,
+                                CLSPoolingRerankModelInfo, EmbedModelInfo,
+                                LASTPoolingEmbedModelInfo, RerankModelInfo)
+
 from .mteb_utils import mteb_test_embed_models, mteb_test_rerank_models
 
 MODELS = [
     ########## BertModel
     CLSPoolingEmbedModelInfo("thenlper/gte-large",
+                             mteb_score=0.76807651,
                              architecture="BertModel",
                              enable_test=True),
     CLSPoolingEmbedModelInfo("thenlper/gte-base",
@@ -30,28 +33,37 @@ MODELS = [
                              architecture="BertModel",
                              enable_test=False),
     ########### NewModel
+    # These three architectures are almost the same, but not exactly the same.
+    # For example,
+    # - whether to use token_type_embeddings
+    # - whether to use context expansion
+    # So only test one (the most widely used) model
     CLSPoolingEmbedModelInfo("Alibaba-NLP/gte-multilingual-base",
                              architecture="GteNewModel",
+                             mteb_score=0.775074696,
                              hf_overrides={"architectures": ["GteNewModel"]},
                              enable_test=True),
     CLSPoolingEmbedModelInfo("Alibaba-NLP/gte-base-en-v1.5",
                              architecture="GteNewModel",
                              hf_overrides={"architectures": ["GteNewModel"]},
-                             enable_test=True),
+                             enable_test=False),
     CLSPoolingEmbedModelInfo("Alibaba-NLP/gte-large-en-v1.5",
                              architecture="GteNewModel",
                              hf_overrides={"architectures": ["GteNewModel"]},
-                             enable_test=True),
+                             enable_test=False),
     ########### Qwen2ForCausalLM
     LASTPoolingEmbedModelInfo("Alibaba-NLP/gte-Qwen2-1.5B-instruct",
+                              mteb_score=0.758473459018872,
                               architecture="Qwen2ForCausalLM",
                               enable_test=True),
     ########## ModernBertModel
     CLSPoolingEmbedModelInfo("Alibaba-NLP/gte-modernbert-base",
+                             mteb_score=0.748193353,
                              architecture="ModernBertModel",
                              enable_test=True),
     ########## Qwen3ForCausalLM
     LASTPoolingEmbedModelInfo("Qwen/Qwen3-Embedding-0.6B",
+                              mteb_score=0.771163695,
                               architecture="Qwen3ForCausalLM",
                               dtype="float32",
                               enable_test=True),
@@ -65,10 +77,12 @@ RERANK_MODELS = [
     CLSPoolingRerankModelInfo(
         # classifier_pooling: mean
         "Alibaba-NLP/gte-reranker-modernbert-base",
+        mteb_score=0.33386,
         architecture="ModernBertForSequenceClassification",
         enable_test=True),
     CLSPoolingRerankModelInfo(
         "Alibaba-NLP/gte-multilingual-reranker-base",
+        mteb_score=0.33062,
         architecture="GteNewForSequenceClassification",
         hf_overrides={"architectures": ["GteNewForSequenceClassification"]},
         enable_test=True),
@@ -78,10 +92,6 @@ RERANK_MODELS = [
 @pytest.mark.parametrize("model_info", MODELS)
 def test_embed_models_mteb(hf_runner, vllm_runner,
                            model_info: EmbedModelInfo) -> None:
-    if model_info.name == "Alibaba-NLP/gte-Qwen2-1.5B-instruct":
-        check_transformers_version(model_info.name,
-                                   max_transformers_version="4.53.2")
-
     mteb_test_embed_models(hf_runner, vllm_runner, model_info)
 
 
@@ -89,10 +99,6 @@ def test_embed_models_mteb(hf_runner, vllm_runner,
 def test_embed_models_correctness(hf_runner, vllm_runner,
                                   model_info: EmbedModelInfo,
                                   example_prompts) -> None:
-    if model_info.name == "Alibaba-NLP/gte-Qwen2-1.5B-instruct":
-        check_transformers_version(model_info.name,
-                                   max_transformers_version="4.53.2")
-
     correctness_test_embed_models(hf_runner, vllm_runner, model_info,
                                   example_prompts)
 
diff --git a/tests/models/language/pooling/test_intfloat.py b/tests/models/language/pooling_mteb_test/test_intfloat.py
similarity index 85%
rename from tests/models/language/pooling/test_intfloat.py
rename to tests/models/language/pooling_mteb_test/test_intfloat.py
index 6cae53a660ad834246a7944d4c322479ebeebb3f..0d6026898ad4addf490404ebf7c93abff48d6492 100644
--- a/tests/models/language/pooling/test_intfloat.py
+++ b/tests/models/language/pooling_mteb_test/test_intfloat.py
@@ -2,14 +2,17 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import pytest
 
-from ...utils import CLSPoolingEmbedModelInfo, EmbedModelInfo
-from .embed_utils import correctness_test_embed_models
+from tests.models.language.pooling.embed_utils import (
+    correctness_test_embed_models)
+from tests.models.utils import CLSPoolingEmbedModelInfo, EmbedModelInfo
+
 from .mteb_utils import mteb_test_embed_models
 
 MODELS = [
     ########## BertModel
     CLSPoolingEmbedModelInfo("intfloat/e5-small",
                              architecture="BertModel",
+                             mteb_score=0.742285423,
                              enable_test=True),
     CLSPoolingEmbedModelInfo("intfloat/e5-base",
                              architecture="BertModel",
@@ -23,6 +26,7 @@ MODELS = [
     ########## XLMRobertaModel
     CLSPoolingEmbedModelInfo("intfloat/multilingual-e5-base",
                              architecture="XLMRobertaModel",
+                             mteb_score=0.779325955,
                              enable_test=True),
     CLSPoolingEmbedModelInfo("intfloat/multilingual-e5-large",
                              architecture="XLMRobertaModel",
@@ -36,7 +40,7 @@ MODELS = [
 @pytest.mark.parametrize("model_info", MODELS)
 def test_embed_models_mteb(hf_runner, vllm_runner,
                            model_info: EmbedModelInfo) -> None:
-    mteb_test_embed_models(hf_runner, vllm_runner, model_info, atol=0.02)
+    mteb_test_embed_models(hf_runner, vllm_runner, model_info)
 
 
 @pytest.mark.parametrize("model_info", MODELS)
diff --git a/tests/models/language/pooling/test_jina.py b/tests/models/language/pooling_mteb_test/test_jina.py
similarity index 90%
rename from tests/models/language/pooling/test_jina.py
rename to tests/models/language/pooling_mteb_test/test_jina.py
index 37c5bdc97dd9842bfdf83495b5b9b652f7fae82d..0a77a78bb31b672d3f5593d8aa9a96716093be59 100644
--- a/tests/models/language/pooling/test_jina.py
+++ b/tests/models/language/pooling_mteb_test/test_jina.py
@@ -4,16 +4,18 @@ from functools import partial
 
 import pytest
 
+from tests.models.language.pooling.embed_utils import (
+    check_embeddings_close, correctness_test_embed_models, matryoshka_fy)
+from tests.models.utils import (CLSPoolingEmbedModelInfo,
+                                CLSPoolingRerankModelInfo, EmbedModelInfo,
+                                RerankModelInfo)
 from vllm import PoolingParams
 
-from ...utils import (CLSPoolingEmbedModelInfo, CLSPoolingRerankModelInfo,
-                      EmbedModelInfo, RerankModelInfo)
-from .embed_utils import (check_embeddings_close,
-                          correctness_test_embed_models, matryoshka_fy)
 from .mteb_utils import mteb_test_embed_models, mteb_test_rerank_models
 
 EMBEDDING_MODELS = [
     CLSPoolingEmbedModelInfo("jinaai/jina-embeddings-v3",
+                             mteb_score=0.824413164,
                              architecture="XLMRobertaModel",
                              is_matryoshka=True)
 ]
@@ -21,6 +23,7 @@ EMBEDDING_MODELS = [
 RERANK_MODELS = [
     CLSPoolingRerankModelInfo(
         "jinaai/jina-reranker-v2-base-multilingual",
+        mteb_score=0.33643,
         architecture="XLMRobertaForSequenceClassification")
 ]
 
diff --git a/tests/models/language/pooling/test_mxbai_rerank.py b/tests/models/language/pooling_mteb_test/test_mxbai_rerank.py
similarity index 96%
rename from tests/models/language/pooling/test_mxbai_rerank.py
rename to tests/models/language/pooling_mteb_test/test_mxbai_rerank.py
index 73823deeff4e0a6017a134eafb85174037932317..05ebb4ec4d3f50efdf652332fdd03a9520a5f4b7 100644
--- a/tests/models/language/pooling/test_mxbai_rerank.py
+++ b/tests/models/language/pooling_mteb_test/test_mxbai_rerank.py
@@ -6,8 +6,8 @@ import pytest
 import torch
 
 from tests.conftest import HfRunner
+from tests.models.utils import LASTPoolingRerankModelInfo, RerankModelInfo
 
-from ...utils import LASTPoolingRerankModelInfo, RerankModelInfo
 from .mteb_utils import mteb_test_rerank_models
 
 mxbai_rerank_hf_overrides = {
@@ -20,6 +20,7 @@ RERANK_MODELS = [
     LASTPoolingRerankModelInfo("mixedbread-ai/mxbai-rerank-base-v2",
                                architecture="Qwen2ForSequenceClassification",
                                hf_overrides=mxbai_rerank_hf_overrides,
+                               mteb_score=0.273,
                                enable_test=True),
     LASTPoolingRerankModelInfo("mixedbread-ai/mxbai-rerank-large-v2",
                                architecture="Qwen2ForSequenceClassification",
diff --git a/tests/models/language/pooling/test_nomic.py b/tests/models/language/pooling_mteb_test/test_nomic.py
similarity index 84%
rename from tests/models/language/pooling/test_nomic.py
rename to tests/models/language/pooling_mteb_test/test_nomic.py
index 2d05958e9bcda98bc61db24cfe67efeb9a8403ff..61512fd0dff18f4b035eb2f5f6c76534daf1ffb2 100644
--- a/tests/models/language/pooling/test_nomic.py
+++ b/tests/models/language/pooling_mteb_test/test_nomic.py
@@ -3,13 +3,16 @@
 
 import pytest
 
-from ...utils import CLSPoolingEmbedModelInfo, EmbedModelInfo
-from .embed_utils import correctness_test_embed_models
+from tests.models.language.pooling.embed_utils import (
+    correctness_test_embed_models)
+from tests.models.utils import CLSPoolingEmbedModelInfo, EmbedModelInfo
+
 from .mteb_utils import mteb_test_embed_models
 
 MODELS = [
     CLSPoolingEmbedModelInfo("nomic-ai/nomic-embed-text-v1",
                              architecture="NomicBertModel",
+                             mteb_score=0.737568559,
                              enable_test=True),
     CLSPoolingEmbedModelInfo("nomic-ai/nomic-embed-text-v1.5",
                              architecture="NomicBertModel",
@@ -19,6 +22,7 @@ MODELS = [
                              enable_test=False),
     CLSPoolingEmbedModelInfo("nomic-ai/nomic-embed-text-v2-moe",
                              architecture="NomicBertModel",
+                             mteb_score=0.715488912,
                              enable_test=True)
 ]
 
diff --git a/tests/models/language/pooling/test_qwen3_reranker.py b/tests/models/language/pooling_mteb_test/test_qwen3_reranker.py
similarity index 96%
rename from tests/models/language/pooling/test_qwen3_reranker.py
rename to tests/models/language/pooling_mteb_test/test_qwen3_reranker.py
index 5dd2d9eae911567f4831f48f137e6359aa413f62..65403081dc0f81f9e51f6e0862bbda8a63a632c6 100644
--- a/tests/models/language/pooling/test_qwen3_reranker.py
+++ b/tests/models/language/pooling_mteb_test/test_qwen3_reranker.py
@@ -6,9 +6,9 @@ import pytest
 import torch
 
 from tests.conftest import HfRunner
+from tests.models.utils import LASTPoolingRerankModelInfo, RerankModelInfo
 from tests.utils import multi_gpu_test
 
-from ...utils import LASTPoolingRerankModelInfo, RerankModelInfo
 from .mteb_utils import mteb_test_rerank_models
 
 qwen3_reranker_hf_overrides = {
@@ -20,6 +20,7 @@ qwen3_reranker_hf_overrides = {
 RERANK_MODELS = [
     LASTPoolingRerankModelInfo("Qwen/Qwen3-Reranker-0.6B",
                                architecture="Qwen3ForSequenceClassification",
+                               mteb_score=0.25736,
                                hf_overrides=qwen3_reranker_hf_overrides,
                                enable_test=True),
     LASTPoolingRerankModelInfo("Qwen/Qwen3-Reranker-4B",
diff --git a/tests/models/language/pooling/test_snowflake_arctic_embed.py b/tests/models/language/pooling_mteb_test/test_snowflake_arctic_embed.py
similarity index 83%
rename from tests/models/language/pooling/test_snowflake_arctic_embed.py
rename to tests/models/language/pooling_mteb_test/test_snowflake_arctic_embed.py
index c22c78592e535b818340b48370dc9e9b33ca1d56..91bad2c4e42fc1dc7483a3dbc77b074224f33acc 100644
--- a/tests/models/language/pooling/test_snowflake_arctic_embed.py
+++ b/tests/models/language/pooling_mteb_test/test_snowflake_arctic_embed.py
@@ -3,14 +3,17 @@
 
 import pytest
 
-from ...utils import CLSPoolingEmbedModelInfo, EmbedModelInfo
-from .embed_utils import correctness_test_embed_models
+from tests.models.language.pooling.embed_utils import (
+    correctness_test_embed_models)
+from tests.models.utils import CLSPoolingEmbedModelInfo, EmbedModelInfo
+
 from .mteb_utils import mteb_test_embed_models
 
 MODELS = [
     CLSPoolingEmbedModelInfo("Snowflake/snowflake-arctic-embed-xs",
                              is_matryoshka=False,
                              architecture="BertModel",
+                             mteb_score=0.714927797,
                              enable_test=True),
     CLSPoolingEmbedModelInfo("Snowflake/snowflake-arctic-embed-s",
                              is_matryoshka=False,
@@ -23,6 +26,7 @@ MODELS = [
     CLSPoolingEmbedModelInfo("Snowflake/snowflake-arctic-embed-m-long",
                              is_matryoshka=False,
                              architecture="NomicBertModel",
+                             mteb_score=0.681146831,
                              enable_test=True),
     CLSPoolingEmbedModelInfo("Snowflake/snowflake-arctic-embed-l",
                              is_matryoshka=False,
@@ -31,14 +35,17 @@ MODELS = [
     CLSPoolingEmbedModelInfo("Snowflake/snowflake-arctic-embed-m-v1.5",
                              is_matryoshka=True,
                              architecture="BertModel",
+                             mteb_score=0.649088363,
                              enable_test=True),
     CLSPoolingEmbedModelInfo("Snowflake/snowflake-arctic-embed-l-v2.0",
                              is_matryoshka=True,
                              architecture="XLMRobertaModel",
+                             mteb_score=0.712258299,
                              enable_test=True),
     CLSPoolingEmbedModelInfo("Snowflake/snowflake-arctic-embed-m-v2.0",
                              is_matryoshka=True,
                              architecture="GteModel",
+                             mteb_score=0.706622444,
                              enable_test=True),
 ]
 
@@ -46,7 +53,7 @@ MODELS = [
 @pytest.mark.parametrize("model_info", MODELS)
 def test_embed_models_mteb(hf_runner, vllm_runner,
                            model_info: EmbedModelInfo) -> None:
-    mteb_test_embed_models(hf_runner, vllm_runner, model_info, atol=0.02)
+    mteb_test_embed_models(hf_runner, vllm_runner, model_info)
 
 
 @pytest.mark.parametrize("model_info", MODELS)
diff --git a/tests/models/language/pooling/test_st_projector.py b/tests/models/language/pooling_mteb_test/test_st_projector.py
similarity index 60%
rename from tests/models/language/pooling/test_st_projector.py
rename to tests/models/language/pooling_mteb_test/test_st_projector.py
index 51ddbcc5ab249646e34955d710684f17df76dfaa..bd493e7e2ba099d904a486487062cf66b1788e71 100644
--- a/tests/models/language/pooling/test_st_projector.py
+++ b/tests/models/language/pooling_mteb_test/test_st_projector.py
@@ -2,7 +2,9 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import pytest
 
-from ...utils import CLSPoolingEmbedModelInfo, EmbedModelInfo
+from tests.models.utils import (CLSPoolingEmbedModelInfo, EmbedModelInfo,
+                                LASTPoolingEmbedModelInfo)
+
 from .mteb_utils import mteb_test_embed_models
 
 # ST models with projector (Dense) layers
@@ -10,8 +12,13 @@ ST_PROJECTOR_MODELS = [
     CLSPoolingEmbedModelInfo(
         "TencentBAC/Conan-embedding-v1",
         architecture="BertModel",
+        mteb_score=0.688611955,
         enable_test=True,
     ),
+    LASTPoolingEmbedModelInfo("google/embeddinggemma-300m",
+                              architecture="Gemma3TextModel",
+                              mteb_score=0.7473819294684156,
+                              enable_test=True)
 ]
 
 
diff --git a/tests/models/multimodal/generation/test_pixtral.py b/tests/models/multimodal/generation/test_pixtral.py
index d39cf706786e2e8e56f1ce1a72ca490ee9ab5e20..a4e21aface41ff40e9fd4ecab91bb05f03132bbc 100644
--- a/tests/models/multimodal/generation/test_pixtral.py
+++ b/tests/models/multimodal/generation/test_pixtral.py
@@ -29,10 +29,10 @@ MISTRAL_SMALL_3_1_ID = "mistralai/Mistral-Small-3.1-24B-Instruct-2503"
 MODELS = [PIXTRAL_ID, MISTRAL_SMALL_3_1_ID]
 
 IMG_URLS = [
-    "https://huggingface.co/datasets/Isotr0py/mistral-test-images/resolve/main/237-400x300.jpg",
-    "https://huggingface.co/datasets/Isotr0py/mistral-test-images/resolve/main/231-200x300.jpg",
-    "https://huggingface.co/datasets/Isotr0py/mistral-test-images/resolve/main/27-500x500.jpg",
-    "https://huggingface.co/datasets/Isotr0py/mistral-test-images/resolve/main/17-150x600.jpg",
+    "237-400x300.jpg",  # "https://huggingface.co/datasets/Isotr0py/mistral-test-images/resolve/main/237-400x300.jpg",
+    "231-200x300.jpg",  # "https://huggingface.co/datasets/Isotr0py/mistral-test-images/resolve/main/237-400x300.jpg",
+    "27-500x500.jpg",  # "https://huggingface.co/datasets/Isotr0py/mistral-test-images/resolve/main/237-400x300.jpg",
+    "17-150x600.jpg",  # "https://huggingface.co/datasets/Isotr0py/mistral-test-images/resolve/main/237-400x300.jpg",
 ]
 PROMPT = "Describe each image in one short sentence."
 
@@ -105,12 +105,6 @@ def _create_engine_inputs_hf(urls: list[str]) -> TextPrompt:
     return engine_inputs
 
 
-MSGS = [
-    _create_msg_format(IMG_URLS[:1]),
-    _create_msg_format(IMG_URLS[:2]),
-    _create_msg_format(IMG_URLS),
-]
-
 SAMPLING_PARAMS = SamplingParams(max_tokens=512, temperature=0.0, logprobs=5)
 LIMIT_MM_PER_PROMPT = dict(image=4)
 
@@ -156,12 +150,8 @@ def load_outputs_w_logprobs(filename: "StrPath") -> OutputsLogprobs:
 @pytest.mark.parametrize("model", MODELS)
 @pytest.mark.parametrize("max_model_len", MAX_MODEL_LEN)
 @pytest.mark.parametrize("dtype", ["bfloat16"])
-def test_chat(
-    vllm_runner,
-    max_model_len: int,
-    model: str,
-    dtype: str,
-) -> None:
+def test_chat(vllm_runner, max_model_len: int, model: str, dtype: str,
+              local_asset_server) -> None:
     EXPECTED_CHAT_LOGPROBS = load_outputs_w_logprobs(
         FIXTURE_LOGPROBS_CHAT[model])
     with vllm_runner(
@@ -174,7 +164,14 @@ def test_chat(
             limit_mm_per_prompt=LIMIT_MM_PER_PROMPT,
     ) as vllm_model:
         outputs = []
-        for msg in MSGS:
+
+        urls_all = [local_asset_server.url_for(u) for u in IMG_URLS]
+        msgs = [
+            _create_msg_format(urls_all[:1]),
+            _create_msg_format(urls_all[:2]),
+            _create_msg_format(urls_all),
+        ]
+        for msg in msgs:
             output = vllm_model.llm.chat(msg, sampling_params=SAMPLING_PARAMS)
 
             outputs.extend(output)
@@ -190,17 +187,19 @@ def test_chat(
                          name_1="output")
 
 
-@pytest.mark.parametrize("prompt,expected_ranges",
-                         [(_create_engine_inputs_hf(IMG_URLS[:1]),
-                           [PlaceholderRange(offset=11, length=494)]),
-                          (_create_engine_inputs_hf(IMG_URLS[1:4]), [
-                              PlaceholderRange(offset=11, length=266),
-                              PlaceholderRange(offset=277, length=1056),
-                              PlaceholderRange(offset=1333, length=418)
-                          ])])
-def test_multi_modal_placeholders(vllm_runner, prompt: TextPrompt,
+@pytest.mark.parametrize(
+    "image_urls,expected_ranges",
+    [(IMG_URLS[:1], [PlaceholderRange(offset=11, length=494)]),
+     (IMG_URLS[1:4], [
+         PlaceholderRange(offset=11, length=266),
+         PlaceholderRange(offset=277, length=1056),
+         PlaceholderRange(offset=1333, length=418)
+     ])])
+def test_multi_modal_placeholders(vllm_runner, image_urls: list[str],
                                   expected_ranges: list[PlaceholderRange],
-                                  monkeypatch) -> None:
+                                  local_asset_server, monkeypatch) -> None:
+    local_image_urls = [local_asset_server.url_for(u) for u in image_urls]
+    prompt = _create_engine_inputs_hf(local_image_urls)
 
     # This placeholder checking test only works with V0 engine
     # where `multi_modal_placeholders` is returned with `RequestOutput`
diff --git a/tests/models/multimodal/generation/test_qwen2_vl.py b/tests/models/multimodal/generation/test_qwen2_vl.py
index c61c27ae204a31fad402f25bcf20ef40d37f2c19..a81f5e7ec88724c41cc8c73c02fe95f753688cf3 100644
--- a/tests/models/multimodal/generation/test_qwen2_vl.py
+++ b/tests/models/multimodal/generation/test_qwen2_vl.py
@@ -154,7 +154,7 @@ def batch_make_image_embeddings(
         embed_counter += cur_batch_embed_len
         image_counter += cur_batch_image_count
 
-    # ensure we don't lost any images or embeddings
+    # ensure we don't lose any images or embeddings
     assert embed_counter == image_embeds.size(0)
     assert image_counter == image_grid_thw.size(0)
     assert len(image_batches) == len(result)
@@ -238,7 +238,7 @@ def batch_make_video_embeddings(
         embed_counter += cur_batch_embed_len
         video_counter += cur_batch_video_count
 
-    # ensure we don't lost any videos or embeddings
+    # ensure we don't lose any videos or embeddings
     assert embed_counter == video_embeds.size(0)
     assert video_counter == video_grid_thw.size(0)
     assert len(video_batches) == len(result)
diff --git a/tests/models/multimodal/generation/test_whisper.py b/tests/models/multimodal/generation/test_whisper.py
index 4a65e8c95204ea73e639a7e197796b1395c1c2a8..e0e9980b883391e8b674499fffb858d6644cbc36 100644
--- a/tests/models/multimodal/generation/test_whisper.py
+++ b/tests/models/multimodal/generation/test_whisper.py
@@ -122,8 +122,7 @@ def run_test(
 
 
 @pytest.mark.core_model
-@pytest.mark.parametrize(
-    "model", ["openai/whisper-small", "openai/whisper-large-v3-turbo"])
+@pytest.mark.parametrize("model", ["openai/whisper-large-v3-turbo"])
 @create_new_process_for_each_test()
 def test_models(vllm_runner, model) -> None:
     run_test(
diff --git a/tests/models/multimodal/generation/vlm_utils/builders.py b/tests/models/multimodal/generation/vlm_utils/builders.py
index 03c08240d6a81bbdaf3cd7c3d88257c9799e66ba..133d5d6ee2ef86e4a9b2fbc6b1067c7c6763097d 100644
--- a/tests/models/multimodal/generation/vlm_utils/builders.py
+++ b/tests/models/multimodal/generation/vlm_utils/builders.py
@@ -250,7 +250,7 @@ def build_video_inputs_from_test_info(
 
 def apply_image_size_scaling(image, size: Union[float, tuple[int, int]],
                              size_type: SizeType):
-    """Applies a size scaler to one image; this can be a an image size factor,
+    """Applies a size scaler to one image; this can be an image size factor,
     which scales the image while maintaining the aspect ratio"""
     # Special case for embeddings; if it's a tensor, it's only valid if we
     # are considering size factors at constant scale, i.e., we just clone
diff --git a/tests/models/multimodal/generation/vlm_utils/case_filtering.py b/tests/models/multimodal/generation/vlm_utils/case_filtering.py
index 336e2dd2b1201b4e58dff1173546ea65835da012..1edb512135343c386bdc8dcbf701897c6e9cb212 100644
--- a/tests/models/multimodal/generation/vlm_utils/case_filtering.py
+++ b/tests/models/multimodal/generation/vlm_utils/case_filtering.py
@@ -42,7 +42,7 @@ def get_filtered_test_settings(
             else:
                 assert test_info.prompt_formatter is not None
 
-            # Everything looks okay; keep if this is has correct proc handling
+            # Everything looks okay; keep if this is correct proc handling
             if (test_info.distributed_executor_backend
                     is not None) == new_proc_per_test:
                 matching_tests[test_name] = test_info
diff --git a/tests/models/multimodal/generation/vlm_utils/core.py b/tests/models/multimodal/generation/vlm_utils/core.py
index a5d6948f06efd5aa80457e869ced699cbe614130..11d44120b875ff83cbdb8dc4d8c0724ce23cd6f8 100644
--- a/tests/models/multimodal/generation/vlm_utils/core.py
+++ b/tests/models/multimodal/generation/vlm_utils/core.py
@@ -42,7 +42,7 @@ def run_test(
     tensor_parallel_size: int = 1,
     vllm_embeddings: Optional[torch.Tensor] = None,
 ):
-    """Modality agnostic test test executor for comparing HF/vLLM outputs."""
+    """Modality agnostic test executor for comparing HF/vLLM outputs."""
     # In the case of embeddings, vLLM takes separate input tensors
     vllm_inputs = vllm_embeddings if vllm_embeddings is not None else inputs
 
@@ -69,6 +69,9 @@ def run_test(
         vllm_runner_kwargs_["tokenizer_mode"] = model_info.tokenizer_mode
     if model_info.hf_overrides:
         vllm_runner_kwargs_["hf_overrides"] = model_info.hf_overrides
+    if model_info.skip_tokenizer_init:
+        vllm_runner_kwargs_[
+            "skip_tokenizer_init"] = model_info.skip_tokenizer_init
 
     if vllm_runner_kwargs:
         vllm_runner_kwargs_.update(vllm_runner_kwargs)
diff --git a/tests/models/multimodal/pooling/test_prithvi_mae.py b/tests/models/multimodal/pooling/test_prithvi_mae.py
index e9be79fba911fcad5cafa5db936762301ab75320..b503d42567022bb81ec05425958d83e2e24285b4 100644
--- a/tests/models/multimodal/pooling/test_prithvi_mae.py
+++ b/tests/models/multimodal/pooling/test_prithvi_mae.py
@@ -46,7 +46,7 @@ def _run_test(
         vllm_model.encode(prompt)
 
 
-MODELS = ["christian-pinto/Prithvi-EO-2.0-300M-TL-VLLM"]
+MODELS = ["mgazz/Prithvi-EO-2.0-300M-TL-Sen1Floods11"]
 
 
 @pytest.mark.core_model
diff --git a/tests/models/multimodal/processing/test_common.py b/tests/models/multimodal/processing/test_common.py
index 3ff4360b83345a86e89d4b77bb3f10cf6e9fe87c..ced0ab3377a9ec1c3b72ac0393f1f27fa01ae9aa 100644
--- a/tests/models/multimodal/processing/test_common.py
+++ b/tests/models/multimodal/processing/test_common.py
@@ -66,7 +66,9 @@ def _test_processing_correctness(
         hf_overrides=model_info.hf_overrides,
         # Ensure that the cache can fit all of the data
         mm_processor_cache_gb=2048,
-    )
+        skip_tokenizer_init=model_info.skip_tokenizer_init,
+        enforce_eager=model_info.enforce_eager,
+        dtype=model_info.dtype)
 
     model_cls = MULTIMODAL_REGISTRY._get_model_cls(model_config)
     factories = MULTIMODAL_REGISTRY._processor_factories[model_cls]
@@ -293,6 +295,7 @@ def _test_processing_correctness_one(
     "OpenGVLab/InternVL3_5-GPT-OSS-20B-A4B-Preview",
     "OpenGVLab/InternVL3_5-30B-A3B",
     "Kwai-Keye/Keye-VL-8B-Preview",
+    "Kwai-Keye/Keye-VL-1_5-8B",
     "moonshotai/Kimi-VL-A3B-Instruct",
     "meta-llama/Llama-4-Scout-17B-16E-Instruct",
     "llava-hf/llava-1.5-7b-hf",
@@ -301,6 +304,7 @@ def _test_processing_correctness_one(
     "llava-hf/llava-onevision-qwen2-0.5b-ov-hf",
     "meta-llama/Llama-3.2-11B-Vision-Instruct",
     "TIGER-Lab/Mantis-8B-siglip-llama3",
+    "mispeech/midashenglm-7b",
     "openbmb/MiniCPM-Llama3-V-2_5",
     "openbmb/MiniCPM-o-2_6",
     "openbmb/MiniCPM-V-2_6",
diff --git a/tests/models/multimodal/processing/test_glm4_1v.py b/tests/models/multimodal/processing/test_glm4_1v.py
index a49842e1099c2f80e0756d7208d907bd17b16225..dfb8d9b2a038db0328a24b33c275083340df47e7 100644
--- a/tests/models/multimodal/processing/test_glm4_1v.py
+++ b/tests/models/multimodal/processing/test_glm4_1v.py
@@ -5,6 +5,7 @@ import pytest
 
 from vllm.assets.video import VideoAsset
 from vllm.multimodal import MULTIMODAL_REGISTRY
+from vllm.multimodal.video import OpenCVDynamicVideoBackend, OpenCVVideoBackend
 
 from ...utils import build_model_context
 
@@ -50,3 +51,49 @@ def test_processor_override(
 
     assert grid_t == expected_grid_t
     assert video_tok_count == expected_toks_per_frame * grid_t
+
+
+@pytest.mark.parametrize("model_id", ["zai-org/GLM-4.1V-9B-Thinking"])
+@pytest.mark.parametrize("fps", [2])
+def test_video_loader_consistency(
+    model_id: str,
+    fps: int,
+):
+    """
+    Ensure dynamic video loader (pre-sampled by loader) and normal video 
+    loader (post-sampled by processor) produce same video processing outputs.
+    """
+    ctx = build_model_context(
+        model_id,
+        mm_processor_kwargs=None,
+        limit_mm_per_prompt={"video": 1},
+    )
+    processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config)
+    hf_processor_mm_kwargs = {"fps": fps}
+
+    # Build the image str / prompt based on the number of images we pass
+    prompt = "<|begin_of_video|><|video|><|end_of_video|>"
+
+    video_path = VideoAsset(name="baby_reading", num_frames=-1).video_path
+    with open(video_path, "rb") as f:
+        video_bytes = f.read()
+
+    static_video, static_metadata = OpenCVVideoBackend.load_bytes(video_bytes)
+    dynamic_video, dynamic_metadata = OpenCVDynamicVideoBackend.load_bytes(
+        video_bytes, requested_fps=fps)
+
+    # pre-sampled loader shouldn't read all frames
+    assert len(dynamic_video) < len(static_video)
+
+    static_mm_data = {"video": [(static_video, static_metadata)]}
+    dynamic_mm_data = {"video": [(dynamic_video, dynamic_metadata)]}
+
+    static_outputs = processor.apply(prompt, static_mm_data,
+                                     hf_processor_mm_kwargs)
+    dynamic_outputs = processor.apply(prompt, dynamic_mm_data,
+                                      hf_processor_mm_kwargs)
+
+    assert static_outputs["prompt_token_ids"] == dynamic_outputs[
+        "prompt_token_ids"]
+    assert static_outputs["mm_kwargs"].get_data(
+    ) == dynamic_outputs["mm_kwargs"].get_data()
diff --git a/tests/models/multimodal/processing/test_mllama4.py b/tests/models/multimodal/processing/test_mllama4.py
index 3be77b5da63f22b33a30ec5aee448202151dc814..e7b28ff8ec7f0047aa25cd5f50613705dc5170bf 100644
--- a/tests/models/multimodal/processing/test_mllama4.py
+++ b/tests/models/multimodal/processing/test_mllama4.py
@@ -52,7 +52,7 @@ def test_profiling(model_id: str, max_model_len: int):
     chunks_per_image = prod(mm_data["patches_per_image"])
     total_num_patches = chunks_per_image * tokens_per_patch
     num_tiles = mm_data["aspect_ratios"][0][0] * mm_data["aspect_ratios"][0][
-        1]  # x-y seperator tokens
+        1]  # x-y separator tokens
     total_tokens = total_num_patches.item() + num_tiles.item(
     ) + 3  # image start, image, image end
 
diff --git a/tests/models/multimodal/processing/test_tensor_schema.py b/tests/models/multimodal/processing/test_tensor_schema.py
index 1a11fa3d2b824d69f8a874ca6df85afaaa86aa55..3b87b669dbbe3c307ecb73a38ae55677b0612629 100644
--- a/tests/models/multimodal/processing/test_tensor_schema.py
+++ b/tests/models/multimodal/processing/test_tensor_schema.py
@@ -31,6 +31,7 @@ from ...utils import dummy_hf_overrides
 
 ARCH_TO_SKIP = {
     "MolmoForCausalLM": "incompatible requirements",
+    "Florence2ForConditionalGeneration": "not supported in V1",
 }
 ARCH_NEEDS_EXTRAS = [
     "InternVLChatModel",
@@ -41,9 +42,6 @@ ARCH_NEEDS_EXTRAS = [
 ]
 REPO_ID_TO_SKIP = {
     "nm-testing/pixtral-12b-FP8-dynamic": "duplicated test",
-    # FIXME(Isotr0py): enable GPT-OSS based InternVL3.5 model
-    # after support PP for GPT-OSS
-    "OpenGVLab/InternVL3_5-GPT-OSS-20B-A4B-Preview": "Broken model",
 }
 
 ImageInput = list[Image.Image]
@@ -199,7 +197,9 @@ def test_model_tensor_schema(model_arch: str, model_id: str):
         revision=model_info.revision,
         trust_remote_code=model_info.trust_remote_code,
         hf_overrides=hf_overrides_fn,
-    )
+        skip_tokenizer_init=model_info.skip_tokenizer_init,
+        enforce_eager=model_info.enforce_eager,
+        dtype=model_info.dtype)
     model_cls = MULTIMODAL_REGISTRY._get_model_cls(model_config)
     factories = MULTIMODAL_REGISTRY._processor_factories[model_cls]
 
diff --git a/tests/models/multimodal/test_mapping.py b/tests/models/multimodal/test_mapping.py
index 7096810d8e15c4311b519887cbb32c60ae88a385..caf1966ab513f26ffccbf3edf6586b2f8a372c66 100644
--- a/tests/models/multimodal/test_mapping.py
+++ b/tests/models/multimodal/test_mapping.py
@@ -59,7 +59,9 @@ def test_hf_model_weights_mapper(model_arch: str):
         revision=model_info.revision,
         trust_remote_code=model_info.trust_remote_code,
         hf_overrides=model_info.hf_overrides,
-    )
+        skip_tokenizer_init=model_info.skip_tokenizer_init,
+        enforce_eager=model_info.enforce_eager,
+        dtype=model_info.dtype)
     model_cls = MULTIMODAL_REGISTRY._get_model_cls(model_config)
 
     original_weights = create_repo_dummy_weights(model_id)
diff --git a/tests/models/registry.py b/tests/models/registry.py
index 78444c5a62e5e752c4e046be61cf6c0488f5faf0..2c3dec56994064eb045a0b152550266fb728d4ff 100644
--- a/tests/models/registry.py
+++ b/tests/models/registry.py
@@ -6,10 +6,11 @@ from dataclasses import dataclass, field
 from typing import Any, Literal, Optional
 
 import pytest
+import torch
 from packaging.version import Version
 from transformers import __version__ as TRANSFORMERS_VERSION
 
-from vllm.config import TokenizerMode
+from vllm.config import ModelDType, TokenizerMode
 
 
 @dataclass(frozen=True)
@@ -47,6 +48,23 @@ class _HfExamplesInfo:
     The reason for the minimum/maximum version requirement.
     """
 
+    skip_tokenizer_init: bool = False
+    """
+    If true, skip initialization of tokenizer and detokenizer. 
+    """
+
+    dtype: ModelDType = "auto"
+    """
+    The data type for the model weights and activations.
+    """
+
+    enforce_eager: bool = False
+    """
+    Whether to enforce eager execution. If True, we will
+    disable CUDA graph and always execute the model in eager mode.
+    If False, we will use CUDA graph and eager execution in hybrid.
+    """
+
     is_available_online: bool = True
     """
     Set this to ``False`` if the name of this architecture no longer exists on
@@ -76,6 +94,15 @@ class _HfExamplesInfo:
     If not specified, the default revision will be used.
     """
 
+    max_num_seqs: Optional[int] = None
+    """Maximum number of sequences to be processed in a single iteration."""
+
+    use_original_num_layers: bool = False
+    """
+    If True, use the original number of layers from the model config 
+    instead of minimal layers for testing.
+    """
+
     def check_transformers_version(
         self,
         *,
@@ -137,7 +164,7 @@ class _HfExamplesInfo:
 # yapf: disable
 _TEXT_GENERATION_EXAMPLE_MODELS = {
     # [Decoder-only]
-    "ApertusForCausalLM": _HfExamplesInfo("swiss-ai/Apertus-8B",
+    "ApertusForCausalLM": _HfExamplesInfo("swiss-ai/Apertus-8B-2509",
                                           min_transformers_version="4.56.0",
                                           trust_remote_code=True),
     "AquilaModel": _HfExamplesInfo("BAAI/AquilaChat-7B",
@@ -154,7 +181,7 @@ _TEXT_GENERATION_EXAMPLE_MODELS = {
     "BailingMoeForCausalLM": _HfExamplesInfo("inclusionAI/Ling-lite-1.5",
                                          trust_remote_code=True),
     "BambaForCausalLM": _HfExamplesInfo("ibm-ai-platform/Bamba-9B-v1",
-                                        min_transformers_version="4.56.0",
+                                        min_transformers_version="4.55.3",
                                         extras={"tiny": "hmellor/tiny-random-BambaForCausalLM"}),  # noqa: E501
     "BloomForCausalLM": _HfExamplesInfo("bigscience/bloom-560m",
                                         {"1b": "bigscience/bloomz-1b1"}),
@@ -208,7 +235,8 @@ _TEXT_GENERATION_EXAMPLE_MODELS = {
     "GptOssForCausalLM": _HfExamplesInfo("lmsys/gpt-oss-20b-bf16"),
     "GraniteForCausalLM": _HfExamplesInfo("ibm/PowerLM-3b"),
     "GraniteMoeForCausalLM": _HfExamplesInfo("ibm/PowerMoE-3b"),
-    "GraniteMoeHybridForCausalLM": _HfExamplesInfo("ibm-granite/granite-4.0-tiny-preview"),  # noqa: E501
+    "GraniteMoeHybridForCausalLM": _HfExamplesInfo("ibm-granite/granite-4.0-tiny-preview", # noqa: E501
+                                                   min_transformers_version="4.55.3"),
     "GraniteMoeSharedForCausalLM": _HfExamplesInfo("ibm-research/moe-7b-1b-active-shared-experts"),  # noqa: E501
     "Grok1ModelForCausalLM": _HfExamplesInfo("hpcai-tech/grok-1",
                                              trust_remote_code=True),
@@ -228,7 +256,7 @@ _TEXT_GENERATION_EXAMPLE_MODELS = {
                                             trust_remote_code=True),
     "JAISLMHeadModel": _HfExamplesInfo("inceptionai/jais-13b-chat"),
     "JambaForCausalLM": _HfExamplesInfo("ai21labs/AI21-Jamba-1.5-Mini",
-                                        min_transformers_version="4.56.0",
+                                        min_transformers_version="4.55.3",
                                         extras={
                                             "tiny": "ai21labs/Jamba-tiny-dev",
                                             "random": "ai21labs/Jamba-tiny-random",  # noqa: E501
@@ -244,7 +272,11 @@ _TEXT_GENERATION_EXAMPLE_MODELS = {
     "Llama4ForCausalLM": _HfExamplesInfo("meta-llama/Llama-4-Scout-17B-16E-Instruct", # noqa: E501
                                          is_available_online=False),
     "MambaForCausalLM": _HfExamplesInfo("state-spaces/mamba-130m-hf"),
-    "Mamba2ForCausalLM": _HfExamplesInfo("mistralai/Mamba-Codestral-7B-v0.1"),
+    "Mamba2ForCausalLM": _HfExamplesInfo("mistralai/Mamba-Codestral-7B-v0.1",
+                                         min_transformers_version="4.55.3",
+                                         extras={
+                                            "random": "yujiepan/mamba2-codestral-v0.1-tiny-random", # noqa: E501
+                                         }),
     "FalconMambaForCausalLM": _HfExamplesInfo("tiiuae/falcon-mamba-7b-instruct"),  # noqa: E501
     "MiniCPMForCausalLM": _HfExamplesInfo("openbmb/MiniCPM-2B-sft-bf16",
                                          trust_remote_code=True),
@@ -259,7 +291,9 @@ _TEXT_GENERATION_EXAMPLE_MODELS = {
     "MistralForCausalLM": _HfExamplesInfo("mistralai/Mistral-7B-Instruct-v0.1"),
     "MixtralForCausalLM": _HfExamplesInfo("mistralai/Mixtral-8x7B-Instruct-v0.1",  # noqa: E501
                                           {"tiny": "TitanML/tiny-mixtral"}),  # noqa: E501
-    "QuantMixtralForCausalLM": _HfExamplesInfo("mistral-community/Mixtral-8x22B-v0.1-AWQ"),  # noqa: E501
+    "MotifForCausalLM": _HfExamplesInfo("Motif-Technologies/Motif-2.6B",
+                                        trust_remote_code=True,
+                                        v0_only=True),
     "MptForCausalLM": _HfExamplesInfo("mpt", is_available_online=False),
     "MPTForCausalLM": _HfExamplesInfo("mosaicml/mpt-7b"),
     "NemotronForCausalLM": _HfExamplesInfo("nvidia/Minitron-8B-Base"),
@@ -282,8 +316,6 @@ _TEXT_GENERATION_EXAMPLE_MODELS = {
     "PhiMoEForCausalLM": _HfExamplesInfo("microsoft/Phi-3.5-MoE-instruct",
                                          trust_remote_code=True),
     "Plamo2ForCausalLM": _HfExamplesInfo("pfnet/plamo-2-1b",
-                                         max_transformers_version="4.53",
-                                         transformers_version_reason="vLLM impl inherits PreTrainedModel and clashes with get_input_embeddings",  # noqa: E501
                                         trust_remote_code=True),
     "QWenLMHeadModel": _HfExamplesInfo("Qwen/Qwen-7B-Chat",
                                        max_transformers_version="4.53",
@@ -294,6 +326,8 @@ _TEXT_GENERATION_EXAMPLE_MODELS = {
     "Qwen2MoeForCausalLM": _HfExamplesInfo("Qwen/Qwen1.5-MoE-A2.7B-Chat"),
     "Qwen3ForCausalLM": _HfExamplesInfo("Qwen/Qwen3-8B"),
     "Qwen3MoeForCausalLM": _HfExamplesInfo("Qwen/Qwen3-30B-A3B"),
+    "Qwen3NextForCausalLM": _HfExamplesInfo("Qwen/Qwen3-Next-80B-A3B-Instruct",
+                                            min_transformers_version="4.56.2"),
     "RWForCausalLM": _HfExamplesInfo("tiiuae/falcon-40b"),
     "SeedOssForCausalLM": _HfExamplesInfo("ByteDance-Seed/Seed-OSS-36B-Instruct", # noqa: E501
                                           trust_remote_code=True,
@@ -328,6 +362,7 @@ _EMBEDDING_EXAMPLE_MODELS = {
     # [Text-only]
     "BertModel": _HfExamplesInfo("BAAI/bge-base-en-v1.5"),
     "Gemma2Model": _HfExamplesInfo("BAAI/bge-multilingual-gemma2"),  # noqa: E501
+    "Gemma3TextModel": _HfExamplesInfo("google/embeddinggemma-300m"),
     "GritLM": _HfExamplesInfo("parasail-ai/GritLM-7B-vllm"),
     "GteModel": _HfExamplesInfo("Snowflake/snowflake-arctic-embed-m-v2.0",
                                                trust_remote_code=True),
@@ -359,7 +394,20 @@ _EMBEDDING_EXAMPLE_MODELS = {
                                          trust_remote_code=True),
     "Qwen2VLForConditionalGeneration": _HfExamplesInfo("MrLight/dse-qwen2-2b-mrl-v1"), # noqa: E501
     "PrithviGeoSpatialMAE": _HfExamplesInfo("ibm-nasa-geospatial/Prithvi-EO-2.0-300M-TL-Sen1Floods11", # noqa: E501
-                                            is_available_online=False),  # noqa: E501
+                                            dtype=torch.float16,
+                                            enforce_eager=True,
+                                            skip_tokenizer_init=True,
+                                            # This is to avoid the model
+                                            # going OOM in CI
+                                            max_num_seqs=32,
+                                            ),
+    "Terratorch": _HfExamplesInfo("ibm-nasa-geospatial/Prithvi-EO-2.0-300M-TL-Sen1Floods11", # noqa: E501
+                                  dtype=torch.float16,
+                                  enforce_eager=True,
+                                  skip_tokenizer_init=True,
+                                  # This is to avoid the model going OOM in CI
+                                  max_num_seqs=32,
+                                  ),
 }
 
 _SEQUENCE_CLASSIFICATION_EXAMPLE_MODELS = {
@@ -438,6 +486,8 @@ _MULTIMODAL_EXAMPLE_MODELS = {
     "InternVLForConditionalGeneration": _HfExamplesInfo("OpenGVLab/InternVL3-1B-hf"),    # noqa: E501
     "KeyeForConditionalGeneration": _HfExamplesInfo("Kwai-Keye/Keye-VL-8B-Preview", # noqa: E501
                                                     trust_remote_code=True),
+    "KeyeVL1_5ForConditionalGeneration": _HfExamplesInfo("Kwai-Keye/Keye-VL-1_5-8B", # noqa: E501
+                                                         trust_remote_code=True),
     "KimiVLForConditionalGeneration": _HfExamplesInfo("moonshotai/Kimi-VL-A3B-Instruct",  # noqa: E501
                                                       extras={"thinking": "moonshotai/Kimi-VL-A3B-Thinking"},  # noqa: E501
                                                       trust_remote_code=True),
@@ -455,6 +505,8 @@ _MULTIMODAL_EXAMPLE_MODELS = {
                                                       max_transformers_version="4.48",  # noqa: E501
                                                       transformers_version_reason="HF model is not compatible.",  # noqa: E501
                                                       hf_overrides={"architectures": ["MantisForConditionalGeneration"]}),  # noqa: E501
+    "MiDashengLMModel": _HfExamplesInfo("mispeech/midashenglm-7b",
+                            trust_remote_code=True),
     "MiniCPMO": _HfExamplesInfo("openbmb/MiniCPM-o-2_6",
                                 trust_remote_code=True),
     "MiniCPMV": _HfExamplesInfo("openbmb/MiniCPM-Llama3-V-2_5",
@@ -474,6 +526,9 @@ _MULTIMODAL_EXAMPLE_MODELS = {
                               trust_remote_code=True),
     "Llama_Nemotron_Nano_VL" : _HfExamplesInfo("nvidia/Llama-3.1-Nemotron-Nano-VL-8B-V1", # noqa: E501
                                                      trust_remote_code=True),
+    "NemotronH_Nano_VL": _HfExamplesInfo("nano_vl_dummy",
+                                          is_available_online=False,
+                                          trust_remote_code=True),
     "Ovis": _HfExamplesInfo("AIDC-AI/Ovis2-1B", trust_remote_code=True,
                             max_transformers_version="4.53",
                             transformers_version_reason="HF model is not compatible",  # noqa: E501
@@ -554,19 +609,21 @@ _SPECULATIVE_DECODING_EXAMPLE_MODELS = {
     "EagleDeepSeekMTPModel": _HfExamplesInfo("eagle618/deepseek-v3-random",
                                         speculative_model="eagle618/eagle-deepseek-v3-random",  # noqa: E501
                                         trust_remote_code=True),
-    "EagleLlamaForCausalLM": _HfExamplesInfo("yuhuili/EAGLE-LLaMA3-Instruct-8B",
+    "EagleLlamaForCausalLM": _HfExamplesInfo("meta-llama/Meta-Llama-3-8B-Instruct", # noqa: E501
                                              trust_remote_code=True,
                                              speculative_model="yuhuili/EAGLE-LLaMA3-Instruct-8B",
-                                             tokenizer="meta-llama/Meta-Llama-3-8B-Instruct"),  # noqa: E501
-    "Eagle3LlamaForCausalLM": _HfExamplesInfo("yuhuili/EAGLE3-LLaMA3.1-Instruct-8B",  # noqa: E501
+                                             tokenizer="meta-llama/Meta-Llama-3-8B-Instruct"), # noqa: E501
+    "Eagle3LlamaForCausalLM": _HfExamplesInfo("meta-llama/Llama-3.1-8B-Instruct",  # noqa: E501
+                                            trust_remote_code=True,
+                                            speculative_model="yuhuili/EAGLE3-LLaMA3.1-Instruct-8B", # noqa: E501
+                                            tokenizer="meta-llama/Llama-3.1-8B-Instruct",
+                                            use_original_num_layers=True,
+                                            max_model_len=10240),
+    "LlamaForCausalLMEagle3": _HfExamplesInfo("Qwen/Qwen3-8B",  # noqa: E501
                                             trust_remote_code=True,
-                                            speculative_model="yuhuili/EAGLE3-LLaMA3.1-Instruct-8B",
-                                            tokenizer="meta-llama/Llama-3.1-8B-Instruct"),
-    # TODO: Re-enable this once tests/models/test_initialization.py is fixed, see PR #22333 #22611   # noqa: E501
-    # "LlamaForCausalLMEagle3": _HfExamplesInfo("AngelSlim/Qwen3-8B_eagle3",  # noqa: E501
-    #                                         trust_remote_code=True,
-    #                                         speculative_model="AngelSlim/Qwen3-8B_eagle3",   # noqa: E501
-    #                                         tokenizer="Qwen/Qwen3-8B"),
+                                            speculative_model="AngelSlim/Qwen3-8B_eagle3",   # noqa: E501
+                                            tokenizer="Qwen/Qwen3-8B",
+                                            use_original_num_layers=True),
     "EagleLlama4ForCausalLM": _HfExamplesInfo(
         "morgendave/EAGLE-Llama-4-Scout-17B-16E-Instruct",
         trust_remote_code=True,
@@ -586,7 +643,9 @@ _SPECULATIVE_DECODING_EXAMPLE_MODELS = {
                                         is_available_online=False),
     "MiMoMTPModel": _HfExamplesInfo("XiaomiMiMo/MiMo-7B-RL",
                                     trust_remote_code=True,
-                                    speculative_model="XiaomiMiMo/MiMo-7B-RL")
+                                    speculative_model="XiaomiMiMo/MiMo-7B-RL"),
+    "Qwen3NextMTP": _HfExamplesInfo("Qwen/Qwen3-Next-80B-A3B-Instruct",
+                                     min_transformers_version="4.56.2"),
 }
 
 _TRANSFORMERS_BACKEND_MODELS = {
diff --git a/tests/models/test_initialization.py b/tests/models/test_initialization.py
index b4d516233b4bfc95e0f8b4678dae454f32a67c57..3b13a12276f5d0f40454db9d6653203f86059efb 100644
--- a/tests/models/test_initialization.py
+++ b/tests/models/test_initialization.py
@@ -36,7 +36,10 @@ def can_initialize(model_arch: str, monkeypatch: pytest.MonkeyPatch,
 
     hf_overrides_fn = partial(dummy_hf_overrides,
                               model_arch=model_arch,
-                              exist_overrides=model_info.hf_overrides)
+                              exist_overrides=model_info.hf_overrides,
+                              use_original_num_layers=getattr(
+                                  model_info, 'use_original_num_layers',
+                                  False))
 
     # Avoid calling model.forward()
     def _initialize_kv_caches_v0(self) -> None:
@@ -60,19 +63,29 @@ def can_initialize(model_arch: str, monkeypatch: pytest.MonkeyPatch,
                        _initialize_kv_caches_v1), monkeypatch.context() as m):
         if model_info.v0_only:
             m.setenv("VLLM_USE_V1", "0")
-        if model_arch == "Phi4FlashForCausalLM":
-            # Phi4FlashForCausalLM only supports DIFFERENTIAL_FLASH_ATTN backend
+        if model_arch in ("Phi4FlashForCausalLM", "MotifForCausalLM"):
+            # Phi4FlashForCausalLM and MotifForCausalLM
+            # only supports DIFFERENTIAL_FLASH_ATTN backend
             m.setenv("VLLM_ATTENTION_BACKEND", "DIFFERENTIAL_FLASH_ATTN")
         if model_arch == "GptOssForCausalLM":
             # FIXME: A hack to bypass FA3 assertion because our CI's L4 GPU
             # has cc==8.9 which hasn't supported FA3 yet. Remove this hack when
             # L4 supports FA3.
             m.setenv("VLLM_ATTENTION_BACKEND", "TRITON_ATTN_VLLM_V1")
+        if model_arch == "Florence2ForConditionalGeneration":
+            # An encoder-decoder model that's V0-only. Just skip it
+            # since V0 is about to be removed.
+            pytest.skip("Skipping Florence2ForConditionalGeneration")
+        if model_arch == "WhisperForConditionalGeneration":
+            m.setenv("VLLM_WORKER_MULTIPROC_METHOD", "spawn")
         LLM(
             model_info.default,
             tokenizer=model_info.tokenizer,
             tokenizer_mode=model_info.tokenizer_mode,
             revision=model_info.revision,
+            enforce_eager=model_info.enforce_eager,
+            skip_tokenizer_init=model_info.skip_tokenizer_init,
+            dtype=model_info.dtype,
             speculative_config={
                 "model": model_info.speculative_model,
                 "num_speculative_tokens": 1,
@@ -85,7 +98,7 @@ def can_initialize(model_arch: str, monkeypatch: pytest.MonkeyPatch,
             model_impl=ModelImpl.TRANSFORMERS
             if model_arch in _TRANSFORMERS_BACKEND_MODELS else ModelImpl.VLLM,
             hf_overrides=hf_overrides_fn,
-        )
+            max_num_seqs=model_info.max_num_seqs)
 
 
 @pytest.mark.parametrize("model_arch", HF_EXAMPLE_MODELS.get_supported_archs())
diff --git a/tests/models/test_terratorch.py b/tests/models/test_terratorch.py
new file mode 100644
index 0000000000000000000000000000000000000000..d6d43ca2f7e15f805a130ace2edd11556fee59c6
--- /dev/null
+++ b/tests/models/test_terratorch.py
@@ -0,0 +1,45 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import pytest
+import torch
+
+from tests.conftest import VllmRunner
+from vllm.utils import set_default_torch_num_threads
+
+
+@pytest.mark.parametrize(
+    "model",
+    [
+        "ibm-nasa-geospatial/Prithvi-EO-2.0-300M-TL-Sen1Floods11",
+        "mgazz/Prithvi_v2_eo_300_tl_unet_agb"
+    ],
+)
+def test_inference(
+    vllm_runner: type[VllmRunner],
+    model: str,
+) -> None:
+
+    pixel_values = torch.full((6, 512, 512), 1.0, dtype=torch.float16)
+    location_coords = torch.full((1, 2), 1.0, dtype=torch.float16)
+    prompt = dict(prompt_token_ids=[1],
+                  multi_modal_data=dict(pixel_values=pixel_values,
+                                        location_coords=location_coords))
+    with (
+            set_default_torch_num_threads(1),
+            vllm_runner(
+                model,
+                runner="pooling",
+                dtype=torch.float16,
+                enforce_eager=True,
+                skip_tokenizer_init=True,
+                # Limit the maximum number of sequences to avoid the
+                # test going OOM during the warmup run
+                max_num_seqs=32,
+            ) as vllm_model,
+    ):
+
+        vllm_output = vllm_model.llm.encode(prompt)
+        assert torch.equal(
+            torch.isnan(vllm_output[0].outputs.data).any(),
+            torch.tensor(False))
diff --git a/tests/models/utils.py b/tests/models/utils.py
index 0fb1f5b3753b543ed2afe4b8832fec9b971e5d7c..76c6e4823a12c5c93a1de73194f78144887e652b 100644
--- a/tests/models/utils.py
+++ b/tests/models/utils.py
@@ -294,6 +294,8 @@ def build_model_context(
         limit_mm_per_prompt=limit_mm_per_prompt,
         mm_processor_cache_gb=mm_processor_cache_gb,
         hf_overrides=model_info.hf_overrides,
+        skip_tokenizer_init=model_info.skip_tokenizer_init,
+        enforce_eager=model_info.enforce_eager,
         **model_config_kwargs,
     )
     return InputContext(model_config)
@@ -345,6 +347,7 @@ class ModelInfo:
     name: str
     architecture: str = ""
     dtype: str = "auto"
+    hf_dtype: str = "float32"
     hf_overrides: Optional[dict[str, Any]] = None
     default_pooling_type: str = ""
     enable_test: bool = True
@@ -352,6 +355,7 @@ class ModelInfo:
 
 @dataclass
 class EmbedModelInfo(ModelInfo):
+    mteb_score: Optional[float] = None
     is_matryoshka: bool = False
     matryoshka_dimensions: Optional[list[int]] = None
 
@@ -368,7 +372,7 @@ class LASTPoolingEmbedModelInfo(EmbedModelInfo):
 
 @dataclass
 class RerankModelInfo(ModelInfo):
-    pass
+    mteb_score: Optional[float] = None
 
 
 @dataclass
@@ -381,11 +385,18 @@ class LASTPoolingRerankModelInfo(RerankModelInfo):
     default_pooling_type: str = "LAST"
 
 
+@dataclass
+class GenerateModelInfo(ModelInfo):
+    hf_dtype: str = "auto"
+    hf_ppl: Optional[float] = None
+
+
 def dummy_hf_overrides(
     hf_config: PretrainedConfig,
     *,
     model_arch: str = "",
     exist_overrides: Optional[dict[str, Any]] = None,
+    use_original_num_layers: bool = False,
 ) -> PretrainedConfig:
     """
     Dummy HF overrides function used to create dummy model
@@ -402,10 +413,18 @@ def dummy_hf_overrides(
 
     # we use three layers for Gemma-3n to check
     # both normal layer and kv_shared_layer
-    num_hidden_layers = (3 if model_arch == "Gemma3nForConditionalGeneration"
-                         else 1)
+    if use_original_num_layers:
+        # Use the original number of layers from the config
+        num_layers = getattr(text_config, 'num_layers', 1)
+        num_hidden_layers = getattr(text_config, 'num_hidden_layers', 1)
+    else:
+        # Use minimal layers for testing
+        num_layers = 1
+        num_hidden_layers = (3 if model_arch
+                             == "Gemma3nForConditionalGeneration" else 1)
+
     text_config.update({
-        "num_layers": 1,
+        "num_layers": num_layers,
         "num_hidden_layers": num_hidden_layers,
         "num_experts": num_experts,
         "num_experts_per_tok": 2,
diff --git a/tests/multimodal/test_utils.py b/tests/multimodal/test_utils.py
index a028c668c8ab7d2d8c7c6a7be955bbf6ffae017d..e1e8282dd66d4602f57f529011d65c2ca7a5dd16 100644
--- a/tests/multimodal/test_utils.py
+++ b/tests/multimodal/test_utils.py
@@ -31,11 +31,11 @@ if TYPE_CHECKING:
     from vllm.multimodal.inputs import MultiModalPlaceholderDict
 
 # Test different image extensions (JPG/PNG) and formats (gray/RGB/RGBA)
-TEST_IMAGE_URLS = [
-    "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg",
-    "https://upload.wikimedia.org/wikipedia/commons/f/fa/Grayscale_8bits_palette_sample_image.png",
-    "https://upload.wikimedia.org/wikipedia/commons/thumb/9/91/Venn_diagram_rgb.svg/1280px-Venn_diagram_rgb.svg.png",
-    "https://upload.wikimedia.org/wikipedia/commons/0/0b/RGBA_comp.png",
+TEST_IMAGE_ASSETS = [
+    "2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg",  # "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg"
+    "Grayscale_8bits_palette_sample_image.png",  # "https://upload.wikimedia.org/wikipedia/commons/f/fa/Grayscale_8bits_palette_sample_image.png",
+    "1280px-Venn_diagram_rgb.svg.png",  # "https://upload.wikimedia.org/wikipedia/commons/thumb/9/91/Venn_diagram_rgb.svg/1280px-Venn_diagram_rgb.svg.png",
+    "RGBA_comp.png",  # "https://upload.wikimedia.org/wikipedia/commons/0/0b/RGBA_comp.png",
 ]
 
 TEST_VIDEO_URLS = [
@@ -45,12 +45,11 @@ TEST_VIDEO_URLS = [
 
 
 @pytest.fixture(scope="module")
-def url_images() -> dict[str, Image.Image]:
-    connector = MediaConnector()
+def url_images(local_asset_server) -> dict[str, Image.Image]:
 
     return {
-        image_url: connector.fetch_image(image_url)
-        for image_url in TEST_IMAGE_URLS
+        image_url: local_asset_server.get_image_asset(image_url)
+        for image_url in TEST_IMAGE_ASSETS
     }
 
 
@@ -69,7 +68,7 @@ def _image_equals(a: Image.Image, b: Image.Image) -> bool:
 
 
 @pytest.mark.asyncio
-@pytest.mark.parametrize("image_url", TEST_IMAGE_URLS)
+@pytest.mark.parametrize("image_url", TEST_IMAGE_ASSETS, indirect=True)
 async def test_fetch_image_http(image_url: str):
     connector = MediaConnector()
 
@@ -79,12 +78,12 @@ async def test_fetch_image_http(image_url: str):
 
 
 @pytest.mark.asyncio
-@pytest.mark.parametrize("image_url", TEST_IMAGE_URLS)
+@pytest.mark.parametrize("raw_image_url", TEST_IMAGE_ASSETS)
 @pytest.mark.parametrize("suffix", get_supported_suffixes())
 async def test_fetch_image_base64(url_images: dict[str, Image.Image],
-                                  image_url: str, suffix: str):
+                                  raw_image_url: str, suffix: str):
     connector = MediaConnector()
-    url_image = url_images[image_url]
+    url_image = url_images[raw_image_url]
 
     try:
         mime_type = Image.MIME[Image.registered_extensions()[suffix]]
@@ -117,7 +116,7 @@ async def test_fetch_image_base64(url_images: dict[str, Image.Image],
 
 
 @pytest.mark.asyncio
-@pytest.mark.parametrize("image_url", TEST_IMAGE_URLS)
+@pytest.mark.parametrize("image_url", TEST_IMAGE_ASSETS, indirect=True)
 async def test_fetch_image_local_files(image_url: str):
     connector = MediaConnector()
 
@@ -152,8 +151,8 @@ async def test_fetch_image_local_files(image_url: str):
 
 
 @pytest.mark.asyncio
-async def test_fetch_image_local_files_with_space_in_name():
-    image_url = TEST_IMAGE_URLS[0]
+@pytest.mark.parametrize("image_url", [TEST_IMAGE_ASSETS[0]], indirect=True)
+async def test_fetch_image_local_files_with_space_in_name(image_url: str):
     connector = MediaConnector()
 
     with TemporaryDirectory() as temp_dir:
@@ -205,6 +204,32 @@ async def test_fetch_video_http(video_url: str, num_frames: int):
     assert metadata_sync == metadata_async
 
 
+@pytest.mark.asyncio
+@pytest.mark.parametrize("video_url", TEST_VIDEO_URLS)
+@pytest.mark.parametrize("max_duration", [1, 60, 1800])
+@pytest.mark.parametrize("requested_fps", [2, 24])
+async def test_fetch_video_http_with_dynamic_loader(
+        video_url: str, max_duration: int, requested_fps: int,
+        monkeypatch: pytest.MonkeyPatch):
+    with monkeypatch.context() as m:
+        m.setenv("VLLM_VIDEO_LOADER_BACKEND", "opencv_dynamic")
+        connector = MediaConnector(
+            media_io_kwargs={
+                "video": {
+                    "max_duration": max_duration,
+                    "requested_fps": requested_fps,
+                }
+            })
+
+        video_sync, metadata_sync = connector.fetch_video(video_url)
+        video_async, metadata_async = await connector.fetch_video_async(
+            video_url)
+
+        assert np.array_equal(video_sync, video_async)
+        assert metadata_sync == metadata_async
+        assert metadata_sync["video_backend"] == "opencv_dynamic"
+
+
 # Used for `test_argsort_mm_positions`.
 class TestCase(NamedTuple):
     mm_positions: "MultiModalPlaceholderDict"
@@ -458,7 +483,7 @@ def run_dp_sharded_vision_model_vs_direct(local_rank: int, world_size: int,
     with torch.inference_mode():
         sharded_output = run_dp_sharded_vision_model(image_input, vision_model)
 
-    # Check that the world size is setup correctly
+    # Check that the world size is set up correctly
     assert get_tensor_model_parallel_world_size() == world_size
 
     # Check that the outputs have the same shape
@@ -636,11 +661,13 @@ def run_dp_sharded_mrope_vision_model_vs_direct(local_rank: int,
 
     # Run the model through the sharded function
     with torch.inference_mode():
-        sharded_output = run_dp_sharded_mrope_vision_model(
-            vision_model, pixel_values, grid_thw_list)
+        sharded_output = run_dp_sharded_mrope_vision_model(vision_model,
+                                                           pixel_values,
+                                                           grid_thw_list,
+                                                           rope_type="rope_3d")
         sharded_output = torch.cat(sharded_output, dim=0)
 
-    # Check that the world size is setup correctly
+    # Check that the world size is set up correctly
     assert get_tensor_model_parallel_world_size() == world_size
 
     # Compare outputs (only on rank 0)
@@ -691,8 +718,10 @@ def run_dp_sharded_mrope_vision_model_empty_input_worker(
 
     # Should handle empty input gracefully
     with torch.inference_mode():
-        output = run_dp_sharded_mrope_vision_model(vision_model, pixel_values,
-                                                   grid_thw_list)
+        output = run_dp_sharded_mrope_vision_model(vision_model,
+                                                   pixel_values,
+                                                   grid_thw_list,
+                                                   rope_type="rope_3d")
 
     assert len(output) == 0
 
@@ -745,8 +774,10 @@ def run_dp_sharded_mrope_vision_model_uneven_load_worker(
 
     # Should handle uneven distribution without errors
     with torch.inference_mode():
-        output_tuple = run_dp_sharded_mrope_vision_model(
-            vision_model, pixel_values, grid_thw_list)
+        output_tuple = run_dp_sharded_mrope_vision_model(vision_model,
+                                                         pixel_values,
+                                                         grid_thw_list,
+                                                         rope_type="rope_3d")
 
     # Verify output shape is reasonable
     merge_factor = vision_model.spatial_merge_size**2
diff --git a/tests/neuron/1_core/test_activation.py b/tests/neuron/1_core/test_activation.py
deleted file mode 100644
index 2d6e5f523cb853e30d808d64e0ea0f56462a2eb2..0000000000000000000000000000000000000000
--- a/tests/neuron/1_core/test_activation.py
+++ /dev/null
@@ -1,43 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-import pytest
-import torch
-import torch.nn.functional as F
-
-from vllm.model_executor.layers.activation import FastGELU, SiluAndMul
-from vllm.platforms import current_platform
-
-
-@pytest.mark.parametrize("activation", ["silu_and_mul", "gelu_fast"])
-@pytest.mark.parametrize("num_tokens,d,dtype", [
-    (7, 512, torch.half),
-    (7, 512, torch.float),
-    (83, 512, torch.half),
-])
-@torch.inference_mode()
-def test_act_and_mul(
-    activation: str,
-    num_tokens: int,
-    d: int,
-    dtype: torch.dtype,
-) -> None:
-    import torch_xla.core.xla_model as xm
-
-    device = xm.xla_device()
-    current_platform.seed_everything(0)
-    torch.set_default_device("cpu")
-    x = torch.randn(num_tokens, 2 * d, dtype=dtype).to(device=device)
-    if activation == "silu_and_mul":
-        layer = SiluAndMul()
-        fn = layer.forward_native
-    elif activation == "gelu_fast":
-        layer = FastGELU()
-        fn = F.gelu
-    else:
-        raise NotImplementedError(
-            f"activation {activation} is not implemented.")
-    assert x.is_xla, "input tensor under testing is expected to be XLA tensor."
-    out = layer.to(device=device).forward_neuron(x)
-    ref_out = fn(x.cpu())
-    torch.testing.assert_close(out.cpu(), ref_out, atol=0.01, rtol=0.0)
diff --git a/tests/neuron/1_core/test_block_table.py b/tests/neuron/1_core/test_block_table.py
deleted file mode 100644
index efec56360c1424a9524cb325677725f4cf970773..0000000000000000000000000000000000000000
--- a/tests/neuron/1_core/test_block_table.py
+++ /dev/null
@@ -1,154 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-import neuronxcc.nki.language as nl
-import pytest
-import torch
-import torch.nn.functional as F
-from neuronxcc import nki
-
-from vllm.attention.ops.nki_flash_attn import (
-    load_block_tables, transform_block_tables_for_indirect_load)
-
-
-def is_power_of_2(n):
-    return n > 0 and (n & (n - 1) == 0)
-
-
-def nki_load_and_transform_block_tables(
-    block_tables,
-    num_tiles,
-    num_blocks_per_tile,
-    num_head,
-    head_id,
-    block_size_tiling_factor,
-):
-    assert is_power_of_2(
-        num_blocks_per_tile), f"{num_blocks_per_tile=} must be power of 2"
-    block_tables_sbuf = load_block_tables(block_tables, num_tiles,
-                                          num_blocks_per_tile)
-
-    # we need to pass an Index as head_id
-    head_id = nl.arange(1)[None, :] + head_id
-
-    block_tables_transposed = transform_block_tables_for_indirect_load(
-        block_tables_sbuf, block_size_tiling_factor, num_head, head_id)
-    B_P_SIZE = 128
-    assert block_tables_transposed.shape[1] == B_P_SIZE
-
-    out = nl.ndarray(
-        block_tables_transposed.shape,
-        dtype=nl.int32,
-        buffer=nl.shared_hbm,
-    )
-    for i in nl.affine_range(block_tables_transposed.shape[0]):
-        nl.store(dst=out[i], value=block_tables_transposed[i])
-    return out
-
-
-def ref_block_tables_transform(
-    block_tables,
-    num_tiles,
-    num_blocks_per_tile,
-    num_head,
-    head_id,
-    block_size_tiling_factor,
-):
-    assert block_tables.numel() == num_tiles * num_blocks_per_tile
-    block_tables = block_tables.view(num_tiles, num_blocks_per_tile)
-    B_F_SIZE = 128
-    num_tiles_padded = (num_tiles + B_F_SIZE - 1) // B_F_SIZE * B_F_SIZE
-    block_tables = F.pad(
-        block_tables,
-        (0, 0, 0, num_tiles_padded - num_tiles),
-        "constant",
-        0,
-    )
-
-    block_tables = block_tables * num_head + head_id
-    block_tables = block_tables.view(num_tiles_padded, num_blocks_per_tile, 1)
-    offset = torch.arange(0, block_size_tiling_factor).view(1, 1, -1)
-    block_tables = block_tables * block_size_tiling_factor + offset
-    block_tables_transposed = block_tables.view(num_tiles_padded, -1).t()
-
-    num_blocks_per_tile = block_tables_transposed.shape[0]
-    assert num_blocks_per_tile % B_F_SIZE == 0
-    return block_tables_transposed.view(num_blocks_per_tile // B_F_SIZE,
-                                        B_F_SIZE, num_tiles_padded)
-
-
-@pytest.mark.parametrize(
-    "q_head_per_kv_head,head_id",
-    [
-        (1, 0),
-        (3, 1),
-    ],
-)
-@pytest.mark.parametrize(
-    "num_tiles,num_blocks_per_tile",
-    [
-        (1, 1),
-        (13, 16),
-        (17, 128),
-        (35, 512),
-        (128, 128),
-        (130, 64),
-        (280, 256),
-        (315, 1),
-    ],
-)
-@torch.inference_mode()
-def test_load_and_transform_block_tables(
-    monkeypatch: pytest.MonkeyPatch,
-    num_tiles,
-    num_blocks_per_tile,
-    q_head_per_kv_head,
-    head_id,
-) -> None:
-    import torch_xla.core.xla_model as xm
-
-    device = xm.xla_device()
-
-    compiler_flags_str = " ".join([
-        "-O1",
-        "--retry_failed_compilation",
-    ])
-    with monkeypatch.context() as m:
-        m.setenv("NEURON_CC_FLAGS", compiler_flags_str)
-
-        torch.manual_seed(10000)
-        torch.set_printoptions(sci_mode=False)
-
-        # On Neuron, we need B_P_SIZE = 128 blocks to make DMA efficient
-        B_P_SIZE = 128
-        if num_blocks_per_tile < B_P_SIZE:
-            assert B_P_SIZE % num_blocks_per_tile == 0
-            block_size_tiling_factor = B_P_SIZE // num_blocks_per_tile
-        else:
-            block_size_tiling_factor = 1
-        max_num_blocks = 100000
-        block_tables = torch.randint(
-            0,
-            max_num_blocks,
-            (num_tiles * num_blocks_per_tile, ),
-            dtype=torch.int32,
-        )
-        nki_out = nki.jit(nki_load_and_transform_block_tables)[1, 1](
-            block_tables.to(device=device),
-            num_tiles,
-            num_blocks_per_tile,
-            q_head_per_kv_head,
-            head_id,
-            block_size_tiling_factor,
-        ).cpu()
-        ref_out = ref_block_tables_transform(
-            block_tables,
-            num_tiles,
-            num_blocks_per_tile,
-            q_head_per_kv_head,
-            head_id,
-            block_size_tiling_factor,
-        )
-        assert (nki_out.shape == ref_out.shape
-                ), f"{nki_out.shape=} != {ref_out.shape=}"
-        assert torch.all(nki_out == ref_out)
diff --git a/tests/neuron/1_core/test_cache.py b/tests/neuron/1_core/test_cache.py
deleted file mode 100644
index 670889ad6b58db6ec7324ed7b5718ca0271d4311..0000000000000000000000000000000000000000
--- a/tests/neuron/1_core/test_cache.py
+++ /dev/null
@@ -1,86 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-import pytest
-import torch
-
-from vllm.attention.ops.nki_flash_attn import reshape_and_cache
-
-
-@pytest.mark.parametrize(
-    "num_tokens, n_kv_head, d_head, num_blocks, block_size",
-    [
-        # Small model configuration (e.g., GPT-2 small)
-        (32, 12, 64, 4, 128),  # Typical sequence processing
-        (1, 12, 64, 4, 128),  # Single token update
-        (128, 12, 64, 4, 128),  # Longer sequence
-
-        # Medium model configuration (e.g., GPT-2 medium)
-        (64, 16, 96, 8, 256),  # Standard batch
-        (256, 16, 96, 8, 256),  # Large batch
-
-        # Large model configuration (e.g., GPT-3 style)
-        (48, 32, 128, 16, 512),  # Typical processing window
-        (512, 32, 128, 16, 512),  # Full context window
-
-        # Edge cases and stress tests
-        (1024, 8, 32, 32, 32),  # Many tokens, small heads
-        (16, 64, 256, 4, 64),  # Few tokens, many heads
-        (2048, 24, 128, 64, 128),  # Large scale test
-
-        # Minimal configurations for debugging
-        (4, 2, 16, 2, 16),  # Tiny test case
-        (1, 1, 8, 1, 8),  # Minimal possible
-    ])
-def test_reshape_and_cache(num_tokens, n_kv_head, d_head, num_blocks,
-                           block_size):
-    # Set random seed for reproducibility
-    torch.manual_seed(42)
-
-    # Create CPU tensors for reference implementation
-    key_cpu = torch.randn(num_tokens, n_kv_head, d_head) / torch.sqrt(
-        torch.tensor(d_head))
-    value_cpu = torch.randn(num_tokens, n_kv_head, d_head) / torch.sqrt(
-        torch.tensor(d_head))
-    key_cache_cpu = torch.zeros(num_blocks, n_kv_head, block_size, d_head)
-    value_cache_cpu = torch.zeros(num_blocks, n_kv_head, block_size, d_head)
-    slot_mapping_cpu = torch.randperm(num_blocks * block_size)[:num_tokens]
-
-    # Run reference implementation on CPU
-    block_indices = torch.div(slot_mapping_cpu,
-                              block_size,
-                              rounding_mode="floor")
-    block_offsets = slot_mapping_cpu % block_size
-
-    for i in range(num_tokens):
-        block_idx = block_indices[i]
-        block_offset = block_offsets[i]
-        key_cache_cpu[block_idx, :, block_offset, :] = key_cpu[i]
-        value_cache_cpu[block_idx, :, block_offset, :] = value_cpu[i]
-
-    # Create XLA device tensors
-    device = torch.device('xla')
-    key = key_cpu.to(device)
-    value = value_cpu.to(device)
-    key_cache = torch.zeros_like(key_cache_cpu, device=device)
-    value_cache = torch.zeros_like(value_cache_cpu, device=device)
-    slot_mapping = slot_mapping_cpu.to(device)
-    kv_cache = torch.stack([key_cache, value_cache])
-
-    # Run vectorized implementation on XLA device
-    reshape_and_cache(key, value, kv_cache, slot_mapping)
-    key_cache, value_cache = torch.unbind(kv_cache, dim=0)
-
-    # Move results back to CPU for comparison
-    key_cache_result = key_cache.cpu()
-    value_cache_result = value_cache.cpu()
-
-    # Assert results match
-    torch.testing.assert_close(key_cache_result,
-                               key_cache_cpu,
-                               rtol=1e-5,
-                               atol=1e-5)
-    torch.testing.assert_close(value_cache_result,
-                               value_cache_cpu,
-                               rtol=1e-5,
-                               atol=1e-5)
diff --git a/tests/neuron/1_core/test_layernorm.py b/tests/neuron/1_core/test_layernorm.py
deleted file mode 100644
index c6fce1d1a06306c672012400c4d9ba20e04821ae..0000000000000000000000000000000000000000
--- a/tests/neuron/1_core/test_layernorm.py
+++ /dev/null
@@ -1,57 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-import pytest
-import torch
-
-from vllm.model_executor.layers.layernorm import RMSNorm
-from vllm.platforms import current_platform
-
-
-@pytest.mark.parametrize("num_tokens,hidden_size,add_residual,dtype", [
-    (7, 8, False, torch.half),
-    (83, 768, False, torch.half),
-    (83, 768, True, torch.half),
-    (83, 768, True, torch.bfloat16),
-    (83, 768, True, torch.float32),
-])
-@torch.inference_mode()
-def test_rms_norm(
-    num_tokens: int,
-    hidden_size: int,
-    add_residual: bool,
-    dtype: torch.dtype,
-) -> None:
-    import torch_xla.core.xla_model as xm
-
-    device = xm.xla_device()
-    current_platform.seed_everything(0)
-    torch.set_default_device("cpu")
-    layer = RMSNorm(hidden_size).to(dtype=dtype)
-    layer.weight.data.normal_(mean=1.0, std=0.1)
-    scale = 1 / (2 * hidden_size)
-    x = torch.randn(num_tokens, hidden_size, dtype=dtype).to(device=device)
-    x *= scale
-    residual = torch.randn_like(x) * scale if add_residual else None
-
-    residual_cpu = residual.cpu() if add_residual else None
-    ref_out = layer.to(device="cpu").forward_native(x.cpu(), residual_cpu)
-    assert x.is_xla, "input tensor under testing is expected to be XLA tensor."
-    out = layer.to(device=device)(x, residual)
-
-    # NOTE(woosuk): LayerNorm operators (including RMS) typically have larger
-    # numerical errors than other operators because they involve reductions.
-    # Therefore, we use a larger tolerance.
-    if add_residual:
-        assert out[0].is_xla, "output tensor is expected to be XLA tensor"
-        torch.testing.assert_close(out[0].cpu(),
-                                   ref_out[0],
-                                   atol=1e-2,
-                                   rtol=1e-2)
-        torch.testing.assert_close(out[1].cpu(),
-                                   ref_out[1],
-                                   atol=1e-2,
-                                   rtol=1e-2)
-    else:
-        assert out.is_xla, "output tensor is expected to be XLA tensor"
-        torch.testing.assert_close(out.cpu(), ref_out, atol=1e-2, rtol=1e-2)
diff --git a/tests/neuron/1_core/test_logits_processor.py b/tests/neuron/1_core/test_logits_processor.py
deleted file mode 100644
index ce9eadf5a883e25d77e5b5aabc99355a587c84b4..0000000000000000000000000000000000000000
--- a/tests/neuron/1_core/test_logits_processor.py
+++ /dev/null
@@ -1,95 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-import random
-from unittest.mock import patch
-
-import pytest
-import torch
-
-from vllm.model_executor.layers.logits_processor import LogitsProcessor
-from vllm.model_executor.sampling_metadata import SamplingMetadata
-from vllm.model_executor.utils import set_random_seed
-from vllm.sequence import SamplingParams, SequenceData, SequenceGroupMetadata
-from vllm.utils import is_pin_memory_available
-
-
-class MockLogitsProcessor(LogitsProcessor):
-
-    def __init__(self, vocab_size: int, scale: float,
-                 fake_logits: torch.Tensor):
-        super().__init__(vocab_size=vocab_size, scale=scale)
-        self.fake_logits = fake_logits.clone()
-
-    def forward(self, *args, **kwargs):
-        with patch(
-                "vllm.model_executor.layers.logits_processor._prune_hidden_states",
-                lambda x, y: x
-        ), patch(
-                "vllm.model_executor.layers.logits_processor.LogitsProcessor._get_logits",
-                lambda *args, **kwargs: self.fake_logits):
-            return super().forward(*args, **kwargs)
-
-
-def _prepare_test(
-        batch_size: int
-) -> tuple[torch.Tensor, torch.Tensor, MockLogitsProcessor]:
-    vocab_size = 32000
-    input_tensor = torch.rand((batch_size, 1024), dtype=torch.float16)
-    fake_logits = torch.full((batch_size, vocab_size),
-                             1e-2,
-                             dtype=input_tensor.dtype)
-    logits_processor = MockLogitsProcessor(32000, 0.5, fake_logits)
-    return input_tensor, fake_logits, logits_processor
-
-
-RANDOM_SEEDS = list(range(8))
-
-
-@pytest.mark.parametrize("seed", RANDOM_SEEDS)
-def test_logits_processors(seed: int):
-    import torch_xla.core.xla_model as xm
-
-    device = xm.xla_device()
-    set_random_seed(seed)
-    torch.set_default_device("cpu")
-    batch_size = random.randint(1, 256)
-    input_tensor, fake_logits, logits_processor = _prepare_test(batch_size)
-
-    # This sample logits processor gives infinite score to the i-th token,
-    # where i is the length of the input sequence.
-    # We therefore expect the output token sequence to be [0, 1, 2, ...]
-    def pick_ith(token_ids, logits):
-        logits[len(token_ids)] = float("inf")
-        return logits
-
-    seq_group_metadata_list = []
-    seq_lens = []
-    for i in range(batch_size):
-        seq_group_metadata_list.append(
-            SequenceGroupMetadata(
-                request_id=f"test_{i}",
-                is_prompt=True,
-                seq_data={0: SequenceData.from_seqs([1, 2, 3])},
-                sampling_params=SamplingParams(temperature=0,
-                                               logits_processors=[pick_ith]),
-                block_tables={0: [1]},
-            ))
-        seq_lens.append(seq_group_metadata_list[-1].seq_data[0].get_len())
-
-    sampling_metadata = SamplingMetadata.prepare(
-        seq_group_metadata_list,
-        seq_lens,
-        query_lens=seq_lens,
-        device=device,
-        pin_memory=is_pin_memory_available())
-    logits_processor_output = logits_processor(
-        lm_head=None,
-        hidden_states=input_tensor,
-        sampling_metadata=sampling_metadata)
-
-    fake_logits *= logits_processor.scale
-    torch.testing.assert_close(logits_processor_output[:, 1],
-                               fake_logits[:, 1],
-                               rtol=1e-4,
-                               atol=0.0)
diff --git a/tests/neuron/1_core/test_neuron_model_runner.py b/tests/neuron/1_core/test_neuron_model_runner.py
deleted file mode 100644
index 5f3268810f9fe3b8ed97f8d95939c0c68a70b33a..0000000000000000000000000000000000000000
--- a/tests/neuron/1_core/test_neuron_model_runner.py
+++ /dev/null
@@ -1,127 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-import os
-from unittest.mock import MagicMock
-
-from vllm.config import VllmConfig
-from vllm.engine.arg_utils import EngineArgs
-from vllm.platforms import current_platform
-from vllm.platforms.neuron import NeuronFramework
-from vllm.sampling_params import SamplingParams
-from vllm.sequence import SequenceData, SequenceGroupMetadata
-from vllm.worker.neuron_model_runner import NeuronModelRunner
-
-os.environ[
-    'VLLM_NEURON_FRAMEWORK'] = NeuronFramework.TRANSFORMERS_NEURONX.value
-
-
-def _create_neuron_model_runner(model: str, *args,
-                                **kwargs) -> NeuronModelRunner:
-    engine_args = EngineArgs(model, *args, **kwargs)
-    engine_config = engine_args.create_engine_config()
-    vllm_config = VllmConfig(
-        model_config=engine_config.model_config,
-        parallel_config=engine_config.parallel_config,
-        scheduler_config=engine_config.scheduler_config,
-        device_config=engine_config.device_config,
-    )
-    neuron_model_runner = NeuronModelRunner(vllm_config=vllm_config)
-    return neuron_model_runner
-
-
-def test_update_neuron_sampling_params_not_full_batch():
-    os.environ["NEURON_ON_DEVICE_SAMPLING_DISABLED"] = "0"
-    model_runner = _create_neuron_model_runner(
-        "facebook/opt-125m",
-        seed=0,
-        dtype="float16",
-        max_num_seqs=2,
-    )
-    assert not model_runner._on_device_sampling_disabled
-    # Test sampling param updating only when TNx is framework
-    # NxDI handles sampling parameter updating inside model
-    if current_platform.use_transformers_neuronx():
-        model_mock = MagicMock()
-        model_runner.model = model_mock
-
-        seq_group_metadata_list = [
-            SequenceGroupMetadata(
-                request_id="test_0",
-                is_prompt=True,
-                seq_data={0: SequenceData.from_seqs([1, 2, 3])},
-                sampling_params=SamplingParams(temperature=0.5,
-                                               top_k=1,
-                                               top_p=0.5),
-                block_tables={0: [1]},
-            )
-        ]
-
-        model_runner.prepare_model_input(seq_group_metadata_list)
-
-        # Index neuron sampling parameters based on block_tables indices.
-        # The first block_id of the sequence 0 is 1, so its parameters are
-        # placed at index 1. So the sampling parameters will be:
-        # Index 0: default sampling parameters
-        # Index 1: sequecne 0's sampling parameters.
-        neuron_sampling_params = (
-            model_runner.model_config.neuron_sampling_params)
-        assert neuron_sampling_params.temperature == [1.0, 0.5]
-        assert neuron_sampling_params.top_k == [
-            model_runner._MAX_NEURON_SAMPLING_TOP_K, 1
-        ]
-        assert neuron_sampling_params.top_p == [1.0, 0.5]
-        model_mock.model.update_generation_config.assert_called_once_with(
-            neuron_sampling_params)
-
-
-def test_update_neuron_sampling_params_full_batch():
-    os.environ["NEURON_ON_DEVICE_SAMPLING_DISABLED"] = "0"
-    model_runner = _create_neuron_model_runner(
-        "facebook/opt-125m",
-        seed=0,
-        dtype="float16",
-        max_num_seqs=2,
-    )
-    assert not model_runner._on_device_sampling_disabled
-
-    # Test sampling param updating only when TNx is framework
-    # NxDI handles sampling parameter updating inside model
-    if current_platform.use_transformers_neuronx():
-        model_mock = MagicMock()
-        model_runner.model = model_mock
-
-        seq_group_metadata_list = [
-            SequenceGroupMetadata(
-                request_id="test_0",
-                is_prompt=True,
-                seq_data={0: SequenceData.from_seqs([1, 2, 3])},
-                sampling_params=SamplingParams(temperature=0.5,
-                                               top_k=1,
-                                               top_p=0.5),
-                block_tables={0: [1]},
-            ),
-            SequenceGroupMetadata(
-                request_id="test_0",
-                is_prompt=True,
-                seq_data={1: SequenceData.from_seqs([4, 5, 6])},
-                sampling_params=SamplingParams(temperature=0.2,
-                                               top_k=2,
-                                               top_p=0.2),
-                block_tables={1: [0]},
-            )
-        ]
-
-        model_runner.prepare_model_input(seq_group_metadata_list)
-
-        # Index neuron sampling parameters based on block_tables indices.
-        # The first block_id of the sequence 0 is 1, so its parameters are
-        # placed at index 1. So the sampling parameters will be:
-        # Index 0: sequence 1's sampling parameters
-        # Index 1: sequecne 0's sampling parameters.
-        neuron_sampling_params = (
-            model_runner.model_config.neuron_sampling_params)
-        assert neuron_sampling_params.temperature == [0.2, 0.5]
-        assert neuron_sampling_params.top_k == [2, 1]
-        assert neuron_sampling_params.top_p == [0.2, 0.5]
-        model_mock.model.update_generation_config.assert_called_once_with(
-            neuron_sampling_params)
diff --git a/tests/neuron/1_core/test_neuron_quant.py b/tests/neuron/1_core/test_neuron_quant.py
deleted file mode 100644
index 08630026959281a0cc5e390fe096f0e4c42a2bb8..0000000000000000000000000000000000000000
--- a/tests/neuron/1_core/test_neuron_quant.py
+++ /dev/null
@@ -1,12 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-from vllm.model_executor.layers.quantization.neuron_quant import (
-    NeuronQuantConfig)
-
-
-def test_get_supported_act_dtypes():
-    neuron_quant_config = NeuronQuantConfig()
-    supported_act_dtypes = neuron_quant_config.get_supported_act_dtypes()
-    target_list = ["any_dtype1", "any_dtype2"]
-    for dtype in target_list:
-        assert dtype in supported_act_dtypes
diff --git a/tests/neuron/1_core/test_prefix_prefill.py b/tests/neuron/1_core/test_prefix_prefill.py
deleted file mode 100644
index abf7febc2955c42988bf82c27255d03c606c2c5a..0000000000000000000000000000000000000000
--- a/tests/neuron/1_core/test_prefix_prefill.py
+++ /dev/null
@@ -1,514 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-from typing import Optional
-
-import pytest
-import torch
-import torch.nn.functional as F
-
-from vllm.utils import cdiv
-
-
-class BlockDiagonalCausalFromBottomRightMask:
-
-    @staticmethod
-    def _from_seqlens(query_lens, seq_lens, block_size=None):
-        from torch import logical_and, logical_or
-
-        contexted = block_size is None
-        context_lens = torch.tensor(seq_lens) - torch.tensor(query_lens)
-        n_queries = sum(query_lens)
-        num_seqs = len(query_lens)
-        if contexted:
-            key_lens_blockaligned = seq_lens
-        else:
-            n_blocks_per_seq = (context_lens + block_size - 1) // block_size
-            offset_per_seq = n_blocks_per_seq * block_size
-            key_lens_blockaligned = offset_per_seq[:num_seqs].tolist()
-        n_keys = sum(key_lens_blockaligned)
-
-        a = (torch.arange(n_queries).reshape(n_queries,
-                                             1).expand(n_queries, n_keys))
-        b = torch.arange(n_keys).reshape(1, n_keys).expand(n_queries, n_keys)
-        q_cumsum = torch.tensor([0] + query_lens).cumsum(dim=0)
-        k_cumsum = torch.tensor([0] + key_lens_blockaligned).cumsum(dim=0)
-
-        prior_mask = torch.zeros(n_queries, n_keys)
-        new_masks: list[torch.Tensor] = []
-        for seq_id in range(num_seqs):
-            ri = q_cumsum[seq_id]
-            ci = k_cumsum[seq_id]
-            nr = query_lens[seq_id]
-
-            if contexted:
-                nc = seq_lens[seq_id]
-                a_offset = ci + nc - ri - nr
-                new_mask = (a + a_offset) >= b
-            else:
-                nc = context_lens[seq_id]
-                a_offset = ci + nc - 1
-                new_mask = a_offset >= b
-
-            left_mask = b >= ci
-            top_mask = a >= ri
-            bottom_mask = a < (ri + nr)
-
-            new_mask = logical_and(
-                logical_and(logical_and(new_mask, left_mask), top_mask),
-                bottom_mask,
-            )
-            prior_mask = logical_or(prior_mask, new_mask)
-            new_masks = new_masks + [new_mask]
-        return prior_mask
-
-    @staticmethod
-    def from_seqlens(query_lens, seq_lens, block_size=None):
-        contexted = block_size is None
-        if contexted:
-            prior_mask = BlockDiagonalCausalFromBottomRightMask._from_seqlens(
-                query_lens, seq_lens)
-            active_mask = None
-        else:
-            prior_mask = BlockDiagonalCausalFromBottomRightMask._from_seqlens(
-                query_lens, seq_lens, block_size)
-            active_mask = BlockDiagonalCausalFromBottomRightMask._from_seqlens(
-                query_lens, query_lens)
-        return prior_mask, active_mask
-
-
-def ref_softmax(x: torch.Tensor,
-                dim: int,
-                mixed_precision=False,
-                return_max_reduce=False):
-    max_value = torch.amax(x, dim=dim, keepdims=True)
-    exp = torch.exp(x - max_value)
-    if mixed_precision:
-        sum_value = torch.sum(exp.astype(torch.float32),
-                              dim=dim,
-                              keepdims=True).astype(x.dtype)
-    else:
-        sum_value = torch.sum(exp, dim=dim, keepdims=True)
-    if return_max_reduce:
-        return exp / sum_value, max_value, torch.reciprocal(sum_value)
-    return exp / sum_value
-
-
-def ref_masked_attention(
-    query: torch.Tensor,
-    key: torch.Tensor,
-    value: torch.Tensor,
-    scale: float,
-    attn_mask: Optional[torch.Tensor] = None,
-    return_max_reduce: Optional[bool] = False,
-) -> torch.Tensor:
-    scaled_qk = scale * torch.einsum("qhd,khd->hqk", query, key).float()
-    if attn_mask is not None:
-        masked_score = scaled_qk + attn_mask.float()
-    if return_max_reduce:
-        norm_score, cached_max, cached_sum_reciprocal = ref_softmax(
-            masked_score, dim=-1, return_max_reduce=True)
-    else:
-        norm_score = ref_softmax(masked_score, dim=-1)
-    out = torch.einsum("hqk,khd->qhd", norm_score.to(value.dtype), value)
-    if return_max_reduce:
-        return (
-            out,
-            cached_max,
-            cached_sum_reciprocal,
-            norm_score,
-            masked_score,
-            scaled_qk,
-        )
-    else:
-        return (out, )
-
-
-def ref_context_attention(
-    query,
-    key,
-    value,
-    query_lens,
-    seq_lens,
-    head_size,
-    num_queries_per_kv,
-    return_max_reduce=False,
-):
-    scale = float(1.0 / (head_size**0.5))
-    if num_queries_per_kv > 1:
-        # Handle MQA and GQA
-        key = torch.repeat_interleave(key, num_queries_per_kv, dim=1)
-        value = torch.repeat_interleave(value, num_queries_per_kv, dim=1)
-
-    attn_mask, _ = BlockDiagonalCausalFromBottomRightMask.from_seqlens(
-        query_lens, seq_lens)
-
-    # convert binary mask to -inf values
-    attn_mask = torch.logical_not(attn_mask)
-    attn_mask = attn_mask.float() * -30000
-
-    output, *debug_tensors = ref_masked_attention(
-        query,
-        key,
-        value,
-        scale,
-        attn_mask,
-        return_max_reduce=return_max_reduce,
-    )
-
-    output = output.unsqueeze(1)
-    if return_max_reduce:
-        cached_max, cached_sum_reciprocal, lse, masked_score, scaled_qk = (
-            debug_tensors)
-        return (
-            output,
-            cached_max,
-            cached_sum_reciprocal,
-            lse,
-            masked_score,
-            scaled_qk,
-        )
-    else:
-        return output
-
-
-def sample_inputs(
-    prefill_batch_size,
-    decode_batch_size,
-    min_query_len,
-    max_query_len,
-    min_ctx_len,
-    max_ctx_len,
-    block_size,
-    num_heads,
-    num_kv_heads,
-    head_size,
-    dtype,
-):
-    batch_size = prefill_batch_size + decode_batch_size
-    max_model_len = (max_query_len + max_ctx_len) * 4
-    max_block_per_request = max_model_len // block_size
-    cache_size = (batch_size * max_block_per_request) + 2
-    prefill_ctx_lens = torch.randint(min_ctx_len,
-                                     max_ctx_len + 1, (prefill_batch_size, ),
-                                     dtype=torch.long).tolist()
-    decode_ctx_lens = torch.randint(min_ctx_len,
-                                    max_ctx_len + 1, (decode_batch_size, ),
-                                    dtype=torch.long).tolist()
-    ctx_lens = prefill_ctx_lens + decode_ctx_lens
-    query_lens = torch.randint(
-        min_query_len,
-        max_query_len + 1,
-        (prefill_batch_size, ),
-        dtype=torch.long,
-    ).tolist() + [1 for _ in range(decode_batch_size)]
-    seq_lens = [a + b for a, b in zip(query_lens, ctx_lens)]
-
-    num_tokens = sum(query_lens)
-    query = torch.empty(num_tokens, num_heads, head_size, dtype=dtype)
-    query.uniform_(-1, 1)
-    torch.empty(num_tokens, num_heads, head_size, dtype=dtype)
-
-    kv = torch.empty(sum(seq_lens), 2, num_kv_heads, head_size, dtype=dtype)
-    kv.uniform_(-1, 1)
-    key, value = kv.unbind(dim=1)
-
-    k_cache = torch.zeros(cache_size,
-                          block_size,
-                          num_kv_heads,
-                          head_size,
-                          dtype=dtype)
-    v_cache = torch.zeros(cache_size,
-                          block_size,
-                          num_kv_heads,
-                          head_size,
-                          dtype=dtype)
-    k = torch.zeros(sum(query_lens), num_kv_heads, head_size, dtype=dtype)
-    v = torch.zeros(sum(query_lens), num_kv_heads, head_size, dtype=dtype)
-    values = torch.arange(0, cache_size, dtype=torch.long)
-    values = values[torch.randperm(cache_size)]
-    block_table = values[:batch_size * max_block_per_request].view(
-        batch_size, max_block_per_request)
-    b_ctx_len = torch.tensor(ctx_lens, dtype=torch.long)
-    b_start_loc = torch.cumsum(torch.tensor([0] + query_lens[:-1],
-                                            dtype=torch.long),
-                               dim=0)
-    # copy kv to cache
-    b_seq_start_loc = torch.cumsum(torch.tensor([0] + seq_lens[:-1],
-                                                dtype=torch.long),
-                                   dim=0)
-    for i in range(batch_size):
-        for j in range(query_lens[i]):
-            k[b_start_loc[i] + j].copy_(key[b_seq_start_loc[i] + b_ctx_len[i] +
-                                            j])
-            v[b_start_loc[i] + j].copy_(value[b_seq_start_loc[i] +
-                                              b_ctx_len[i] + j])
-        cur_ctx = 0
-        block_id = 0
-        while cur_ctx < b_ctx_len[i]:
-            start_loc = b_seq_start_loc[i] + cur_ctx
-            if cur_ctx + block_size > b_ctx_len[i]:
-                end_loc = b_seq_start_loc[i] + b_ctx_len[i]
-            else:
-                end_loc = start_loc + block_size
-            start_slot = block_table[i, block_id] * block_size
-            end_slot = start_slot + end_loc - start_loc
-            k_cache.view(-1, num_kv_heads,
-                         head_size)[start_slot:end_slot].copy_(
-                             key[start_loc:end_loc])
-            v_cache.view(-1, num_kv_heads,
-                         head_size)[start_slot:end_slot].copy_(
-                             value[start_loc:end_loc])
-            cur_ctx += block_size
-            block_id += 1
-    kv_cache = torch.stack([k_cache, v_cache])
-
-    return (
-        query,
-        k,
-        v,
-        kv_cache,
-        block_table,
-        key,
-        value,
-        query_lens,
-        seq_lens,
-    )
-
-
-def get_active_block_tables(block_tables, query_lens, seq_lens, block_size,
-                            num_blocks):
-    context_lens = seq_lens - query_lens
-    blocks_per_seq = (context_lens + block_size - 1) // block_size
-    num_seqs = len(seq_lens)
-    active_blocks: list[int] = []
-    for seq_id in range(num_seqs):
-        active_blocks = (
-            active_blocks +
-            block_tables[seq_id, :blocks_per_seq[seq_id]].tolist())
-    return F.pad(
-        torch.tensor(active_blocks, dtype=torch.int32),
-        (0, num_blocks - len(active_blocks)),
-        "constant",
-        0,
-    )
-
-
-@pytest.mark.parametrize(
-    "prefill_batch_size,decode_batch_size,block_size,large_tile_size,num_heads,num_queries_per_kv,head_size,mixed_precision",
-    [
-        # Test minimal configurations (small block size)
-        (1, 199, 1, 512, 4, 2, 8, False
-         ),  # minimal block size, small dimensions
-        (1, 199, 1, 512, 4, 2, 8, True),  # same with mixed precision
-
-        # Test common/medium configurations
-        (4, 12, 32, 2048, 32, 8, 64, False),  # common case, larger heads
-        (4, 12, 32, 2048, 16, 4, 32,
-         True),  # medium size, mixed precision, grouped-query attention (GQA)
-
-        # Test large configurations
-        (4, 12, 256, 8192, 8, 1, 128, False),  # large blocks, large head size
-        (4, 12, 256, 8192, 64, 8, 64, True),  # large blocks, many heads
-
-        # Test asymmetric configurations
-        (2, 24, 64, 4096, 12, 4, 96, False),  # varied batch sizes
-        (8, 8, 128, 2048, 24, 2, 48, True),  # balanced batches
-
-        # Test edge cases
-        (1, 128, 16, 1024, 4, 2, 16, False),  # large decode batch
-        (16, 4, 8, 1024, 4, 2, 128, True),  # large prefill batch
-        (4, 12, 32, 2048, 16, 1, 32, True),  # multi-head attention (MHA)
-        (4, 12, 32, 2048, 16, 16, 32, True),  # multi-query attention (MQA)
-    ])
-@torch.inference_mode()
-def test_contexted_kv_attention(
-    monkeypatch: pytest.MonkeyPatch,
-    prefill_batch_size: int,
-    decode_batch_size: int,
-    num_heads: int,
-    num_queries_per_kv: int,
-    head_size: int,
-    block_size: int,
-    large_tile_size,
-    mixed_precision: bool,
-) -> None:
-
-    import torch_xla.core.xla_model as xm
-
-    from vllm.attention.ops.nki_flash_attn import (flash_attn_varlen_nkifunc,
-                                                   reorder_context_mask)
-
-    assert large_tile_size % block_size == 0
-
-    device = xm.xla_device()
-
-    compiler_flags_str = " ".join([
-        "-O1",
-        "--retry_failed_compilation",
-    ])
-    with monkeypatch.context() as m:
-        m.setenv("NEURON_CC_FLAGS", compiler_flags_str)
-
-        torch.manual_seed(0)
-        torch.set_printoptions(sci_mode=False)
-        torch.set_default_device("cpu")
-        dtype = torch.float32
-
-        min_ctx_len = 32
-        max_ctx_len = 1024
-        min_query_len = 16
-        max_query_len = 512
-        num_kv_heads = num_heads // num_queries_per_kv
-        (
-            query,
-            k_active,
-            v_active,
-            kv_cache,
-            block_table,
-            key,
-            value,
-            query_lens,
-            seq_lens,
-        ) = sample_inputs(
-            prefill_batch_size=prefill_batch_size,
-            decode_batch_size=decode_batch_size,
-            min_query_len=min_query_len,
-            max_query_len=max_query_len,
-            min_ctx_len=min_ctx_len,
-            max_ctx_len=max_ctx_len,
-            block_size=block_size,
-            num_heads=num_heads,
-            num_kv_heads=num_kv_heads,
-            head_size=head_size,
-            dtype=dtype,
-        )
-
-        output_ref = ref_context_attention(
-            query,
-            key,
-            value,
-            query_lens,
-            seq_lens,
-            head_size,
-            num_queries_per_kv,
-            return_max_reduce=False,
-        )
-
-        # build neuron program
-        B_P_SIZE = 128
-        assert (large_tile_size >= B_P_SIZE
-                ), f"Expect {large_tile_size=} to be larger than {B_P_SIZE=}"
-
-        def pad_to_multiple(a, b):
-            return cdiv(a, b) * b
-
-        def pad_to_next_power_of_2(a):
-            assert a > 0
-            return 2**int(a - 1).bit_length()
-
-        # calculate input shapes
-        max_num_queries = pad_to_next_power_of_2(sum(query_lens))
-        context_lens = torch.tensor(seq_lens) - torch.tensor(query_lens)
-        num_active_blocks = cdiv(context_lens, block_size).sum().item()
-        num_active_blocks = pad_to_multiple(num_active_blocks,
-                                            large_tile_size // block_size)
-        context_kv_len = num_active_blocks * block_size
-        assert (
-            context_kv_len %
-            large_tile_size == 0), f"invalid context_kv_len={context_kv_len}"
-
-        # pad QKV tensors
-        pad_dims = (
-            0,
-            0,
-            0,
-            0,
-            0,
-            max_num_queries - query.shape[0],
-        )
-        query = F.pad(query, pad_dims, "constant", 0)
-        k = F.pad(k_active, pad_dims, "constant", 0)
-        v = F.pad(v_active, pad_dims, "constant", 0)
-
-        # permute QKV tensors
-        # query: (1, n_heads, d, seq_q)
-        # key:   (1, n_kv_heads, d, seq_k)
-        # value: (1, n_kv_heads, seq_v, d)
-        query = query.unsqueeze(0).permute(0, 2, 3, 1).contiguous()
-        k = k.unsqueeze(0).permute(0, 2, 3, 1).contiguous()
-        v = v.unsqueeze(0).permute(0, 2, 1, 3).contiguous()
-        kv_cache = kv_cache.permute(0, 1, 3, 2, 4).contiguous()
-
-        # transform block table
-        active_block_table = get_active_block_tables(
-            block_table.cpu(),
-            torch.tensor(query_lens).cpu(),
-            torch.tensor(seq_lens).cpu(),
-            block_size,
-            num_active_blocks,
-        )
-
-        # Build attention masks
-        prior_mask, active_mask = (
-            BlockDiagonalCausalFromBottomRightMask.from_seqlens(
-                query_lens, seq_lens, block_size=block_size))
-        prior_mask_padded = F.pad(
-            prior_mask,
-            (
-                0,
-                context_kv_len - prior_mask.shape[1],
-                0,
-                max_num_queries - prior_mask.shape[0],
-            ),
-            "constant",
-            0,
-        ).bool()
-        active_mask_padded = F.pad(
-            active_mask,
-            (
-                0,
-                max_num_queries - active_mask.shape[1],
-                0,
-                max_num_queries - active_mask.shape[0],
-            ),
-            "constant",
-            0,
-        ).bool()
-        attn_mask = torch.concat([prior_mask_padded, active_mask_padded],
-                                 dim=1)
-
-        attn_mask = reorder_context_mask(attn_mask, large_tile_size,
-                                         block_size)
-
-        input_args = (
-            query.to(device=device),
-            k.to(device=device),
-            v.to(device=device),
-            kv_cache.to(device=device),
-            active_block_table.to(device=device),
-            attn_mask.to(device=device),
-        )
-        input_kwargs = dict(
-            n_kv_head=num_kv_heads,
-            head_size=head_size,
-            mixed_precision=mixed_precision,
-            LARGE_TILE_SZ=large_tile_size,
-        )
-
-        output_nki = flash_attn_varlen_nkifunc(*input_args, **input_kwargs)
-
-        num_actual_tokens = sum(query_lens)
-        # - o: shape (bs, n_heads, seq_q, d) -> (bs, seq_q, n_heads, d)
-        output_nki = output_nki.cpu().permute(0, 2, 1, 3)
-        output_nki = output_nki[0, :num_actual_tokens, :, :]
-        output_ref_padded = F.pad(
-            output_ref,
-            (0, 0, 0, 0, 0, 0, 0, max_num_queries - output_ref.shape[0]),
-            "constant",
-            0,
-        )
-        output_ref = output_ref_padded.transpose(
-            0, 1)[0, :num_actual_tokens, :, :]
-
-        torch.testing.assert_close(output_nki, output_ref, atol=1e-2, rtol=0)
diff --git a/tests/neuron/1_core/test_rotary_embedding.py b/tests/neuron/1_core/test_rotary_embedding.py
deleted file mode 100644
index a7ac79729986d748d9c03ef3a296554376a72c2c..0000000000000000000000000000000000000000
--- a/tests/neuron/1_core/test_rotary_embedding.py
+++ /dev/null
@@ -1,68 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-"""
-Tests for miscellaneous utilities
-"""
-
-import pytest
-import torch
-
-from vllm.model_executor.layers.rotary_embedding import RotaryEmbedding
-from vllm.platforms import current_platform
-
-
-@pytest.mark.parametrize(
-    "max_position,is_neox_style,rotary_dim,head_size,seq_len,use_key", [
-        (16, False, 32, 32, 1024, True),
-        (16, False, 32, 128, 1024, True),
-        (16, True, 32, 32, 1024, True),
-        (16, True, 32, 128, 1024, True),
-        (16, False, 32, 128, 1024, False),
-        (16, True, 32, 128, 1024, False),
-    ])
-def test_rotary_embedding_opcheck(max_position, is_neox_style, rotary_dim,
-                                  head_size, seq_len, use_key):
-    import torch_xla.core.xla_model as xm
-
-    device = xm.xla_device()
-    current_platform.seed_everything(0)
-    torch.set_default_device("cpu")
-
-    batch_size = 1
-    base = 10000
-    num_heads = 8
-
-    rot = RotaryEmbedding(head_size, rotary_dim, max_position, base,
-                          is_neox_style, torch.float32)
-
-    positions = torch.randint(0,
-                              max_position, (batch_size, seq_len),
-                              device="cpu")
-    query = torch.randn(batch_size,
-                        seq_len,
-                        num_heads * head_size,
-                        dtype=torch.float32,
-                        device="cpu")
-    key = torch.randn_like(query) if use_key else None
-    assert positions.is_cpu, \
-        "reference input tensor is expected to be CPU tensor."
-    ref_query, ref_key = rot.to(device="cpu").forward_native(
-        positions, query, key)
-    out_query, out_key = rot.to(device=device).forward_neuron(
-        positions.to(device=device), query.to(device=device),
-        key.to(device=device) if key is not None else None)
-    if use_key:
-        assert out_query.is_xla and out_key.is_xla, \
-            "output tensor is expected to be XLA tensor"
-        torch.testing.assert_close(out_key.cpu(),
-                                   ref_key,
-                                   atol=1e-2,
-                                   rtol=1e-2)
-    else:
-        assert out_key is None, "expected returned key to be None"
-        assert out_query.is_xla, \
-            "output tensor is expected to be XLA tensor"
-    torch.testing.assert_close(out_query.cpu(),
-                               ref_query,
-                               atol=1e-2,
-                               rtol=1e-2)
diff --git a/tests/neuron/2_core/test_comm_ops.py b/tests/neuron/2_core/test_comm_ops.py
deleted file mode 100644
index 85a48dae58aaf37b443dc64b37dd02dcb0b6d1f8..0000000000000000000000000000000000000000
--- a/tests/neuron/2_core/test_comm_ops.py
+++ /dev/null
@@ -1,101 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-import functools
-from typing import Callable
-from unittest.mock import patch
-
-import pytest
-import torch
-import torch_xla.distributed.xla_multiprocessing as xmp
-from typing_extensions import ParamSpec
-
-from vllm.distributed.communication_op import (
-    tensor_model_parallel_all_gather, tensor_model_parallel_all_reduce)
-from vllm.distributed.parallel_state import (ensure_model_parallel_initialized,
-                                             init_distributed_environment)
-from vllm.utils import get_distributed_init_method, get_open_port
-
-_P = ParamSpec("_P")
-
-
-def reinitialize_neuron_runtime(f: Callable[_P, None]) -> Callable[_P, None]:
-    """Decorator to reinitialize the Neuron Runtime before executing a test.
-    This is necessary for distributed tests which need to reallocate Neuron
-    Cores to separate subprocesses.
-    """
-
-    @functools.wraps(f)
-    def wrapper(*args: _P.args, **kwargs: _P.kwargs) -> None:
-        runtime = torch.classes.neuron.Runtime()
-        runtime.initialize()
-        runtime.unsafe_close()
-
-        f(*args, **kwargs)
-        runtime.initialize()
-
-    return wrapper
-
-
-def all_gather_test_worker(index, tp_degree, distributed_init_method):
-    init_distributed_environment(tp_degree,
-                                 index,
-                                 distributed_init_method,
-                                 index,
-                                 backend="xla")
-    ensure_model_parallel_initialized(tp_degree, 1)
-
-    num_dimensions = 3
-    tensor_size = list(range(2, num_dimensions + 2))
-    total_size = 1
-    for s in tensor_size:
-        total_size *= s
-
-    all_gather_dimension = -1
-    all_tensors = [
-        torch.arange(total_size, dtype=torch.float32,
-                     device="xla").reshape(tensor_size) * (r + 1)
-        for r in range(tp_degree)
-    ]
-    expected = torch.cat(all_tensors, dim=all_gather_dimension)
-    t = all_tensors[index % tp_degree]
-    t = tensor_model_parallel_all_gather(t, all_gather_dimension)
-    torch.testing.assert_close(t, expected)
-
-
-def all_reduce_test_worker(index, tp_degree, distributed_init_method):
-    init_distributed_environment(tp_degree,
-                                 index,
-                                 distributed_init_method,
-                                 index,
-                                 backend="xla")
-    ensure_model_parallel_initialized(tp_degree, 1)
-
-    num_elements = 8
-    all_tensors = [
-        torch.arange(num_elements, dtype=torch.float32, device="xla") * (r + 1)
-        for r in range(tp_degree)
-    ]
-    expected = torch.sum(torch.stack(all_tensors, dim=0), dim=0)
-    t = all_tensors[index % tp_degree]
-    t = tensor_model_parallel_all_reduce(t)
-    torch.testing.assert_close(t, expected)
-
-
-@pytest.mark.parametrize("tp_size", [2])
-@pytest.mark.parametrize("test_target",
-                         [all_reduce_test_worker, all_gather_test_worker])
-@reinitialize_neuron_runtime
-def test_neuron_multi_process_tensor_parallel(monkeypatch, tp_size,
-                                              test_target):
-
-    with patch('torch_xla._XLAC._xla_runtime_is_initialized',
-               return_value=False):
-        distributed_init_method = get_distributed_init_method(
-            "127.0.0.1", get_open_port())
-
-        monkeypatch.setenv("VLLM_USE_V1", "1")
-        monkeypatch.setenv("NEURONCORE_NUM_DEVICES", str(tp_size))
-        monkeypatch.setenv("NEURON_PJRT_PROCESSES_NUM_DEVICES",
-                           ','.join(['1' for _ in range(tp_size)]))
-
-        xmp.spawn(test_target, args=(tp_size, distributed_init_method))
diff --git a/tests/neuron/2_core/test_eagle.py b/tests/neuron/2_core/test_eagle.py
deleted file mode 100644
index cac642af03101ab5d484625b1adf8c66d073fab1..0000000000000000000000000000000000000000
--- a/tests/neuron/2_core/test_eagle.py
+++ /dev/null
@@ -1,83 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-import json
-import os
-import shutil
-import tempfile
-
-import torch
-from huggingface_hub import snapshot_download
-from safetensors import safe_open
-
-from vllm import LLM, SamplingParams
-
-
-def patch_eagle_draft_with_lm_head(target_model_id: str,
-                                   draft_model_id: str) -> str:
-    # In NxDI, draft model checkpoint must include lm_head weights from target
-    # model. For more details see https://awsdocs-neuron.readthedocs-hosted.com
-    # /en/latest/libraries/nxd-inference/developer_guides/feature-guide.html
-    # #eagle-checkpoint-compatibility
-    final_draft_dir = "/tmp/patched_eagle_draft"
-
-    with tempfile.TemporaryDirectory() as tmp_dir:
-        target_dir = snapshot_download(repo_id=target_model_id,
-                                       local_dir=os.path.join(
-                                           tmp_dir, "target"))
-        draft_dir = snapshot_download(repo_id=draft_model_id,
-                                      local_dir=os.path.join(tmp_dir, "draft"))
-
-        lm_head_key = "lm_head.weight"
-        index_path = os.path.join(target_dir, "model.safetensors.index.json")
-        with open(index_path) as f:
-            index = json.load(f)
-        shard_name = index["weight_map"][lm_head_key]
-        target_safetensor_path = os.path.join(target_dir, shard_name)
-
-        with safe_open(target_safetensor_path, framework="pt") as f:
-            target_lm_head = f.get_tensor(lm_head_key)
-
-        draft_path = os.path.join(draft_dir, "pytorch_model.bin")
-        draft_state_dict = torch.load(draft_path, map_location="cpu")
-        draft_state_dict[lm_head_key] = target_lm_head.to(torch.float16)
-        torch.save(draft_state_dict, draft_path)
-
-        shutil.copytree(draft_dir, final_draft_dir, dirs_exist_ok=True)
-
-    return final_draft_dir
-
-
-def test_eagle():
-    patched_draft_path = patch_eagle_draft_with_lm_head(
-        target_model_id="meta-llama/Llama-2-7b-hf",
-        draft_model_id="yuhuili/EAGLE-llama2-chat-7B")
-    llm = LLM(
-        model="meta-llama/Llama-2-7b-hf",
-        speculative_config={
-            "model": patched_draft_path,
-            "num_speculative_tokens": 5,
-            "max_model_len": 128
-        },
-        max_num_seqs=1,
-        max_model_len=128,
-        tensor_parallel_size=2,
-        override_neuron_config={
-            "enable_eagle_speculation": True,
-            "enable_fused_speculation": True,
-            "fused_qkv": True
-        },
-    )
-    prompts = [
-        "The president of the United States is",
-    ]
-    outputs = llm.generate(prompts, SamplingParams(top_k=1))
-    expected_output = " the head of state and head of government of " \
-    "the United States. The president direct"
-
-    for output in outputs:
-        generated_text = output.outputs[0].text
-        print(f"Prompt: {output.prompt!r}, Generated text: {generated_text!r}")
-        assert (expected_output == generated_text)
-
-    print("Neuron Eagle speculation test passed.")
diff --git a/tests/neuron/2_core/test_mistral.py b/tests/neuron/2_core/test_mistral.py
deleted file mode 100644
index ff59be1725b6c30775636541a51f1e45a7834a59..0000000000000000000000000000000000000000
--- a/tests/neuron/2_core/test_mistral.py
+++ /dev/null
@@ -1,64 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-from vllm import LLM, SamplingParams
-
-
-def test_mistral():
-    llm = LLM(model="mistralai/Mistral-7B-v0.1",
-              tensor_parallel_size=2,
-              max_num_seqs=4,
-              max_model_len=128,
-              override_neuron_config={
-                  "sequence_parallel_enabled": False,
-                  "skip_warmup": True
-              })
-
-    # Send more prompts than the compiled batch size (4) and request
-    # varying generation lengths to test accuracy related to Neuron
-    # specific sequence id sorting.
-    prompts = [
-        "The president of the United States is",
-        "The capital of France is",
-        "What is Annapurna labs?",
-        "I believe the meaning of life is",
-        "Tell me a story about a brave knight",
-        "Hello, my name is Llama",
-    ]
-
-    sampling_params = [
-        SamplingParams(top_k=1, max_tokens=10),
-        SamplingParams(top_k=1, max_tokens=20),
-        SamplingParams(top_k=1, max_tokens=30),
-        SamplingParams(top_k=1, max_tokens=40),
-        SamplingParams(top_k=1, max_tokens=50),
-        SamplingParams(top_k=1, max_tokens=60)
-    ]
-
-    outputs = llm.generate(prompts, sampling_params)
-
-    expected_outputs = [
-        " the most powerful person in the world. He is",
-        " a city of many faces. It is a city of history, culture, art, "
-        "fashion, and",
-        "\n\nAnnapurna Labs is a semiconductor company that was founded "
-        "in 2013 by Amazon. The company is",
-        " to be happy.\n\nI believe that happiness is a choice.\n\nI "
-        "believe that happiness is a state of mind.\n\nI believe that "
-        "happiness is a journey.\n\nI believe",
-        " who rescued a princess from a dragon.\n\nTell me a story about"
-        " a princess who rescued herself from a dragon.\n\nTell me a "
-        "story about a princess who rescued herself from a dragon and "
-        "then rescued a knight from",
-        " and I am a 10 year old male. I am a very friendly and "
-        "affectionate boy who loves to be around people. I am a very "
-        "active boy who loves to play and run around. I am a very smart "
-        "boy who loves to learn new things. I am a very loyal boy"
-    ]
-
-    for expected_output, output in zip(expected_outputs, outputs):
-        generated_text = output.outputs[0].text
-        print(f"Prompt: {output.prompt!r}, Generated text: {generated_text!r}")
-        assert (expected_output == generated_text)
-
-    print("Neuron Mistral test passed.")
diff --git a/tests/neuron/2_core/test_multi_lora.py b/tests/neuron/2_core/test_multi_lora.py
deleted file mode 100644
index 52ca9fe7b6667542076c2d954a59a8dadb3024e0..0000000000000000000000000000000000000000
--- a/tests/neuron/2_core/test_multi_lora.py
+++ /dev/null
@@ -1,97 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-from huggingface_hub import snapshot_download
-
-from vllm import LLM, SamplingParams
-from vllm.lora.request import LoRARequest
-
-
-def test_llama_single_lora():
-    sql_lora_files = snapshot_download(
-        repo_id="yard1/llama-2-7b-sql-lora-test")
-    llm = LLM(model="meta-llama/Llama-2-7b-hf",
-              tensor_parallel_size=2,
-              max_num_seqs=4,
-              max_model_len=512,
-              override_neuron_config={
-                  "sequence_parallel_enabled": False,
-                  "skip_warmup": True,
-                  "lora_modules": [{
-                      "name": "lora_id_1",
-                      "path": sql_lora_files
-                  }]
-              },
-              enable_lora=True,
-              max_loras=1,
-              max_lora_rank=256,
-              device="neuron")
-    """For multi-lora requests using NxDI as the backend, only the lora_name 
-    needs to be specified. The lora_id and lora_path are supplied at the LLM 
-    class/server initialization, after which the paths are handled by NxDI"""
-    lora_req_1 = LoRARequest("lora_id_1", 0, " ")
-    prompts = [
-        "The president of the United States is",
-        "The capital of France is",
-    ]
-    outputs = llm.generate(prompts,
-                           SamplingParams(top_k=1),
-                           lora_request=[lora_req_1, lora_req_1])
-
-    expected_outputs = [
-        " the head of state and head of government of the United States. "
-        "The president direct",
-        " a city of contrasts. The city is home to the Eiffel Tower"
-    ]
-
-    for expected_output, output in zip(expected_outputs, outputs):
-        generated_text = output.outputs[0].text
-        assert (expected_output == generated_text)
-
-
-def test_llama_multiple_lora():
-    sql_lora_files = snapshot_download(
-        repo_id="yard1/llama-2-7b-sql-lora-test")
-    llm = LLM(model="meta-llama/Llama-2-7b-hf",
-              tensor_parallel_size=2,
-              max_num_seqs=4,
-              max_model_len=512,
-              override_neuron_config={
-                  "sequence_parallel_enabled":
-                  False,
-                  "skip_warmup":
-                  True,
-                  "lora_modules": [{
-                      "name": "lora_id_1",
-                      "path": sql_lora_files
-                  }, {
-                      "name": "lora_id_2",
-                      "path": sql_lora_files
-                  }]
-              },
-              enable_lora=True,
-              max_loras=2,
-              max_lora_rank=256,
-              device="neuron")
-    """For multi-lora requests using NxDI as the backend, only the lora_name 
-    needs to be specified. The lora_id and lora_path are supplied at the LLM 
-    class/server initialization, after which the paths are handled by NxDI"""
-    lora_req_1 = LoRARequest("lora_id_1", 0, " ")
-    lora_req_2 = LoRARequest("lora_id_2", 1, " ")
-    prompts = [
-        "The president of the United States is",
-        "The capital of France is",
-    ]
-    outputs = llm.generate(prompts,
-                           SamplingParams(top_k=1),
-                           lora_request=[lora_req_1, lora_req_2])
-
-    expected_outputs = [
-        " the head of state and head of government of the United States. "
-        "The president direct",
-        " a city of contrasts. The city is home to the Eiffel Tower"
-    ]
-
-    for expected_output, output in zip(expected_outputs, outputs):
-        generated_text = output.outputs[0].text
-        assert (expected_output == generated_text)
diff --git a/tests/plugins/prithvi_io_processor_plugin/prithvi_io_processor/__init__.py b/tests/plugins/prithvi_io_processor_plugin/prithvi_io_processor/__init__.py
index a750c756c11a26f43799db3066efdfeda0b95d1a..4bbb79c98a82a04e5d8da50b6215e7dace5e37d5 100644
--- a/tests/plugins/prithvi_io_processor_plugin/prithvi_io_processor/__init__.py
+++ b/tests/plugins/prithvi_io_processor_plugin/prithvi_io_processor/__init__.py
@@ -1,8 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-def register_prithvi_india():
-    return "prithvi_io_processor.prithvi_processor.PrithviMultimodalDataProcessorIndia"  # noqa: E501
 
 
-def register_prithvi_valencia():
-    return "prithvi_io_processor.prithvi_processor.PrithviMultimodalDataProcessorValencia"  # noqa: E501
+def register_prithvi():
+    return "prithvi_io_processor.prithvi_processor.PrithviMultimodalDataProcessor"  # noqa: E501
diff --git a/tests/plugins/prithvi_io_processor_plugin/prithvi_io_processor/prithvi_processor.py b/tests/plugins/prithvi_io_processor_plugin/prithvi_io_processor/prithvi_processor.py
index d49a50b7a309f33d7b8940010e027793e1d94460..42874f0398f0a547b58bd2bc082747688a89770b 100644
--- a/tests/plugins/prithvi_io_processor_plugin/prithvi_io_processor/prithvi_processor.py
+++ b/tests/plugins/prithvi_io_processor_plugin/prithvi_io_processor/prithvi_processor.py
@@ -8,7 +8,7 @@ import datetime
 import os
 import tempfile
 import urllib.request
-from collections.abc import AsyncGenerator, Sequence
+from collections.abc import Sequence
 from typing import Any, Optional, Union
 
 import albumentations
@@ -234,6 +234,8 @@ def load_image(
 
 class PrithviMultimodalDataProcessor(IOProcessor):
 
+    indices = [0, 1, 2, 3, 4, 5]
+
     def __init__(self, vllm_config: VllmConfig):
 
         super().__init__(vllm_config)
@@ -359,14 +361,6 @@ class PrithviMultimodalDataProcessor(IOProcessor):
 
         return prompts
 
-    async def pre_process_async(
-        self,
-        prompt: IOProcessorInput,
-        request_id: Optional[str] = None,
-        **kwargs,
-    ) -> Union[PromptType, Sequence[PromptType]]:
-        return self.pre_process(prompt, request_id, **kwargs)
-
     def post_process(
         self,
         model_output: Sequence[PoolingRequestOutput],
@@ -420,30 +414,3 @@ class PrithviMultimodalDataProcessor(IOProcessor):
                                   format="tiff",
                                   data=out_data,
                                   request_id=request_id)
-
-    async def post_process_async(
-        self,
-        model_output: AsyncGenerator[tuple[int, PoolingRequestOutput]],
-        request_id: Optional[str] = None,
-        **kwargs,
-    ) -> IOProcessorOutput:
-        collected_output = [item async for i, item in model_output]
-        return self.post_process(collected_output, request_id, **kwargs)
-
-
-class PrithviMultimodalDataProcessorIndia(PrithviMultimodalDataProcessor):
-
-    def __init__(self, vllm_config: VllmConfig):
-
-        super().__init__(vllm_config)
-
-        self.indices = [1, 2, 3, 8, 11, 12]
-
-
-class PrithviMultimodalDataProcessorValencia(PrithviMultimodalDataProcessor):
-
-    def __init__(self, vllm_config: VllmConfig):
-
-        super().__init__(vllm_config)
-
-        self.indices = [0, 1, 2, 3, 4, 5]
diff --git a/tests/plugins/prithvi_io_processor_plugin/setup.py b/tests/plugins/prithvi_io_processor_plugin/setup.py
index a03b1fbbd4a809821c8443f601a5b774461ee0ae..3ddda1a47bbe431a4f88bc27ba0b60f29d22c082 100644
--- a/tests/plugins/prithvi_io_processor_plugin/setup.py
+++ b/tests/plugins/prithvi_io_processor_plugin/setup.py
@@ -9,8 +9,7 @@ setup(
     packages=["prithvi_io_processor"],
     entry_points={
         "vllm.io_processor_plugins": [
-            "prithvi_to_tiff_india = prithvi_io_processor:register_prithvi_india",  # noqa: E501
-            "prithvi_to_tiff_valencia = prithvi_io_processor:register_prithvi_valencia",  # noqa: E501
+            "prithvi_to_tiff = prithvi_io_processor:register_prithvi",  # noqa: E501
         ]
     },
 )
diff --git a/tests/plugins_tests/test_io_processor_plugins.py b/tests/plugins_tests/test_io_processor_plugins.py
index 00fe429445d7dc8550dcfce76fa833ff71d90e1e..3567a701a3afa1e78807598dcc5929de591a3b2d 100644
--- a/tests/plugins_tests/test_io_processor_plugins.py
+++ b/tests/plugins_tests/test_io_processor_plugins.py
@@ -7,12 +7,11 @@ import requests
 
 from tests.utils import RemoteOpenAIServer
 from vllm.config import VllmConfig
-from vllm.entrypoints.llm import LLM
 from vllm.entrypoints.openai.protocol import IOProcessorResponse
 from vllm.plugins.io_processors import get_io_processor
 from vllm.pooling_params import PoolingParams
 
-MODEL_NAME = "christian-pinto/Prithvi-EO-2.0-300M-TL-VLLM"
+MODEL_NAME = "ibm-nasa-geospatial/Prithvi-EO-2.0-300M-TL-Sen1Floods11"
 
 image_url = "https://huggingface.co/christian-pinto/Prithvi-EO-2.0-300M-TL-VLLM/resolve/main/valencia_example_2024-10-26.tiff"  # noqa: E501
 
@@ -23,61 +22,7 @@ def test_loading_missing_plugin():
         get_io_processor(vllm_config, "wrong_plugin")
 
 
-def test_loading_engine_with_wrong_plugin():
-
-    with pytest.raises(ValueError):
-        LLM(
-            model=MODEL_NAME,
-            skip_tokenizer_init=True,
-            trust_remote_code=True,
-            enforce_eager=True,
-            # Limit the maximum number of parallel requests
-            # to avoid the model going OOM in CI.
-            max_num_seqs=32,
-            io_processor_plugin="wrong_plugin",
-        )
-
-
-@pytest.mark.parametrize("model_name", [MODEL_NAME])
-def test_prithvi_mae_plugin_offline(vllm_runner, model_name: str):
-
-    img_prompt = dict(
-        data=image_url,
-        data_format="url",
-        image_format="tiff",
-        out_data_format="b64_json",
-    )
-
-    pooling_params = PoolingParams(task="encode", softmax=False)
-
-    with vllm_runner(
-            model_name,
-            runner="pooling",
-            skip_tokenizer_init=True,
-            trust_remote_code=True,
-            enforce_eager=True,
-            # Limit the maximum number of parallel requests
-            # to avoid the model going OOM in CI.
-            max_num_seqs=1,
-            io_processor_plugin="prithvi_to_tiff_valencia",
-    ) as llm_runner:
-        pooler_output = llm_runner.get_llm().encode(
-            img_prompt,
-            pooling_params=pooling_params,
-        )
-    output = pooler_output[0].outputs
-
-    # verify the output is formatted as expected for this plugin
-    assert all(
-        hasattr(output, attr)
-        for attr in ["type", "format", "data", "request_id"])
-
-    # We just check that the output is a valid base64 string.
-    # Raises an exception and fails the test if the string is corrupted.
-    base64.b64decode(output.data)
-
-
-@pytest.fixture(scope="module")
+@pytest.fixture(scope="function")
 def server():
     args = [
         "--runner",
@@ -90,7 +35,9 @@ def server():
         "--max-num-seqs",
         "32",
         "--io-processor-plugin",
-        "prithvi_to_tiff_valencia"
+        "prithvi_to_tiff",
+        "--model-impl",
+        "terratorch",
     ]
 
     with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
@@ -113,6 +60,7 @@ async def test_prithvi_mae_plugin_online(
         },
         "priority": 0,
         "model": model_name,
+        "softmax": False
     }
 
     ret = requests.post(
@@ -135,3 +83,43 @@ async def test_prithvi_mae_plugin_online(
     # We just check that the output is a valid base64 string.
     # Raises an exception and fails the test if the string is corrupted.
     base64.b64decode(plugin_data["data"])
+
+
+@pytest.mark.parametrize("model_name", [MODEL_NAME])
+def test_prithvi_mae_plugin_offline(vllm_runner, model_name: str):
+
+    img_prompt = dict(
+        data=image_url,
+        data_format="url",
+        image_format="tiff",
+        out_data_format="b64_json",
+    )
+
+    pooling_params = PoolingParams(task="encode", softmax=False)
+
+    with vllm_runner(
+            model_name,
+            runner="pooling",
+            skip_tokenizer_init=True,
+            trust_remote_code=True,
+            enforce_eager=True,
+            # Limit the maximum number of parallel requests
+            # to avoid the model going OOM in CI.
+            max_num_seqs=1,
+            model_impl="terratorch",
+            io_processor_plugin="prithvi_to_tiff",
+    ) as llm_runner:
+        pooler_output = llm_runner.get_llm().encode(
+            img_prompt,
+            pooling_params=pooling_params,
+        )
+    output = pooler_output[0].outputs
+
+    # verify the output is formatted as expected for this plugin
+    assert all(
+        hasattr(output, attr)
+        for attr in ["type", "format", "data", "request_id"])
+
+    # We just check that the output is a valid base64 string.
+    # Raises an exception and fails the test if the string is corrupted.
+    base64.b64decode(output.data)
diff --git a/tests/quantization/test_modelopt.py b/tests/quantization/test_modelopt.py
index fcbfa681d75c9917ec01335b46221bdd61bed705..c60a03f44baec3abc830b52d58812b461a3ff1d3 100644
--- a/tests/quantization/test_modelopt.py
+++ b/tests/quantization/test_modelopt.py
@@ -27,7 +27,7 @@ def use_v0_only(monkeypatch):
                     reason="ModelOpt FP8 is not supported on this GPU type.")
 def test_modelopt_fp8_checkpoint_setup(vllm_runner):
     """Test ModelOpt FP8 checkpoint loading and structure validation."""
-    # TODO: provide a small publically available test checkpoint
+    # TODO: provide a small publicly available test checkpoint
     model_path = ("/home/scratch.omniml_data_1/zhiyu/ckpts/test_ckpts/"
                   "TinyLlama-1.1B-Chat-v1.0-fp8-0710")
 
diff --git a/tests/quantization/test_torchao.py b/tests/quantization/test_torchao.py
index eef3568efea12b0f6bdfecabb0b28f6ed1ad8cb1..8e68f6a2e019fd30395e27ce2c9ddc17b640ed86 100644
--- a/tests/quantization/test_torchao.py
+++ b/tests/quantization/test_torchao.py
@@ -75,5 +75,25 @@ def test_qwenvl_int8wo_model_loading_with_params(vllm_runner):
         print(output)
 
 
+@pytest.mark.skipif(not TORCHAO_AVAILABLE, reason="torchao is not available")
+@pytest.mark.skip(
+    reason="since torchao nightly is only compatible with torch nightly"
+    "currently https://github.com/pytorch/ao/issues/2919, we'll have to skip "
+    "torchao tests that requires newer versions (0.14.0.dev+) for now")
+def test_opt_125m_awq_int4wo_model_loading_with_params(vllm_runner):
+    torch._dynamo.reset()
+    model_name = ("torchao-testing/opt-125m-AWQConfig-Int4WeightOnlyConfig-v2"
+                  "-0.14.0.dev")
+    with vllm_runner(model_name=model_name,
+                     quantization="torchao",
+                     dtype="bfloat16",
+                     pt_load_map_location="cuda:0") as llm:
+        output = llm.generate_greedy(["The capital of France is"],
+                                     max_tokens=32)
+
+        assert output
+        print(output)
+
+
 if __name__ == "__main__":
     pytest.main([__file__])
diff --git a/tests/runai_model_streamer_test/test_runai_model_streamer_loader.py b/tests/runai_model_streamer_test/test_runai_model_streamer_loader.py
index 84c615b6b8dbc539c0f8fc09c6a7dbb847a539ef..22bdb3b44eb03fbc802d6404128f4725a6fc574f 100644
--- a/tests/runai_model_streamer_test/test_runai_model_streamer_loader.py
+++ b/tests/runai_model_streamer_test/test_runai_model_streamer_loader.py
@@ -2,7 +2,7 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from vllm import SamplingParams
-from vllm.config import LoadConfig
+from vllm.config.load import LoadConfig
 from vllm.model_executor.model_loader import get_model_loader
 
 load_format = "runai_streamer"
diff --git a/tests/runai_model_streamer_test/test_runai_utils.py b/tests/runai_model_streamer_test/test_runai_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..bde77ff6650636ba5c94f4448f5c7debae53340f
--- /dev/null
+++ b/tests/runai_model_streamer_test/test_runai_utils.py
@@ -0,0 +1,39 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import glob
+import os
+import tempfile
+
+import huggingface_hub.constants
+
+from vllm.model_executor.model_loader.weight_utils import (
+    download_weights_from_hf)
+from vllm.transformers_utils.runai_utils import (is_runai_obj_uri,
+                                                 list_safetensors)
+
+
+def test_is_runai_obj_uri():
+    assert is_runai_obj_uri("gs://some-gcs-bucket/path")
+    assert is_runai_obj_uri("s3://some-s3-bucket/path")
+    assert not is_runai_obj_uri("nfs://some-nfs-path")
+
+
+def test_runai_list_safetensors_local():
+    with tempfile.TemporaryDirectory() as tmpdir:
+        huggingface_hub.constants.HF_HUB_OFFLINE = False
+        download_weights_from_hf("openai-community/gpt2",
+                                 allow_patterns=["*.safetensors", "*.json"],
+                                 cache_dir=tmpdir)
+        safetensors = glob.glob(f"{tmpdir}/**/*.safetensors", recursive=True)
+        assert len(safetensors) > 0
+        parentdir = [
+            os.path.dirname(safetensor) for safetensor in safetensors
+        ][0]
+        files = list_safetensors(parentdir)
+        assert len(safetensors) == len(files)
+
+
+if __name__ == "__main__":
+    test_is_runai_obj_uri()
+    test_runai_list_safetensors_local()
diff --git a/tests/samplers/test_beam_search.py b/tests/samplers/test_beam_search.py
index cc9a88a255f9f429e54759482448ea69e12a01a2..0320a5ef31a6585b1a08f069d1ea0c30f5837c75 100644
--- a/tests/samplers/test_beam_search.py
+++ b/tests/samplers/test_beam_search.py
@@ -82,7 +82,7 @@ def test_beam_search_with_concurrency_limit(
     beam_width: int,
 ) -> None:
     # example_prompts[1]&[3]&[7] fails due to unknown reason even without
-    # concurency limit. skip them for now.
+    # concurrency limit. skip them for now.
     example_prompts = (example_prompts[:8])
     concurrency_limit = 2
     assert len(example_prompts) > concurrency_limit
diff --git a/tests/tensorizer_loader/test_tensorizer.py b/tests/tensorizer_loader/test_tensorizer.py
index 0fb142a1b6e560cc29b2158454a3caa3e5f4207d..e00d7c2f80c67b4babfe279d3a385acabebd5f3b 100644
--- a/tests/tensorizer_loader/test_tensorizer.py
+++ b/tests/tensorizer_loader/test_tensorizer.py
@@ -161,11 +161,11 @@ def test_load_without_tensorizer_load_format(vllm_runner, capfd, model_ref):
         model = vllm_runner(
             model_ref,
             model_loader_extra_config=TensorizerConfig(tensorizer_uri="test"))
+        pytest.fail("Expected RuntimeError for extra config keys")
     except RuntimeError:
         out, err = capfd.readouterr()
         combined_output = out + err
-        assert ("ValueError: Model loader extra config "
-                "is not supported for load "
+        assert ("ValueError: Unexpected extra config keys for load "
                 "format auto") in combined_output
     finally:
         del model
@@ -181,11 +181,12 @@ def test_raise_value_error_on_invalid_load_format(vllm_runner, capfd,
             model_ref,
             load_format="safetensors",
             model_loader_extra_config=TensorizerConfig(tensorizer_uri="test"))
+        pytest.fail("Expected RuntimeError for extra config keys")
     except RuntimeError:
         out, err = capfd.readouterr()
 
         combined_output = out + err
-        assert ("ValueError: Model loader extra config is not supported "
+        assert ("ValueError: Unexpected extra config keys "
                 "for load format safetensors") in combined_output
     finally:
         del model
diff --git a/tests/test_config.py b/tests/test_config.py
index 957771a4226bcaedf13f2df5e8130b1c38e138fa..373fbd267539a80fae84e8655a7d424cca9fdf5b 100644
--- a/tests/test_config.py
+++ b/tests/test_config.py
@@ -6,8 +6,9 @@ from dataclasses import MISSING, Field, asdict, dataclass, field
 import pytest
 
 from vllm.compilation.backends import VllmBackend
-from vllm.config import (LoadConfig, ModelConfig, PoolerConfig, VllmConfig,
-                         get_field, update_config)
+from vllm.config import (ModelConfig, PoolerConfig, VllmConfig, get_field,
+                         update_config)
+from vllm.config.load import LoadConfig
 from vllm.model_executor.layers.pooler import PoolingType
 from vllm.platforms import current_platform
 
diff --git a/tests/tool_use/test_openai_tool_parser.py b/tests/tool_use/test_openai_tool_parser.py
new file mode 100644
index 0000000000000000000000000000000000000000..0192c7d2765cdd9f6839b114da604352bad6217f
--- /dev/null
+++ b/tests/tool_use/test_openai_tool_parser.py
@@ -0,0 +1,147 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import json
+
+import pytest
+from openai_harmony import (Conversation, DeveloperContent,
+                            HarmonyEncodingName, Message, Role, SystemContent,
+                            load_harmony_encoding)
+
+from vllm.entrypoints.openai.protocol import FunctionCall, ToolCall
+from vllm.entrypoints.openai.tool_parsers import OpenAIToolParser
+from vllm.transformers_utils.tokenizer import get_tokenizer
+
+MODEL = "gpt2"
+
+
+@pytest.fixture(scope="module")
+def openai_tokenizer():
+    # The parser does not use the tokenizer, but the constructor requires it.
+    return get_tokenizer(MODEL)
+
+
+@pytest.fixture
+def openai_tool_parser(openai_tokenizer):
+    return OpenAIToolParser(openai_tokenizer)
+
+
+@pytest.fixture(scope="module")
+def harmony_encoding():
+    return load_harmony_encoding(HarmonyEncodingName.HARMONY_GPT_OSS)
+
+
+def assert_tool_calls(
+    actual_tool_calls: list[ToolCall],
+    expected_tool_calls: list[ToolCall],
+):
+    assert len(actual_tool_calls) == len(expected_tool_calls)
+
+    for actual_tool_call, expected_tool_call in zip(actual_tool_calls,
+                                                    expected_tool_calls):
+        assert isinstance(actual_tool_call.id, str)
+        assert len(actual_tool_call.id) > 16  # Default from protocol.py
+        assert actual_tool_call.type == "function"
+        assert actual_tool_call.function == expected_tool_call.function
+
+
+def test_extract_tool_calls_no_tools(openai_tool_parser, harmony_encoding):
+    convo = Conversation.from_messages([
+        Message.from_role_and_content(
+            Role.SYSTEM,
+            SystemContent.new(),
+        ),
+        Message.from_role_and_content(
+            Role.DEVELOPER,
+            DeveloperContent.new().with_instructions("Talk like a pirate!")),
+        Message.from_role_and_content(Role.USER, "Arrr, how be you?"),
+        Message.from_role_and_content(Role.ASSISTANT,
+                                      "This is a test").with_channel("final")
+    ])
+    token_ids = harmony_encoding.render_conversation_for_completion(
+        convo, Role.ASSISTANT)
+    extracted_info = openai_tool_parser.extract_tool_calls(
+        "",
+        request=None,
+        token_ids=token_ids,
+    )
+    assert not extracted_info.tools_called
+    assert extracted_info.tool_calls == []
+    assert extracted_info.content == "This is a test"
+
+
+def test_extract_tool_calls_single_tool(openai_tool_parser, harmony_encoding):
+    convo = Conversation.from_messages([
+        Message.from_role_and_content(Role.USER,
+                                      "What is the weather in Tokyo?"),
+        Message.from_role_and_content(
+            Role.ASSISTANT,
+            'User asks: "What is the weather in Tokyo?" We need to use get_current_weather tool.',  #  noqa: E501
+        ).with_channel("analysis"),
+        Message.from_role_and_content(
+            Role.ASSISTANT,
+            '{"location": "Tokyo"}').with_channel("commentary").with_recipient(
+                "functions.get_current_weather").with_content_type("json"),
+    ])
+    token_ids = harmony_encoding.render_conversation_for_completion(
+        convo, Role.ASSISTANT)
+
+    extracted_info = openai_tool_parser.extract_tool_calls(
+        "",
+        request=None,
+        token_ids=token_ids,
+    )
+    assert extracted_info.tools_called
+    expected_tool_calls = [
+        ToolCall(function=FunctionCall(
+            name="get_current_weather",
+            arguments=json.dumps({"location": "Tokyo"}),
+        ))
+    ]
+    assert_tool_calls(extracted_info.tool_calls, expected_tool_calls)
+    assert extracted_info.content is None
+
+
+def test_extract_tool_calls_multiple_tools(
+    openai_tool_parser,
+    harmony_encoding,
+):
+    convo = Conversation.from_messages([
+        Message.from_role_and_content(
+            Role.USER, "What is the weather in Tokyo based on where I'm at?"),
+        Message.from_role_and_content(
+            Role.ASSISTANT,
+            'User asks: "What is the weather in Tokyo?" based on their location. We need to use get_current_weather tool and get_user_location tool.',  #  noqa: E501
+        ).with_channel("analysis"),
+        Message.from_role_and_content(
+            Role.ASSISTANT,
+            '{"location": "Tokyo"}').with_channel("commentary").with_recipient(
+                "functions.get_current_weather").with_content_type("json"),
+        Message.from_role_and_content(
+            Role.ASSISTANT,
+            '{"location": "Tokyo"}').with_channel("commentary").with_recipient(
+                "functions.get_user_location").with_content_type("json"),
+    ])
+    token_ids = harmony_encoding.render_conversation_for_completion(
+        convo,
+        Role.ASSISTANT,
+    )
+
+    extracted_info = openai_tool_parser.extract_tool_calls(
+        "",
+        request=None,
+        token_ids=token_ids,
+    )
+    assert extracted_info.tools_called
+    expected_tool_calls = [
+        ToolCall(function=FunctionCall(
+            name="get_current_weather",
+            arguments=json.dumps({"location": "Tokyo"}),
+        )),
+        ToolCall(function=FunctionCall(
+            name="get_user_location",
+            arguments=json.dumps({"location": "Tokyo"}),
+        ))
+    ]
+    assert_tool_calls(extracted_info.tool_calls, expected_tool_calls)
+    assert extracted_info.content is None
diff --git a/tests/tpu/test_quantization_accuracy.py b/tests/tpu/test_quantization_accuracy.py
index 6cefbae4bdd18ba46e73038b311e2bdf68d4569f..8d9fbd280317ca34a21c198a21aee67c52e747b0 100644
--- a/tests/tpu/test_quantization_accuracy.py
+++ b/tests/tpu/test_quantization_accuracy.py
@@ -28,7 +28,7 @@ ACCURACY_CONFIGS = [
         expected_value=0.76),  # no bias
     # NOTE(rob): We cannot re-initialize vLLM in the same process for TPU,
     # so only one of these tests can run in a single call to pytest. As
-    # a follow up, move this into the LM-EVAL section of the CI.
+    # a follow-up, move this into the LM-EVAL section of the CI.
     # GSM8KAccuracyTestConfig(
     #     model_name="neuralmagic/Qwen2-7B-Instruct-quantized.w8a8",
     #     expected_value=0.66),  # bias in QKV layers
diff --git a/tests/transformers_utils/__init__.py b/tests/transformers_utils/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/tests/transformers_utils/test_config_parser_registry.py b/tests/transformers_utils/test_config_parser_registry.py
new file mode 100644
index 0000000000000000000000000000000000000000..13c654e05d2aca67ba0e90c540011afc10ba42e7
--- /dev/null
+++ b/tests/transformers_utils/test_config_parser_registry.py
@@ -0,0 +1,37 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from pathlib import Path
+from typing import Optional, Union
+
+import pytest
+from transformers import PretrainedConfig
+
+from vllm.transformers_utils.config import (get_config_parser,
+                                            register_config_parser)
+from vllm.transformers_utils.config_parser_base import ConfigParserBase
+
+
+@register_config_parser("custom_config_parser")
+class CustomConfigParser(ConfigParserBase):
+
+    def parse(self,
+              model: Union[str, Path],
+              trust_remote_code: bool,
+              revision: Optional[str] = None,
+              code_revision: Optional[str] = None,
+              **kwargs) -> tuple[dict, PretrainedConfig]:
+        raise NotImplementedError
+
+
+def test_register_config_parser():
+    assert isinstance(get_config_parser("custom_config_parser"),
+                      CustomConfigParser)
+
+
+def test_invalid_config_parser():
+    with pytest.raises(ValueError):
+
+        @register_config_parser("invalid_config_parser")
+        class InvalidConfigParser:
+            pass
diff --git a/tests/utils.py b/tests/utils.py
index 9d2073f3c10360812ebb0b086f5fc4e37fa4409d..16e1e6039329017e9ca7b2d9a3421b06f1105f1e 100644
--- a/tests/utils.py
+++ b/tests/utils.py
@@ -2,6 +2,7 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import asyncio
+import contextlib
 import copy
 import functools
 import importlib
@@ -13,10 +14,11 @@ import sys
 import tempfile
 import time
 import warnings
-from contextlib import contextmanager, suppress
+from contextlib import ExitStack, contextmanager, suppress
 from multiprocessing import Process
 from pathlib import Path
 from typing import Any, Callable, Literal, Optional, Union
+from unittest.mock import patch
 
 import cloudpickle
 import httpx
@@ -799,43 +801,106 @@ _P = ParamSpec("_P")
 
 
 def fork_new_process_for_each_test(
-        f: Callable[_P, None]) -> Callable[_P, None]:
+        func: Callable[_P, None]) -> Callable[_P, None]:
     """Decorator to fork a new process for each test function.
     See https://github.com/vllm-project/vllm/issues/7053 for more details.
     """
 
-    @functools.wraps(f)
+    @functools.wraps(func)
     def wrapper(*args: _P.args, **kwargs: _P.kwargs) -> None:
         # Make the process the leader of its own process group
         # to avoid sending SIGTERM to the parent process
         os.setpgrp()
         from _pytest.outcomes import Skipped
-        pid = os.fork()
-        print(f"Fork a new process to run a test {pid}")
-        if pid == 0:
-            try:
-                f(*args, **kwargs)
-            except Skipped as e:
-                # convert Skipped to exit code 0
-                print(str(e))
-                os._exit(0)
-            except Exception:
-                import traceback
-                traceback.print_exc()
-                os._exit(1)
+
+        # Create a unique temporary file to store exception info from child
+        # process. Use test function name and process ID to avoid collisions.
+        with tempfile.NamedTemporaryFile(
+                delete=False,
+                mode='w+b',
+                prefix=f"vllm_test_{func.__name__}_{os.getpid()}_",
+                suffix=".exc") as exc_file, ExitStack() as delete_after:
+            exc_file_path = exc_file.name
+            delete_after.callback(os.remove, exc_file_path)
+
+            pid = os.fork()
+            print(f"Fork a new process to run a test {pid}")
+            if pid == 0:
+                # Parent process responsible for deleting, don't delete
+                # in child.
+                delete_after.pop_all()
+                try:
+                    func(*args, **kwargs)
+                except Skipped as e:
+                    # convert Skipped to exit code 0
+                    print(str(e))
+                    os._exit(0)
+                except Exception as e:
+                    import traceback
+                    tb_string = traceback.format_exc()
+
+                    # Try to serialize the exception object first
+                    exc_to_serialize: dict[str, Any]
+                    try:
+                        # First, try to pickle the actual exception with
+                        # its traceback.
+                        exc_to_serialize = {'pickled_exception': e}
+                        # Test if it can be pickled
+                        cloudpickle.dumps(exc_to_serialize)
+                    except (Exception, KeyboardInterrupt):
+                        # Fall back to string-based approach.
+                        exc_to_serialize = {
+                            'exception_type': type(e).__name__,
+                            'exception_msg': str(e),
+                            'traceback': tb_string,
+                        }
+                    try:
+                        with open(exc_file_path, 'wb') as f:
+                            cloudpickle.dump(exc_to_serialize, f)
+                    except Exception:
+                        # Fallback: just print the traceback.
+                        print(tb_string)
+                    os._exit(1)
+                else:
+                    os._exit(0)
             else:
-                os._exit(0)
-        else:
-            pgid = os.getpgid(pid)
-            _pid, _exitcode = os.waitpid(pid, 0)
-            # ignore SIGTERM signal itself
-            old_signal_handler = signal.signal(signal.SIGTERM, signal.SIG_IGN)
-            # kill all child processes
-            os.killpg(pgid, signal.SIGTERM)
-            # restore the signal handler
-            signal.signal(signal.SIGTERM, old_signal_handler)
-            assert _exitcode == 0, (f"function {f} failed when called with"
-                                    f" args {args} and kwargs {kwargs}")
+                pgid = os.getpgid(pid)
+                _pid, _exitcode = os.waitpid(pid, 0)
+                # ignore SIGTERM signal itself
+                old_signal_handler = signal.signal(signal.SIGTERM,
+                                                   signal.SIG_IGN)
+                # kill all child processes
+                os.killpg(pgid, signal.SIGTERM)
+                # restore the signal handler
+                signal.signal(signal.SIGTERM, old_signal_handler)
+                if _exitcode != 0:
+                    # Try to read the exception from the child process
+                    exc_info = {}
+                    if os.path.exists(exc_file_path):
+                        with contextlib.suppress(Exception), \
+                            open(exc_file_path, 'rb') as f:
+                            exc_info = cloudpickle.load(f)
+
+                    if (original_exception :=
+                            exc_info.get('pickled_exception')) is not None:
+                        # Re-raise the actual exception object if it was
+                        # successfully pickled.
+                        assert isinstance(original_exception, Exception)
+                        raise original_exception
+
+                    if (original_tb := exc_info.get("traceback")) is not None:
+                        # Use string-based traceback for fallback case
+                        raise AssertionError(
+                            f"Test {func.__name__} failed when called with"
+                            f" args {args} and kwargs {kwargs}"
+                            f" (exit code: {_exitcode}):\n{original_tb}"
+                        ) from None
+
+                    # Fallback to the original generic error
+                    raise AssertionError(
+                        f"function {func.__name__} failed when called with"
+                        f" args {args} and kwargs {kwargs}"
+                        f" (exit code: {_exitcode})") from None
 
     return wrapper
 
@@ -1077,3 +1142,11 @@ def get_attn_backend_list_based_on_platform() -> list[str]:
         return attn_backend_list
     else:
         raise ValueError("Unsupported platform")
+
+
+@contextmanager
+def override_cutlass_fp8_supported(value: bool):
+    with patch(
+            "vllm.model_executor.layers.quantization.utils.w8a8_utils.cutlass_fp8_supported",
+            return_value=value):
+        yield
diff --git a/tests/utils_/test_utils.py b/tests/utils_/test_utils.py
index 66124dd854ee03852ec2afcc6817c8ee8b12649d..6dbba18b4dcfa9c00db3a8f65358b21518824f27 100644
--- a/tests/utils_/test_utils.py
+++ b/tests/utils_/test_utils.py
@@ -835,22 +835,20 @@ def test_model_specification(parser_with_config, cli_config_file,
 
 @pytest.mark.parametrize("input", [(), ("abc", ), (None, ),
                                    (None, bool, [1, 2, 3])])
-@pytest.mark.parametrize("output", [0, 1, 2])
-def test_sha256(input: tuple, output: int):
-    hash = sha256(input)
-    assert hash is not None
-    assert isinstance(hash, int)
-    assert hash != 0
+def test_sha256(input: tuple):
+    digest = sha256(input)
+    assert digest is not None
+    assert isinstance(digest, bytes)
+    assert digest != b""
 
-    bytes = pickle.dumps(input, protocol=pickle.HIGHEST_PROTOCOL)
-    assert hash == int.from_bytes(hashlib.sha256(bytes).digest(),
-                                  byteorder="big")
+    input_bytes = pickle.dumps(input, protocol=pickle.HIGHEST_PROTOCOL)
+    assert digest == hashlib.sha256(input_bytes).digest()
 
     # hashing again, returns the same value
-    assert hash == sha256(input)
+    assert digest == sha256(input)
 
     # hashing different input, returns different value
-    assert hash != sha256(input + (1, ))
+    assert digest != sha256(input + (1, ))
 
 
 @pytest.mark.parametrize(
diff --git a/tests/v1/attention/test_attention_backends.py b/tests/v1/attention/test_attention_backends.py
index e4c07aae0ebedf1a1bf35392d3d7b5a7ea6effd9..1ae8b91c347a271925d6f214786e8c5ac71947ef 100644
--- a/tests/v1/attention/test_attention_backends.py
+++ b/tests/v1/attention/test_attention_backends.py
@@ -70,22 +70,6 @@ BATCH_SPECS = {
 }
 
 
-def create_dummy_kv_cache(kv_cache_spec: FullAttentionSpec,
-                          device: torch.device,
-                          num_blocks: int = 100) -> torch.Tensor:
-    """Create a dummy KV cache tensor for testing."""
-    kv_cache = torch.randn(
-        2,  # K and V
-        num_blocks,
-        kv_cache_spec.block_size,
-        kv_cache_spec.num_kv_heads,
-        kv_cache_spec.head_size,
-        dtype=_convert_dtype_to_torch(kv_cache_spec.dtype),
-        device=device,
-    )
-    return kv_cache
-
-
 def create_and_prepopulate_kv_cache(
         k_contexts: list[torch.Tensor],
         v_contexts: list[torch.Tensor],
diff --git a/tests/v1/attention/test_chunked_local_attention.py b/tests/v1/attention/test_chunked_local_attention.py
index 8c5a63653db9fbcc6bb471ff0b9d232ca90212da..be77256a0d2f01547e87ab0723cf417b0f92cfc7 100644
--- a/tests/v1/attention/test_chunked_local_attention.py
+++ b/tests/v1/attention/test_chunked_local_attention.py
@@ -160,7 +160,7 @@ def test_local_attention_virtual_batches(test_data: LocalAttentionTestData):
         # Use torch.arange instead of torch.randint so we can assert on
         # block table tensor values. The block table will have shape
         # (num_batches, cdiv(max_seq_len, block_size)) and the values will be
-        # aranged from 0 to cdiv(max_seq_len, block_size)-1
+        # arranged from 0 to cdiv(max_seq_len, block_size)-1
         arange_block_indices=True,
     )
 
diff --git a/tests/v1/attention/test_mla_backends.py b/tests/v1/attention/test_mla_backends.py
index 24070358799ef75eb3eb9e13abd8389638efab40..a62993950affe505446b8f3dce7ac4b1fcf00d37 100644
--- a/tests/v1/attention/test_mla_backends.py
+++ b/tests/v1/attention/test_mla_backends.py
@@ -15,7 +15,7 @@ from vllm.v1.attention.backends.utils import CommonAttentionMetadata
 from vllm.v1.kv_cache_interface import FullAttentionSpec
 
 BACKENDS_TO_TEST = [
-    _Backend.CUTLASS_MLA, _Backend.FLASHMLA_VLLM_V1,
+    _Backend.CUTLASS_MLA, _Backend.FLASHMLA_VLLM_V1, _Backend.FLASH_ATTN_MLA,
     _Backend.TRITON_MLA_VLLM_V1
 ]
 
@@ -69,25 +69,10 @@ BATCH_SPECS = {
 }
 
 
-def create_dummy_kv_cache(kv_cache_spec: FullAttentionSpec,
-                          device: torch.device,
-                          num_blocks: int = 100) -> torch.Tensor:
-    """Create a dummy KV cache tensor for testing."""
-    kv_cache = torch.randn(
-        num_blocks,
-        kv_cache_spec.block_size,
-        kv_cache_spec.head_size,  # latent dimension
-        dtype=_convert_dtype_to_torch(kv_cache_spec.dtype),
-        device=device,
-    )
-    return kv_cache
-
-
 def create_and_prepopulate_kv_cache(
         kv_c_contexts: list[torch.Tensor],
         k_pe_contexts: list[torch.Tensor],
         block_size: int,
-        num_kv_heads: int,
         head_size: int,
         dtype: torch.dtype,
         device: torch.device,
@@ -101,7 +86,6 @@ def create_and_prepopulate_kv_cache(
         k_pe_contexts: List of key positional embedding context tensors
                        for each sequence
         block_size: Size of each block
-        num_kv_heads: Number of KV heads (should be 1 for MLA)
         head_size: Size of each head (latent dimension)
         dtype: Data type for the cache
         device: Device to create the cache on
@@ -299,8 +283,6 @@ def test_backend_correctness(dist_init, batch_spec_name: str, model: str):
     query_lens = batch_spec.query_lens
     num_q_heads = vllm_config.model_config.get_num_attention_heads(
         vllm_config.parallel_config)
-    num_kv_heads = vllm_config.model_config.get_num_kv_heads(
-        vllm_config.parallel_config)
     head_size = vllm_config.model_config.get_head_size()
     dtype = _convert_dtype_to_torch(vllm_config.model_config.dtype)
     block_size = vllm_config.cache_config.block_size
@@ -315,7 +297,7 @@ def test_backend_correctness(dist_init, batch_spec_name: str, model: str):
 
     # 2. Generate data and compute SDPA reference output for MLA
     all_q_vllm, all_kv_c_vllm, all_k_pe_vllm = [], [], []
-    all_sdpa_outputs = []
+    all_sdpa_outputs: list[list[torch.Tensor]] = []
     kv_c_contexts, k_pe_contexts = [], []
 
     # Create shared MLA weight matrices for consistency across all sequences
@@ -331,6 +313,9 @@ def test_backend_correctness(dist_init, batch_spec_name: str, model: str):
                        device=device)
     kv_b_proj_weight = torch.cat([W_UK, W_UV], dim=-1)
 
+    for i, backend in enumerate(BACKENDS_TO_TEST):
+        all_sdpa_outputs.append([])
+
     for i in range(batch_size):
         s_len = seq_lens[i]
         q_len = query_lens[i]
@@ -358,85 +343,93 @@ def test_backend_correctness(dist_init, batch_spec_name: str, model: str):
                                 dtype=dtype,
                                 device=device)
 
-        # Determine if this is decode (single token)
-        # or prefill (multiple tokens)
-        is_decode = q_len == 1
+        # Determine if this is decode or prefill
+        is_decode = []
+        for i, backend in enumerate(BACKENDS_TO_TEST):
+            builder_cls, _ = get_attention_backend(backend)
+            is_decode.append(q_len <= builder_cls.reorder_batch_threshold)
 
         # Split q into nope and rope components
         q_nope, q_pe = q_c.split([qk_nope_head_dim, qk_rope_head_dim], dim=-1)
 
-        if is_decode:
-            # Decode path: MQA-style attention in latent space
-            # Transform q_nope to latent space: q_nope @ W_UK
-            # q_nope: [1, num_heads, qk_nope_head_dim]
-            # W_UK: [kv_lora_rank, num_heads, qk_nope_head_dim]
-            ql_nope = torch.einsum("qnh,lnh->qnl", q_nope,
-                                   W_UK)  # [1, num_heads, kv_lora_rank]
-
-            # Build MQA attention inputs
-            # Q: [1, num_heads, kv_lora_rank + qk_rope_head_dim]
-            q_mqa = torch.cat([ql_nope, q_pe], dim=-1)
-            # K: [s_len, kv_lora_rank + qk_rope_head_dim]
-            # (broadcasted to all heads)
-            k_mqa = torch.cat([kv_c_full, k_pe_full.squeeze(1)], dim=-1)
-            k_mqa = k_mqa.unsqueeze(1).expand(-1, num_q_heads, -1)
-            # V: [s_len, kv_lora_rank] (broadcasted to all heads)
-            v_mqa = kv_c_full.unsqueeze(1).expand(-1, num_q_heads, -1)
-
-            # SDPA expects (N, H, L, D)
-            q_sdpa_in = q_mqa.unsqueeze(0).transpose(1, 2)
-            k_sdpa_in = k_mqa.unsqueeze(0).transpose(1, 2)
-            v_sdpa_in = v_mqa.unsqueeze(0).transpose(1, 2)
-
-            sdpa_out_i = torch.nn.functional.scaled_dot_product_attention(
-                q_sdpa_in, k_sdpa_in, v_sdpa_in, is_causal=False, scale=scale)
-            sdpa_out_i = sdpa_out_i.transpose(1, 2).squeeze(
-                0)  # [1, num_heads, kv_lora_rank]
-
-            # Project back to output space: sdpa_out @ W_UV
-            sdpa_out_i = torch.einsum("qnl,lnv->qnv", sdpa_out_i, W_UV)
-            sdpa_out_i = sdpa_out_i.flatten(start_dim=-2)
-        else:
-            # Prefill path: MHA-style attention with full sequence
-            # Apply kv_b_proj to the full kv_c tensor
-            kv_nope_full = torch.einsum("sl,lnh->snh", kv_c_full,
-                                        kv_b_proj_weight)
-            k_nope_full, v_full = kv_nope_full.split(
-                [qk_nope_head_dim, v_head_dim], dim=-1)
-
-            # Build attention inputs for full sequence
-            q_mha = torch.cat([q_nope, q_pe],
-                              dim=-1)  # [q_len, num_heads, total_dim]
-            k_pe_full_expanded = k_pe_full.expand(-1, num_q_heads, -1)
-            k_full = torch.cat([k_nope_full, k_pe_full_expanded], dim=-1)
-
-            # Create custom attention mask:
-            # - Query tokens can attend to all context tokens
-            # - Query tokens can only attend to query tokens up to their pos
-            attn_mask = torch.ones(q_len,
-                                   s_len,
-                                   dtype=torch.bool,
-                                   device=device)
-            # Apply causal mask only to the query portion (context_len onwards)
-            causal_mask = torch.tril(torch.ones(q_len, q_len, device=device))
-            attn_mask[:, context_len:] = causal_mask
-
-            # SDPA expects (N, H, L, D)
-            q_sdpa_in = q_mha.unsqueeze(0).transpose(1, 2)
-            k_sdpa_in = k_full.unsqueeze(0).transpose(1, 2)
-            v_sdpa_in = v_full.unsqueeze(0).transpose(1, 2)
-
-            # Single attention call with custom mask
-            sdpa_out_i = torch.nn.functional.scaled_dot_product_attention(
-                q_sdpa_in,
-                k_sdpa_in,
-                v_sdpa_in,
-                attn_mask=attn_mask,
-                scale=scale)
-            sdpa_out_i = sdpa_out_i.transpose(1, 2).squeeze(0)
-            sdpa_out_i = sdpa_out_i.flatten(start_dim=-2)
-
-        all_sdpa_outputs.append(sdpa_out_i)
+        #######################################################
+        # Decode path: MQA-style attention in latent space
+        # Transform q_nope to latent space: q_nope @ W_UK
+        # q_nope: [1, num_heads, qk_nope_head_dim]
+        # W_UK: [kv_lora_rank, num_heads, qk_nope_head_dim]
+        ql_nope = torch.einsum("qnh,lnh->qnl", q_nope,
+                               W_UK)  # [1, num_heads, kv_lora_rank]
+
+        # Build MQA attention inputs
+        # Q: [1, num_heads, kv_lora_rank + qk_rope_head_dim]
+        q_mqa = torch.cat([ql_nope, q_pe], dim=-1)
+        # K: [s_len, kv_lora_rank + qk_rope_head_dim]
+        # (broadcasted to all heads)
+        k_mqa = torch.cat([kv_c_full, k_pe_full.squeeze(1)], dim=-1)
+        k_mqa = k_mqa.unsqueeze(1).expand(-1, num_q_heads, -1)
+        # V: [s_len, kv_lora_rank] (broadcasted to all heads)
+        v_mqa = kv_c_full.unsqueeze(1).expand(-1, num_q_heads, -1)
+
+        # Create custom attention mask for decode path:
+        # - Query tokens can attend to all context tokens
+        # - Query tokens can only attend to query tokens up to their position
+        attn_mask = torch.ones(q_len, s_len, dtype=torch.bool, device=device)
+        # Apply causal mask only to the query portion (context_len onwards)
+        causal_mask = torch.tril(torch.ones(q_len, q_len, device=device))
+        attn_mask[:, context_len:] = causal_mask
+
+        # SDPA expects (N, H, L, D)
+        q_sdpa_in = q_mqa.unsqueeze(0).transpose(1, 2)
+        k_sdpa_in = k_mqa.unsqueeze(0).transpose(1, 2)
+        v_sdpa_in = v_mqa.unsqueeze(0).transpose(1, 2)
+
+        sdpa_out_i_decode = torch.nn.functional.scaled_dot_product_attention(
+            q_sdpa_in, k_sdpa_in, v_sdpa_in, attn_mask=attn_mask, scale=scale)
+        sdpa_out_i_decode = sdpa_out_i_decode.transpose(1, 2).squeeze(
+            0)  # [1, num_heads, kv_lora_rank]
+
+        # Project back to output space: sdpa_out @ W_UV
+        sdpa_out_i_decode = torch.einsum("qnl,lnv->qnv", sdpa_out_i_decode,
+                                         W_UV)
+        sdpa_out_i_decode = sdpa_out_i_decode.flatten(start_dim=-2)
+
+        #######################################################
+        # Prefill path: MHA-style attention with full sequence
+        # Apply kv_b_proj to the full kv_c tensor
+        kv_nope_full = torch.einsum("sl,lnh->snh", kv_c_full, kv_b_proj_weight)
+        k_nope_full, v_full = kv_nope_full.split(
+            [qk_nope_head_dim, v_head_dim], dim=-1)
+
+        # Build attention inputs for full sequence
+        q_mha = torch.cat([q_nope, q_pe],
+                          dim=-1)  # [q_len, num_heads, total_dim]
+        k_pe_full_expanded = k_pe_full.expand(-1, num_q_heads, -1)
+        k_full = torch.cat([k_nope_full, k_pe_full_expanded], dim=-1)
+
+        # Create custom attention mask:
+        # - Query tokens can attend to all context tokens
+        # - Query tokens can only attend to query tokens up to their pos
+        attn_mask = torch.ones(q_len, s_len, dtype=torch.bool, device=device)
+        # Apply causal mask only to the query portion (context_len onwards)
+        causal_mask = torch.tril(torch.ones(q_len, q_len, device=device))
+        attn_mask[:, context_len:] = causal_mask
+
+        # SDPA expects (N, H, L, D)
+        q_sdpa_in = q_mha.unsqueeze(0).transpose(1, 2)
+        k_sdpa_in = k_full.unsqueeze(0).transpose(1, 2)
+        v_sdpa_in = v_full.unsqueeze(0).transpose(1, 2)
+
+        # Single attention call with custom mask
+        sdpa_out_i_prefill = torch.nn.functional.scaled_dot_product_attention(
+            q_sdpa_in, k_sdpa_in, v_sdpa_in, attn_mask=attn_mask, scale=scale)
+        sdpa_out_i_prefill = sdpa_out_i_prefill.transpose(1, 2).squeeze(0)
+        sdpa_out_i_prefill = sdpa_out_i_prefill.flatten(start_dim=-2)
+
+        for i, backend in enumerate(BACKENDS_TO_TEST):
+            if is_decode[i]:
+                all_sdpa_outputs[i].append(sdpa_out_i_decode)
+            else:
+                all_sdpa_outputs[i].append(sdpa_out_i_prefill)
 
         # Inputs for vLLM MLA backends are just the new tokens
         all_q_vllm.append(q_c)
@@ -451,7 +444,9 @@ def test_backend_correctness(dist_init, batch_spec_name: str, model: str):
     query_vllm = torch.cat(all_q_vllm, dim=0)
     kv_c_vllm = torch.cat(all_kv_c_vllm, dim=0)
     k_pe_vllm = torch.cat(all_k_pe_vllm, dim=0)
-    sdpa_output = torch.cat(all_sdpa_outputs, dim=0)
+    sdpa_outputs = []
+    for i, backend in enumerate(BACKENDS_TO_TEST):
+        sdpa_outputs.append(torch.cat(all_sdpa_outputs[i], dim=0))
 
     # Create mock kv_b_proj using the same weights as reference implementation
     from vllm.model_executor.layers.linear import ColumnParallelLinear
@@ -477,7 +472,6 @@ def test_backend_correctness(dist_init, batch_spec_name: str, model: str):
         kv_c_contexts=kv_c_contexts,
         k_pe_contexts=k_pe_contexts,
         block_size=block_size,
-        num_kv_heads=num_kv_heads,
         head_size=head_size,
         dtype=dtype,
         device=device,
@@ -486,7 +480,7 @@ def test_backend_correctness(dist_init, batch_spec_name: str, model: str):
         randomize_blocks=True)
 
     # 4. Run vLLM backends and compare
-    for backend_name in BACKENDS_TO_TEST:
+    for i, backend_name in enumerate(BACKENDS_TO_TEST):
         backend_output = run_attention_backend(
             backend_name, kv_cache_spec, ["placeholder"], vllm_config, device,
             common_attn_metadata, query_vllm, kv_c_vllm, k_pe_vllm, kv_cache,
@@ -494,12 +488,12 @@ def test_backend_correctness(dist_init, batch_spec_name: str, model: str):
             mock_kv_b_proj)
 
         # Check shape and dtype consistency
-        assert backend_output.shape == sdpa_output.shape, (
+        assert backend_output.shape == sdpa_outputs[i].shape, (
             f"[{backend_name}] shape {backend_output.shape} != "
-            f"SDPA shape {sdpa_output.shape}")
-        assert backend_output.dtype == sdpa_output.dtype, (
+            f"SDPA shape {sdpa_outputs[i].shape}")
+        assert backend_output.dtype == sdpa_outputs[i].dtype, (
             f"[{backend_name}] dtype {backend_output.dtype} != "
-            f"SDPA dtype {sdpa_output.dtype}")
+            f"SDPA dtype {sdpa_outputs[i].dtype}")
 
         assert torch.isfinite(backend_output).all(), (
             f"[{backend_name}] produced non-finite values")
@@ -508,12 +502,13 @@ def test_backend_correctness(dist_init, batch_spec_name: str, model: str):
         rtol = 1e-2
         atol = 5e-1
 
-        max_diff = torch.max(torch.abs(backend_output - sdpa_output)).item()
+        max_diff = torch.max(torch.abs(backend_output -
+                                       sdpa_outputs[i])).item()
         max_rel_diff = torch.max(
-            torch.abs(backend_output - sdpa_output) /
-            torch.abs(sdpa_output)).item()
+            torch.abs(backend_output - sdpa_outputs[i]) /
+            torch.abs(sdpa_outputs[i])).item()
         all_close = torch.allclose(backend_output,
-                                   sdpa_output,
+                                   sdpa_outputs[i],
                                    rtol=rtol,
                                    atol=atol)
 
diff --git a/tests/v1/attention/utils.py b/tests/v1/attention/utils.py
index 6a08cdc56f7367323995fe483a51fc9e4fc1e91c..5c49566240df4a2ebe1818be2d90b5824f55eb2c 100644
--- a/tests/v1/attention/utils.py
+++ b/tests/v1/attention/utils.py
@@ -139,6 +139,8 @@ def get_attention_backend(backend_name: _Backend):
         "vllm.v1.attention.backends.mla.cutlass_mla.CutlassMLABackend",
         _Backend.FLASHMLA_VLLM_V1:
         "vllm.v1.attention.backends.mla.flashmla.FlashMLABackend",
+        _Backend.FLASH_ATTN_MLA:
+        "vllm.v1.attention.backends.mla.flashattn_mla.FlashAttnMLABackend",
         _Backend.TRITON_MLA_VLLM_V1:
         "vllm.v1.attention.backends.mla.triton_mla.TritonMLABackend",
     }
diff --git a/tests/v1/core/test_kv_cache_utils.py b/tests/v1/core/test_kv_cache_utils.py
index e738f2bd464726a5a7193fdeeb80890be22d0fc1..44e479098ad5d479179d45bc24fbe62b906569f9 100644
--- a/tests/v1/core/test_kv_cache_utils.py
+++ b/tests/v1/core/test_kv_cache_utils.py
@@ -6,20 +6,22 @@ from typing import Callable, Optional
 import pytest
 import torch
 
+import vllm.v1.core.kv_cache_utils as kv_cache_utils
 from vllm.config import ModelConfig, SchedulerConfig, VllmConfig
 from vllm.multimodal.inputs import (MultiModalFeatureSpec,
                                     MultiModalKwargsItem, PlaceholderRange)
 from vllm.sampling_params import SamplingParams
-from vllm.utils import GiB_bytes, sha256, sha256_cbor_64bit
+from vllm.utils import GiB_bytes, sha256, sha256_cbor
 from vllm.v1.core.kv_cache_manager import KVCacheManager
 # disable yapf here as it formats differently than isort such that both fail
 # yapf: disable
 from vllm.v1.core.kv_cache_utils import (
-    FreeKVCacheBlockQueue, KVCacheBlock, PrefixCachingMetrics,
+    BlockHash, FreeKVCacheBlockQueue, KVCacheBlock, PrefixCachingMetrics,
     estimate_max_model_len, generate_block_hash_extra_keys,
     get_kv_cache_config, get_max_concurrency_for_kv_cache_config,
     get_request_block_hasher, hash_block_tokens, init_none_hash,
-    is_kv_cache_type_uniform, unify_kv_cache_configs)
+    is_kv_cache_type_uniform, make_block_hash_with_group_id,
+    unify_kv_cache_configs)
 from vllm.v1.kv_cache_interface import (FullAttentionSpec, KVCacheConfig,
                                         KVCacheGroupSpec, KVCacheTensor,
                                         SlidingWindowSpec)
@@ -88,7 +90,7 @@ def new_sliding_window_spec(block_size=16,
                              sliding_window=sliding_window)
 
 
-@pytest.mark.parametrize("hash_fn", [sha256, sha256_cbor_64bit, hash])
+@pytest.mark.parametrize("hash_fn", [sha256, sha256_cbor])
 def test_none_hash(monkeypatch, hash_fn):
     import vllm.v1.core.kv_cache_utils
 
@@ -98,8 +100,8 @@ def test_none_hash(monkeypatch, hash_fn):
         reloaded_kv_cache_utils = importlib.reload(vllm.v1.core.kv_cache_utils)
         reloaded_kv_cache_utils.init_none_hash(hash_fn)
         assert reloaded_kv_cache_utils.NONE_HASH is not None
-        assert isinstance(reloaded_kv_cache_utils.NONE_HASH, int)
-        assert reloaded_kv_cache_utils.NONE_HASH != 0
+        assert isinstance(reloaded_kv_cache_utils.NONE_HASH, bytes)
+        assert reloaded_kv_cache_utils.NONE_HASH != b""
 
     # case 2: PYTHONHASHSEED is set, use the seed and hash_fn
     with monkeypatch.context() as m:
@@ -107,12 +109,11 @@ def test_none_hash(monkeypatch, hash_fn):
         reloaded_kv_cache_utils = importlib.reload(vllm.v1.core.kv_cache_utils)
         reloaded_kv_cache_utils.init_none_hash(hash_fn)
         assert reloaded_kv_cache_utils.NONE_HASH is not None
-        assert isinstance(reloaded_kv_cache_utils.NONE_HASH, int)
+        assert isinstance(reloaded_kv_cache_utils.NONE_HASH, bytes)
         assert hash_fn('python hash seed') == reloaded_kv_cache_utils.NONE_HASH
 
 
 def test_kv_cache_block():
-    import vllm.v1.core.kv_cache_utils
 
     # Test KVCacheBlock initialization
     block = KVCacheBlock(block_id=0)
@@ -127,8 +128,7 @@ def test_kv_cache_block():
     assert block.ref_cnt == 0
 
     # Test block hash setting and resetting
-    block_hash = vllm.v1.core.kv_cache_utils.BlockHash(hash_value=123,
-                                                       token_ids=(1, 2, 3))
+    block_hash = make_block_hash_with_group_id(BlockHash(b"abc"), 0)
     block.block_hash = block_hash
     assert block.block_hash == block_hash
 
@@ -247,7 +247,7 @@ def test_free_kv_cache_block_queue_append_n():
 
 def test_free_kv_cache_block_queue_popleft_n():
     blocks = [KVCacheBlock(block_id=i) for i in range(6)]
-    # Create a empty FreeKVCacheBlockQueue with these blocks
+    # Create an empty FreeKVCacheBlockQueue with these blocks
     queue = FreeKVCacheBlockQueue(
         [blocks[1], blocks[3], blocks[5], blocks[4], blocks[0], blocks[2]])
     assert queue.num_free_blocks == 6
@@ -407,27 +407,23 @@ def test_generate_block_hash_extra_keys_cache_salt():
     assert next_mm_idx == 1
 
 
-@pytest.mark.parametrize("hash_fn", [sha256, sha256_cbor_64bit, hash])
+@pytest.mark.parametrize("hash_fn", [sha256, sha256_cbor])
 def test_hash_block_tokens(hash_fn):
-    import vllm.v1.core.kv_cache_utils
     init_none_hash(hash_fn)
-    parent_block_hash = 123
+    parent_block_hash = BlockHash(b"123")
     curr_block_token_ids = (1, 2, 3)
     extra_keys = ("key1", "key2")
 
     block_hash = hash_block_tokens(hash_fn, parent_block_hash,
                                    curr_block_token_ids, extra_keys)
-    assert isinstance(block_hash, vllm.v1.core.kv_cache_utils.BlockHash)
-    assert block_hash.hash_value == hash_fn(
-        (parent_block_hash, curr_block_token_ids, extra_keys))
-    assert block_hash.token_ids == curr_block_token_ids
-    assert block_hash.extra_keys == extra_keys
+    expected = hash_fn((parent_block_hash, curr_block_token_ids, extra_keys))
+    assert block_hash == expected
 
 
-@pytest.mark.parametrize("hash_fn", [sha256, sha256_cbor_64bit, hash])
+@pytest.mark.parametrize("hash_fn", [sha256, sha256_cbor])
 def test_request_block_hasher(hash_fn):
-    import vllm.v1.core.kv_cache_utils
-    init_none_hash(hash_fn)
+    kv_cache_utils.init_none_hash(hash_fn)
+
     request = make_request(
         request_id="0",
         prompt_token_ids=[_ for _ in range(6)],
@@ -442,19 +438,13 @@ def test_request_block_hasher(hash_fn):
 
     block_hashes = request.block_hashes
     assert len(block_hashes) == 2
-    assert isinstance(block_hashes[0], vllm.v1.core.kv_cache_utils.BlockHash)
-    assert isinstance(block_hashes[1], vllm.v1.core.kv_cache_utils.BlockHash)
-
-    # Check the first block
-    assert block_hashes[0].token_ids == (0, 1, 2)
-    assert block_hashes[0].extra_keys == ("hash1", )
+    assert block_hashes[0] == hash_fn(
+        (kv_cache_utils.NONE_HASH, (0, 1, 2), ("hash1", )))
+    assert block_hashes[1] == hash_fn(
+        (block_hashes[0], (3, 4, 5), ("hash2", )))
 
-    # Check the second block
-    assert block_hashes[1].token_ids == (3, 4, 5)
-    assert block_hashes[1].extra_keys == ("hash2", )
 
-
-@pytest.mark.parametrize("hash_fn", [sha256, sha256_cbor_64bit, hash])
+@pytest.mark.parametrize("hash_fn", [sha256, sha256_cbor])
 def test_hash_tokens_different_mm_input(hash_fn):
     init_none_hash(hash_fn)
 
@@ -484,9 +474,9 @@ def test_hash_tokens_different_mm_input(hash_fn):
     assert block_hashes1[1] != block_hashes2[1]
 
 
-@pytest.mark.parametrize("hash_fn", [sha256, sha256_cbor_64bit, hash])
+@pytest.mark.parametrize("hash_fn", [sha256, sha256_cbor])
 def test_hash_request_tokens_no_mm_inputs(hash_fn):
-    init_none_hash(hash_fn)
+    kv_cache_utils.init_none_hash(hash_fn)
 
     request = make_request(
         request_id="0",
@@ -500,10 +490,9 @@ def test_hash_request_tokens_no_mm_inputs(hash_fn):
     block_hashes = request.block_hashes
 
     assert len(block_hashes) == 2
-    assert block_hashes[0].token_ids == (0, 1, 2)
-    assert block_hashes[0].extra_keys is None
-    assert block_hashes[1].token_ids == (3, 4, 5)
-    assert block_hashes[1].extra_keys is None
+    assert block_hashes[0] == hash_fn(
+        (kv_cache_utils.NONE_HASH, (0, 1, 2), None))
+    assert block_hashes[1] == hash_fn((block_hashes[0], (3, 4, 5), None))
 
 
 def test_metrics():
diff --git a/tests/v1/core/test_prefix_caching.py b/tests/v1/core/test_prefix_caching.py
index e7a8f63702b300127f7960ac822cde5dd3775ea6..659d768bcf2e9b53b86de8b8d497d80ea46637e2 100644
--- a/tests/v1/core/test_prefix_caching.py
+++ b/tests/v1/core/test_prefix_caching.py
@@ -8,17 +8,19 @@ from typing import Callable, Optional
 import pytest
 import torch
 
+import vllm.v1.core.kv_cache_utils as kv_cache_utils
 from vllm.distributed.kv_events import AllBlocksCleared, BlockRemoved
 from vllm.multimodal.inputs import (MultiModalFeatureSpec,
                                     MultiModalKwargsItem, PlaceholderRange)
 from vllm.sampling_params import SamplingParams
-from vllm.utils import sha256, sha256_cbor_64bit
+from vllm.utils import sha256, sha256_cbor
 from vllm.v1.core.block_pool import BlockPool
 from vllm.v1.core.kv_cache_manager import KVCacheManager, Request
-from vllm.v1.core.kv_cache_utils import (BlockHash, BlockHashWithGroupId,
-                                         KVCacheBlock,
+from vllm.v1.core.kv_cache_utils import (BlockHash, KVCacheBlock,
+                                         get_block_hash, get_group_id,
                                          get_request_block_hasher,
-                                         hash_block_tokens, init_none_hash)
+                                         hash_block_tokens, init_none_hash,
+                                         make_block_hash_with_group_id)
 from vllm.v1.kv_cache_interface import (FullAttentionSpec, KVCacheConfig,
                                         KVCacheGroupSpec, SlidingWindowSpec)
 
@@ -101,8 +103,10 @@ def make_kv_cache_config_hybrid_model(block_size: int,
     )
 
 
-@pytest.mark.parametrize("hash_algo", ["sha256", "sha256_cbor_64bit", "hash"])
-def test_prefill(hash_algo):
+@pytest.mark.parametrize("hash_fn", [sha256, sha256_cbor])
+def test_prefill(hash_fn):
+    init_none_hash(hash_fn)
+
     block_size = 16
     manager = KVCacheManager(
         make_kv_cache_config(block_size, 11),
@@ -110,10 +114,6 @@ def test_prefill(hash_algo):
         enable_caching=True,
     )
 
-    # choose the hash function according to the parameter
-    hash_fn = (sha256_cbor_64bit if hash_algo == "sha256_cbor_64bit" else
-               sha256 if hash_algo == "sha256" else hash)
-
     # Complete 3 blocks (48 tokens)
     common_token_ids = [i for i in range(3) for _ in range(16)]
 
@@ -137,10 +137,12 @@ def test_prefill(hash_algo):
         block_tokens = tuple(all_token_ids[(block_id - 1) * 16:block_id * 16])
         block_hash = hash_block_tokens(hash_fn, parent_block_hash,
                                        block_tokens)
-        assert manager.block_pool.blocks[
-            block_id].block_hash.block_hash == block_hash
+        blk_hash = manager.block_pool.blocks[block_id].block_hash
+        assert blk_hash is not None
+        assert get_block_hash(blk_hash) == block_hash
+        assert get_group_id(blk_hash) == 0
         assert manager.block_pool.blocks[block_id].ref_cnt == 1
-        parent_block_hash = block_hash.hash_value
+        parent_block_hash = block_hash
 
     # Check partial block metadata
     for block_id in (4, ):
@@ -233,7 +235,7 @@ def test_prefill_hybrid_model():
         enable_caching=True,
     )
 
-    hash_fn = hash
+    hash_fn = sha256
 
     # Complete 3 blocks (48 tokens)
     common_token_ids = [i for i in range(3) for _ in range(block_size)]
@@ -260,11 +262,13 @@ def test_prefill_hybrid_model():
         block_tokens = tuple(all_token_ids[(length - 1) * 16:length * 16])
         block_hash = hash_block_tokens(hash_fn, parent_block_hash,
                                        block_tokens)
-        for block_id in block_ids:
-            assert manager.block_pool.blocks[
-                block_id].block_hash.block_hash == block_hash
+        for group_id, block_id in enumerate(block_ids):
+            blk_hash = manager.block_pool.blocks[block_id].block_hash
+            assert blk_hash is not None
+            assert get_block_hash(blk_hash) == block_hash
+            assert get_group_id(blk_hash) == group_id
             assert manager.block_pool.blocks[block_id].ref_cnt == 1
-        parent_block_hash = block_hash.hash_value
+        parent_block_hash = block_hash
 
     # Check partial block metadata
     for block_id in (4, 8, 12):
@@ -298,11 +302,10 @@ def test_prefill_hybrid_model():
     cached_block_hash_to_block_bak = copy.copy(
         manager.block_pool.cached_block_hash_to_block)
 
-    def test_partial_request_hit(request_id: str,
-                                 hash_to_evict: list[BlockHashWithGroupId],
+    def test_partial_request_hit(request_id: str, hash_to_evict: list[bytes],
                                  expect_hit_length: int):
         req = make_request(request_id, common_token_ids + unique_token_ids,
-                           block_size, hash)
+                           block_size, sha256)
         for hash_with_group_id in hash_to_evict:
             manager.block_pool.cached_block_hash_to_block.pop(
                 hash_with_group_id)
@@ -319,33 +322,32 @@ def test_prefill_hybrid_model():
 
     # Evict the blocks outside sliding window, does not affect the hit length.
     test_partial_request_hit("2", [
-        BlockHashWithGroupId(block_hashes[0], 1),
-        BlockHashWithGroupId(block_hashes[0], 2)
+        make_block_hash_with_group_id(block_hashes[0], 1),
+        make_block_hash_with_group_id(block_hashes[0], 2)
     ], 3)
 
     # Evict the first block of full attention, makes total cache miss.
-    test_partial_request_hit("3", [
-        BlockHashWithGroupId(block_hashes[0], 0),
-    ], 0)
+    test_partial_request_hit(
+        "3", [make_block_hash_with_group_id(block_hashes[0], 0)], 0)
 
     # Evict the last block of all layers, reduces the hit length to 2.
     test_partial_request_hit("4", [
-        BlockHashWithGroupId(block_hashes[2], 0),
-        BlockHashWithGroupId(block_hashes[2], 1),
-        BlockHashWithGroupId(block_hashes[2], 2),
+        make_block_hash_with_group_id(block_hashes[2], 0),
+        make_block_hash_with_group_id(block_hashes[2], 1),
+        make_block_hash_with_group_id(block_hashes[2], 2),
     ], 2)
 
     # Evict the last block of full attention, reduces the hit length to 2.
-    test_partial_request_hit("5", [BlockHashWithGroupId(block_hashes[2], 0)],
-                             2)
+    test_partial_request_hit(
+        "5", [make_block_hash_with_group_id(block_hashes[2], 0)], 2)
 
     # Evict the last block of sliding window, reduces the hit length to 2.
-    test_partial_request_hit("6", [BlockHashWithGroupId(block_hashes[2], 1)],
-                             2)
+    test_partial_request_hit(
+        "6", [make_block_hash_with_group_id(block_hashes[2], 1)], 2)
 
     # Evict the last block of sliding window, reduces the hit length to 2.
-    test_partial_request_hit("7", [BlockHashWithGroupId(block_hashes[2], 2)],
-                             2)
+    test_partial_request_hit(
+        "7", [make_block_hash_with_group_id(block_hashes[2], 2)], 2)
 
     # Evict different set of blocks for full attention and sliding window makes
     # total cache miss.
@@ -353,9 +355,9 @@ def test_prefill_hybrid_model():
     # The cache hit length of sliding window is 2 * block_size.
     # Then it is cache miss as the two type of layers have different hit length.
     test_partial_request_hit("8", [
-        BlockHashWithGroupId(block_hashes[2], 0),
-        BlockHashWithGroupId(block_hashes[0], 1),
-        BlockHashWithGroupId(block_hashes[0], 2),
+        make_block_hash_with_group_id(block_hashes[2], 0),
+        make_block_hash_with_group_id(block_hashes[0], 1),
+        make_block_hash_with_group_id(block_hashes[0], 2),
     ], 0)
 
 
@@ -372,8 +374,8 @@ def test_prefill_plp():
         max_model_len=8192,
         enable_caching=True,
     )
-    # the default hash function is hash
-    hash_fn = hash
+    # the default hash function is sha256
+    hash_fn = sha256
 
     # Complete 3 blocks (48 tokens)
     common_token_ids = [i for i in range(3) for _ in range(16)]
@@ -404,10 +406,12 @@ def test_prefill_plp():
         block_tokens = tuple(all_token_ids[(block_id - 1) * 16:block_id * 16])
         block_hash = hash_block_tokens(hash_fn, parent_block_hash,
                                        block_tokens)
-        assert manager.block_pool.blocks[
-            block_id].block_hash.block_hash == block_hash
+        blk_hash = (manager.block_pool.blocks[block_id].block_hash)
+        assert blk_hash is not None
+        assert get_block_hash(blk_hash) == block_hash
+        assert get_group_id(blk_hash) == 0
         assert manager.block_pool.blocks[block_id].ref_cnt == 1
-        parent_block_hash = block_hash.hash_value
+        parent_block_hash = block_hash
 
     # Check partial block metadata
     for block_id in (4, ):
@@ -493,7 +497,7 @@ def test_decode():
     # Incomplete 1 block (7 tokens)
     unique_token_ids = [3] * 7
     req0 = make_request("0", common_token_ids + unique_token_ids, block_size,
-                        hash)
+                        sha256)
     computed_blocks, num_computed_tokens = manager.get_computed_blocks(req0)
     assert not computed_blocks.blocks[0]
     assert num_computed_tokens == 0
@@ -538,7 +542,7 @@ def test_evict():
     )
 
     last_token_id = 5 * 16 + 7
-    req0 = make_request("0", list(range(last_token_id)), block_size, hash)
+    req0 = make_request("0", list(range(last_token_id)), block_size, sha256)
     computed_blocks, num_computed_tokens = manager.get_computed_blocks(req0)
     assert not computed_blocks.blocks[0]
     assert num_computed_tokens == 0
@@ -550,7 +554,7 @@ def test_evict():
     # 3 blocks.
     req1 = make_request("1", list(range(last_token_id,
                                         last_token_id + 3 * 16)), block_size,
-                        hash)
+                        sha256)
     computed_blocks, num_computed_tokens = manager.get_computed_blocks(req1)
     assert not computed_blocks.blocks[0]
     assert num_computed_tokens == 0
@@ -572,7 +576,7 @@ def test_evict():
     ] == [10, 6, 5, 4, 3, 2, 1, 9, 8, 7]
 
     # Touch the first 2 blocks.
-    req2 = make_request("2", list(range(2 * 16 + 3)), block_size, hash)
+    req2 = make_request("2", list(range(2 * 16 + 3)), block_size, sha256)
     computed_blocks, num_computed_tokens = manager.get_computed_blocks(req2)
     assert computed_blocks.get_block_ids() == ([1, 2], )
     assert num_computed_tokens == 2 * 16
@@ -597,7 +601,7 @@ def test_hash_block_correct_reuse():
 
     # Allocate 1 block and cache it.
     num_tokens = block_size * 1
-    req = make_request("0", list(range(num_tokens)), block_size, hash)
+    req = make_request("0", list(range(num_tokens)), block_size, sha256)
     computed_blocks, num_computed_tokens = manager.get_computed_blocks(req)
     assert not computed_blocks.blocks[0]
     assert num_computed_tokens == 0
@@ -611,7 +615,7 @@ def test_hash_block_correct_reuse():
 
     # Allocate a new block that's not full, make sure hash info on the
     # block is cleared.
-    req = make_request("1", list(range(num_tokens - 1)), block_size, hash)
+    req = make_request("1", list(range(num_tokens - 1)), block_size, sha256)
     computed_blocks, num_computed_tokens = manager.get_computed_blocks(req)
     assert not computed_blocks.blocks[0]
     assert num_computed_tokens == 0
@@ -638,7 +642,7 @@ def test_computed_blocks_not_evicted():
 
     # Allocate a block and cache it.
     num_tokens = block_size * 1
-    req0 = make_request("0", list(range(num_tokens)), block_size, hash)
+    req0 = make_request("0", list(range(num_tokens)), block_size, sha256)
     computed_blocks, num_computed_tokens = manager.get_computed_blocks(req0)
     assert not computed_blocks.blocks[0]
     assert num_computed_tokens == 0
@@ -650,7 +654,7 @@ def test_computed_blocks_not_evicted():
 
     # Allocate another block.
     req1 = make_request("1", list(range(num_tokens, num_tokens * 2)),
-                        block_size, hash)
+                        block_size, sha256)
     computed_blocks, num_computed_tokens = manager.get_computed_blocks(req1)
     assert not computed_blocks.blocks[0]
     assert num_computed_tokens == 0
@@ -666,7 +670,7 @@ def test_computed_blocks_not_evicted():
 
     # Now if we have a cache hit on the first block, we should evict the second
     # cached block rather than the first one.
-    req2 = make_request("2", list(range(num_tokens * 2)), block_size, hash)
+    req2 = make_request("2", list(range(num_tokens * 2)), block_size, sha256)
     computed_blocks, num_computed_tokens = manager.get_computed_blocks(req2)
     assert len(computed_blocks.blocks[0]) == 1
     assert computed_blocks.blocks[0][0].block_id == 1
@@ -691,7 +695,7 @@ def test_basic_prefix_caching_disabled():
     )
 
     req1 = make_request("1", list(range(10)), block_size,
-                        hash)  # 2 blocks and some more
+                        sha256)  # 2 blocks and some more
 
     computed_blocks, num_computed_tokens = manager.get_computed_blocks(req1)
     assert not computed_blocks.blocks[0]
@@ -706,7 +710,7 @@ def test_basic_prefix_caching_disabled():
 
     # No caching.
     req2 = make_request("2", list(range(16)), block_size,
-                        hash)  # shared prefix
+                        sha256)  # shared prefix
     computed_blocks, num_computed_tokens = manager.get_computed_blocks(req2)
     assert not computed_blocks.blocks[0]
     assert num_computed_tokens == 0
@@ -716,7 +720,7 @@ def test_basic_prefix_caching_disabled():
     assert len(blocks.blocks[0]) == 4
 
     # New requests should not have any blocks.
-    req3 = make_request("3", list(range(4)), block_size, hash)
+    req3 = make_request("3", list(range(4)), block_size, sha256)
     computed_blocks, num_computed_tokens = manager.get_computed_blocks(req3)
     assert not computed_blocks.blocks[0]
     assert num_computed_tokens == 0
@@ -726,7 +730,7 @@ def test_basic_prefix_caching_disabled():
     assert not blocks
 
 
-@pytest.mark.parametrize("hash_fn", [sha256, sha256_cbor_64bit, hash])
+@pytest.mark.parametrize("hash_fn", [sha256, sha256_cbor])
 def test_cache_blocks(hash_fn):
     """
     This is a unit test that tests the correctness of the _cache_full_blocks
@@ -787,7 +791,7 @@ def test_cache_blocks_multi_group():
     #  Block 1/5: [4, 5, 6, 7]
     #  Block 2/6: [8, 9, 10, 11]
     #  Block 3/7: [12, 13]
-    req = make_request("0", list(range(14)), block_size, hash)
+    req = make_request("0", list(range(14)), block_size, sha256)
 
     # Cache the blocks for group 0.
     blocks = [KVCacheBlock(block_id=i) for i in range(2)]
@@ -845,6 +849,8 @@ def test_mm_prefix_caching():
     """
     This tests that the multi-modal prefix caching is correct.
     """
+    kv_cache_utils.init_none_hash(sha256)
+
     block_size = 16
     manager = KVCacheManager(
         make_kv_cache_config(block_size, 11),
@@ -874,23 +880,30 @@ def test_mm_prefix_caching():
     req0 = make_request("0",
                         all_token_ids,
                         block_size,
-                        hash,
+                        sha256,
                         mm_positions=mm_positions,
                         mm_hashes=mm_hashes)
     computed_blocks, num_computed_tokens = manager.get_computed_blocks(req0)
 
-    # Completed block should have hashes with extra keys.
+    # Completed block should have hashes
     assert not computed_blocks.blocks[0]
     assert num_computed_tokens == 0
     block_hashes = req0.block_hashes
     assert len(block_hashes) == 3
-    assert block_hashes[0].extra_keys == ("aaa", )
-    assert block_hashes[1].extra_keys == ("aaa", "bbb")
-    assert block_hashes[2].extra_keys == ("bbb", )
+    assert block_hashes[0] == sha256(
+        (kv_cache_utils.NONE_HASH, tuple(all_token_ids[:block_size]),
+         ("aaa", )))
+    assert block_hashes[1] == sha256(
+        (block_hashes[0], tuple(all_token_ids[block_size:block_size * 2]),
+         ("aaa", "bbb")))
+    assert block_hashes[2] == sha256(
+        (block_hashes[1], tuple(all_token_ids[block_size * 2:block_size * 3]),
+         ("bbb", )))
 
     blocks = manager.allocate_slots(req0, 59,
                                     len(computed_blocks.blocks[0]) * 16,
                                     computed_blocks)
+    assert blocks is not None
     assert blocks.get_block_ids() == ([1, 2, 3, 4], )
     req0.num_computed_tokens = 59
 
@@ -901,10 +914,10 @@ def test_mm_prefix_caching():
                                         len(computed_blocks.blocks[0]) * 16,
                                         computed_blocks)
     assert new_blocks is not None and len(new_blocks.blocks[0]) == 0
-
-    # The just completed block should have hashes with extra keys.
     assert len(block_hashes) == 4
-    assert block_hashes[3].extra_keys == ("ccc", )
+    assert block_hashes[3] == sha256(
+        (block_hashes[2], tuple(all_token_ids[3 * block_size:] + [8] * 5),
+         ("ccc", )))
 
     # Cache hit.
     unique_token_ids = [-1] * 7 + [200] * 5
@@ -916,7 +929,7 @@ def test_mm_prefix_caching():
     req1 = make_request("1",
                         all_token_ids,
                         block_size,
-                        hash,
+                        sha256,
                         mm_positions=mm_positions,
                         mm_hashes=mm_hashes)
     computed_blocks, num_computed_tokens = manager.get_computed_blocks(req1)
@@ -929,6 +942,8 @@ def test_cache_key_salting():
     This tests that cache salts are applied during hashing and the cache
     is separated cache as expected.
     """
+    kv_cache_utils.init_none_hash(sha256)
+
     block_size = 16
     manager = KVCacheManager(
         make_kv_cache_config(block_size, 11),
@@ -939,21 +954,26 @@ def test_cache_key_salting():
     # 3 complete blocks and an incomplete block with 11 tokens.
     common_token_ids = [i for i in range(3) for _ in range(block_size)]
     token_ids = common_token_ids + [3] * 11
-    req0 = make_request("0", token_ids, block_size, hash, cache_salt="salt1")
+    req0 = make_request("0", token_ids, block_size, sha256, cache_salt="salt1")
     computed_blocks, num_computed_tokens = manager.get_computed_blocks(req0)
 
-    # Completed block should have hashes with extra keys.
+    # Completed block should have hashes
     assert not computed_blocks.blocks[0]
     assert num_computed_tokens == 0
     block_hashes = req0.block_hashes
     assert len(block_hashes) == 3
-    assert block_hashes[0].extra_keys == ("salt1", )
-    assert block_hashes[1].extra_keys is None
-    assert block_hashes[2].extra_keys is None
+    assert block_hashes[0] == sha256(
+        (kv_cache_utils.NONE_HASH, tuple(token_ids[:block_size]), ("salt1", )))
+    assert block_hashes[1] == sha256(
+        (block_hashes[0], tuple(token_ids[block_size:block_size * 2]), None))
+    assert block_hashes[2] == sha256(
+        (block_hashes[1], tuple(token_ids[block_size * 2:block_size * 3]),
+         None))
 
     blocks = manager.allocate_slots(req0, 59,
                                     len(computed_blocks.blocks[0]) * 16,
                                     computed_blocks)
+    assert blocks is not None
     assert blocks.get_block_ids() == ([1, 2, 3, 4], )
     req0.num_computed_tokens = 59
 
@@ -964,14 +984,13 @@ def test_cache_key_salting():
                                         len(computed_blocks.blocks[0]) * 16,
                                         computed_blocks)
     assert new_blocks is not None and len(new_blocks.blocks[0]) == 0
-
-    # Now one more block that should not have extra keys.
     assert len(block_hashes) == 4
-    assert block_hashes[3].extra_keys is None
+    assert block_hashes[3] == sha256(
+        (block_hashes[2], tuple(token_ids[3 * block_size:] + [8] * 5), None))
 
     # Test cache hit with a new request that has the same salt.
     token_ids = common_token_ids + [4] * 11
-    req1 = make_request("1", token_ids, block_size, hash, cache_salt="salt1")
+    req1 = make_request("1", token_ids, block_size, sha256, cache_salt="salt1")
     computed_blocks, num_computed_tokens = manager.get_computed_blocks(req1)
     # Should match only a prefix of 3 blocks.
     assert len(computed_blocks.blocks[0]) == 3
@@ -979,13 +998,19 @@ def test_cache_key_salting():
 
     # Test cache miss with same content but different salt.
     token_ids = common_token_ids + [4] * 11
-    req2 = make_request("2", token_ids, block_size, hash, cache_salt="salt2")
+    req2 = make_request("2", token_ids, block_size, sha256, cache_salt="salt2")
     computed_blocks, num_computed_tokens = manager.get_computed_blocks(req2)
     assert len(computed_blocks.blocks[0]) == 0
     assert num_computed_tokens == 0
     block_hashes = req2.block_hashes
     assert len(block_hashes) == 3
-    assert block_hashes[0].extra_keys == ("salt2", )
+    assert block_hashes[0] == sha256(
+        (kv_cache_utils.NONE_HASH, tuple(token_ids[:block_size]), ("salt2", )))
+    assert block_hashes[1] == sha256(
+        (block_hashes[0], tuple(token_ids[block_size:block_size * 2]), None))
+    assert block_hashes[2] == sha256(
+        (block_hashes[1], tuple(token_ids[block_size * 2:block_size * 3]),
+         None))
 
 
 def test_prefill_not_enough_free_blocks_with_computed_blocks():
@@ -1004,7 +1029,7 @@ def test_prefill_not_enough_free_blocks_with_computed_blocks():
     # Complete 3 blocks (48 tokens)
     # | Common-0 | Common-1 | Common-2 | ... |
     common_token_ids = [i for i in range(3) for _ in range(16)]
-    req0 = make_request("0", common_token_ids, block_size, hash)
+    req0 = make_request("0", common_token_ids, block_size, sha256)
     computed_blocks, num_computed_tokens = manager.get_computed_blocks(req0)
     assert not computed_blocks.blocks[0]
     assert num_computed_tokens == 0
@@ -1015,7 +1040,7 @@ def test_prefill_not_enough_free_blocks_with_computed_blocks():
         req0.request_id]
 
     # | Common-0 | Common-1 | Common-2 | Req1-3 | Req1-4 | Req1-5 | ... |
-    req1 = make_request("1", common_token_ids * 2, block_size, hash)
+    req1 = make_request("1", common_token_ids * 2, block_size, sha256)
     computed_blocks, num_computed_tokens = manager.get_computed_blocks(req1)
     assert computed_blocks.blocks[0] == block_part0
     assert num_computed_tokens == 3 * 16
@@ -1032,7 +1057,7 @@ def test_prefill_not_enough_free_blocks_with_computed_blocks():
 
     # | Common-0 | Common-1 | Common-2 | Req1-3 (F) | Req1-4 (F) |
     # | Req1-5(F)| Req2-0   | Req2-1   | ... |
-    req2 = make_request("2", [7] * block_size * 2, block_size, hash)
+    req2 = make_request("2", [7] * block_size * 2, block_size, sha256)
     computed_blocks, num_computed_tokens = manager.get_computed_blocks(req2)
     assert not computed_blocks.blocks[0]
     assert num_computed_tokens == 0
@@ -1044,7 +1069,7 @@ def test_prefill_not_enough_free_blocks_with_computed_blocks():
     # but it cannot be allocated due to insufficient free blocks (2).
     # In this case, the ref_cnt of the computed blocks should not be changed.
     assert manager.block_pool.free_block_queue.num_free_blocks == 5
-    req3 = make_request("3", common_token_ids * 3, block_size, hash)
+    req3 = make_request("3", common_token_ids * 3, block_size, sha256)
     computed_blocks, num_computed_tokens = manager.get_computed_blocks(req3)
     assert computed_blocks.blocks[0] == block_part1
     assert num_computed_tokens == 6 * 16
@@ -1069,13 +1094,13 @@ def test_reset_prefix_cache():
     full_block_token_ids = [i for i in range(3) for _ in range(16)]
     unique_token_ids = [3] * 7
     all_token_ids = full_block_token_ids + unique_token_ids
-    req0 = make_request("0", all_token_ids, block_size, hash)
+    req0 = make_request("0", all_token_ids, block_size, sha256)
     blocks = manager.allocate_slots(req0, 55)
     assert blocks.get_block_ids() == ([1, 2, 3, 4], )
 
     unique_token_ids = [4] * 7
     all_token_ids = full_block_token_ids + unique_token_ids
-    req1 = make_request("1", all_token_ids, block_size, hash)
+    req1 = make_request("1", all_token_ids, block_size, sha256)
     computed_blocks, _ = manager.get_computed_blocks(req1)
     assert len(req1.block_hashes) == 3
     assert len(computed_blocks.blocks[0]) == 3
@@ -1109,7 +1134,7 @@ def test_prefix_cache_stats_disabled():
     assert manager.prefix_cache_stats is None
 
     # Call all functions that check whether log_stats is disabled.
-    req = make_request("0", list(range(16)), block_size, hash)
+    req = make_request("0", list(range(16)), block_size, sha256)
     computed_blocks, num_computed_tokens = manager.get_computed_blocks(req)
     assert not computed_blocks.blocks[0]
     assert num_computed_tokens == 0
@@ -1124,15 +1149,9 @@ def test_prefix_cache_stats_disabled():
 
 def test_maybe_evict_cached_block():
     pool = BlockPool(num_gpu_blocks=4, enable_caching=True)
-    block_hash0 = BlockHashWithGroupId(block_hash=BlockHash(hash_value=10,
-                                                            token_ids=(100, )),
-                                       group_id=1000)
-    block_hash1 = BlockHashWithGroupId(block_hash=BlockHash(hash_value=20,
-                                                            token_ids=(200, )),
-                                       group_id=2000)
-    block_hash2 = BlockHashWithGroupId(block_hash=BlockHash(hash_value=30,
-                                                            token_ids=(300, )),
-                                       group_id=3000)
+    block_hash0 = make_block_hash_with_group_id(BlockHash(b"10"), 1000)
+    block_hash1 = make_block_hash_with_group_id(BlockHash(b"20"), 2000)
+    block_hash2 = make_block_hash_with_group_id(BlockHash(b"30"), 3000)
     block_hashes = [
         block_hash0,
         block_hash1,
@@ -1206,7 +1225,7 @@ def test_kv_cache_events(blocks_to_cache: int):
     )
 
     num_tokens = block_size * blocks_to_cache
-    req0 = make_request("0", list(range(num_tokens)), block_size, hash)
+    req0 = make_request("0", list(range(num_tokens)), block_size, sha256)
     _ = manager.allocate_slots(req0, num_tokens)
     events = manager.take_events()
 
@@ -1222,7 +1241,7 @@ def test_kv_cache_events(blocks_to_cache: int):
     # Should see block_to_cache number of removed block events and a new block
     # stored event
     manager.free(req0)
-    req1 = make_request("1", list(range(num_tokens)), block_size, hash)
+    req1 = make_request("1", list(range(num_tokens)), block_size, sha256)
     _ = manager.allocate_slots(req1, num_tokens)
     events = manager.take_events()
 
@@ -1256,7 +1275,7 @@ def test_eagle_enabled_removes_last_block():
 
     # Request with 3 full blocks (48 tokens)
     token_ids = [0] * (3 * block_size)
-    req = make_request("divisible_request", token_ids, block_size, hash)
+    req = make_request("divisible_request", token_ids, block_size, sha256)
 
     # Prime the cache
     computed_blocks, _ = manager.get_computed_blocks(req)
@@ -1266,7 +1285,7 @@ def test_eagle_enabled_removes_last_block():
     manager.free(req)
 
     # New request with same tokens + Eagle enabled
-    req_eagle = make_request("eagle_divisible", token_ids, block_size, hash)
+    req_eagle = make_request("eagle_divisible", token_ids, block_size, sha256)
     computed_blocks, num_tokens = manager.get_computed_blocks(req_eagle)
 
     # Should retain 1 block:
@@ -1287,7 +1306,7 @@ def test_eagle_with_partial_blocks():
     )
     # 2 full blocks + 5 tokens (non-divisible length)
     token_ids = [0] * (2 * block_size + 5)
-    req = make_request("partial_block_test", token_ids, block_size, hash)
+    req = make_request("partial_block_test", token_ids, block_size, sha256)
 
     # Prime the cache
     computed_blocks, _ = manager.get_computed_blocks(req)
@@ -1297,7 +1316,7 @@ def test_eagle_with_partial_blocks():
     manager.free(req)
 
     # New request with Eagle enabled
-    req_eagle = make_request("partial_eagle", token_ids, block_size, hash)
+    req_eagle = make_request("partial_eagle", token_ids, block_size, sha256)
     computed_blocks, num_tokens = manager.get_computed_blocks(req_eagle)
     # Original match: 2 full blocks → Eagle removes 1 → 1 remaining
     assert len(computed_blocks.blocks[0]) == 1
@@ -1328,7 +1347,7 @@ def test_eagle_with_sliding_window():
 
     # 2 full blocks + 5 tokens (non-divisible length)
     token_ids = [0] * (2 * block_size + 5)
-    req = make_request("partial_block_test", token_ids, block_size, hash)
+    req = make_request("partial_block_test", token_ids, block_size, sha256)
 
     # Prime the cache
     computed_blocks, _ = manager.get_computed_blocks(req)
@@ -1341,7 +1360,7 @@ def test_eagle_with_sliding_window():
     manager.free(req)
 
     # New request with Eagle enabled
-    req_eagle = make_request("partial_eagle", token_ids, block_size, hash)
+    req_eagle = make_request("partial_eagle", token_ids, block_size, sha256)
     computed_blocks, num_tokens = manager.get_computed_blocks(req_eagle)
     # Original match: 2 full blocks → Eagle removes 1 → 1 remaining
     assert len(computed_blocks.blocks[0]) == 1
@@ -1351,11 +1370,11 @@ def test_eagle_with_sliding_window():
     assert manager.block_pool.get_cached_block(
         block_hash_first_block, kv_cache_group_ids=[0]) is not None
     manager.block_pool.cached_block_hash_to_block.pop(
-        BlockHashWithGroupId(block_hash_first_block, 0))
+        make_block_hash_with_group_id(block_hash_first_block, 0))
 
     # New request
     req_after_evict = make_request("partial_eagle_after_evict", token_ids,
-                                   block_size, hash)
+                                   block_size, sha256)
     computed_blocks, num_tokens = manager.get_computed_blocks(req_after_evict)
     # Cache miss. The only hit prefix is [NULL_BLOCK, BLOCK_2] if eagle is
     # not considered. But after dropping the last matched block due to eagle,
diff --git a/tests/v1/core/test_single_type_kv_cache_manager.py b/tests/v1/core/test_single_type_kv_cache_manager.py
index 7dcebba491fab60f7702c0b64260c1e35ec48041..b70850a9bcff9788af06b26a3d5da690922a9b9a 100644
--- a/tests/v1/core/test_single_type_kv_cache_manager.py
+++ b/tests/v1/core/test_single_type_kv_cache_manager.py
@@ -6,8 +6,8 @@ import random
 import torch
 
 from vllm.v1.core.block_pool import BlockPool
-from vllm.v1.core.kv_cache_utils import (BlockHash, BlockHashWithGroupId,
-                                         KVCacheBlock)
+from vllm.v1.core.kv_cache_utils import (BlockHash, KVCacheBlock,
+                                         make_block_hash_with_group_id)
 from vllm.v1.core.single_type_kv_cache_manager import (
     ChunkedLocalAttentionManager, SlidingWindowManager)
 from vllm.v1.kv_cache_interface import (ChunkedLocalAttentionSpec,
@@ -44,7 +44,7 @@ def test_chunked_local_attention_possible_cached_prefix():
 
     def run_one_case(block_is_cached, tail_token, expect_length):
         block_hash_list = [
-            BlockHash(i, ()) for i in range(len(block_is_cached))
+            BlockHash(str(i).encode()) for i in range(len(block_is_cached))
         ]
 
         block_pool.cached_block_hash_to_block.clear()
@@ -53,8 +53,8 @@ def test_chunked_local_attention_possible_cached_prefix():
         for i, (block_hash,
                 is_cached) in enumerate(zip(block_hash_list, block_is_cached)):
             if is_cached:
-                block_pool.cached_block_hash_to_block[BlockHashWithGroupId(
-                    block_hash, 0)] = {
+                block_pool.cached_block_hash_to_block[
+                    make_block_hash_with_group_id(block_hash, 0)] = {
                         i: block_pool.blocks[i + 10],
                     }
 
@@ -109,7 +109,7 @@ def test_sliding_window_possible_cached_prefix():
 
     def run_one_case(block_is_cached, expect_length):
         block_hash_list = [
-            BlockHash(i, ()) for i in range(len(block_is_cached))
+            BlockHash(str(i).encode()) for i in range(len(block_is_cached))
         ]
 
         block_pool.cached_block_hash_to_block.clear()
@@ -118,8 +118,8 @@ def test_sliding_window_possible_cached_prefix():
         for i, (block_hash,
                 is_cached) in enumerate(zip(block_hash_list, block_is_cached)):
             if is_cached:
-                block_pool.cached_block_hash_to_block[BlockHashWithGroupId(
-                    block_hash, 0)] = {
+                block_pool.cached_block_hash_to_block[
+                    make_block_hash_with_group_id(block_hash, 0)] = {
                         i: block_pool.blocks[i + 10],
                     }
 
diff --git a/tests/v1/core/utils.py b/tests/v1/core/utils.py
index e392c2c336e9beaa109aa0dd24f70ac054b6cae9..d343141cdf4cb9e2fd1013435ffce4f4cd85bff6 100644
--- a/tests/v1/core/utils.py
+++ b/tests/v1/core/utils.py
@@ -9,6 +9,7 @@ from vllm.config import (CacheConfig, KVTransferConfig, ModelConfig,
 from vllm.multimodal.inputs import (MultiModalFeatureSpec,
                                     MultiModalKwargsItem, PlaceholderRange)
 from vllm.sampling_params import SamplingParams
+from vllm.utils import sha256
 from vllm.v1.core.kv_cache_utils import (get_request_block_hasher,
                                          init_none_hash)
 from vllm.v1.core.sched.async_scheduler import AsyncScheduler
@@ -130,10 +131,10 @@ def create_requests(
 ) -> list[Request]:
     global _none_hash_initialized
     if not _none_hash_initialized:
-        init_none_hash(hash)
+        init_none_hash(sha256)
         _none_hash_initialized = True
 
-    block_hasher = get_request_block_hasher(block_size, hash)
+    block_hasher = get_request_block_hasher(block_size, sha256)
     sampling_params = SamplingParams(ignore_eos=False,
                                      max_tokens=max_tokens,
                                      stop_token_ids=stop_token_ids,
diff --git a/tests/v1/cudagraph/test_cudagraph_mode.py b/tests/v1/cudagraph/test_cudagraph_mode.py
index 81655e41750061456d053bcb212d664872c93c8d..25e01806f49562a627b41de96b36465c45d6aaa7 100644
--- a/tests/v1/cudagraph/test_cudagraph_mode.py
+++ b/tests/v1/cudagraph/test_cudagraph_mode.py
@@ -62,6 +62,16 @@ backend_configs = {
                       "cudagraph_mode": "FULL_AND_PIECEWISE",
                   },
                   specific_gpu_arch=(9, 0)),
+    # FlashAttention MLA on Hopper
+    "FlashAttentionMLA":
+    BackendConfig(name="FlashAttentionMLA",
+                  env_vars={
+                      "VLLM_ATTENTION_BACKEND": "FLASH_ATTN_MLA",
+                  },
+                  comp_config={
+                      "cudagraph_mode": "FULL_DECODE_ONLY",
+                  },
+                  specific_gpu_arch=(9, 0)),
     # FA2
     "FA2":
     BackendConfig(name="FA2",
diff --git a/tests/v1/e2e/test_spec_decode.py b/tests/v1/e2e/test_spec_decode.py
index bd0fa6b80781a1f6b43fbcb9e7e3ed02edc90602..0b240b7d434e52de0bc0ee03b4e6e98f2dc8a234 100644
--- a/tests/v1/e2e/test_spec_decode.py
+++ b/tests/v1/e2e/test_spec_decode.py
@@ -83,7 +83,7 @@ def test_ngram_correctness(
     model_name: str,
 ):
     '''
-    Compare the outputs of a original LLM and a speculative LLM
+    Compare the outputs of an original LLM and a speculative LLM
     should be the same when using ngram speculative decoding.
     '''
     with monkeypatch.context() as m:
@@ -117,45 +117,38 @@ def test_ngram_correctness(
                 print(f"ref_output: {ref_output.outputs[0].text}")
                 print(f"spec_output: {spec_output.outputs[0].text}")
 
-        # Heuristic: expect at least 70% of the prompts to match exactly
+        # Heuristic: expect at least 68% of the prompts to match exactly
         # Upon failure, inspect the outputs to check for inaccuracy.
-        assert matches > int(0.7 * len(ref_outputs))
+        assert matches >= int(0.68 * len(ref_outputs))
         del spec_llm
         torch.cuda.empty_cache()
         cleanup_dist_env_and_memory()
 
 
-@pytest.mark.parametrize(
-    ["model_setup", "mm_enabled"],
-    [
-        # TODO: Re-enable this once tests/models/test_initialization.py is fixed, see PR #22333 #22611  # noqa: E501
-        # (("eagle3", "Qwen/Qwen3-8B", "AngelSlim/Qwen3-8B_eagle3", 1), False),
-        (("eagle", "meta-llama/Llama-3.1-8B-Instruct",
-          "yuhuili/EAGLE-LLaMA3.1-Instruct-8B", 1), False),
-        (("eagle3", "meta-llama/Llama-3.1-8B-Instruct",
-          "yuhuili/EAGLE3-LLaMA3.1-Instruct-8B", 1), False),
-        pytest.param(
-            ("eagle", "meta-llama/Llama-4-Scout-17B-16E-Instruct",
-             "morgendave/EAGLE-Llama-4-Scout-17B-16E-Instruct", 4),
-            False,
-            marks=pytest.mark.skip(reason="Skipping due to CI OOM issues")),
-        pytest.param(
-            ("eagle", "meta-llama/Llama-4-Scout-17B-16E-Instruct",
-             "morgendave/EAGLE-Llama-4-Scout-17B-16E-Instruct", 4),
-            True,
-            marks=pytest.mark.skip(reason="Skipping due to CI OOM issues")),
-        (("eagle", "eagle618/deepseek-v3-random",
-          "eagle618/eagle-deepseek-v3-random", 1), False),
-    ],
-    ids=[
-        # TODO: Re-enable this once tests/models/test_initialization.py is fixed, see PR #22333 #22611  # noqa: E501
-        # "qwen3_eagle3",
-        "llama3_eagle",
-        "llama3_eagle3",
-        "llama4_eagle",
-        "llama4_eagle_mm",
-        "deepseek_eagle"
-    ])
+@pytest.mark.parametrize(["model_setup", "mm_enabled"], [
+    (("eagle3", "Qwen/Qwen3-8B", "AngelSlim/Qwen3-8B_eagle3", 1), False),
+    (("eagle", "meta-llama/Llama-3.1-8B-Instruct",
+      "yuhuili/EAGLE-LLaMA3.1-Instruct-8B", 1), False),
+    (("eagle3", "meta-llama/Llama-3.1-8B-Instruct",
+      "yuhuili/EAGLE3-LLaMA3.1-Instruct-8B", 1), False),
+    pytest.param(
+        ("eagle", "meta-llama/Llama-4-Scout-17B-16E-Instruct",
+         "morgendave/EAGLE-Llama-4-Scout-17B-16E-Instruct", 4),
+        False,
+        marks=pytest.mark.skip(reason="Skipping due to CI OOM issues")),
+    pytest.param(
+        ("eagle", "meta-llama/Llama-4-Scout-17B-16E-Instruct",
+         "morgendave/EAGLE-Llama-4-Scout-17B-16E-Instruct", 4),
+        True,
+        marks=pytest.mark.skip(reason="Skipping due to CI OOM issues")),
+    (("eagle", "eagle618/deepseek-v3-random",
+      "eagle618/eagle-deepseek-v3-random", 1), False),
+],
+                         ids=[
+                             "qwen3_eagle3", "llama3_eagle", "llama3_eagle3",
+                             "llama4_eagle", "llama4_eagle_mm",
+                             "deepseek_eagle"
+                         ])
 @pytest.mark.parametrize("attn_backend",
                          get_attn_backend_list_based_on_platform())
 def test_eagle_correctness(
@@ -169,7 +162,7 @@ def test_eagle_correctness(
         # TODO: Fix this flaky test
         pytest.skip(
             "TREE_ATTN is flaky in the test disable for now until it can be "
-            "reolved (see https://github.com/vllm-project/vllm/issues/22922)")
+            "resolved (see https://github.com/vllm-project/vllm/issues/22922)")
 
     # Generate test prompts inside the function instead of using fixture
     test_prompts = get_test_prompts(mm_enabled)
diff --git a/tests/v1/engine/test_async_llm.py b/tests/v1/engine/test_async_llm.py
index df04a14af70ce5474b916b4814e07a456f0ed6f6..aca546600d0b56d15b04e5207e3b52bd48fff4fd 100644
--- a/tests/v1/engine/test_async_llm.py
+++ b/tests/v1/engine/test_async_llm.py
@@ -393,7 +393,7 @@ class MockLoggingStatLogger(LoggingStatLogger):
 async def test_customize_loggers(monkeypatch):
     """Test that we can customize the loggers.
     If a customized logger is provided at the init, it should
-    be used directly.
+    be added to the default loggers.
     """
 
     with monkeypatch.context() as m, ExitStack() as after:
@@ -410,7 +410,8 @@ async def test_customize_loggers(monkeypatch):
 
         stat_loggers = engine.logger_manager.per_engine_logger_dict
         assert len(stat_loggers) == 1
-        assert len(stat_loggers[0]) == 1
+        assert len(
+            stat_loggers[0]) == 2  # LoggingStatLogger + MockLoggingStatLogger
         stat_loggers[0][0].log.assert_called_once()
 
 
diff --git a/tests/v1/engine/test_engine_args.py b/tests/v1/engine/test_engine_args.py
index f70a3ce147ff2638e26ba8b10a83756cad0bd3f9..23ec3673b10b43b9b1f65dbf1ea78d40099aab4e 100644
--- a/tests/v1/engine/test_engine_args.py
+++ b/tests/v1/engine/test_engine_args.py
@@ -36,18 +36,19 @@ def test_prefix_caching_from_cli():
     assert vllm_config.cache_config.enable_prefix_caching
 
     # default hash algorithm is "builtin"
-    assert vllm_config.cache_config.prefix_caching_hash_algo == "builtin"
+    assert vllm_config.cache_config.prefix_caching_hash_algo == "sha256"
+
+    # set hash algorithm to sha256_cbor
+    args = parser.parse_args(["--prefix-caching-hash-algo", "sha256_cbor"])
+    vllm_config = EngineArgs.from_cli_args(args=args).create_engine_config()
+    assert vllm_config.cache_config.prefix_caching_hash_algo == \
+        "sha256_cbor"
 
     # set hash algorithm to sha256
     args = parser.parse_args(["--prefix-caching-hash-algo", "sha256"])
     vllm_config = EngineArgs.from_cli_args(args=args).create_engine_config()
     assert vllm_config.cache_config.prefix_caching_hash_algo == "sha256"
 
-    # set hash algorithm to builtin
-    args = parser.parse_args(["--prefix-caching-hash-algo", "builtin"])
-    vllm_config = EngineArgs.from_cli_args(args=args).create_engine_config()
-    assert vllm_config.cache_config.prefix_caching_hash_algo == "builtin"
-
     # an invalid hash algorithm raises an error
     parser.exit_on_error = False
     with pytest.raises(ArgumentError):
diff --git a/tests/v1/engine/test_processor_multi_modal_uuids.py b/tests/v1/engine/test_processor_multi_modal_uuids.py
index 970a59eca8ece42ad9983dd9a378daf6cdc29d3f..955c74d262a0948afdf5bdef476c73f2fcfd979f 100644
--- a/tests/v1/engine/test_processor_multi_modal_uuids.py
+++ b/tests/v1/engine/test_processor_multi_modal_uuids.py
@@ -152,8 +152,8 @@ def test_multi_modal_uuids_accepts_none_and_passes_through(
                         *,
                         tokenization_kwargs=None,
                         lora_request=None,
-                        mm_hash_overrides=None):
-        captured["mm_hash_overrides"] = mm_hash_overrides
+                        mm_uuids=None):
+        captured["mm_uuids"] = mm_uuids
         # Minimal processed inputs for decoder-only flow
         return {"type": "token", "prompt_token_ids": [1]}
 
@@ -180,7 +180,7 @@ def test_multi_modal_uuids_accepts_none_and_passes_through(
         params=SamplingParams(),
     )
 
-    assert captured["mm_hash_overrides"] == mm_uuids
+    assert captured["mm_uuids"] == mm_uuids
 
 
 def test_multi_modal_uuids_ignored_when_caching_disabled(monkeypatch):
@@ -196,8 +196,8 @@ def test_multi_modal_uuids_ignored_when_caching_disabled(monkeypatch):
                         *,
                         tokenization_kwargs=None,
                         lora_request=None,
-                        mm_hash_overrides=None):
-        captured["mm_hash_overrides"] = mm_hash_overrides
+                        mm_uuids=None):
+        captured["mm_uuids"] = mm_uuids
         return {"type": "token", "prompt_token_ids": [1]}
 
     monkeypatch.setattr(processor.input_preprocessor,
@@ -223,7 +223,7 @@ def test_multi_modal_uuids_ignored_when_caching_disabled(monkeypatch):
     )
 
     # Expect request-id-based overrides are passed through
-    assert captured["mm_hash_overrides"] == {
+    assert captured["mm_uuids"] == {
         "image": [f"{request_id}-image-0", f"{request_id}-image-1"],
         "video": [f"{request_id}-video-0"],
     }
diff --git a/tests/v1/entrypoints/llm/test_struct_output_generate.py b/tests/v1/entrypoints/llm/test_struct_output_generate.py
index cd82eb2ac4199a2e493d2b8c8bf424a7b6ac5895..126d8ce8c8e00c62f5caf27b9f2bc71bee3debd7 100644
--- a/tests/v1/entrypoints/llm/test_struct_output_generate.py
+++ b/tests/v1/entrypoints/llm/test_struct_output_generate.py
@@ -46,12 +46,12 @@ PARAMS_MODELS_BACKENDS_TOKENIZER_MODE = [
     ("mistralai/Ministral-8B-Instruct-2410", "xgrammar", "mistral", None),
     ("Qwen/Qwen2.5-1.5B-Instruct", "xgrammar", "auto", None),
     ("Qwen/Qwen2.5-1.5B-Instruct", "lm-format-enforcer", "auto", None),
-    ("mistralai/Ministral-8B-Instruct-2410", "outlines", "auto", None),
-    ("mistralai/Ministral-8B-Instruct-2410", "outlines", "mistral", None),
+    #FIXME: This tests are flaky on CI thus disabled. Tracking in Issue #24402
+    # ("mistralai/Ministral-8B-Instruct-2410", "outlines", "auto", None),
+    # ("mistralai/Ministral-8B-Instruct-2410", "outlines", "mistral", None),
+    #("Qwen/Qwen2.5-1.5B-Instruct", "guidance", "auto"),
     ("mistralai/Ministral-8B-Instruct-2410", "outlines", "auto",
      NGRAM_SPEC_CONFIG),
-    #FIXME: This test is flaky on CI thus disabled
-    #("Qwen/Qwen2.5-1.5B-Instruct", "guidance", "auto"),
     ("mistralai/Ministral-8B-Instruct-2410", "guidance", "auto",
      NGRAM_SPEC_CONFIG),
     ("Qwen/Qwen2.5-1.5B-Instruct", "xgrammar", "auto", NGRAM_SPEC_CONFIG),
@@ -122,6 +122,7 @@ def test_structured_output(
         guided_decoding_backend=guided_decoding_backend,
         guided_decoding_disable_any_whitespace=(guided_decoding_backend
                                                 in {"xgrammar", "guidance"}),
+        seed=120,
         tokenizer_mode=tokenizer_mode,
         speculative_config=speculative_config)
 
diff --git a/tests/v1/entrypoints/openai/responses/test_basic.py b/tests/v1/entrypoints/openai/responses/test_basic.py
index 7a0baa5767cbad92a05a889601da955f28f5cd77..2ee1004493a16a1e916fca14fd80978f4f8bd2b4 100644
--- a/tests/v1/entrypoints/openai/responses/test_basic.py
+++ b/tests/v1/entrypoints/openai/responses/test_basic.py
@@ -2,6 +2,7 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import openai  # use the official client for correctness check
+import openai.types.responses as openai_responses_types
 import pytest
 
 
@@ -86,3 +87,18 @@ async def test_logprobs(client: openai.AsyncOpenAI):
     outputs = response.output
     assert outputs[-1].content[-1].logprobs
     assert len(outputs[-1].content[-1].logprobs[0].top_logprobs) == 5
+
+
+@pytest.mark.asyncio
+async def test_streaming(client: openai.AsyncOpenAI):
+    stream = await client.responses.create(
+        input="What is 13 * 24?",
+        stream=True,
+    )
+    events = [event async for event in stream]
+    assert isinstance(events[0], openai_responses_types.ResponseCreatedEvent)
+    assert any(
+        isinstance(event, openai_responses_types.ResponseTextDeltaEvent)
+        for event in events)
+    assert isinstance(events[-1],
+                      openai_responses_types.ResponseCompletedEvent)
diff --git a/tests/v1/entrypoints/openai/responses/test_image.py b/tests/v1/entrypoints/openai/responses/test_image.py
index c8d09fd39fb1399be0f50e0eac17a1bf7a73210b..3ed36ca678c0c6d234dabee8830163f4fd85612c 100644
--- a/tests/v1/entrypoints/openai/responses/test_image.py
+++ b/tests/v1/entrypoints/openai/responses/test_image.py
@@ -8,17 +8,17 @@ import pytest
 import pytest_asyncio
 
 from tests.utils import RemoteOpenAIServer
-from vllm.multimodal.utils import encode_image_base64, fetch_image
+from vllm.multimodal.utils import encode_image_base64
 
 # Use a small vision model for testing
 MODEL_NAME = "Qwen/Qwen2.5-VL-3B-Instruct"
 MAXIMUM_IMAGES = 2
 # Test different image extensions (JPG/PNG) and formats (gray/RGB/RGBA)
-TEST_IMAGE_URLS = [
-    "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg",
-    "https://upload.wikimedia.org/wikipedia/commons/f/fa/Grayscale_8bits_palette_sample_image.png",
-    "https://upload.wikimedia.org/wikipedia/commons/thumb/9/91/Venn_diagram_rgb.svg/1280px-Venn_diagram_rgb.svg.png",
-    "https://upload.wikimedia.org/wikipedia/commons/0/0b/RGBA_comp.png",
+TEST_IMAGE_ASSETS = [
+    "2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg",  # "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg"
+    "Grayscale_8bits_palette_sample_image.png",  # "https://upload.wikimedia.org/wikipedia/commons/f/fa/Grayscale_8bits_palette_sample_image.png",
+    "1280px-Venn_diagram_rgb.svg.png",  # "https://upload.wikimedia.org/wikipedia/commons/thumb/9/91/Venn_diagram_rgb.svg/1280px-Venn_diagram_rgb.svg.png",
+    "RGBA_comp.png",  # "https://upload.wikimedia.org/wikipedia/commons/0/0b/RGBA_comp.png",
 ]
 
 
@@ -52,16 +52,17 @@ async def client(image_server):
 
 
 @pytest.fixture(scope="session")
-def base64_encoded_image() -> dict[str, str]:
+def base64_encoded_image(local_asset_server) -> dict[str, str]:
     return {
-        image_url: encode_image_base64(fetch_image(image_url))
-        for image_url in TEST_IMAGE_URLS
+        image_url:
+        encode_image_base64(local_asset_server.get_image_asset(image_url))
+        for image_url in TEST_IMAGE_ASSETS
     }
 
 
 @pytest.mark.asyncio
 @pytest.mark.parametrize("model_name", [MODEL_NAME])
-@pytest.mark.parametrize("image_url", TEST_IMAGE_URLS)
+@pytest.mark.parametrize("image_url", TEST_IMAGE_ASSETS, indirect=True)
 async def test_single_chat_session_image(client: openai.AsyncOpenAI,
                                          model_name: str, image_url: str):
     content_text = "What's in this image?"
@@ -91,11 +92,11 @@ async def test_single_chat_session_image(client: openai.AsyncOpenAI,
 
 @pytest.mark.asyncio
 @pytest.mark.parametrize("model_name", [MODEL_NAME])
-@pytest.mark.parametrize("image_url", TEST_IMAGE_URLS)
+@pytest.mark.parametrize("raw_image_url", TEST_IMAGE_ASSETS)
 async def test_single_chat_session_image_base64encoded(
     client: openai.AsyncOpenAI,
     model_name: str,
-    image_url: str,
+    raw_image_url: str,
     base64_encoded_image: dict[str, str],
 ):
     content_text = "What's in this image?"
@@ -106,7 +107,7 @@ async def test_single_chat_session_image_base64encoded(
             {
                 "type": "input_image",
                 "image_url":
-                f"data:image/jpeg;base64,{base64_encoded_image[image_url]}",
+                f"data:image/jpeg;base64,{base64_encoded_image[raw_image_url]}",
                 "detail": "auto",
             },
             {
@@ -127,7 +128,8 @@ async def test_single_chat_session_image_base64encoded(
 @pytest.mark.parametrize("model_name", [MODEL_NAME])
 @pytest.mark.parametrize(
     "image_urls",
-    [TEST_IMAGE_URLS[:i] for i in range(2, len(TEST_IMAGE_URLS))])
+    [TEST_IMAGE_ASSETS[:i] for i in range(2, len(TEST_IMAGE_ASSETS))],
+    indirect=True)
 async def test_multi_image_input(client: openai.AsyncOpenAI, model_name: str,
                                  image_urls: list[str]):
     messages = [{
diff --git a/tests/v1/entrypoints/openai/test_completion.py b/tests/v1/entrypoints/openai/test_completion.py
index 3a65583fab8d3f3a699d58b624aeca8711942318..3114d7639f045728ca49a669d23824e007f9c25f 100644
--- a/tests/v1/entrypoints/openai/test_completion.py
+++ b/tests/v1/entrypoints/openai/test_completion.py
@@ -686,7 +686,7 @@ async def test_invalid_grammar(client: openai.AsyncOpenAI, model_name: str):
 async def test_completion_with_empty_prompt_embeds(
         client: openai.AsyncOpenAI) -> None:
     """Test completion with empty prompt embeds."""
-    payload: dict[str, list] = {"prompt_embeds": []}
+    payload: dict[str, object] = {"prompt": "Hello", "prompt_embeds": []}
     headers: dict[str, str] = {"Content-Type": "application/json"}
     # base_url = http://localhost:8000/v1/completions
     response = requests.post(f"{client.base_url}completions",
diff --git a/tests/v1/executor/test_executor.py b/tests/v1/executor/test_executor.py
index bdd5155c1481d55dc1a21dee5caaab0bb5843f19..4e83e2f9d4b63371b54810365f82953f52fc8539 100644
--- a/tests/v1/executor/test_executor.py
+++ b/tests/v1/executor/test_executor.py
@@ -27,7 +27,7 @@ class CustomMultiprocExecutor(MultiprocExecutor):
                        kwargs: Optional[dict] = None,
                        non_block: bool = False,
                        unique_reply_rank: Optional[int] = None) -> list[Any]:
-        # Drop marker to show that this was ran
+        # Drop marker to show that this was run
         with open(".marker", "w"):
             ...
         return super().collective_rpc(method, timeout, args, kwargs)
diff --git a/tests/v1/kv_connector/unit/test_remote_decode_lifecycle.py b/tests/v1/kv_connector/unit/test_remote_decode_lifecycle.py
index d8c56ac42f718e579fc39f08f331b80070dde58c..380e72a156336797c72ca7be6d9058713a2b741b 100644
--- a/tests/v1/kv_connector/unit/test_remote_decode_lifecycle.py
+++ b/tests/v1/kv_connector/unit/test_remote_decode_lifecycle.py
@@ -42,7 +42,7 @@ def test_basic_lifecycle():
     engine_core_outputs = scheduler.update_from_output(scheduler_output,
                                                        model_runner_output)
 
-    # Ensure the request is finished after 1 tokens.
+    # Ensure the request is finished after 1 token.
     assert request.is_finished()
     assert request.status == RequestStatus.FINISHED_LENGTH_CAPPED
     output = engine_core_outputs[0].outputs[0]
@@ -141,7 +141,7 @@ def test_short_prompt_lifecycle():
 
 
 def test_prefix_cache_lifecycle():
-    """Test that remote decode params still works with a prefix cache hit."""
+    """Test that remote decode params still work with a prefix cache hit."""
 
     vllm_config = create_vllm_config()
     scheduler = create_scheduler(vllm_config)
diff --git a/tests/v1/kv_connector/unit/test_shared_storage_connector.py b/tests/v1/kv_connector/unit/test_shared_storage_connector.py
index db203b81f15fc86d40c4a0a2d39814c26b48e6f1..6be261e45cb00143b5b2059be60fdbfc2dcea8e2 100644
--- a/tests/v1/kv_connector/unit/test_shared_storage_connector.py
+++ b/tests/v1/kv_connector/unit/test_shared_storage_connector.py
@@ -33,7 +33,7 @@ def _check_path_len(path):
 
 
 def _list_path(path):
-    """Return the list of foldername (hashes generatd) under the path"""
+    """Return the list of foldername (hashes generated) under the path"""
     return list(path.iterdir())
 
 
@@ -41,7 +41,7 @@ def run_test(tmp_path, processor, llm: LLM, question: str,
              image_urls: list[Image], expected_len: int, info: str):
     """
     One individual test to process the prompt and output base on 1 set of input
-    Then check if the length in the strorage path matches the expected length
+    Then check if the length in the storage path matches the expected length
     `info` introduces details or purpose of the individual test
     """
     print(f"***info: {info}***")
@@ -115,7 +115,7 @@ def test_shared_storage_connector_hashes(tmp_path):
     """
     Tests that SharedStorageConnector saves KV to the storage locations
     with proper hashes; that are unique for inputs with identical text but 
-    differnt images (same size), or same multiple images but different orders.
+    different images (same size), or same multiple images but different orders.
     """
     # Using tmp_path as the storage path to store KV
     print(f"KV storage path at: {str(tmp_path)}")
@@ -171,12 +171,12 @@ def test_shared_storage_connector_hashes(tmp_path):
                   img=[image_1],
                   expected_len=2,
                   info=("image_1 single input the 2nd time. "
-                        "It should not form aother new hash.")),
+                        "It should not form another new hash.")),
         InputCase(text=TEXT_PROMPTS[0],
                   img=[image_2],
                   expected_len=2,
                   info=("image_2 single input the 2nd time. "
-                        "It should not form aother new hash.")),
+                        "It should not form another new hash.")),
         InputCase(text=TEXT_PROMPTS[0],
                   img=[image_1, image_2],
                   expected_len=3,
@@ -189,12 +189,12 @@ def test_shared_storage_connector_hashes(tmp_path):
                   img=[image_1, image_2],
                   expected_len=4,
                   info=("[image_1, image_2] input the 2nd time. "
-                        "It should not form aother new hash.")),
+                        "It should not form another new hash.")),
         InputCase(text=TEXT_PROMPTS[0],
                   img=[image_2, image_1],
                   expected_len=4,
                   info=("[image_2, image_1] input the 2nd time. "
-                        "It should not form aother new hash.")),
+                        "It should not form another new hash.")),
         InputCase(text=TEXT_PROMPTS[0],
                   img=[],
                   expected_len=5,
diff --git a/tests/v1/kv_connector/unit/utils.py b/tests/v1/kv_connector/unit/utils.py
index 3f068d5e8c7eb3bcc92ec656efe1d8686de05a1d..0cae1c7bc05188c430f8c62952e8abdd70c446fa 100644
--- a/tests/v1/kv_connector/unit/utils.py
+++ b/tests/v1/kv_connector/unit/utils.py
@@ -13,6 +13,7 @@ from vllm.distributed.kv_transfer.kv_connector.factory import (
     KVConnectorFactory)
 from vllm.distributed.kv_transfer.kv_connector.v1.shared_storage_connector import (  # noqa
     SharedStorageConnector)
+from vllm.utils import sha256
 from vllm.v1.core.kv_cache_manager import KVCacheBlocks
 from vllm.v1.core.kv_cache_utils import (get_request_block_hasher,
                                          init_none_hash)
@@ -127,11 +128,11 @@ def create_request(request_id: int,
                    use_all_1s_for_prompt_tokens: bool = False,
                    num_remote_blocks: int = 3,
                    block_size: int = 16,
-                   hash_fn: Callable = hash) -> Request:
+                   hash_fn: Callable = sha256) -> Request:
     """Make dummy request for testing."""
     global _none_hash_initialized
     if not _none_hash_initialized:
-        init_none_hash(hash)
+        init_none_hash(hash_fn)
         _none_hash_initialized = True
 
     kv_transfer_params: Optional[dict[str, Any]] = None
diff --git a/tests/v1/logits_processors/test_custom_offline.py b/tests/v1/logits_processors/test_custom_offline.py
index a7fde1990f7ed2ea049b58b45dcb0d3fb2dc140a..891f55a14633bdbaefa13fb87a9e7e7bdf379572 100644
--- a/tests/v1/logits_processors/test_custom_offline.py
+++ b/tests/v1/logits_processors/test_custom_offline.py
@@ -15,6 +15,7 @@ from tests.v1.logits_processors.utils import (DUMMY_LOGITPROC_ARG,
                                               POOLING_MODEL_NAME, TEMP_GREEDY,
                                               CustomLogitprocSource,
                                               DummyLogitsProcessor,
+                                              WrappedPerReqLogitsProcessor,
                                               dummy_module)
 from tests.v1.logits_processors.utils import entry_points as fake_entry_points
 from tests.v1.logits_processors.utils import prompts
@@ -80,7 +81,7 @@ def _run_test(kwargs: dict, logitproc_loaded: bool) -> None:
             target_token = params.extra_args[DUMMY_LOGITPROC_ARG]
             if not all(x == target_token for x in lp_toks):
                 raise AssertionError(
-                    f"Request {bdx} generated {lp_toks}, shoud all be "
+                    f"Request {bdx} generated {lp_toks}, should all be "
                     f"{target_token}")
         else:
             # This request does not exercise custom logitproc (or custom
@@ -161,6 +162,38 @@ def test_custom_logitsprocs(monkeypatch,
     _run_test(kwargs, logitproc_loaded=True)
 
 
+@create_new_process_for_each_test()
+def test_custom_logitsprocs_req(monkeypatch):
+    """Test passing request-level logits processor to offline Python interface
+    
+    Wrap a request-level logits processor to create a batch level logits
+    processor that has a well-defined behavior (mask out all tokens except one
+    `target_token`)
+
+    Construct an `LLM` instance which loads the wrapped logits processor. Pass
+    the custom logitproc as a class object.
+
+    Construct a reference `LLM` instance with no custom logitproc
+
+    Pass in a batch of requests, 50% of which pass a `target_token` value
+    in through `SamplingParams.extra_args`, 50% of which do not.
+
+    Validate that
+    * Requests which do not activate the custom logitproc, yield the same
+      results for both `LLM` instances
+    * Requests which activate the custom logitproc, only output `target_token`
+
+    Args:
+      monkeypatch: for setting env vars
+    """
+
+    # Test that logitproc info is passed to workers
+    monkeypatch.setenv("VLLM_ENABLE_V1_MULTIPROCESSING", "1")
+    random.seed(40)
+    _run_test({"logits_processors": [WrappedPerReqLogitsProcessor]},
+              logitproc_loaded=True)
+
+
 @create_new_process_for_each_test()
 @pytest.mark.parametrize("logitproc_source", [
     CustomLogitprocSource.LOGITPROC_SOURCE_ENTRYPOINT,
diff --git a/tests/v1/logits_processors/utils.py b/tests/v1/logits_processors/utils.py
index c36f1bd021c705ab183c341cfa7216e077723bed..7ec35bd3eb639f62669b718dbc033106a141c99a 100644
--- a/tests/v1/logits_processors/utils.py
+++ b/tests/v1/logits_processors/utils.py
@@ -3,15 +3,21 @@
 
 import types
 from enum import Enum, auto
-from typing import Optional
+from typing import Any, Optional
 
 import torch
 
 from vllm.config import VllmConfig
-from vllm.v1.sample.logits_processor import (LOGITSPROCS_GROUP, BatchUpdate,
-                                             LogitsProcessor)
+from vllm.logger import init_logger
+from vllm.sampling_params import SamplingParams
+from vllm.v1.sample.logits_processor import (LOGITSPROCS_GROUP,
+                                             AdapterLogitsProcessor,
+                                             BatchUpdate, LogitsProcessor,
+                                             RequestLogitsProcessor)
 from vllm.v1.sample.logits_processor.builtin import process_dict_updates
 
+logger = init_logger(__name__)
+
 MODEL_NAME = "facebook/opt-125m"
 POOLING_MODEL_NAME = "BAAI/bge-base-en-v1.5"
 DUMMY_LOGITPROC_ARG = "target_token"
@@ -104,5 +110,60 @@ class EntryPoints(list):
         self.names = [ep.name for ep in eps]
 
 
+class DummyPerReqLogitsProcessor:
+    """The request-level logits processor masks out all logits except the
+    token id identified by `target_token`"""
+
+    def __init__(self, target_token: int) -> None:
+        """Specify `target_token`"""
+        self.target_token = target_token
+
+    def __call__(
+        self,
+        output_ids: list[int],
+        logits: torch.Tensor,
+    ) -> torch.Tensor:
+        val_to_keep = logits[self.target_token].item()
+        logits[:] = float("-inf")
+        logits[self.target_token] = val_to_keep
+        return logits
+
+
+class WrappedPerReqLogitsProcessor(AdapterLogitsProcessor):
+    """Example of wrapping a fake request-level logit processor to create a
+    batch-level logits processor"""
+
+    def is_argmax_invariant(self) -> bool:
+        return False
+
+    def new_req_logits_processor(
+        self,
+        params: SamplingParams,
+    ) -> Optional[RequestLogitsProcessor]:
+        """This method returns a new request-level logits processor, customized
+        to the `target_token` value associated with a particular request.
+
+        Returns None if the logits processor should not be applied to the
+        particular request. To use the logits processor the request must have
+        a "target_token" custom argument with an integer value.
+
+        Args:
+          params: per-request sampling params
+
+        Returns:
+          `Callable` request logits processor, or None
+        """
+        target_token: Optional[
+            Any] = params.extra_args and params.extra_args.get("target_token")
+        if target_token is None:
+            return None
+        if not isinstance(target_token, int):
+            logger.warning(
+                "target_token value %s is not int; not applying logits"
+                " processor to request.", target_token)
+            return None
+        return DummyPerReqLogitsProcessor(target_token)
+
+
 """Fake version of importlib.metadata.entry_points"""
 entry_points = lambda group: EntryPoints(group)
diff --git a/tests/v1/metrics/test_engine_logger_apis.py b/tests/v1/metrics/test_engine_logger_apis.py
new file mode 100644
index 0000000000000000000000000000000000000000..e6a4d0a2a2e8bdbcd408735911d680c0cda62cfd
--- /dev/null
+++ b/tests/v1/metrics/test_engine_logger_apis.py
@@ -0,0 +1,83 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import copy
+
+import pytest
+
+from vllm.v1.engine.async_llm import AsyncEngineArgs, AsyncLLM
+from vllm.v1.metrics.ray_wrappers import RayPrometheusStatLogger
+
+
+class DummyStatLogger:
+    """
+    A dummy stat logger for testing purposes.
+    Implements the minimal interface expected by StatLoggerManager.
+    """
+
+    def __init__(self, vllm_config, engine_idx):
+        self.vllm_config = vllm_config
+        self.engine_idx = engine_idx
+        self.recorded = []
+        self.logged = False
+        self.engine_initialized = False
+
+    def record(self, scheduler_stats, iteration_stats, engine_idx):
+        self.recorded.append((scheduler_stats, iteration_stats, engine_idx))
+
+    def log(self):
+        self.logged = True
+
+    def log_engine_initialized(self):
+        self.engine_initialized = True
+
+
+@pytest.fixture
+def log_stats_enabled_engine_args():
+    """
+    Shared fixture providing common AsyncEngineArgs configuration
+    used across multiple tests.
+    """
+    return AsyncEngineArgs(
+        model="distilbert/distilgpt2",
+        dtype="half",
+        disable_log_stats=False,
+        enforce_eager=True,
+    )
+
+
+@pytest.mark.asyncio
+async def test_async_llm_replace_default_loggers(
+        log_stats_enabled_engine_args):
+    """
+    RayPrometheusStatLogger should replace the default PrometheusStatLogger
+    """
+
+    engine = AsyncLLM.from_engine_args(log_stats_enabled_engine_args,
+                                       stat_loggers=[RayPrometheusStatLogger])
+    assert isinstance(engine.logger_manager.prometheus_logger,
+                      RayPrometheusStatLogger)
+    engine.shutdown()
+
+
+@pytest.mark.asyncio
+async def test_async_llm_add_to_default_loggers(log_stats_enabled_engine_args):
+    """
+    It's still possible to use custom stat loggers exclusively by passing 
+    disable_log_stats=True in addition to a list of custom stat loggers.
+    """
+    # Create engine_args with disable_log_stats=True for this test
+    disabled_log_engine_args = copy.deepcopy(log_stats_enabled_engine_args)
+    disabled_log_engine_args.disable_log_stats = True
+
+    # Disable default loggers; pass custom stat logger to the constructor
+    engine = AsyncLLM.from_engine_args(disabled_log_engine_args,
+                                       stat_loggers=[DummyStatLogger])
+
+    assert len(engine.logger_manager.per_engine_logger_dict[0]) == 1
+    assert isinstance(engine.logger_manager.per_engine_logger_dict[0][0],
+                      DummyStatLogger)
+
+    # log_stats is still True, since custom stat loggers are used
+    assert engine.log_stats
+
+    engine.shutdown()
diff --git a/tests/v1/sample/test_logprobs.py b/tests/v1/sample/test_logprobs.py
index e835c029634cef8aabf13701dd8a9a677acef9ec..570e330208a39ad93fe25645e9832aea47d8d415 100644
--- a/tests/v1/sample/test_logprobs.py
+++ b/tests/v1/sample/test_logprobs.py
@@ -430,7 +430,7 @@ def test_zero_logprobs(vllm_model, example_prompts,
 
 
 def test_all_logprobs(example_prompts, monkeypatch: pytest.MonkeyPatch):
-    """Engine should return all vocabulary logprobs
+    """Engine should return all vocabulary logprobs and prompt logprobs
 
     Args:
       example_prompts: list of example prompts (test fixture)
@@ -444,16 +444,24 @@ def test_all_logprobs(example_prompts, monkeypatch: pytest.MonkeyPatch):
             # 2 other llms alive during whole session
             gpu_memory_utilization=0.15,
             max_model_len=256)
+
         sampling_params_logprobs_all = SamplingParams(max_tokens=5,
-                                                      logprobs=-1)
+                                                      logprobs=-1,
+                                                      prompt_logprobs=-1)
         results_logprobs_all = runner.llm.generate(
             example_prompts, sampling_params=sampling_params_logprobs_all)
         vocab_size = runner.llm.llm_engine.get_model_config().get_vocab_size()
+
         for i in range(len(results_logprobs_all)):
             logprobs = results_logprobs_all[i].outputs[0].logprobs
+            prompt_logprobs = results_logprobs_all[i].prompt_logprobs
             assert logprobs is not None
             for logprob in logprobs:
                 assert len(logprob) == vocab_size
+            assert prompt_logprobs is not None
+            assert prompt_logprobs[0] is None
+            for prompt_logprob in prompt_logprobs[1:]:
+                assert len(prompt_logprob) == vocab_size
 
 
 @pytest.mark.parametrize("logprobs_mode", list(LogprobsMode))
diff --git a/tests/v1/spec_decode/test_eagle.py b/tests/v1/spec_decode/test_eagle.py
index 7b8445a0b28782859b8d3a072b3e7f19c7914469..ddedc61aae2960a1981efde908dd5105f1d99185 100644
--- a/tests/v1/spec_decode/test_eagle.py
+++ b/tests/v1/spec_decode/test_eagle.py
@@ -12,9 +12,10 @@ from tests.v1.attention.utils import (BatchSpec, _Backend,
                                       create_common_attn_metadata,
                                       create_standard_kv_cache_spec,
                                       get_attention_backend)
-from vllm.config import (CacheConfig, DeviceConfig, LoadConfig, ModelConfig,
+from vllm.config import (CacheConfig, DeviceConfig, ModelConfig,
                          ParallelConfig, SchedulerConfig, SpeculativeConfig,
                          VllmConfig)
+from vllm.config.load import LoadConfig
 from vllm.model_executor.models.llama import LlamaForCausalLM
 from vllm.platforms import current_platform
 from vllm.v1.spec_decode.eagle import EagleProposer
@@ -183,7 +184,7 @@ def test_load_model(mock_get_model, mock_get_layers, mock_get_pp_group, method,
     mock_pp_group.world_size = pp_size
     mock_get_pp_group.return_value = mock_pp_group
 
-    # Setup the target model mock with a custom class so that
+    # Set up the target model mock with a custom class so that
     # isinstance() checks match the expected type.
     class _TargetModelStub(LlamaForCausalLM):
         model: mock.MagicMock
diff --git a/tests/v1/spec_decode/test_tree_attention.py b/tests/v1/spec_decode/test_tree_attention.py
index 63178174086611156bc0e21f225aa50716c5dc6b..eacb2ad584baf9f300d6b72302202be1353c44f0 100644
--- a/tests/v1/spec_decode/test_tree_attention.py
+++ b/tests/v1/spec_decode/test_tree_attention.py
@@ -187,7 +187,7 @@ def test_tree_attn_correctness() -> None:
                         dtype=torch.bfloat16,
                     )
 
-                    # Setup the block table and KV cache for paged KV.
+                    # Set up the block table and KV cache for paged KV.
                     assert max_sequence_length % block_size == 0
                     max_blocks_per_batch = max_sequence_length // block_size
                     kv_cache = torch.randn(
@@ -222,7 +222,7 @@ def test_tree_attn_correctness() -> None:
                                 num_alloc_blocks_per_batch] = block_ids.view(
                                     -1, num_alloc_blocks_per_batch)
 
-                    # Setup the slot mapping for the input KVs.
+                    # Set up the slot mapping for the input KVs.
                     tree_positions = sequence_position + torch.arange(
                         0,
                         tree_size_q,
diff --git a/tests/v1/test_kv_sharing.py b/tests/v1/test_kv_sharing.py
index 6b01b7d3e1d6c62b7a7dfc7c6e402237e60b5530..96848047145b639362c37fcc7466dc05971c8f18 100644
--- a/tests/v1/test_kv_sharing.py
+++ b/tests/v1/test_kv_sharing.py
@@ -30,7 +30,7 @@ def test_initialize_kv_cache_for_kv_sharing_different_attn_groups():
     }
 
     # Layers 0 and 1 both belong in KV cache group 0
-    # However, if they have have different attention backends, they will be
+    # However, if they have different attention backends, they will be
     # placed in different attention groups for KV cache group 0
     kv_cache_groups = [
         KVCacheGroupSpec(["model.layers.0", "model.layers.1"],
diff --git a/tests/v1/test_oracle.py b/tests/v1/test_oracle.py
index 1f16e92f657e0ad8273911fb334c62f635a8f0f6..efa604dd6b5a85953b1db09843c60e352190d512 100644
--- a/tests/v1/test_oracle.py
+++ b/tests/v1/test_oracle.py
@@ -10,7 +10,6 @@ from vllm.engine.arg_utils import AsyncEngineArgs
 from vllm.engine.async_llm_engine import AsyncLLMEngine
 
 UNSUPPORTED_MODELS_V1 = [
-    "openai/whisper-large-v3",  # transcription
     "facebook/bart-large-cnn",  # encoder decoder
 ]
 
diff --git a/tests/v1/tpu/test_multimodal.py b/tests/v1/tpu/test_multimodal.py
index bcc2993028dd6070f2aa2138e39cda0d7b20eeb5..9947fcbe73135168788579e02e3f14350a3eeea6 100644
--- a/tests/v1/tpu/test_multimodal.py
+++ b/tests/v1/tpu/test_multimodal.py
@@ -4,18 +4,19 @@
 import openai
 import pytest
 
-from vllm.multimodal.utils import encode_image_base64, fetch_image
+from vllm.multimodal.utils import encode_image_base64
 from vllm.platforms import current_platform
 
-from ...entrypoints.openai.test_vision import TEST_IMAGE_URLS
+from ...entrypoints.openai.test_vision import TEST_IMAGE_ASSETS
 from ...utils import RemoteOpenAIServer
 
 
 @pytest.fixture(scope="session")
-def base64_encoded_image() -> dict[str, str]:
+def base64_encoded_image(local_asset_server) -> dict[str, str]:
     return {
-        image_url: encode_image_base64(fetch_image(image_url))
-        for image_url in TEST_IMAGE_URLS
+        image_asset:
+        encode_image_base64(local_asset_server.get_image_asset(image_asset))
+        for image_asset in TEST_IMAGE_ASSETS
     }
 
 
@@ -66,7 +67,7 @@ async def test_basic_vision(model_name: str, base64_encoded_image: dict[str,
         client: openai.AsyncOpenAI = remote_server.get_async_client()
 
         # Other requests now should be much faster
-        for image_url in TEST_IMAGE_URLS:
+        for image_url in TEST_IMAGE_ASSETS:
             image_base64 = base64_encoded_image[image_url]
             chat_completion_from_base64 = await client.chat.completions\
                 .create(
diff --git a/tests/v1/tpu/test_topk_topp_sampler.py b/tests/v1/tpu/test_topk_topp_sampler.py
index ca5c067b364e0040f1f317a1a3b819af90295917..05751badc76190a33450c064c91a308bb8398526 100644
--- a/tests/v1/tpu/test_topk_topp_sampler.py
+++ b/tests/v1/tpu/test_topk_topp_sampler.py
@@ -6,8 +6,12 @@ import pytest
 import torch
 
 from vllm.platforms import current_platform
-from vllm.v1.sample.ops.topk_topp_sampler import (apply_top_k_top_p,
-                                                  apply_top_k_top_p_tpu)
+from vllm.v1.sample.ops.topk_topp_sampler import apply_top_k_top_p
+
+# isort: off
+from vllm.v1.sample.tpu.sampler import (apply_top_k_top_p as
+                                        apply_top_k_top_p_tpu)
+# isort: on
 
 if not current_platform.is_tpu():
     pytest.skip("This test needs a TPU.", allow_module_level=True)
diff --git a/tests/v1/tracing/test_tracing.py b/tests/v1/tracing/test_tracing.py
new file mode 100644
index 0000000000000000000000000000000000000000..da8655f95e195b80747f0456448dca15f03f3b0f
--- /dev/null
+++ b/tests/v1/tracing/test_tracing.py
@@ -0,0 +1,137 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+# ruff: noqa
+# type: ignore
+from __future__ import annotations
+
+import threading
+from collections.abc import Iterable
+from concurrent import futures
+from typing import Callable, Generator, Literal
+
+import grpc
+import pytest
+from opentelemetry.proto.collector.trace.v1.trace_service_pb2 import (
+    ExportTraceServiceResponse)
+from opentelemetry.proto.collector.trace.v1.trace_service_pb2_grpc import (
+    TraceServiceServicer, add_TraceServiceServicer_to_server)
+from opentelemetry.proto.common.v1.common_pb2 import AnyValue, KeyValue
+from opentelemetry.sdk.environment_variables import (
+    OTEL_EXPORTER_OTLP_TRACES_INSECURE)
+
+from vllm import LLM, SamplingParams
+from vllm.tracing import SpanAttributes
+
+FAKE_TRACE_SERVER_ADDRESS = "localhost:4317"
+
+FieldName = Literal['bool_value', 'string_value', 'int_value', 'double_value',
+                    'array_value']
+
+
+def decode_value(value: AnyValue):
+    field_decoders: dict[FieldName, Callable] = {
+        "bool_value": (lambda v: v.bool_value),
+        "string_value": (lambda v: v.string_value),
+        "int_value": (lambda v: v.int_value),
+        "double_value": (lambda v: v.double_value),
+        "array_value":
+        (lambda v: [decode_value(item) for item in v.array_value.values]),
+    }
+    for field, decoder in field_decoders.items():
+        if value.HasField(field):
+            return decoder(value)
+    raise ValueError(f"Couldn't decode value: {value}")
+
+
+def decode_attributes(attributes: Iterable[KeyValue]):
+    return {kv.key: decode_value(kv.value) for kv in attributes}
+
+
+class FakeTraceService(TraceServiceServicer):
+
+    def __init__(self):
+        self.request = None
+        self.evt = threading.Event()
+
+    def Export(self, request, context):
+        self.request = request
+        self.evt.set()
+        return ExportTraceServiceResponse()
+
+
+@pytest.fixture
+def trace_service() -> Generator[FakeTraceService, None, None]:
+    """Fixture to set up a fake gRPC trace service"""
+    server = grpc.server(futures.ThreadPoolExecutor(max_workers=1))
+    service = FakeTraceService()
+    add_TraceServiceServicer_to_server(service, server)
+    server.add_insecure_port(FAKE_TRACE_SERVER_ADDRESS)
+    server.start()
+
+    yield service
+
+    server.stop(None)
+
+
+def test_traces(
+    monkeypatch: pytest.MonkeyPatch,
+    trace_service: FakeTraceService,
+):
+    with monkeypatch.context() as m:
+        m.setenv(OTEL_EXPORTER_OTLP_TRACES_INSECURE, "true")
+        m.setenv("VLLM_USE_V1", "1")
+        sampling_params = SamplingParams(
+            temperature=0.01,
+            top_p=0.1,
+            max_tokens=256,
+        )
+        model = "facebook/opt-125m"
+        llm = LLM(model=model,
+                  otlp_traces_endpoint=FAKE_TRACE_SERVER_ADDRESS,
+                  gpu_memory_utilization=0.3,
+                  disable_log_stats=False)
+        prompts = ["This is a short prompt"]
+        outputs = llm.generate(prompts, sampling_params=sampling_params)
+        print(f"test_traces outputs is : {outputs}")
+
+        timeout = 10
+        if not trace_service.evt.wait(timeout):
+            raise TimeoutError(
+                f"The fake trace service didn't receive a trace within "
+                f"the {timeout} seconds timeout")
+
+        request = trace_service.request
+        assert len(request.resource_spans) == 1, (
+            f"Expected 1 resource span, "
+            f"but got {len(request.resource_spans)}")
+        assert len(request.resource_spans[0].scope_spans) == 1, (
+            f"Expected 1 scope span, "
+            f"but got {len(request.resource_spans[0].scope_spans)}")
+        assert len(request.resource_spans[0].scope_spans[0].spans) == 1, (
+            f"Expected 1 span, "
+            f"but got {len(request.resource_spans[0].scope_spans[0].spans)}")
+
+        attributes = decode_attributes(
+            request.resource_spans[0].scope_spans[0].spans[0].attributes)
+        # assert attributes.get(SpanAttributes.GEN_AI_RESPONSE_MODEL) == model
+        assert attributes.get(
+            SpanAttributes.GEN_AI_REQUEST_ID) == outputs[0].request_id
+        assert attributes.get(SpanAttributes.GEN_AI_REQUEST_TEMPERATURE
+                              ) == sampling_params.temperature
+        assert attributes.get(
+            SpanAttributes.GEN_AI_REQUEST_TOP_P) == sampling_params.top_p
+        assert attributes.get(SpanAttributes.GEN_AI_REQUEST_MAX_TOKENS
+                              ) == sampling_params.max_tokens
+        assert attributes.get(
+            SpanAttributes.GEN_AI_REQUEST_N) == sampling_params.n
+        assert attributes.get(
+            SpanAttributes.GEN_AI_USAGE_PROMPT_TOKENS) == len(
+                outputs[0].prompt_token_ids)
+        completion_tokens = sum(len(o.token_ids) for o in outputs[0].outputs)
+        assert attributes.get(
+            SpanAttributes.GEN_AI_USAGE_COMPLETION_TOKENS) == completion_tokens
+
+        assert attributes.get(SpanAttributes.GEN_AI_LATENCY_TIME_IN_QUEUE) > 0
+        assert attributes.get(
+            SpanAttributes.GEN_AI_LATENCY_TIME_TO_FIRST_TOKEN) > 0
+        assert attributes.get(SpanAttributes.GEN_AI_LATENCY_E2E) > 0
diff --git a/tests/v1/worker/test_gpu_model_runner.py b/tests/v1/worker/test_gpu_model_runner.py
index d6cd03fb01a7393030f69dce051d8217465f2144..6d99029e404efa6f0a2d1a594278b10480088d26 100644
--- a/tests/v1/worker/test_gpu_model_runner.py
+++ b/tests/v1/worker/test_gpu_model_runner.py
@@ -702,7 +702,7 @@ def test_hybrid_attention_mamba_tensor_shapes(monkeypatch):
     KVCacheTensors for the attention and mamba layers
     (via _reshape_kv_cache_tensors function). This test verifies
     that the views are compatible: writing a mamba block
-    will not corrupt an attention block and vice-versa
+    will not corrupt an attention block and vice versa
     '''
 
     current_platform.seed_everything(42)
diff --git a/tools/install_deepgemm.sh b/tools/install_deepgemm.sh
index 33849581d2c0e0d29fbb5f5f4d2a31d261174a51..98427f1835ec2341e0a37c002ebc94ee85dcf7f9 100755
--- a/tools/install_deepgemm.sh
+++ b/tools/install_deepgemm.sh
@@ -6,7 +6,7 @@ set -e
 
 # Default values
 DEEPGEMM_GIT_REPO="https://github.com/deepseek-ai/DeepGEMM.git"
-DEEPGEMM_GIT_REF="7b6b5563b9d4c1ae07ffbce7f78ad3ac9204827c"
+DEEPGEMM_GIT_REF="ea9c5d9270226c5dd7a577c212e9ea385f6ef048"
 
 # Parse command line arguments
 while [[ $# -gt 0 ]]; do
@@ -105,4 +105,4 @@ fi
 
 popd
 
-echo "✅ DeepGEMM installation completed successfully"
\ No newline at end of file
+echo "✅ DeepGEMM installation completed successfully"
diff --git a/tools/mypy.sh b/tools/mypy.sh
index 781d8fc02884b831f0a98861cb6b44e8035470c3..63e3b9a916634dc199a6c974c7c634c084c9409b 100755
--- a/tools/mypy.sh
+++ b/tools/mypy.sh
@@ -29,7 +29,7 @@ run_mypy vllm/engine
 run_mypy vllm/executor
 run_mypy vllm/inputs
 run_mypy vllm/lora
-run_mypy vllm/model_executor
+run_mypy --exclude 'vllm/model_executor/layers/fla/ops' vllm/model_executor
 run_mypy vllm/plugins
 run_mypy vllm/worker
 run_mypy vllm/v1
diff --git a/tools/profiler/visualize_layerwise_profile.py b/tools/profiler/visualize_layerwise_profile.py
index 038d3c44f043a44355a5c73561e40bd7676c753e..30d6547073d38df6c3d325aed9318d4e0976deae 100644
--- a/tools/profiler/visualize_layerwise_profile.py
+++ b/tools/profiler/visualize_layerwise_profile.py
@@ -119,7 +119,7 @@ def attempt_to_make_names_unique(entries_and_traces):
              if not all_the_same(trace_eles)), None)
 
         if first_trace_difference is None:
-            # can't create a unique name, leave them names as the
+            # can't create a unique name, leave the names as they
             # are they will get aggregated by the pivot_table call
             continue
 
diff --git a/use_existing_torch.py b/use_existing_torch.py
index a9f79e16981c4c7bdc489799816bba0cb7630527..b5aafdde16c2846dcdc0d0b5a98a93194014498e 100644
--- a/use_existing_torch.py
+++ b/use_existing_torch.py
@@ -1,21 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-import glob
-
-requires_files = glob.glob('requirements/*.txt')
-requires_files += ["pyproject.toml"]
-for file in requires_files:
-    print(f">>> cleaning {file}")
-    with open(file) as f:
-        lines = f.readlines()
-    if "torch" in "".join(lines).lower():
-        print("removed:")
-        with open(file, 'w') as f:
-            for line in lines:
-                if 'torch' not in line.lower():
-                    f.write(line)
-                else:
-                    print(line.strip())
-    print(f"<<< done cleaning {file}")
-    print()
+print("vLLM is now using 'uv' to disable build isolation for 'torch'.")
+print("Please instead install vLLM with 'uv pip install -e .' (must use 'uv')")
diff --git a/vllm/__init__.py b/vllm/__init__.py
index 7b90fd3a241bd16fb645a5567fb83fef4f274018..3a5c1b1ce0dafa7f96615601253d3b7e5c2cc506 100644
--- a/vllm/__init__.py
+++ b/vllm/__init__.py
@@ -14,6 +14,8 @@ import typing
 import vllm.env_override  # noqa: F401
 
 MODULE_ATTRS = {
+    "bc_linter_skip": "._bc_linter:bc_linter_skip",
+    "bc_linter_include": "._bc_linter:bc_linter_include",
     "AsyncEngineArgs": ".engine.arg_utils:AsyncEngineArgs",
     "EngineArgs": ".engine.arg_utils:EngineArgs",
     "AsyncLLMEngine": ".engine.async_llm_engine:AsyncLLMEngine",
@@ -54,6 +56,8 @@ if typing.TYPE_CHECKING:
                               ScoringRequestOutput)
     from vllm.pooling_params import PoolingParams
     from vllm.sampling_params import SamplingParams
+
+    from ._bc_linter import bc_linter_include, bc_linter_skip
 else:
 
     def __getattr__(name: str) -> typing.Any:
@@ -70,6 +74,8 @@ else:
 
 __all__ = [
     "__version__",
+    "bc_linter_skip",
+    "bc_linter_include",
     "__version_tuple__",
     "LLM",
     "ModelRegistry",
diff --git a/vllm/_bc_linter.py b/vllm/_bc_linter.py
new file mode 100644
index 0000000000000000000000000000000000000000..52a95dbee18663525fc8375ac0bb8889f0e6057d
--- /dev/null
+++ b/vllm/_bc_linter.py
@@ -0,0 +1,59 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+# vllm/_bc_linter.py
+from __future__ import annotations
+
+from typing import Any, Callable, TypeVar, overload
+
+T = TypeVar("T")
+
+
+@overload
+def bc_linter_skip(obj: T) -> T:
+    ...
+
+
+@overload
+def bc_linter_skip(*, reason: str | None = ...) -> Callable[[T], T]:
+    ...
+
+
+def bc_linter_skip(obj: Any = None, *, reason: str | None = None):
+    """
+    No-op decorator to mark symbols/files for BC-linter suppression.
+
+    Usage:
+        @bc_linter_skip
+        def legacy_api(...): ...
+    """
+
+    def _wrap(x: T) -> T:
+        return x
+
+    return _wrap if obj is None else obj
+
+
+@overload
+def bc_linter_include(obj: T) -> T:
+    ...
+
+
+@overload
+def bc_linter_include(*, reason: str | None = ...) -> Callable[[T], T]:
+    ...
+
+
+def bc_linter_include(obj: Any = None, *, reason: str | None = None):
+    """
+    Usage:
+        @bc_linter_include
+        def public_api(...): ...
+    """
+
+    def _wrap(x: T) -> T:
+        return x
+
+    return _wrap if obj is None else obj
+
+
+__all__ = ["bc_linter_skip", "bc_linter_include"]
diff --git a/vllm/_custom_ops.py b/vllm/_custom_ops.py
index 356764f3ef657294fcea4443df056b718ad70a02..9d5a445a3eab7edf8a55e0c228c31b460aca8309 100644
--- a/vllm/_custom_ops.py
+++ b/vllm/_custom_ops.py
@@ -280,6 +280,13 @@ def fused_add_rms_norm(input: torch.Tensor, residual: torch.Tensor,
     torch.ops._C.fused_add_rms_norm(input, residual, weight, epsilon)
 
 
+def poly_norm(out: torch.Tensor, input: torch.Tensor, weight: torch.Tensor,
+              bias: torch.Tensor, epsilon: float) -> None:
+    # TODO: Remove this contiguous call when the kernel is updated to support non-contiguous input
+    input_contiguous = input.contiguous()
+    torch.ops._C.poly_norm(out, input_contiguous, weight, bias, epsilon)
+
+
 def apply_repetition_penalties_torch(
         logits: torch.Tensor, prompt_mask: torch.Tensor,
         output_mask: torch.Tensor, repetition_penalties: torch.Tensor) -> None:
@@ -715,6 +722,7 @@ def cutlass_sparse_scaled_mm_supported(cuda_device_capability: int) -> bool:
 def cutlass_group_gemm_supported(cuda_device_capability: int) -> bool:
     return torch.ops._C.cutlass_group_gemm_supported(cuda_device_capability)
 
+
 def cutlass_sparse_compress(a: torch.Tensor) \
     -> tuple[torch.Tensor, torch.Tensor]:
     """
@@ -1630,20 +1638,6 @@ def concat_and_cache_mla(
                                                 scale)
 
 
-def cp_fused_concat_and_cache_mla(
-    kv_c: torch.Tensor,
-    k_pe: torch.Tensor,
-    cp_local_token_select_indices: torch.Tensor,
-    kv_cache: torch.Tensor,
-    slot_mapping: torch.Tensor,
-    kv_cache_dtype: str,
-    scale: torch.Tensor,
-) -> None:
-    torch.ops._C_cache_ops.cp_fused_concat_and_cache_mla(
-        kv_c, k_pe, cp_local_token_select_indices, kv_cache, slot_mapping,
-        kv_cache_dtype, scale)
-
-
 def copy_blocks(key_caches: list[torch.Tensor],
                 value_caches: list[torch.Tensor],
                 block_mapping: torch.Tensor) -> None:
@@ -1852,13 +1846,13 @@ def cutlass_mla_decode(out: torch.Tensor, q_nope: torch.Tensor,
     return out
 
 
-def sm100_cutlass_mla_decode(out: torch.Tensor, q_nope: torch.Tensor,
-                             q_pe: torch.Tensor,
+def sm100_cutlass_mla_decode(out: torch.Tensor, lse: torch.Tensor,
+                             q_nope: torch.Tensor, q_pe: torch.Tensor,
                              kv_c_and_k_pe_cache: torch.Tensor,
                              seq_lens: torch.Tensor, page_table: torch.Tensor,
                              workspace: torch.Tensor, scale: float,
                              num_kv_splits: int) -> torch.Tensor:
-    torch.ops._C.sm100_cutlass_mla_decode(out, q_nope, q_pe,
+    torch.ops._C.sm100_cutlass_mla_decode(out, lse, q_nope, q_pe,
                                           kv_c_and_k_pe_cache, seq_lens,
                                           page_table, workspace, scale,
                                           num_kv_splits)
@@ -1933,6 +1927,35 @@ class CPUDNNLGEMMHandler:
             torch.ops._C.release_dnnl_matmul_handler(self.handler)
 
 
+if hasattr(torch.ops._C, "create_onednn_mm_handler"):
+    _supports_onednn = True
+else:
+    _supports_onednn = False
+
+
+def create_onednn_mm(
+    weight: torch.Tensor,  # [K, N]
+    primitive_cache_size: int = 128,
+) -> CPUDNNLGEMMHandler:
+    handler = CPUDNNLGEMMHandler()
+    handler.k, handler.n = weight.size()
+    handler.handler = torch.ops._C.create_onednn_mm_handler(
+        weight, primitive_cache_size)
+    return handler
+
+
+def onednn_mm(
+    dnnl_handler: CPUDNNLGEMMHandler,
+    x: torch.Tensor,
+    bias: Optional[torch.Tensor],
+) -> torch.Tensor:
+    output = torch.empty((*x.shape[0:-1], dnnl_handler.n), dtype=x.dtype)
+    torch.ops._C.onednn_mm(output, x.reshape(-1, dnnl_handler.k), bias,
+                           dnnl_handler.handler)
+
+    return output
+
+
 def create_onednn_scaled_mm(
     weight: torch.Tensor,  # [K, N]
     weight_scales: torch.Tensor,
diff --git a/vllm/_ipex_ops.py b/vllm/_ipex_ops.py
index 79e3e448cada30fac50ee423193bf9b1570e08a5..c2868c040aa16974f4824872320ddf0550a22188 100644
--- a/vllm/_ipex_ops.py
+++ b/vllm/_ipex_ops.py
@@ -1,11 +1,12 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-from typing import Optional
+from typing import Optional, Union
 
 import torch
 
 from vllm.logger import init_logger
+from vllm.platforms import current_platform
 
 logger = init_logger(__name__)
 
@@ -241,10 +242,9 @@ class ipex_ops:
         k_scale_float: float = 1.0,
         v_scale_float: float = 1.0,
     ) -> None:
-        assert kv_cache_dtype == "auto"
-        # TODO: support FP8 kv cache.
         ipex.llm.modules.PagedAttention.reshape_and_cache_flash(
-            key, value, key_cache, value_cache, slot_mapping)
+            key, value, key_cache, value_cache, slot_mapping, kv_cache_dtype,
+            k_scale_float, v_scale_float)
 
     @staticmethod
     def flash_attn_varlen_func(
@@ -349,3 +349,56 @@ class ipex_ops:
     def swap_blocks(src: torch.Tensor, dst: torch.Tensor,
                     block_mapping: torch.Tensor) -> None:
         torch.xpu.swap_blocks(src, dst, block_mapping)  # type: ignore
+
+    @staticmethod
+    def scaled_fp8_quant(
+        input: torch.Tensor,
+        scale: Optional[torch.Tensor] = None,
+        num_token_padding: Optional[int] = None,
+        scale_ub: Optional[torch.Tensor] = None,
+        use_per_token_if_dynamic: bool = False,
+        output: Optional[torch.Tensor] = None,
+    ) -> tuple[torch.Tensor, torch.Tensor]:
+        """
+        Quantize input tensor to FP8 and return quantized tensor and scale.
+        
+        This function is designed for both static and dynamic quantization:
+        If you provide the scale, it will use static scaling and if you omit
+        it, the scale will be determined dynamically. Currently, XPU platform
+        only supports dynamic quantization. The function also allows optional
+        padding of the output tensors for downstream kernels that will benefit
+        from padding.
+
+        Args:
+            input: The input tensor to be quantized to FP8
+            scale: Optional scaling factor for the FP8 quantization
+            scale_ub: Optional upper bound for scaling factor in dynamic
+                per token case
+            num_token_padding: If specified, pad the first dimension
+                of the output to at least this value.
+            use_per_token_if_dynamic: Whether to do per_tensor or per_token
+                in the dynamic quantization case.
+    
+        Returns:
+            tuple[torch.Tensor, torch.Tensor]: The output tensor in FP8 and
+                scaling factor.
+        """
+        # This code assumes batch_dim and num_tokens are flattened
+        assert (input.ndim == 2)
+        shape: Union[tuple[int, int], torch.Size] = input.shape
+        out_dtype: torch.dtype = current_platform.fp8_dtype()
+        if num_token_padding:
+            shape = (max(num_token_padding, input.shape[0]), shape[1])
+        if output is None:
+            output = torch.empty(shape, device=input.device, dtype=out_dtype)
+        else:
+            assert num_token_padding is None, \
+                "padding not supported if output passed in"
+            assert output.dtype == out_dtype
+        assert scale is None, "only dynamic fp8 quantization supported on XPU"
+        assert not use_per_token_if_dynamic, (
+            "per token dynamic fp8 quantization not supported on XPU")
+        scale = torch.zeros(1, device=input.device, dtype=torch.float32)
+        torch.ops.torch_ipex.dynamic_scaled_fp8_quant(output, input, scale)
+
+        return output, scale
diff --git a/vllm/assets/image.py b/vllm/assets/image.py
index c8f8d43a983553ca6ea02dae9e90c76e63e3f093..4639a11187d03df3018ad5e991b68b35dc102a11 100644
--- a/vllm/assets/image.py
+++ b/vllm/assets/image.py
@@ -2,6 +2,7 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from dataclasses import dataclass
+from pathlib import Path
 from typing import Literal
 
 import torch
@@ -11,17 +12,29 @@ from .base import get_vllm_public_assets
 
 VLM_IMAGES_DIR = "vision_model_images"
 
-ImageAssetName = Literal["stop_sign", "cherry_blossom", "hato"]
+ImageAssetName = Literal["stop_sign", "cherry_blossom", "hato",
+                         "2560px-Gfp-wisconsin-madison-the-nature-boardwalk",
+                         "Grayscale_8bits_palette_sample_image",
+                         "1280px-Venn_diagram_rgb", "RGBA_comp", "237-400x300",
+                         "231-200x300", "27-500x500", "17-150x600",
+                         "handelsblatt-preview", "paper-11"]
 
 
 @dataclass(frozen=True)
 class ImageAsset:
     name: ImageAssetName
 
+    def get_path(self, ext: str) -> Path:
+        """
+        Return s3 path for given image.
+        """
+        return get_vllm_public_assets(filename=f"{self.name}.{ext}",
+                                      s3_prefix=VLM_IMAGES_DIR)
+
     @property
-    def pil_image(self) -> Image.Image:
-        image_path = get_vllm_public_assets(filename=f"{self.name}.jpg",
-                                            s3_prefix=VLM_IMAGES_DIR)
+    def pil_image(self, ext="jpg") -> Image.Image:
+
+        image_path = self.get_path(ext)
         return Image.open(image_path)
 
     @property
@@ -29,6 +42,9 @@ class ImageAsset:
         """
         Image embeddings, only used for testing purposes with llava 1.5.
         """
-        image_path = get_vllm_public_assets(filename=f"{self.name}.pt",
-                                            s3_prefix=VLM_IMAGES_DIR)
+        image_path = self.get_path('pt')
         return torch.load(image_path, map_location="cpu", weights_only=True)
+
+    def read_bytes(self, ext: str) -> bytes:
+        p = Path(self.get_path(ext))
+        return p.read_bytes()
diff --git a/vllm/assets/video.py b/vllm/assets/video.py
index 8ab0e9760be87e72fedfd6aedab452ce862b699e..983e9114cccfbba87b6fcea2d51282d6505ac8d8 100644
--- a/vllm/assets/video.py
+++ b/vllm/assets/video.py
@@ -110,22 +110,23 @@ class VideoAsset:
     def filename(self) -> str:
         return self._NAME_TO_FILE[self.name]
 
+    @property
+    def video_path(self) -> str:
+        return download_video_asset(self.filename)
+
     @property
     def pil_images(self) -> list[Image.Image]:
-        video_path = download_video_asset(self.filename)
-        ret = video_to_pil_images_list(video_path, self.num_frames)
+        ret = video_to_pil_images_list(self.video_path, self.num_frames)
         return ret
 
     @property
     def np_ndarrays(self) -> npt.NDArray:
-        video_path = download_video_asset(self.filename)
-        ret = video_to_ndarrays(video_path, self.num_frames)
+        ret = video_to_ndarrays(self.video_path, self.num_frames)
         return ret
 
     @property
     def metadata(self) -> dict[str, Any]:
-        video_path = download_video_asset(self.filename)
-        ret = video_get_metadata(video_path)
+        ret = video_get_metadata(self.video_path)
         return ret
 
     def get_audio(self, sampling_rate: Optional[float] = None) -> npt.NDArray:
@@ -134,5 +135,4 @@ class VideoAsset:
         
         See also: examples/offline_inference/qwen2_5_omni/only_thinker.py
         """
-        video_path = download_video_asset(self.filename)
-        return librosa.load(video_path, sr=sampling_rate)[0]
+        return librosa.load(self.video_path, sr=sampling_rate)[0]
diff --git a/vllm/attention/backends/abstract.py b/vllm/attention/backends/abstract.py
index 0b9c625533cb71b2c0a17d1da8ccb89ec68ab66d..0217bff6adafa4f32666ba11af1e707a01b3fd80 100644
--- a/vllm/attention/backends/abstract.py
+++ b/vllm/attention/backends/abstract.py
@@ -257,6 +257,32 @@ class AttentionLayer(Protocol):
 
 class AttentionImpl(ABC, Generic[T]):
 
+    # Whether the attention impl can return the softmax lse for decode.
+    # Some features like decode context parallelism require the softmax lse.
+    can_return_lse_for_decode: bool = False
+
+    # some attention backends might not always want to return lse
+    # even if they can return lse (for efficiency reasons)
+    need_to_return_lse_for_decode: bool = False
+
+    dcp_world_size: int
+    dcp_rank: int
+
+    def __new__(cls, *args, **kwargs):
+        # use __new__ so that all subclasses will call this
+        self = super().__new__(cls)
+        try:
+            from vllm.distributed.parallel_state import get_dcp_group
+            self.dcp_world_size = get_dcp_group().world_size
+            self.dcp_rank = get_dcp_group().rank_in_group
+        except AssertionError:
+            # DCP might not be initialized in testing
+            self.dcp_world_size = 1
+            self.dcp_rank = 0
+        self.need_to_return_lse_for_decode = self.dcp_world_size > 1 \
+            and self.can_return_lse_for_decode
+        return self
+
     @abstractmethod
     def __init__(
         self,
diff --git a/vllm/attention/backends/differential_flash_attn.py b/vllm/attention/backends/differential_flash_attn.py
index caa02530d2fd6c12946644ce38e69982f4dd20e3..a7d0e3afb517fd84e7add62575e3a5b519746a1f 100644
--- a/vllm/attention/backends/differential_flash_attn.py
+++ b/vllm/attention/backends/differential_flash_attn.py
@@ -734,6 +734,7 @@ class DifferentialFlashAttentionImpl(AttentionImpl):
                     window_size=self.sliding_window,
                     alibi_slopes=self.alibi_slopes,
                     softcap=self.logits_soft_cap,
+                    fa_version=self.vllm_flash_attn_version,
                 )
                 assert prefill_output.shape == output[:
                                                       num_prefill_tokens].shape
@@ -755,6 +756,7 @@ class DifferentialFlashAttentionImpl(AttentionImpl):
                     window_size=self.sliding_window,
                     alibi_slopes=self.alibi_slopes,
                     softcap=self.logits_soft_cap,
+                    fa_version=self.vllm_flash_attn_version,
                 ).squeeze(1)
             except Exception as e:
                 logger.error("Error in PagedAttention.forward_decode: %s",
@@ -787,6 +789,7 @@ class DifferentialFlashAttentionImpl(AttentionImpl):
             window_size=self.sliding_window,
             alibi_slopes=self.alibi_slopes,
             softcap=self.logits_soft_cap,
+            fa_version=self.vllm_flash_attn_version,
         ).squeeze(1)
         return output
 
diff --git a/vllm/attention/backends/flashmla.py b/vllm/attention/backends/flashmla.py
index f23c096952ce0a1d708200de1df43bca99eb8e3a..411eb5413f53c14c161981be3014f55d15d9b8ad 100644
--- a/vllm/attention/backends/flashmla.py
+++ b/vllm/attention/backends/flashmla.py
@@ -17,6 +17,7 @@ from vllm.attention.backends.mla.common import (MLACommonBackend,
 from vllm.attention.ops.flashmla import (flash_mla_with_kvcache,
                                          get_mla_metadata,
                                          is_flashmla_supported)
+from vllm.platforms.cuda import CudaPlatform
 
 
 class FlashMLABackend(MLACommonBackend):
@@ -181,6 +182,16 @@ class FlashMLAImpl(MLACommonImpl[FlashMLAMetadata]):
         assert is_flashmla_supported(), \
             "FlashMLA is not supported on this device"
 
+        # disallow FlashMLA on NVIDIA Blackwell (SM 10.0+) GPUs
+        # context:
+        # https://github.com/deepseek-ai/FlashMLA/issues/83
+        # https://github.com/vllm-project/vllm/issues/24513
+        if CudaPlatform.has_device_capability(100):
+            raise NotImplementedError(
+                "FlashMLA is temporarily disabled on Blackwell (SM 10.0). "
+                "Please use CUTLASS_MLA or TRITON_MLA instead. "
+                "Example: `export VLLM_ATTENTION_BACKEND=CUTLASS_MLA`")
+
         unsupported_features = [alibi_slopes, sliding_window, logits_soft_cap]
         if any(unsupported_features):
             raise NotImplementedError(
diff --git a/vllm/attention/backends/mla/common.py b/vllm/attention/backends/mla/common.py
index 35a140c2db5932896361c69eaed7f6068145513f..af21a04c1f212b551b2b2ded7016162ad9c0d3c7 100644
--- a/vllm/attention/backends/mla/common.py
+++ b/vllm/attention/backends/mla/common.py
@@ -824,7 +824,7 @@ class MLACommonMetadataBuilder(AttentionMetadataBuilder[T], Generic[T]):
             and context_lens_tensor is not None \
             and context_lens_tensor[:self.num_prefills].max() > 0:
 
-            # NOTE: it is recommend you read the `Chunked Prefill` section in
+            # NOTE: it is recommended you read the `Chunked Prefill` section in
             # the comment at the top of the file before trying to understand
             # the following code
 
@@ -1056,7 +1056,7 @@ class MLACommonImpl(MLAAttentionImpl[T], Generic[T]):
             return layer.weight
 
         # we currently do not have quantized bmm's which are needed for
-        # `W_UV` and `W_UK_T`, we we just store fp16/bf16 copies and perform
+        # `W_UV` and `W_UK_T`, we just store fp16/bf16 copies and perform
         # the bmm's in 16-bit, the extra memory overhead of this is fairly low
         if self.use_llama_nn and self.kv_b_proj.quant_method is None:
             kv_b_proj_weight = get_and_maybe_dequant_weights(self.kv_b_proj)
diff --git a/vllm/attention/layer.py b/vllm/attention/layer.py
index 237802afccde919d4c0546d2201ac3bb7c47745e..bb05b468fd1028e0327ccd5ab23cbefeddf9b4b5 100644
--- a/vllm/attention/layer.py
+++ b/vllm/attention/layer.py
@@ -360,13 +360,13 @@ class MultiHeadAttention(nn.Module):
             # currently, only torch_sdpa is supported on rocm
             self.attn_backend = _Backend.TORCH_SDPA
         else:
-            if backend in (_Backend.FLASH_ATTN, _Backend.FLASH_ATTN_VLLM_V1,
-                           _Backend.FLEX_ATTENTION):
-                backend = _Backend.XFORMERS
-
             self.attn_backend = backend if backend in {
-                _Backend.TORCH_SDPA, _Backend.XFORMERS, _Backend.PALLAS_VLLM_V1
-            } else _Backend.TORCH_SDPA
+                _Backend.TORCH_SDPA,
+                _Backend.TORCH_SDPA_VLLM_V1,
+                _Backend.XFORMERS,
+                _Backend.PALLAS_VLLM_V1,
+                _Backend.ROCM_AITER_FA,
+            } else current_platform.get_vit_attn_backend()
 
         if (self.attn_backend == _Backend.XFORMERS
                 and not check_xformers_availability()):
@@ -399,7 +399,8 @@ class MultiHeadAttention(nn.Module):
                                                           key,
                                                           value,
                                                           scale=self.scale)
-        elif self.attn_backend == _Backend.TORCH_SDPA:
+        elif (self.attn_backend == _Backend.TORCH_SDPA
+              or self.attn_backend == _Backend.TORCH_SDPA_VLLM_V1):
             query, key, value = (x.transpose(1, 2)
                                  for x in (query, key, value))
             out = F.scaled_dot_product_attention(query,
@@ -413,6 +414,19 @@ class MultiHeadAttention(nn.Module):
             from torch_xla.experimental.custom_kernel import flash_attention
             out = flash_attention(query, key, value, sm_scale=self.scale)
             out = out.transpose(1, 2)
+        elif self.attn_backend == _Backend.ROCM_AITER_FA:
+            from aiter import flash_attn_varlen_func
+
+            # ROCm Flash Attention expects (batch, seq, heads, head_dim)
+            out = flash_attn_varlen_func(query,
+                                         key,
+                                         value,
+                                         softmax_scale=self.scale)
+        else:
+            # ViT attention hasn't supported this backend yet
+            raise NotImplementedError(
+                f"ViT attention hasn't supported {self.attn_backend} "
+                f"backend yet.")
 
         return out.reshape(bsz, q_len, -1)
 
diff --git a/vllm/attention/layers/cross_attention.py b/vllm/attention/layers/cross_attention.py
new file mode 100644
index 0000000000000000000000000000000000000000..c24fa4e15f67963a1b29669bc5b4376acef6e2b9
--- /dev/null
+++ b/vllm/attention/layers/cross_attention.py
@@ -0,0 +1,159 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import functools
+from copy import copy
+from typing import Optional
+
+import numpy as np
+import torch
+
+from vllm import envs
+from vllm.attention.backends.abstract import (AttentionBackend,
+                                              AttentionMetadata, AttentionType)
+from vllm.attention.layer import Attention
+from vllm.attention.selector import get_attn_backend
+from vllm.config import CacheConfig, VllmConfig
+from vllm.logger import init_logger
+from vllm.multimodal import MULTIMODAL_REGISTRY
+from vllm.utils import cdiv
+from vllm.v1.attention.backends.utils import (CommonAttentionMetadata,
+                                              subclass_attention_backend)
+from vllm.v1.kv_cache_interface import CrossAttentionSpec
+
+logger = init_logger(__name__)
+
+
+def _get_max_encoder_len(vllm_config: VllmConfig) -> int:
+    return MULTIMODAL_REGISTRY.get_encdec_max_encoder_len(
+        vllm_config.model_config)
+
+
+def _get_cross_slot_mapping(encoder_seq_lens: np.ndarray,
+                            block_table_tensor: torch.Tensor,
+                            kv_cache_spec: CrossAttentionSpec,
+                            device: torch.device) -> torch.Tensor:
+    """Get cross-attention slot mappings."""
+
+    block_size = kv_cache_spec.block_size
+    slot_mappings = []
+
+    # Find indices with non-zero encoder sequence lengths
+    # The majority of parallel requests will be running the
+    # decoder, so this list should be relatively small.
+    active_indices = np.nonzero(encoder_seq_lens)[0]
+
+    for req_index in active_indices:
+        encoder_seq_len = encoder_seq_lens[req_index].item()
+
+        # Calculate the number of blocks needed for this request
+        num_blocks_needed = cdiv(encoder_seq_len, block_size)
+
+        # Get the block IDs for this request from the tensor
+        req_block_ids = block_table_tensor[req_index]
+
+        # Get only the blocks we need (first num_blocks_needed blocks)
+        needed_block_ids = req_block_ids[:num_blocks_needed]
+
+        # All needed blocks are allocated
+        i_values = torch.arange(encoder_seq_len,
+                                dtype=torch.int64,
+                                device=device)
+        block_indices = i_values // block_size
+        block_offsets = i_values % block_size
+        block_numbers = needed_block_ids[block_indices]
+        slot_mapping = block_numbers * block_size + block_offsets
+
+        slot_mappings.append(slot_mapping)
+
+    if slot_mappings:
+        return torch.cat(slot_mappings)
+    else:
+        return torch.empty(0, dtype=torch.int64, device=device)
+
+
+@functools.lru_cache
+def create_cross_attention_backend(
+    underlying_attn_backend: AttentionBackend, ) -> type[AttentionBackend]:
+    prefix = "CrossAttention_"
+    underlying_builder = underlying_attn_backend.get_builder_cls()
+
+    class CrossAttentionBuilder(underlying_builder):  # type: ignore
+
+        def build(self,
+                  common_prefix_len: int,
+                  common_attn_metadata: CommonAttentionMetadata,
+                  fast_build: bool = False) -> AttentionMetadata:
+            new_metadata = copy(common_attn_metadata)
+            new_metadata.causal = False
+            max_encoder_len = _get_max_encoder_len(self.vllm_config)
+            new_metadata.max_seq_len = max_encoder_len
+
+            new_metadata.seq_lens = torch.full(
+                (new_metadata.num_reqs, ),
+                max_encoder_len,
+                dtype=torch.int32,
+                device=self.device,
+            )
+            new_metadata.seq_lens_cpu = torch.full(
+                (new_metadata.num_reqs, ),
+                max_encoder_len,
+                dtype=torch.int32,
+                device="cpu",
+            )
+            new_metadata.slot_mapping = _get_cross_slot_mapping(
+                new_metadata.encoder_seq_lens, new_metadata.block_table_tensor,
+                self.kv_cache_spec, self.device)
+            return super().build(common_prefix_len, new_metadata, fast_build)
+
+    attn_backend = subclass_attention_backend(
+        name_prefix=prefix,
+        attention_backend_cls=underlying_attn_backend,
+        builder_cls=CrossAttentionBuilder)
+
+    return attn_backend
+
+
+class CrossAttention(Attention):
+    """
+    Cross-attention for encoder-decoder models.
+    Handles attention between decoder queries and encoder keys/values.
+    """
+
+    def __init__(self,
+                 num_heads: int,
+                 head_size: int,
+                 scale: float,
+                 cache_config: Optional[CacheConfig] = None,
+                 attn_type: Optional[str] = None,
+                 **kwargs):
+        dtype = torch.get_default_dtype()
+
+        if cache_config is not None:
+            kv_cache_dtype = cache_config.cache_dtype
+            block_size = cache_config.block_size
+        else:
+            kv_cache_dtype = "auto"
+            block_size = 16
+
+        if envs.VLLM_USE_V1:
+            underlying_attn_backend = get_attn_backend(head_size, dtype,
+                                                       kv_cache_dtype,
+                                                       block_size)
+
+            attn_backend = create_cross_attention_backend(
+                underlying_attn_backend)
+        else:
+            # in v0 cross attention is handled inside the backends
+            attn_backend = None
+
+        if attn_type is not None:
+            assert attn_type == AttentionType.ENCODER_DECODER, (
+                "CrossAttention only supports AttentionType.ENCODER_DECODER")
+
+        super().__init__(num_heads=num_heads,
+                         head_size=head_size,
+                         scale=scale,
+                         cache_config=cache_config,
+                         attn_backend=attn_backend,
+                         attn_type=AttentionType.ENCODER_DECODER,
+                         **kwargs)
diff --git a/vllm/attention/ops/chunked_prefill_paged_decode.py b/vllm/attention/ops/chunked_prefill_paged_decode.py
index e5b90a8b2755813db9dfcab0ce05560ae483fe0f..bf4b06512a3c11e698fe69c4f039ee25f4e707bc 100644
--- a/vllm/attention/ops/chunked_prefill_paged_decode.py
+++ b/vllm/attention/ops/chunked_prefill_paged_decode.py
@@ -15,6 +15,8 @@ from vllm.triton_utils import tl, triton
 
 from .prefix_prefill import context_attention_fwd
 
+float8_info = torch.finfo(current_platform.fp8_dtype())
+
 
 @triton.jit
 def cdiv_fn(x, y):
@@ -34,6 +36,7 @@ def kernel_paged_attention_2d(
         scale,  # float32
         k_scale,  # float32
         v_scale,  # float32
+        out_scale_inv,
         num_query_heads: tl.constexpr,  # int
         num_queries_per_kv: tl.constexpr,  # int
         num_queries_per_kv_padded: tl.constexpr,  # int
@@ -60,7 +63,9 @@ def kernel_paged_attention_2d(
         filter_by_query_len: tl.constexpr,  # bool
         query_start_len_ptr,  # [num_seqs+1]
         USE_SINKS: tl.constexpr,  # bool
-):
+        USE_FP8: tl.constexpr,
+        FP8_MIN: tl.constexpr = float8_info.min,
+        FP8_MAX: tl.constexpr = float8_info.max):
     seq_idx = tl.program_id(0)
     kv_head_idx = tl.program_id(1)
 
@@ -204,6 +209,9 @@ def kernel_paged_attention_2d(
 
     # epilogue
     acc = acc / L[:, None]
+    if USE_FP8:
+        acc = acc * tl.load(out_scale_inv)
+        acc = tl.clamp(acc, FP8_MIN, FP8_MAX)
 
     output_offset = (cur_batch_in_all_start_index * output_stride_0 +
                      query_head_idx * output_stride_1)
@@ -234,6 +242,7 @@ def chunked_prefill_paged_decode(
     alibi_slopes=None,
     sliding_window=None,
     sm_scale=None,
+    output_scale=None,
     # Optional tensor for sinks
     sinks=None,
 ):
@@ -266,6 +275,7 @@ def chunked_prefill_paged_decode(
             sliding_window=sliding_window,
             sm_scale=sm_scale,
             skip_decode=True,
+            fp8_out_scale=output_scale,
             sinks=sinks,
         )
 
@@ -316,7 +326,7 @@ def chunked_prefill_paged_decode(
         tmp_output = torch.empty(
             size=(total_num_seq, num_query_heads, max_num_partitions,
                   head_size),
-            dtype=output.dtype,
+            dtype=query.dtype,
             device=output.device,
         )
         exp_sums = torch.empty(
@@ -345,6 +355,7 @@ def chunked_prefill_paged_decode(
             kv_cache_dtype=kv_cache_dtype,
             k_scale=k_scale,
             v_scale=v_scale,
+            fp8_out_scale=output_scale,
         )
     else:
         kernel_paged_attention_2d[(
@@ -362,6 +373,8 @@ def chunked_prefill_paged_decode(
             scale=sm_scale,
             k_scale=k_scale,
             v_scale=v_scale,
+            out_scale_inv=1.0 /
+            output_scale if output_scale is not None else 1.0,
             num_query_heads=num_query_heads,
             num_queries_per_kv=num_queries_per_kv,
             num_queries_per_kv_padded=num_queries_per_kv_padded,
@@ -388,4 +401,5 @@ def chunked_prefill_paged_decode(
             filter_by_query_len=True,
             query_start_len_ptr=query_start_loc,
             USE_SINKS=sinks is not None,
+            USE_FP8=output_scale is not None,
         )
diff --git a/vllm/attention/ops/common.py b/vllm/attention/ops/common.py
new file mode 100644
index 0000000000000000000000000000000000000000..189b57e8e8b82f60d2e8b7e8179610dd86bf875d
--- /dev/null
+++ b/vllm/attention/ops/common.py
@@ -0,0 +1,139 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import torch
+
+from vllm.distributed.parallel_state import GroupCoordinator
+from vllm.triton_utils import tl, triton
+
+
+@triton.jit
+def _correct_attn_cp_out_kernel(outputs_ptr, new_output_ptr, lses_ptr,
+                                vlse_ptr, outputs_stride_B, outputs_stride_H,
+                                outputs_stride_D, lses_stride_N, lses_stride_B,
+                                lses_stride_H, lse_idx, HEAD_DIM: tl.constexpr,
+                                N_ROUNDED: tl.constexpr):
+    """
+    Apply the all-gathered lses to correct each local rank's attention
+    output. we still need perform a cross-rank reduction to obtain the
+    final attention output.
+
+    Args:
+        output: [ B, H, D ]
+        lses   : [ N, B, H ]
+        cp, batch, q_heads, v_head_dim
+    Return:
+        output: [ B, H, D ]
+        lse   : [ B, H ]
+    """
+    batch_idx = tl.program_id(axis=0).to(tl.int64)
+    head_idx = tl.program_id(axis=1).to(tl.int64)
+    d_offsets = tl.arange(0, HEAD_DIM)
+    num_n_offsets = tl.arange(0, N_ROUNDED)
+
+    # shape = [N]
+    lse_offsets = num_n_offsets * lses_stride_N + batch_idx * \
+        lses_stride_B + head_idx * lses_stride_H
+
+    # calc final lse
+    lse = tl.load(lses_ptr + lse_offsets)
+    lse = tl.where((lse != lse) | (lse == float('inf')), -float('inf'), lse)
+    lse_max = tl.max(lse, axis=0)
+    lse -= lse_max
+    lse_exp = tl.exp(lse)
+    lse_acc = tl.sum(lse_exp, axis=0)
+    lse = tl.log(lse_acc)
+    lse += lse_max
+
+    lse_offsets = batch_idx * lses_stride_B + head_idx * lses_stride_H
+    tl.store(vlse_ptr + lse_offsets, lse)
+
+    # shape = [D]
+    output_offsets = batch_idx * outputs_stride_B + \
+                    head_idx * outputs_stride_H + \
+                    d_offsets * outputs_stride_D
+
+    # correct output
+    lse_offset = lse_idx * lses_stride_N + batch_idx * \
+        lses_stride_B + head_idx * lses_stride_H
+    lse_tmp = tl.load(lses_ptr + lse_offset)
+    lse_finally = lse_tmp - lse
+    lse_finally = tl.where(
+        (lse_finally != lse_finally) | (lse_finally == float('inf')),
+        -float('inf'), lse_finally)
+    factor = tl.exp(lse_finally)
+    output = tl.load(outputs_ptr + output_offsets)
+    output = output * factor
+
+    tl.store(new_output_ptr + output_offsets, output)
+
+
+class CPTritonContext:
+    """ The CPTritonContext is used to avoid recompilation of the Triton JIT.
+    """
+
+    def __init__(self):
+        self.inner_kernel = None
+
+    def call_kernel(self, kernel, grid, *regular_args, **const_args):
+        if self.inner_kernel is None:
+            self.inner_kernel = kernel[grid](*regular_args, **const_args)
+        else:
+            self.inner_kernel[grid](*regular_args)
+
+
+def correct_attn_out(out: torch.Tensor, lses: torch.Tensor, cp_rank: int,
+                     ctx: CPTritonContext):
+    """
+    Apply the all-gathered lses to correct each local rank's attention
+    output. we still need perform a cross-rank reduction to obtain the
+    final attention output.
+
+    Args:
+        output: [ B, H, D ]
+        lses   : [ N, B, H ]
+    Return:
+        output: [ B, H, D ]
+        lse   : [ B, H ]
+    """
+    if ctx is None:
+        ctx = CPTritonContext()
+
+    lse = torch.empty_like(lses[0])
+
+    grid = (out.shape[0], out.shape[1], 1)
+    regular_args = (out, out, lses, lse, *out.stride(), *lses.stride(),
+                    cp_rank)
+    const_args = {
+        "HEAD_DIM": out.shape[-1],
+        "N_ROUNDED": lses.shape[0],
+    }
+
+    ctx.call_kernel(_correct_attn_cp_out_kernel, grid, *regular_args,
+                    **const_args)
+    return out, lse
+
+
+def cp_lse_ag_out_rs(cp_attn_out: torch.Tensor,
+                     cp_attn_lse: torch.Tensor,
+                     cp_group: GroupCoordinator,
+                     ctx: CPTritonContext = None):
+    """
+    cp_attn_out: [ B, H, D ]
+    cp_attn_lse: [ B, H ]
+    """
+    if cp_group.world_size == 1:
+        return cp_attn_out
+
+    if ctx is None:
+        ctx = CPTritonContext()
+
+    lses = torch.empty((cp_group.world_size, ) + cp_attn_lse.shape,
+                       dtype=cp_attn_lse.dtype,
+                       device=cp_attn_lse.device)
+
+    cp_attn_lse = cp_attn_lse.contiguous()
+    lses = cp_group.all_gather(cp_attn_lse, dim=0).view_as(lses)
+    out, _ = correct_attn_out(cp_attn_out, lses, cp_group.rank_in_group, ctx)
+    assert out.is_contiguous()
+    out = cp_group.reduce_scatter(out, dim=1)
+    return out
diff --git a/vllm/attention/ops/flashmla.py b/vllm/attention/ops/flashmla.py
index 564042cf8eb12b6bbbed621518fa58514f57daae..2c3e8c42400cec46b4b5451d67d66e639e8866b4 100644
--- a/vllm/attention/ops/flashmla.py
+++ b/vllm/attention/ops/flashmla.py
@@ -105,7 +105,9 @@ def flash_mla_with_kvcache(
         descale_q,
         descale_k,
     )
-    return out, softmax_lse
+
+    # Note(hc): need revisit when we support DCP with decode query_len > 1.
+    return out.squeeze(1), softmax_lse.squeeze(-1)
 
 
 #
diff --git a/vllm/attention/ops/nki_flash_attn.py b/vllm/attention/ops/nki_flash_attn.py
deleted file mode 100644
index 29fa4320176168fd813b70765c318e3bdff7ab1a..0000000000000000000000000000000000000000
--- a/vllm/attention/ops/nki_flash_attn.py
+++ /dev/null
@@ -1,903 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-import neuronxcc.nki.isa as nisa
-import neuronxcc.nki.language as nl
-import numpy as np
-import torch
-from neuronxcc import nki
-from neuronxcc.nki.language import par_dim
-
-from vllm.utils import cdiv
-
-
-def is_power_of_2(x):
-    return x > 0 and (x & (x - 1)) == 0
-
-
-@nki.jit
-def load_block_tables(block_tables_hbm, num_tiles, num_blocks_per_tile):
-    """
-    Load block tables from HBM into SRAM
-
-    `block_tables_hbm` has shape `(num_tiles * num_blocks_per_tile, )`.
-    In case `num_tiles > B_P_SIZE`, we need further tile `num_tile` dimension.
-    """
-    B_P_SIZE = 128
-
-    # reshape as `(num_tiles, num_blocks_per_tile)`
-    assert len(block_tables_hbm.shape) == 1
-    (num_total_blocks, ) = block_tables_hbm.shape
-    assert num_blocks_per_tile * num_tiles == num_total_blocks
-    block_tables_hbm = block_tables_hbm.reshape(
-        (num_tiles, num_blocks_per_tile))
-
-    block_tables_sbuf = nl.zeros(
-        (cdiv(num_tiles, B_P_SIZE), par_dim(B_P_SIZE), num_blocks_per_tile),
-        dtype=nl.int32,
-    )
-    for i in nl.affine_range(cdiv(num_tiles, B_P_SIZE)):
-        i_p = nl.arange(B_P_SIZE)[:, None]
-        i_f = nl.arange(num_blocks_per_tile)[None, :]
-        block_tables_sbuf[i, i_p, i_f] = nl.load(
-            block_tables_hbm[i_p + i * B_P_SIZE, i_f],
-            dtype=nl.int32,
-            mask=(i_p + i * B_P_SIZE < num_tiles),
-        )
-    return block_tables_sbuf
-
-
-@nki.jit
-def transform_block_tables_for_indirect_load(
-    block_tables,
-    block_size_tiling_factor,
-    num_head,
-    head_id,
-):
-    """
-    This function does two things:
-    1. calculate new `block_tables` for a `head_id` after flattening
-    `num_block`, `num_head`, and `block_size_tiling_factor` dimensions
-    2. transpose the result so that `block_table` for each tile is mapped to
-    SBUF Partition dimension for vectorized DMA
-
-    Tiling trick to further improve DMA performance:
-    Given KV cache shape `(num_block, num_head, block_size, D)`, when loading M
-    blocks of a given `head_id` from HBM, the load `cache[block_tables,
-    head_id]` has shape `(M, block_size, D)`. If M < B_P_SIZE = 128, DMA may not
-    fully utilize hardware parallelization. The solution is to tile `block_size`
-    into `(block_size_tiling_factor, tiled_block_size)` s.t. `M *
-    block_size_tiling_factor = B_P_SIZE`. After tiling, KV cache has shape
-    `(num_block, num_head, block_size_tiling_factor, tiled_block_size, D)`. 
-
-    Note:
-    We don't further tile D dimension as small DMA size also hurts performance.
-    """
-    B_P_SIZE = 128
-    num_partitions, num_tiles_per_partition, num_blocks_per_tile = (
-        block_tables.shape)
-    assert num_tiles_per_partition == B_P_SIZE
-    assert is_power_of_2(
-        num_blocks_per_tile), f"{num_blocks_per_tile=} is not power of 2"
-
-    num_loads = cdiv(num_blocks_per_tile, B_P_SIZE)
-    block_tables_transposed = nl.ndarray(
-        (
-            num_loads,
-            par_dim(B_P_SIZE),
-            num_partitions * num_tiles_per_partition,
-        ),
-        dtype=nl.int32,
-    )
-
-    # prepare iota ahead of time to avoid repeatedly using Gpsimd
-    if num_head > 1:
-        head_id = nisa.iota(head_id, dtype=nl.int32).reshape((1, 1))
-        head_id = nl.transpose(
-            head_id.broadcast_to((1, num_tiles_per_partition)))
-        if num_blocks_per_tile > 1:
-            head_id = head_id.broadcast_to(
-                (num_tiles_per_partition, num_blocks_per_tile))
-
-    if block_size_tiling_factor > 1:
-        broadcast_shape = (
-            num_tiles_per_partition,
-            num_blocks_per_tile,
-            block_size_tiling_factor,
-        )
-        offset = nisa.iota(nl.arange(block_size_tiling_factor)[None, None, :],
-                           dtype=nl.int32).broadcast_to(broadcast_shape)
-
-    for partition_id in nl.affine_range(num_partitions):
-        block_tables_partition = block_tables[partition_id]
-        if num_head > 1:
-            # fuse num_block and num_head dimension
-            block_tables_partition = block_tables_partition * num_head + head_id
-
-        if block_size_tiling_factor > 1:
-            # need to apply block size tiling trick
-            assert num_blocks_per_tile * block_size_tiling_factor == B_P_SIZE
-            block_tables_partition = ((block_tables_partition *
-                                       block_size_tiling_factor).reshape(
-                                           (num_tiles_per_partition,
-                                            num_blocks_per_tile,
-                                            1)).broadcast_to(broadcast_shape))
-            new_block_tables = block_tables_partition + offset
-            new_block_tables = new_block_tables.reshape(
-                (num_tiles_per_partition, B_P_SIZE))
-        else:
-            new_block_tables = block_tables_partition
-
-        # transpose the block table so that it can be used by vector DGE
-        for i in nl.affine_range(num_loads):
-            i_p = nl.arange(B_P_SIZE)[:, None]
-            i_f = (partition_id * num_tiles_per_partition +
-                   nl.arange(num_tiles_per_partition)[None, :])
-            block_tables_transposed[i, i_p, i_f] = nl.transpose(
-                new_block_tables[:, nl.ds(i * B_P_SIZE, B_P_SIZE)])
-    return block_tables_transposed
-
-
-@nki.jit
-def load_kv_tile_from_cache(
-    cur_k_tile,
-    cur_v_tile,
-    kv_cache,
-    block_tables,
-    large_k_tile_idx,
-    num_blocks_per_large_tile,
-    tiled_block_size,
-    B_P_SIZE,
-    B_D_SIZE,
-):
-    """
-    Load KV cache and transform Key and Value into layout required by Matmul
-
-    Vectorized DMA Load layout:
-    Key and Value: (par_dim(B_P_SIZE), seqlen_kv // B_P_SIZE * B_D_SIZE)
-
-    Layout used by attention matmuls:
-    Key: (par_dim(B_D_SIZE), seqlen_kv)
-    Value: (seqlen_kv // B_P_SIZE, par_dim(B_P_SIZE), B_D_SIZE)
-           equivalent to (par_dim(B_P_SIZE), seqlen_kv // B_P_SIZE * B_D_SIZE)
-    """
-    # load key cache
-    num_loads = cdiv(num_blocks_per_large_tile, B_P_SIZE)
-    for load_idx in nl.affine_range(num_loads):
-        i_p = nl.arange(B_P_SIZE)[:, None]
-        i_f = nl.arange(tiled_block_size * B_D_SIZE)[None, :]
-        loaded = nl.load(kv_cache[0, block_tables[load_idx, i_p,
-                                                  large_k_tile_idx], i_f])
-        if cur_k_tile.dtype != loaded.dtype:
-            loaded = nl.copy(loaded, dtype=cur_k_tile.dtype)
-        # Transpose SBUF tensor using PE
-        for tb_i in nl.affine_range(tiled_block_size):
-            cur_k_tile[
-                :,
-                nl.ds(
-                    load_idx * B_P_SIZE * tiled_block_size + tb_i * B_P_SIZE,
-                    B_P_SIZE,
-                ),
-            ] = nl.transpose(loaded[:, nl.ds(tb_i * B_D_SIZE, B_D_SIZE)])
-
-    # load value cache
-    for load_idx in nl.affine_range(num_loads):
-        loaded = nl.load(kv_cache[1, block_tables[load_idx, i_p,
-                                                  large_k_tile_idx], i_f])
-        if cur_v_tile.dtype != loaded.dtype:
-            loaded = nl.copy(loaded, dtype=cur_v_tile.dtype)
-        i_p = nl.arange(B_P_SIZE)[:, None]
-        i_f = nl.arange(tiled_block_size * B_D_SIZE)[None, :]
-        cur_v_tile[
-            :,
-            nl.ds(
-                load_idx * tiled_block_size * B_D_SIZE,
-                tiled_block_size * B_D_SIZE,
-            ),
-        ] = loaded
-
-
-@nki.jit
-def transpose_p_local(p_local_transposed,
-                      p_local,
-                      LARGE_TILE_SZ,
-                      B_F_SIZE=512):
-    for i in nl.affine_range(LARGE_TILE_SZ // B_F_SIZE):
-        if nisa.get_nc_version() == nisa.nc_version.gen3:
-            p_local_t_tmp = nl.ndarray((par_dim(128), B_F_SIZE),
-                                       buffer=nl.sbuf,
-                                       dtype=p_local.dtype)
-        else:
-            p_local_t_tmp = nl.ndarray((par_dim(128), B_F_SIZE),
-                                       buffer=nl.psum,
-                                       dtype=np.float32)
-
-        for j in nl.affine_range(B_F_SIZE // 128):
-            j_128_slice = nl.ds(j * 128, 128)
-            i_j_128_slice = nl.ds(i * B_F_SIZE + j * 128, 128)
-
-            if nisa.get_nc_version() == nisa.nc_version.gen3:
-                p_local_t_tmp[:, j_128_slice] = nisa.dma_transpose(
-                    p_local[:, i_j_128_slice])
-            else:
-                p_local_t_tmp[:, j_128_slice] = nisa.nc_transpose(
-                    p_local[:, i_j_128_slice])
-
-        p_local_transposed[:, nl.ds(i * B_F_SIZE, B_F_SIZE)] = nl.copy(
-            p_local_t_tmp, dtype=p_local_transposed.dtype)
-
-
-@nki.jit
-def _flash_attention_core(
-    q_local_tile,
-    k,
-    v,
-    o_buffer,
-    l_buffer,
-    m_buffer,
-    kernel_dtype,
-    acc_type,
-    tile_mask,
-    use_causal_mask,
-    q_tile_idx=None,
-    initialize=False,
-    LARGE_TILE_SZ=2048,
-    B_P_SIZE=128,
-    B_F_SIZE=512,
-    B_D_SIZE=128,
-    qk_res_buffer=None,
-):
-    """
-    The flash attention core function to calculate self attention between a tile
-    of q and a block of K and V.
-    The q_local_tile has (B_P_SIZE, B_D_SIZE)
-    The K and V have shape (B_D_SIZE, LARGE_TILE_SZ), whose free dimension will
-    be split into size B_F_SIZE tiles
-
-    The results are stored in the following three buffers
-    o_buffer: (B_P_SIZE, d)
-    l_buffer: (B_P_SIZE, 1)
-    m_buffer: (B_P_SIZE, 1)
-
-    All IO buffers are in SBUF.
-    """
-    num_k_tile_per_large_tile = LARGE_TILE_SZ // B_F_SIZE
-
-    qk_res_buf = nl.ndarray((par_dim(B_P_SIZE), LARGE_TILE_SZ),
-                            buffer=nl.sbuf,
-                            dtype=acc_type)
-    max_local = nl.ndarray((par_dim(B_P_SIZE), num_k_tile_per_large_tile),
-                           dtype=acc_type)
-    for k_i in nl.affine_range(num_k_tile_per_large_tile):
-        k_i_b_f_slice = nl.ds(k_i * B_F_SIZE, B_F_SIZE)
-
-        if use_causal_mask:
-            # mask are used to only apply computation to the lower half of the
-            # matrix, which reduce the arithmetic intensity by up to 50%
-            multiplication_required_selection = (q_tile_idx * B_P_SIZE
-                                                 >= k_i * B_F_SIZE)
-        else:
-            multiplication_required_selection = True
-
-        if multiplication_required_selection:
-            qk_psum = nl.ndarray((par_dim(B_P_SIZE), B_F_SIZE),
-                                 dtype=np.float32,
-                                 buffer=nl.psum)  # (128, 512)
-            qk_psum[:, :] = nl.matmul(q_local_tile,
-                                      k[:, k_i_b_f_slice],
-                                      transpose_x=True)  # (p(128), 512)
-            qk_res_buf[:, k_i_b_f_slice] = nl.where(
-                tile_mask[:, k_i_b_f_slice],
-                qk_psum[:, nl.ds(0, B_F_SIZE)],
-                -9984.0,
-                dtype=acc_type,
-            )
-        else:
-            qk_res_buf[:, k_i_b_f_slice] = -9984.0
-
-        # Calculate max of the current tile
-        max_local[:, k_i] = nisa.tensor_reduce(
-            np.max,
-            qk_res_buf[:, k_i_b_f_slice],
-            axis=(1, ),
-            dtype=acc_type,
-            negate=False,
-        )
-
-    if qk_res_buffer is not None:
-        qk_res_buffer[:, :] = nl.copy(qk_res_buf[:, :])
-
-    max_ = nisa.tensor_reduce(
-        np.max,
-        max_local[:, :],
-        axis=(1, ),
-        dtype=acc_type,
-        negate=False,
-    )
-
-    o_previous_scaled = nl.ndarray((par_dim(B_P_SIZE), B_D_SIZE),
-                                   dtype=o_buffer.dtype)
-
-    if initialize:
-        m_buffer[:, 0] = nl.copy(max_)
-        m_current = max_
-    else:
-        m_previous = nl.copy(m_buffer[:, 0])
-        m_buffer[:, 0] = nl.maximum(m_previous, max_)  # (128,1)
-
-        m_current = m_buffer[:, 0]
-        # Compute scaling factor
-        alpha = nisa.activation(
-            np.exp,
-            m_previous,
-            bias=-1 * m_current,
-            scale=1.0,
-        )
-        o_previous_scaled[...] = nl.multiply(o_buffer[:, :], alpha)
-
-    p_local = nl.ndarray((par_dim(B_P_SIZE), LARGE_TILE_SZ),
-                         dtype=kernel_dtype)
-    REDUCTION_TILE = min(2048, LARGE_TILE_SZ // 2)
-
-    p_partial_sum = nl.ndarray(
-        (par_dim(B_P_SIZE), LARGE_TILE_SZ // REDUCTION_TILE),
-        dtype=acc_type,
-    )
-
-    for k_r_i in nl.affine_range(LARGE_TILE_SZ // REDUCTION_TILE):
-        k_r_i_reduce_slice = nl.ds(k_r_i * REDUCTION_TILE, REDUCTION_TILE)
-
-        # compute exp(qk - max)
-        # Compute partial row - tile sum of exp(qk - max))
-        # FIXME : Use activation accumulate to accumulate over k_r_i loop ?
-        p_local[:, k_r_i_reduce_slice] = nisa.activation_reduce(
-            np.exp,
-            qk_res_buf[:, k_r_i_reduce_slice],
-            bias=-1 * m_current,
-            scale=1.0,
-            reduce_op=nl.add,
-            reduce_res=p_partial_sum[:, k_r_i],
-            dtype=kernel_dtype,
-        )
-
-    ps = nl.sum(p_partial_sum, axis=1, dtype=acc_type)
-
-    p_local_transposed = nl.ndarray((par_dim(B_P_SIZE), LARGE_TILE_SZ),
-                                    dtype=kernel_dtype)
-    transpose_p_local(
-        p_local_transposed=p_local_transposed,
-        p_local=p_local,
-        LARGE_TILE_SZ=LARGE_TILE_SZ,
-        B_F_SIZE=B_F_SIZE,
-    )
-
-    pv_psum = nl.zeros(
-        (par_dim(B_P_SIZE), B_D_SIZE),
-        dtype=np.float32,
-        buffer=nl.psum,
-    )
-    for k_i in nl.affine_range(LARGE_TILE_SZ // B_P_SIZE):
-        pv_psum[:, :] += nl.matmul(
-            p_local_transposed[:, nl.ds(k_i * B_P_SIZE, B_P_SIZE)],
-            v[:, nl.ds(k_i * B_D_SIZE, B_D_SIZE)],
-            transpose_x=True,
-        )  # (128, 128) (p(Br), d)
-
-    if initialize:
-        o_buffer[:, :] = nl.copy(pv_psum[:, :])
-        l_buffer[:, 0] = nl.add(nl.log(ps), max_)
-    else:
-        o_buffer[:, :] = nl.add(o_previous_scaled, pv_psum)
-
-        l_prev = l_buffer[:, 0]
-        l_exp = nl.add(
-            nl.exp(nl.subtract(l_prev, m_current)),
-            ps,
-        )
-        l_buffer[:, 0] = nl.add(m_current, nl.log(l_exp))
-
-
-@nki.jit
-def load_v_tile(v_hbm_tile, cur_v_tile, large_tile_idx, v_i, LARGE_TILE_SZ):
-    B_P_SIZE = 128
-    B_D_SIZE = v_hbm_tile.shape[-1]
-    loaded = nl.load(v_hbm_tile[
-        nl.ds(large_tile_idx * LARGE_TILE_SZ + B_P_SIZE * v_i, B_P_SIZE),
-        :,
-    ])
-    if cur_v_tile.dtype != loaded.dtype:
-        loaded = nl.copy(loaded, dtype=cur_v_tile.dtype)
-    cur_v_tile[:, nl.ds(v_i * B_D_SIZE, B_D_SIZE)] = loaded
-
-
-@nki.jit
-def flash_paged_attention(
-    query,
-    key,
-    value,
-    kv_cache,
-    block_tables,
-    mask,
-    softmax_scale=None,
-    mixed_precision=True,
-    LARGE_TILE_SZ=2048,
-    return_debug_tensors=False,
-):
-    """
-    Flash PagedAttention Forward Kernel.
-
-    IO tensor layouts:
-      - query: shape   (1, n_heads, d, seq_q)
-      - key:   shape   (1, n_kv_heads, d, seq_k)
-      - value: shape   (1, n_kv_heads, seq_v, d)
-      - kv_cache: (2, num_blocks, n_kv_heads, block_size, d)
-      - block_tables: (num_active_blocks, )
-      - mask: (seq_q, num_active_blocks * block_size + seq_q)
-      - o: shape (1, n_heads, seq_q, d)
-
-      - This kernel requires seq_k == seq_v
-      - We use continuous batching by default, so the batch dimension is
-        always 1, and different requests are concatenated along sequence
-        dimension.
-      - We use paged cache blocks (kv_cache) to store KV cache.
-
-    IO tensor dtypes:
-      - This kernel assumes all IO tensors have the same dtype except for
-        block_tables (int32) and mask (int32)
-      - If mixed_precision is True, then all Tensor Engine operation will be
-        performed in bfloat16 and accumulation will be performed in float32.
-        Otherwise the intermediates will be in the same type as the inputs.
-
-    Compile-time Constants:
-      - softmax_scale: scaling for softmax, is None, default is `1.0/(d**0.5)`
-      - mixed_precision: flag to set non-matmul ops in fp32 precision, default
-        is set to `true`, if false, we use same precision as input types
-      - LARGE_TILE_SZ: `default=2048`, size of the kv tile size for attention
-        computation reduction
-
-    GQA support Notes:
-      the spmd kernel for launching kernel should be on kv_heads instead of
-      nheads
-
-    Example usage:
-      MHA: q: [b, h, d, s], k: [b, h, d, s], v: [b, h, s, d]
-        usage: `flash_fwd[b, h](q, k, v, ...)`
-      GQA: q: [b, h, d, s], k: [b, kv_h, d, s], v: [b, kv_h, s, d]
-        usage: `flash_fwd[b, kv_h](q, k, v, ...)`
-    """
-    B_F_SIZE = 512
-    B_P_SIZE = 128
-    b, h, d, seqlen_q = query.shape
-    B_D_SIZE = d
-    n_tile_q = seqlen_q // B_P_SIZE  # since q will be loaded on tensor engine
-    _, num_blocks, k_h, block_size, _ = kv_cache.shape
-    q_h_per_k_h = h // k_h
-    assert b == 1, f"invalid batch size {b=}"
-    assert d <= 128, f" we do not support head_dim > 128, got head dim {d=}"
-    cache_shape = (2, num_blocks, k_h, block_size, d)
-    assert (tuple(kv_cache.shape) == cache_shape
-            ), f"{kv_cache.shape=} mismatch, expect {cache_shape}"
-    assert key is None or tuple(key.shape) == (
-        1,
-        k_h,
-        d,
-        seqlen_q,
-    ), f"key shape {key.shape} mismatch!"
-    assert value is None or tuple(value.shape) == (
-        1,
-        k_h,
-        seqlen_q,
-        d,
-    ), f"value shape {value.shape} mismatch!"
-
-    assert (
-        nl.program_ndim() == 2
-    ), f"Expect spmd grid with 2 dimensions, got {nl.program_ndim()} instead!"
-    batch_id = nl.program_id(axis=0)
-    head_id = nl.program_id(axis=1)
-
-    (num_active_blocks, ) = block_tables.shape
-    context_kv_len = num_active_blocks * block_size
-    assert (
-        LARGE_TILE_SZ % B_F_SIZE == 0
-    ), f"Need {LARGE_TILE_SZ=} to be divisible by {B_F_SIZE=} in transpose_p"
-    assert (context_kv_len % LARGE_TILE_SZ == 0
-            ), f"Need {context_kv_len=} to be divisible by {LARGE_TILE_SZ=}"
-
-    num_blocks_per_large_tile = LARGE_TILE_SZ // block_size
-    assert is_power_of_2(
-        num_blocks_per_large_tile
-    ), f"{num_blocks_per_large_tile=} is expected of be power of 2"
-    if seqlen_q > B_F_SIZE:
-        MAX_REDUCTION_TILE = 2048
-        if seqlen_q // 2 > MAX_REDUCTION_TILE:
-            assert (
-                seqlen_q % MAX_REDUCTION_TILE == 0
-            ), f"{seqlen_q=} should be divisible by {MAX_REDUCTION_TILE=}"
-        else:
-            assert (seqlen_q % B_F_SIZE == 0
-                    ), f"{seqlen_q=} should be divisible by {B_F_SIZE=})"
-
-    kernel_dtype = nl.bfloat16 if mixed_precision else query.dtype
-    acc_type = np.dtype(np.float32) if mixed_precision else kernel_dtype
-    softmax_scale = softmax_scale or (1.0 / (d**0.5))
-    num_large_k_tile = context_kv_len // LARGE_TILE_SZ
-
-    o = nl.ndarray((b, h, seqlen_q, d),
-                   dtype=query.dtype,
-                   buffer=nl.shared_hbm)
-    hbm_l_buffer, hbm_m_buffer, hbm_qk_res, qk_res_buffer = (
-        None,
-        None,
-        None,
-        None,
-    )
-    if return_debug_tensors:
-        hbm_l_buffer = nl.ndarray((b, h, seqlen_q),
-                                  dtype=acc_type,
-                                  buffer=nl.shared_hbm)
-        hbm_m_buffer = nl.ndarray((b, h, seqlen_q),
-                                  dtype=acc_type,
-                                  buffer=nl.shared_hbm)
-        hbm_qk_res = nl.ndarray((b, h, B_P_SIZE, seqlen_q),
-                                dtype=acc_type,
-                                buffer=nl.shared_hbm)
-        qk_res_buffer = nl.zeros(
-            (n_tile_q, q_h_per_k_h, par_dim(B_P_SIZE), seqlen_q),
-            dtype=acc_type,
-            buffer=nl.sbuf,
-            lazy_initialization=True,
-        )
-    block_tables_sbuf = load_block_tables(
-        block_tables_hbm=block_tables,
-        num_tiles=num_large_k_tile,
-        num_blocks_per_tile=num_blocks_per_large_tile,
-    )
-
-    # On Neuron, we need B_P_SIZE = 128 blocks to make DMA efficient
-    if num_blocks_per_large_tile < B_P_SIZE:
-        # we checked num_blocks_per_tile is a power of 2
-        assert B_P_SIZE % num_blocks_per_large_tile == 0
-        block_size_tiling_factor = B_P_SIZE // num_blocks_per_large_tile
-        # We assume block_size >= block_size_tiling_factor
-        assert block_size % block_size_tiling_factor == 0
-    else:
-        block_size_tiling_factor = 1
-    tiled_block_size = block_size // block_size_tiling_factor
-
-    # Indirect DMA load must be placed along Partition Dimension
-    block_tables_sbuf = transform_block_tables_for_indirect_load(
-        block_tables_sbuf,
-        block_size_tiling_factor=block_size_tiling_factor,
-        num_head=k_h,
-        head_id=head_id,
-    )
-
-    # Flatten KV cache to be 3D for loading into SBUF
-    new_cache_shape = (
-        2,
-        num_blocks * k_h * block_size_tiling_factor,
-        tiled_block_size * d,
-    )
-    kv_cache = kv_cache.reshape(new_cache_shape)
-
-    # Global Flash Attention accumulators
-    o_buffer = nl.zeros(
-        (n_tile_q, q_h_per_k_h, par_dim(B_P_SIZE), d),
-        dtype=acc_type,
-        buffer=nl.sbuf,
-        lazy_initialization=True,
-    )
-    l_buffer = nl.zeros(
-        (n_tile_q, q_h_per_k_h, par_dim(B_P_SIZE), 1),
-        dtype=acc_type,
-        buffer=nl.sbuf,
-        lazy_initialization=True,
-    )
-    m_buffer = nl.zeros(
-        (n_tile_q, q_h_per_k_h, par_dim(B_P_SIZE), 1),
-        dtype=acc_type,
-        buffer=nl.sbuf,
-        lazy_initialization=True,
-    )
-
-    for large_k_tile_idx in nl.sequential_range(0, num_large_k_tile):
-        num_loads = cdiv(num_blocks_per_large_tile, B_P_SIZE)
-        cur_k_tile = nl.ndarray(
-            (par_dim(B_D_SIZE), LARGE_TILE_SZ),
-            dtype=kernel_dtype,
-        )
-        cur_v_tile = nl.ndarray(
-            (par_dim(B_P_SIZE), num_loads * tiled_block_size * B_D_SIZE),
-            dtype=kernel_dtype,
-        )
-        load_kv_tile_from_cache(
-            cur_k_tile=cur_k_tile,
-            cur_v_tile=cur_v_tile,
-            kv_cache=kv_cache,
-            block_tables=block_tables_sbuf,
-            large_k_tile_idx=large_k_tile_idx,
-            num_blocks_per_large_tile=num_blocks_per_large_tile,
-            tiled_block_size=tiled_block_size,
-            B_P_SIZE=B_P_SIZE,
-            B_D_SIZE=B_D_SIZE,
-        )
-
-        for i in nl.affine_range(n_tile_q):
-            cur_mask = nl.load(mask[
-                nl.ds(i * B_P_SIZE, B_P_SIZE),
-                nl.ds(large_k_tile_idx * LARGE_TILE_SZ, LARGE_TILE_SZ),
-            ])
-            for i_q_h in nl.affine_range(q_h_per_k_h):
-                q_tile = nl.ndarray((B_D_SIZE, B_P_SIZE), dtype=kernel_dtype)
-                q_hbm_tile = query[batch_id, head_id * q_h_per_k_h + i_q_h]
-                q_sbuf_tile = nl.load(q_hbm_tile[:,
-                                                 nl.ds(i *
-                                                       B_P_SIZE, B_P_SIZE)])
-                if q_sbuf_tile.dtype != kernel_dtype:
-                    q_sbuf_tile = nl.copy(q_sbuf_tile, dtype=kernel_dtype)
-                q_tile[:, :] = q_sbuf_tile * softmax_scale
-
-                _flash_attention_core(
-                    q_local_tile=q_tile,
-                    k=cur_k_tile,
-                    v=cur_v_tile,
-                    o_buffer=o_buffer[i, i_q_h],
-                    l_buffer=l_buffer[i, i_q_h],
-                    m_buffer=m_buffer[i, i_q_h],
-                    kernel_dtype=kernel_dtype,
-                    acc_type=acc_type,
-                    tile_mask=cur_mask,
-                    use_causal_mask=False,
-                    q_tile_idx=i,
-                    initialize=large_k_tile_idx == 0,
-                    LARGE_TILE_SZ=LARGE_TILE_SZ,
-                    B_P_SIZE=B_P_SIZE,
-                    B_F_SIZE=B_F_SIZE,
-                    B_D_SIZE=B_D_SIZE,
-                )
-
-    # compute attention between input query, key and value
-    if key is not None and value is not None:
-        B_F_SIZE = min(seqlen_q, B_F_SIZE)
-        LARGE_TILE_SZ = seqlen_q
-
-        cur_k_tile = nl.ndarray((par_dim(B_D_SIZE), LARGE_TILE_SZ),
-                                dtype=kernel_dtype)
-        cur_v_tile = nl.ndarray(
-            (par_dim(B_P_SIZE), LARGE_TILE_SZ // B_P_SIZE * B_D_SIZE),
-            dtype=kernel_dtype,
-        )
-
-        loaded = nl.load(key[batch_id, head_id, :, :])
-        if loaded.dtype != kernel_dtype:
-            loaded = nl.copy(loaded, dtype=kernel_dtype)
-        cur_k_tile[:, :] = loaded
-
-        v_hbm_tile = value[batch_id, head_id]
-        for v_i in nl.affine_range(LARGE_TILE_SZ // B_P_SIZE):
-            load_v_tile(
-                v_hbm_tile=v_hbm_tile,
-                cur_v_tile=cur_v_tile,
-                large_tile_idx=0,
-                v_i=v_i,
-                LARGE_TILE_SZ=LARGE_TILE_SZ,
-            )
-
-        for i in nl.affine_range(n_tile_q):
-            cur_mask = nl.load(mask[
-                nl.ds(i * B_P_SIZE, B_P_SIZE),
-                nl.ds(context_kv_len, LARGE_TILE_SZ),
-            ])
-            for i_q_h in nl.affine_range(q_h_per_k_h):
-
-                q_tile = nl.ndarray((B_D_SIZE, B_P_SIZE), dtype=kernel_dtype)
-                q_hbm_tile = query[batch_id, head_id * q_h_per_k_h + i_q_h]
-                q_sbuf_tile = nl.load(q_hbm_tile[:,
-                                                 nl.ds(i *
-                                                       B_P_SIZE, B_P_SIZE)])
-                if q_sbuf_tile.dtype != kernel_dtype:
-                    q_sbuf_tile = nl.copy(q_sbuf_tile, dtype=kernel_dtype)
-                q_tile[:, :] = q_sbuf_tile * softmax_scale
-                _flash_attention_core(
-                    q_local_tile=q_tile,
-                    k=cur_k_tile,
-                    v=cur_v_tile,
-                    o_buffer=o_buffer[i, i_q_h],
-                    l_buffer=l_buffer[i, i_q_h],
-                    m_buffer=m_buffer[i, i_q_h],
-                    kernel_dtype=kernel_dtype,
-                    acc_type=acc_type,
-                    tile_mask=cur_mask,
-                    use_causal_mask=True,
-                    q_tile_idx=i,
-                    initialize=False,
-                    LARGE_TILE_SZ=LARGE_TILE_SZ,
-                    B_P_SIZE=B_P_SIZE,
-                    B_F_SIZE=B_F_SIZE,
-                    B_D_SIZE=B_D_SIZE,
-                    qk_res_buffer=(qk_res_buffer[i, i_q_h]
-                                   if qk_res_buffer is not None else None),
-                )
-
-    # -- -- -- -- write output to buffer on HBM -- -- -- -- -- -- #
-    for i_q_h in nl.affine_range(q_h_per_k_h):
-        for i in nl.affine_range(n_tile_q):
-            out = nl.multiply(
-                o_buffer[i, i_q_h],
-                nl.exp(m_buffer[i, i_q_h] - l_buffer[i, i_q_h]),
-                dtype=kernel_dtype,
-            )
-
-            nl.store(
-                o[
-                    batch_id,
-                    head_id * q_h_per_k_h + i_q_h,
-                    nl.ds(i * B_P_SIZE, B_P_SIZE),
-                    :,
-                ],
-                out,
-            )
-            # maximum and summation statistics
-            if return_debug_tensors:
-                nl.store(
-                    hbm_m_buffer[
-                        batch_id,
-                        head_id * q_h_per_k_h + i_q_h,
-                        nl.ds(i * B_P_SIZE, B_P_SIZE),
-                    ],
-                    m_buffer[i, i_q_h, :, :],
-                )
-                nl.store(
-                    hbm_l_buffer[
-                        batch_id,
-                        head_id * q_h_per_k_h + i_q_h,
-                        nl.ds(i * B_P_SIZE, B_P_SIZE),
-                    ],
-                    l_buffer[i, i_q_h],
-                )
-                nl.store(
-                    hbm_qk_res[batch_id, head_id * q_h_per_k_h + i_q_h, :, :],
-                    qk_res_buffer[batch_id, i_q_h, :, :],
-                )
-
-    if return_debug_tensors:
-        return o, hbm_m_buffer, hbm_l_buffer, hbm_qk_res
-    return o
-
-
-def reorder_context_mask(mask, LARGE_TILE_SZ, block_size):
-    """
-    Reorder the mask to make it compatible with the flash attention kernel.
-
-    We vectorize KV cache read to improve DMA utilization. However, the layout
-    that maximizes DMA bandwidth changes the order tokens are consumed.
-    
-    The token layout (inner 2 dimensions) after vectorized load is (B_P_SIZE,
-    tiled_block_size) in a tile of `B_P_SIZE * tiled_block_size` tokens. And
-    each step the engine consumes a column (rather than a row) of B_P_SIZE
-    tokens. Therefore, the tokens are visited in a strided way.
-
-    To make sure mask matches the order tokens are consumed, we need to properly
-    transpose mask.
-    """
-    total_query_len, total_seq_len = mask.shape
-    context_kv_len = total_seq_len - total_query_len
-
-    B_P_SIZE = 128
-    assert (LARGE_TILE_SZ
-            >= B_P_SIZE), f"{LARGE_TILE_SZ=} must be larger than {B_P_SIZE=}"
-    num_tiled_blocks = max(B_P_SIZE, LARGE_TILE_SZ // block_size)
-    tiled_block_size = LARGE_TILE_SZ // num_tiled_blocks
-    if tiled_block_size > 1:
-        # Mask reordering is needed when tiled_block_size > 1
-        device = mask.device
-        mask = mask.cpu()
-        context_mask = mask[:, :context_kv_len]
-        context_mask = context_mask.view(
-            total_query_len,
-            context_kv_len // LARGE_TILE_SZ,
-            num_tiled_blocks // B_P_SIZE,
-            B_P_SIZE,
-            tiled_block_size,
-        )
-        context_mask = context_mask.transpose(3, 4).reshape(
-            total_query_len, context_kv_len)
-        new_mask = mask[:, context_kv_len:]
-        return torch.concat([context_mask, new_mask], dim=1).to(device)
-    else:
-        return mask
-
-
-def flash_attn_varlen_nkifunc(
-    query,
-    key,
-    value,
-    kv_cache,
-    block_table,
-    attn_mask,
-    n_kv_head=None,
-    head_size=None,
-    LARGE_TILE_SZ=2048,
-    mixed_precision=True,
-):
-    """
-    Compute flash paged attention for variable length sequences.
-
-    This function is a wrapper around the flash attention NKI kernel. It takes
-    in the following arguments:
-      - query: (1, n_heads, d, seq_q)
-      - key:   (1, n_kv_heads, d, seq_k)
-      - value: (1, n_kv_heads, seq_v, d)
-      - kv_cache:   (2, n_blocks, n_kv_heads, block_size, d)
-      - block_tables: (n_active_blocks, )
-      - attn_mask: (seq_q, n_active_blocks * block_size + seq_q)
-
-    Notes:
-      - attn_mask must be reordered outside using `reorder_context_mask`
-      - Key/value cache layout must be (n_blocks, n_kv_heads, block_size, d) 
-        for better DMA throughput
-    """
-    if n_kv_head is None:
-        n_kv_head = kv_cache.shape[2]
-    assert kv_cache.shape[0] == 2
-    assert kv_cache.shape[2] == n_kv_head
-    if head_size is None:
-        head_size = kv_cache.shape[-1]
-
-    kwargs = dict(
-        query=query,
-        key=key,
-        value=value,
-        kv_cache=kv_cache,
-        block_tables=block_table,
-        mask=attn_mask,
-        softmax_scale=1.0 / (head_size**0.5),
-        mixed_precision=mixed_precision,
-        LARGE_TILE_SZ=LARGE_TILE_SZ,
-    )
-
-    o = flash_paged_attention[1, n_kv_head](**kwargs)
-    return o
-
-
-def reshape_and_cache(
-    key: torch.Tensor,
-    value: torch.Tensor,
-    kv_cache: torch.Tensor,
-    slot_mapping: torch.Tensor,
-) -> None:
-    """
-    Writes key-value pairs to the KV cache at specified positions.
-
-    Args:
-        key (torch.Tensor): Key tensor with shape
-            (num_tokens, n_kv_head, d_head)
-        value (torch.Tensor): Value tensor with shape 
-            (num_tokens, n_kv_head, d_head)
-        kv_cache (torch.Tensor): Key/value cache tensor with shape 
-            (2, num_blocks, n_kv_head, block_size, d_head)
-        slot_mapping (torch.Tensor): Mapping tensor indicating cache positions
-            with shape (num_tokens)
-
-    Returns:
-        None: Updates the kv_cache tensor in-place
-    """
-    block_size = kv_cache.size(3)
-    n_kv_head = key.size(1)
-
-    # Calculate indices with explicit floor division
-    block_indices = torch.div(slot_mapping, block_size, rounding_mode="floor")
-    block_offsets = slot_mapping % block_size
-
-    # Create the head indices tensor
-    head_indices = torch.arange(n_kv_head, device=key.device)
-
-    # Update caches using index_put_
-    kv_cache.index_put_(
-        (torch.tensor([0], device=key.device), block_indices[:, None],
-         head_indices[None, :], block_offsets[:, None]), key)
-
-    kv_cache.index_put_(
-        (torch.tensor([1], device=key.device), block_indices[:, None],
-         head_indices[None, :], block_offsets[:, None]), value)
diff --git a/vllm/attention/ops/paged_attn.py b/vllm/attention/ops/paged_attn.py
index c6d1501e2757844bb70754213ed9df2f5d3df063..4d870a45e5800fd73dddc8279e88e21261cfacd8 100644
--- a/vllm/attention/ops/paged_attn.py
+++ b/vllm/attention/ops/paged_attn.py
@@ -6,9 +6,14 @@ from typing import List, Optional, Tuple
 
 import torch
 
-from vllm import _custom_ops as ops
+from vllm.platforms import current_platform
 from vllm.triton_utils import HAS_TRITON
 
+if current_platform.is_cuda_alike():
+    from vllm import _custom_ops as ops
+elif current_platform.is_xpu():
+    from vllm._ipex_ops import ipex_ops as ops
+
 if HAS_TRITON:
     from vllm.attention.ops.prefix_prefill import context_attention_fwd
 
diff --git a/vllm/attention/ops/prefix_prefill.py b/vllm/attention/ops/prefix_prefill.py
index e1d41930f6231a51c6bb29d5f308bdf3f6e33bf0..7e5c2b6c62e9be8d0b70cf70f316dc46b4644521 100644
--- a/vllm/attention/ops/prefix_prefill.py
+++ b/vllm/attention/ops/prefix_prefill.py
@@ -15,6 +15,7 @@ NUM_WARPS = 4 if current_platform.is_rocm() else 8
 
 # To check compatibility
 IS_TURING = current_platform.get_device_capability() == (7, 5)
+float8_info = torch.finfo(current_platform.fp8_dtype())
 
 
 # Here's an example autotuner config for this kernel. This config does provide
@@ -43,6 +44,7 @@ def _fwd_kernel(Q,
                 sm_scale,
                 k_scale,
                 v_scale,
+                out_scale_inv,
                 B_Start_Loc,
                 B_Seqlen,
                 x: tl.constexpr,
@@ -82,8 +84,11 @@ def _fwd_kernel(Q,
                 num_unroll_request: tl.constexpr,
                 SKIP_DECODE: tl.constexpr,
                 USE_SINKS: tl.constexpr,
+                USE_FP8: tl.constexpr,
                 MAX_Q_LEN: tl.constexpr = 0,
-                MAX_CTX_LEN: tl.constexpr = 0):
+                MAX_CTX_LEN: tl.constexpr = 0,
+                FP8_MIN: tl.constexpr = float8_info.min,
+                FP8_MAX: tl.constexpr = float8_info.max):
 
     cur_batch = tl.program_id(0)
     cur_head = tl.program_id(1)
@@ -146,7 +151,7 @@ def _fwd_kernel(Q,
         start_n = tl.multiple_of(start_n, BLOCK_SIZE)
         # -- compute qk ----
         bn = tl.load(B_Loc + cur_batch * stride_b_loc_b +
-                     (start_n // BLOCK_SIZE) * stride_b_loc_s)
+                     (start_n // BLOCK_SIZE) * stride_b_loc_s).to(tl.int64)
         # [D,BLOCK_SIZE]
         off_k = (
             bn[None, :] * stride_k_cache_bs + cur_kv_head * stride_k_cache_h +
@@ -284,6 +289,9 @@ def _fwd_kernel(Q,
     off_o = ((cur_batch_in_all_start_index + offs_m[:, None]) * stride_obs +
              cur_head * stride_oh + offs_d[None, :] * stride_od)
     out_ptrs = Out + off_o
+    if USE_FP8:
+        acc = acc * tl.load(out_scale_inv)
+        acc = tl.clamp(acc, FP8_MIN, FP8_MAX)
     tl.store(out_ptrs,
              acc,
              mask=dim_mask[None, :] & (offs_m[:, None] < cur_batch_query_len))
@@ -367,7 +375,7 @@ def _fwd_kernel_flash_attn_v2(
         bn = tl.load(B_Loc + cur_batch * stride_b_loc_b +
                      ((start_n + offs_n) // block_size) * stride_b_loc_s,
                      mask=(start_n + offs_n) < cur_batch_ctx_len,
-                     other=0)
+                     other=0).to(tl.int64)
         off_k = (
             bn[None, :] * stride_k_cache_bs + cur_kv_head * stride_k_cache_h +
             (offs_d[:, None] // x) * stride_k_cache_d +
@@ -575,7 +583,7 @@ def _fwd_kernel_alibi(
         bn = tl.load(B_Loc + cur_batch * stride_b_loc_b +
                      ((start_n + offs_n) // block_size) * stride_b_loc_s,
                      mask=(start_n + offs_n) < cur_batch_ctx_len,
-                     other=0)
+                     other=0).to(tl.int64)
         off_k = (
             bn[None, :] * stride_k_cache_bs + cur_kv_head * stride_k_cache_h +
             (offs_d[:, None] // x) * stride_k_cache_d +
@@ -743,6 +751,7 @@ def context_attention_fwd(q,
                           sliding_window=None,
                           sm_scale=None,
                           skip_decode=False,
+                          fp8_out_scale=None,
                           sinks=None):
 
     q_dtype_is_f32 = q.dtype is torch.float32
@@ -793,6 +802,7 @@ def context_attention_fwd(q,
 
     if alibi_slopes is not None:
         assert sinks is None, "Sinks arg is not supported with alibi"
+        assert fp8_out_scale is None, "FP8 output not supported with alibi"
         # need to reduce num. blocks when using fp32
         # due to increased use of GPU shared memory
         # if q.dtype is torch.float32:
@@ -870,6 +880,7 @@ def context_attention_fwd(q,
         sm_scale,
         k_scale,
         v_scale,
+        1.0 / fp8_out_scale if fp8_out_scale is not None else 1.0,
         b_start_loc,
         b_seq_len,
         k_cache.shape[4],
@@ -905,6 +916,7 @@ def context_attention_fwd(q,
         BLOCK_DMODEL_PADDED=Lk_padded,
         SLIDING_WINDOW=sliding_window,
         SKIP_DECODE=skip_decode,
+        USE_FP8=fp8_out_scale is not None,
         BLOCK_M=128,
         BLOCK_N=64,
         num_unroll_cache=4,
diff --git a/vllm/attention/ops/triton_unified_attention.py b/vllm/attention/ops/triton_unified_attention.py
index 56ebed0f524487647340d52b37a45a20228d95df..d2ad2f7e8d2aafaecb3e237f1faae77c14512b83 100644
--- a/vllm/attention/ops/triton_unified_attention.py
+++ b/vllm/attention/ops/triton_unified_attention.py
@@ -10,9 +10,11 @@
 import torch
 
 from vllm.logger import init_logger
+from vllm.platforms import current_platform
 from vllm.triton_utils import tl, triton
 
 logger = init_logger(__name__)
+float8_info = torch.finfo(current_platform.fp8_dtype())
 
 
 @triton.jit
@@ -48,47 +50,51 @@ def find_seq_idx(query_start_len_ptr, target_idx, num_seqs,
 
 @triton.jit
 def kernel_unified_attention_2d(
-        output_ptr,  # [num_tokens, num_query_heads, head_size]
-        query_ptr,  # [num_tokens, num_query_heads, head_size]
-        key_cache_ptr,  # [num_blks, blk_size, num_kv_heads, head_size]
-        value_cache_ptr,  # [num_blks, blk_size, num_kv_heads, head_size]
-        sink_ptr,  # [num_query_heads]
-        block_tables_ptr,  # [num_seqs, max_num_blocks_per_seq]
-        seq_lens_ptr,  # [num_seqs]
-        alibi_slopes_ptr,  # [num_query_heads]
-        qq_bias_ptr,  # [num_query_tokens, num_query_tokens]
-        scale,  # float32
-        k_scale,  # float32
-        v_scale,  # float32
-        softcap,  # float32
-        num_query_heads: tl.constexpr,  # int
-        num_queries_per_kv: tl.constexpr,  # int
-        block_table_stride: tl.int64,  # int
-        query_stride_0: tl.int64,  # int
-        query_stride_1: tl.int64,  # int, should be equal to head_size
-        output_stride_0: tl.int64,  # int
-        output_stride_1: tl.int64,  # int, should be equal to head_size
-        qq_bias_stride_0: tl.int64,  # int
-        BLOCK_SIZE: tl.constexpr,  # int
-        HEAD_SIZE: tl.constexpr,  # int
-        HEAD_SIZE_PADDED: tl.constexpr,  # int, must be power of 2
-        USE_ALIBI_SLOPES: tl.constexpr,  # bool
-        USE_QQ_BIAS: tl.constexpr,  # bool
-        USE_SOFTCAP: tl.constexpr,  # bool
-        USE_SINKS: tl.constexpr,  # bool
-        SLIDING_WINDOW: tl.constexpr,  # int
-        stride_k_cache_0: tl.int64,  # int
-        stride_k_cache_1: tl.int64,  # int
-        stride_k_cache_2: tl.int64,  # int
-        stride_k_cache_3: tl.constexpr,  # int
-        stride_v_cache_0: tl.int64,  # int
-        stride_v_cache_1: tl.int64,  # int
-        stride_v_cache_2: tl.int64,  # int
-        stride_v_cache_3: tl.constexpr,  # int
-        query_start_len_ptr,  # [num_seqs+1]
-        BLOCK_Q: tl.constexpr,  # int
-        num_seqs: tl.int32,
-        BLOCK_M: tl.constexpr,  # int
+    output_ptr,  # [num_tokens, num_query_heads, head_size]
+    query_ptr,  # [num_tokens, num_query_heads, head_size]
+    key_cache_ptr,  # [num_blks, blk_size, num_kv_heads, head_size]
+    value_cache_ptr,  # [num_blks, blk_size, num_kv_heads, head_size]
+    sink_ptr,  # [num_query_heads]
+    block_tables_ptr,  # [num_seqs, max_num_blocks_per_seq]
+    seq_lens_ptr,  # [num_seqs]
+    alibi_slopes_ptr,  # [num_query_heads]
+    qq_bias_ptr,  # [num_query_tokens, num_query_tokens]
+    scale,  # float32
+    k_scale,  # float32
+    v_scale,  # float32
+    out_scale,  # float32
+    softcap,  # float32
+    num_query_heads: tl.constexpr,  # int
+    num_queries_per_kv: tl.constexpr,  # int
+    block_table_stride: tl.int64,  # int
+    query_stride_0: tl.int64,  # int
+    query_stride_1: tl.int64,  # int, should be equal to head_size
+    output_stride_0: tl.int64,  # int
+    output_stride_1: tl.int64,  # int, should be equal to head_size
+    qq_bias_stride_0: tl.int64,  # int
+    BLOCK_SIZE: tl.constexpr,  # int
+    HEAD_SIZE: tl.constexpr,  # int
+    HEAD_SIZE_PADDED: tl.constexpr,  # int, must be power of 2
+    USE_ALIBI_SLOPES: tl.constexpr,  # bool
+    USE_QQ_BIAS: tl.constexpr,  # bool
+    USE_SOFTCAP: tl.constexpr,  # bool
+    USE_SINKS: tl.constexpr,  # bool
+    SLIDING_WINDOW: tl.constexpr,  # int
+    stride_k_cache_0: tl.int64,  # int
+    stride_k_cache_1: tl.int64,  # int
+    stride_k_cache_2: tl.int64,  # int
+    stride_k_cache_3: tl.constexpr,  # int
+    stride_v_cache_0: tl.int64,  # int
+    stride_v_cache_1: tl.int64,  # int
+    stride_v_cache_2: tl.int64,  # int
+    stride_v_cache_3: tl.constexpr,  # int
+    query_start_len_ptr,  # [num_seqs+1]
+    BLOCK_Q: tl.constexpr,  # int
+    num_seqs: tl.int32,
+    BLOCK_M: tl.constexpr,  # int
+    USE_FP8: tl.constexpr,  # bool
+    FP8_MIN: tl.constexpr = float8_info.min,
+    FP8_MAX: tl.constexpr = float8_info.max,
 ):
     q_block_global_idx = tl.program_id(0)
     kv_head_idx = tl.program_id(1)
@@ -281,6 +287,9 @@ def kernel_unified_attention_2d(
 
     # epilogue
     acc = acc / L[:, None]
+    if USE_FP8:
+        acc = acc * tl.load(out_scale)
+        acc = tl.clamp(acc, FP8_MIN, FP8_MAX)
 
     output_offset = (query_offset_0[:, None] * output_stride_0 +
                      query_offset_1[:, None] * output_stride_1 +
@@ -552,23 +561,27 @@ def kernel_unified_attention_3d(
 
 @triton.jit
 def reduce_segments(
-        output_ptr,  # [num_tokens, num_query_heads, head_size]
-        segm_output_ptr,
-        #[num_tokens, num_query_heads, max_num_segments, head_size]
-        segm_max_ptr,  # [num_tokens, num_query_heads, max_num_segments]
-        segm_expsum_ptr,  # [num_tokens, num_query_heads, max_num_segments]
-        seq_lens_ptr,  # [num_seqs]
-        num_seqs,  # int
-        num_query_heads: tl.constexpr,  # int
-        output_stride_0: tl.int64,  # int
-        output_stride_1: tl.int64,  # int, should be equal to head_size
-        block_table_stride: tl.int64,  # int
-        BLOCK_SIZE: tl.constexpr,  # int
-        HEAD_SIZE: tl.constexpr,  # int, must be power of 2
-        HEAD_SIZE_PADDED: tl.constexpr,  # int, must be power of 2
-        query_start_len_ptr,  # [num_seqs+1]
-        BLOCK_Q: tl.constexpr,  # int
-        NUM_SEGMENTS_PER_SEQ: tl.constexpr,  # int
+    output_ptr,  # [num_tokens, num_query_heads, head_size]
+    segm_output_ptr,
+    #[num_tokens, num_query_heads, max_num_segments, head_size]
+    segm_max_ptr,  # [num_tokens, num_query_heads, max_num_segments]
+    segm_expsum_ptr,  # [num_tokens, num_query_heads, max_num_segments]
+    seq_lens_ptr,  # [num_seqs]
+    num_seqs,  # int
+    num_query_heads: tl.constexpr,  # int
+    out_scale_inv,  # float32
+    output_stride_0: tl.int64,  # int
+    output_stride_1: tl.int64,  # int, should be equal to head_size
+    block_table_stride: tl.int64,  # int
+    BLOCK_SIZE: tl.constexpr,  # int
+    HEAD_SIZE: tl.constexpr,  # int, must be power of 2
+    HEAD_SIZE_PADDED: tl.constexpr,  # int, must be power of 2
+    query_start_len_ptr,  # [num_seqs+1]
+    BLOCK_Q: tl.constexpr,  # int
+    NUM_SEGMENTS_PER_SEQ: tl.constexpr,  # int
+    USE_FP8: tl.constexpr,  # bool
+    FP8_MIN: tl.constexpr = float8_info.min,
+    FP8_MAX: tl.constexpr = float8_info.max,
 ):
     query_token_idx = tl.program_id(0)
     query_head_idx = tl.program_id(1)
@@ -624,6 +637,10 @@ def reduce_segments(
     # safely divide by overall_expsum, returning 0.0 if overall_expsum is 0
     acc = tl.where(overall_expsum == 0.0, 0.0, acc_sum / overall_expsum)
 
+    if USE_FP8:
+        acc = acc * tl.load(out_scale_inv)
+        acc = tl.clamp(acc, FP8_MIN, FP8_MAX)
+
     # write result
     output_offset = (query_token_idx * output_stride_0 +
                      query_head_idx * output_stride_1 +
@@ -649,6 +666,7 @@ def unified_attention(
     k_descale,
     v_descale,
     alibi_slopes=None,
+    output_scale=None,
     qq_bias=None,
     # Optional tensor for sinks
     sinks=None,
@@ -674,7 +692,8 @@ def unified_attention(
     num_queries_per_kv = num_query_heads // num_kv_heads
     head_size = q.shape[2]
 
-    BLOCK_M = 16
+    BLOCK_M = 16 if num_queries_per_kv <= 16 else triton.next_power_of_2(
+        num_queries_per_kv)
     BLOCK_Q = BLOCK_M // num_queries_per_kv
 
     # Ideally we would launch with kernel with:
@@ -706,6 +725,7 @@ def unified_attention(
             scale=softmax_scale,
             k_scale=k_descale,
             v_scale=v_descale,
+            out_scale=1 / output_scale if output_scale is not None else 1.0,
             softcap=softcap,
             num_query_heads=num_query_heads,
             num_queries_per_kv=num_queries_per_kv,
@@ -735,6 +755,7 @@ def unified_attention(
             BLOCK_Q=BLOCK_Q,
             num_seqs=num_seqs,
             BLOCK_M=BLOCK_M,
+            USE_FP8=output_scale is not None,
         )
     else:
         # for initial version, NUM_SEGMENTS = 16 is chosen as a default
@@ -818,6 +839,8 @@ def unified_attention(
             seq_lens_ptr=seqused_k,
             num_seqs=num_seqs,
             num_query_heads=num_query_heads,
+            out_scale_inv=1 /
+            output_scale if output_scale is not None else 1.0,
             output_stride_0=out.stride(0),
             output_stride_1=out.stride(1),
             block_table_stride=block_table.stride(0),
@@ -827,4 +850,5 @@ def unified_attention(
             query_start_len_ptr=cu_seqlens_q,
             BLOCK_Q=BLOCK_Q,
             NUM_SEGMENTS_PER_SEQ=NUM_SEGMENTS,
+            USE_FP8=output_scale is not None,
         )
diff --git a/vllm/attention/utils/fa_utils.py b/vllm/attention/utils/fa_utils.py
index f8b00565f0517b1e186133283b328423cefa5e5d..dc0af7e28e3e25b5b9948b2c5eeb7b9a3698acd1 100644
--- a/vllm/attention/utils/fa_utils.py
+++ b/vllm/attention/utils/fa_utils.py
@@ -68,5 +68,18 @@ def flash_attn_supports_fp8() -> bool:
         current_platform.get_device_capability().major == 9
 
 
+def flash_attn_supports_mla():
+    from vllm.platforms import current_platform
+    if current_platform.is_cuda():
+        try:
+            from vllm.vllm_flash_attn.flash_attn_interface import (
+                is_fa_version_supported)
+            return is_fa_version_supported(3) \
+                and current_platform.get_device_capability()[0] == 9
+        except (ImportError, AssertionError):
+            pass
+    return False
+
+
 def is_flash_attn_varlen_func_available() -> bool:
     return current_platform.is_cuda() or current_platform.is_xpu()
diff --git a/vllm/beam_search.py b/vllm/beam_search.py
index 5a2e79e1b5c74d627c8fac09ee490d36ad14bdd2..01124872e98c06e501659e8a44fbeabb30d83d6d 100644
--- a/vllm/beam_search.py
+++ b/vllm/beam_search.py
@@ -4,8 +4,8 @@
 from dataclasses import dataclass
 from typing import TYPE_CHECKING, Any, Optional, Union
 
+from vllm.logprobs import Logprob
 from vllm.lora.request import LoRARequest
-from vllm.sequence import Logprob
 
 if TYPE_CHECKING:
     from vllm.multimodal import MultiModalDataDict
diff --git a/vllm/benchmarks/datasets.py b/vllm/benchmarks/datasets.py
index 93519b5ba1523bf9e9a4b6f88ab5b85cdd6a2909..32820b026b6f6809b0cb5b7d02c6f009766fe7bf 100644
--- a/vllm/benchmarks/datasets.py
+++ b/vllm/benchmarks/datasets.py
@@ -198,8 +198,9 @@ class BenchmarkDataset(ABC):
 
     @abstractmethod
     def sample(self, tokenizer: PreTrainedTokenizerBase,
-               num_requests: int, 
-               request_id_prefix: str = "") -> list[SampleRequest]:
+               num_requests: int,
+               request_id_prefix: str = "",
+               no_oversample: bool = False) -> list[SampleRequest]:
         """
         Abstract method to generate sample requests from the dataset.
 
@@ -224,6 +225,7 @@ class BenchmarkDataset(ABC):
         requests: list[SampleRequest],
         num_requests: int,
         request_id_prefix: str = "",
+        no_oversample: bool = False,
     ) -> None:
         """
         Oversamples the list of requests if its size is less than the desired
@@ -236,6 +238,11 @@ class BenchmarkDataset(ABC):
             request_id_prefix (str) The prefix of the request ids.
 
         """
+        if no_oversample:
+            logger.info("Skipping oversampling. " \
+            "Total samples: %d.", len(requests))
+            return
+
         if len(requests) < num_requests:
             random.seed(self.random_seed)
             additional = deepcopy(
@@ -405,6 +412,7 @@ class RandomDataset(BenchmarkDataset):
         tokenizer: PreTrainedTokenizerBase,
         num_requests: int,
         request_id_prefix: str = "",
+        no_oversample: bool = False,
         prefix_len: int = DEFAULT_PREFIX_LEN,
         range_ratio: float = DEFAULT_RANGE_RATIO,
         input_len: int = DEFAULT_INPUT_LEN,
@@ -832,6 +840,7 @@ class RandomMultiModalDataset(RandomDataset):
         tokenizer: PreTrainedTokenizerBase,
         num_requests: int,
         request_id_prefix: str = "",
+        no_oversample: bool = False,
         prefix_len: int = RandomDataset.DEFAULT_PREFIX_LEN,
         range_ratio: float = RandomDataset.DEFAULT_RANGE_RATIO,
         input_len: int = RandomDataset.DEFAULT_INPUT_LEN,
@@ -959,6 +968,7 @@ class ShareGPTDataset(BenchmarkDataset):
         output_len: Optional[int] = None,
         enable_multimodal_chat: bool = False,
         request_id_prefix: str = "",
+        no_oversample: bool = False,
         **kwargs,
     ) -> list:
         samples: list = []
@@ -1002,7 +1012,10 @@ class ShareGPTDataset(BenchmarkDataset):
                     request_id=request_id_prefix + str(ind),
                 ))
             ind += 1
-        self.maybe_oversample_requests(samples, num_requests, request_id_prefix)
+        self.maybe_oversample_requests(samples, 
+                                       num_requests, 
+                                       request_id_prefix, 
+                                       no_oversample)
         return samples
 
 
@@ -1020,7 +1033,7 @@ def add_dataset_parser(parser: FlexibleArgumentParser):
         default="random",
         choices=[
             "sharegpt", "burstgpt", "sonnet", "random", "random-mm", "hf", 
-            "custom", "prefix_repetition"
+            "custom", "prefix_repetition", "spec_bench"
         ],
         help="Name of the dataset to benchmark on.",
     )
@@ -1036,6 +1049,12 @@ def add_dataset_parser(parser: FlexibleArgumentParser):
         help="Path to the sharegpt/sonnet dataset. "
         "Or the huggingface dataset ID if using HF dataset.",
     )
+    parser.add_argument(
+        "--no-oversample",
+        action="store_true",
+        help="Do not oversample if the dataset has " \
+        "fewer samples than num-prompts.",
+    )
 
     # group for dataset specific arguments
     custom_group = parser.add_argument_group("custom dataset options")
@@ -1053,6 +1072,22 @@ def add_dataset_parser(parser: FlexibleArgumentParser):
         "Skip applying chat template to prompt, used only for custom dataset.",
     )
 
+    spec_bench_group = parser.add_argument_group("spec bench dataset options")
+    spec_bench_group.add_argument(
+        "--spec-bench-output-len",
+        type=int,
+        default=256,
+        help=
+        "Num of output tokens per request, used only for spec bench dataset.",
+    )
+    spec_bench_group.add_argument(
+        "--spec-bench-category",
+        type=str,
+        default=None,
+        help=
+        "Category for spec bench dataset. If None, use all categories.",
+    )
+
     sonnet_group = parser.add_argument_group("sonnet dataset options")
     sonnet_group.add_argument(
         "--sonnet-input-len",
@@ -1085,6 +1120,22 @@ def add_dataset_parser(parser: FlexibleArgumentParser):
         "from the ShareGPT dataset.",
     )
 
+    blazedit_group = parser.add_argument_group("blazedit dataset options")
+    blazedit_group.add_argument(
+        "--blazedit-min-distance",
+        type=float,
+        default=0.0,
+        help=
+        "Minimum distance for blazedit dataset. Min: 0, Max: 1.0",
+    )
+    blazedit_group.add_argument(
+        "--blazedit-max-distance",
+        type=float,
+        default=1.0,
+        help=
+        "Maximum distance for blazedit dataset. Min: 0, Max: 1.0",
+    )
+
     random_group = parser.add_argument_group("random dataset options")
     random_group.add_argument(
         "--random-input-len",
@@ -1227,6 +1278,16 @@ def add_dataset_parser(parser: FlexibleArgumentParser):
                           type=str,
                           default=None,
                           help="Split of the HF dataset.")
+    hf_group.add_argument(
+        "--hf-name",
+        type=str,
+        default=None,
+        help=(
+            "Name of the dataset on HuggingFace "
+            "(e.g., 'lmarena-ai/VisionArena-Chat'). "
+            "Specify this if your dataset-path is a local path."
+        ),
+    )
     hf_group.add_argument(
         "--hf-output-len",
         type=int,
@@ -1268,6 +1329,10 @@ def add_dataset_parser(parser: FlexibleArgumentParser):
 
 
 def get_samples(args, tokenizer) -> list[SampleRequest]:
+
+    if not hasattr(args, "request_id_prefix"):
+        args.request_id_prefix = ""
+
     if args.dataset_name == "custom":
         dataset = CustomDataset(dataset_path=args.dataset_path)
         input_requests = dataset.sample(
@@ -1276,6 +1341,7 @@ def get_samples(args, tokenizer) -> list[SampleRequest]:
             output_len=args.custom_output_len,
             skip_chat_template=args.custom_skip_chat_template,
             request_id_prefix=args.request_id_prefix,
+            no_oversample=args.no_oversample,
         )
 
     elif args.dataset_name == "sonnet":
@@ -1290,6 +1356,7 @@ def get_samples(args, tokenizer) -> list[SampleRequest]:
                 tokenizer=tokenizer,
                 return_prompt_formatted=False,
                 request_id_prefix=args.request_id_prefix,
+                no_oversample=args.no_oversample,
             )
         else:
             assert tokenizer.chat_template or tokenizer.default_chat_template, (
@@ -1302,33 +1369,67 @@ def get_samples(args, tokenizer) -> list[SampleRequest]:
                 tokenizer=tokenizer,
                 return_prompt_formatted=True,
                 request_id_prefix=args.request_id_prefix,
+                no_oversample=args.no_oversample,
             )
 
     elif args.dataset_name == "hf":
         # all following datasets are implemented from the
         # HuggingFaceDataset base class
-        if args.dataset_path in VisionArenaDataset.SUPPORTED_DATASET_PATHS:
+        hf_kwargs = {}
+        if (
+            args.dataset_path in VisionArenaDataset.SUPPORTED_DATASET_PATHS
+            or args.hf_name in VisionArenaDataset.SUPPORTED_DATASET_PATHS
+        ):
             dataset_class = VisionArenaDataset
             args.hf_split = "train"
             args.hf_subset = None
-        elif args.dataset_path in InstructCoderDataset.SUPPORTED_DATASET_PATHS:
+        elif (
+            args.dataset_path in InstructCoderDataset.SUPPORTED_DATASET_PATHS
+            or args.hf_name in InstructCoderDataset.SUPPORTED_DATASET_PATHS
+        ):
             dataset_class = InstructCoderDataset
             args.hf_split = "train"
-        elif args.dataset_path in MTBenchDataset.SUPPORTED_DATASET_PATHS:
+        elif (
+            args.dataset_path in MTBenchDataset.SUPPORTED_DATASET_PATHS
+            or args.hf_name in MTBenchDataset.SUPPORTED_DATASET_PATHS
+        ):
             dataset_class = MTBenchDataset
             args.hf_split = "train"
-        elif args.dataset_path in ConversationDataset.SUPPORTED_DATASET_PATHS:
+        elif (
+            args.dataset_path in ConversationDataset.SUPPORTED_DATASET_PATHS
+            or args.hf_name in ConversationDataset.SUPPORTED_DATASET_PATHS
+        ):
             dataset_class = ConversationDataset
-        elif args.dataset_path in AIMODataset.SUPPORTED_DATASET_PATHS:
+        elif (
+            args.dataset_path in AIMODataset.SUPPORTED_DATASET_PATHS
+            or args.hf_name in AIMODataset.SUPPORTED_DATASET_PATHS
+        ):
             dataset_class = AIMODataset
             args.hf_split = "train"
-        elif args.dataset_path in NextEditPredictionDataset.SUPPORTED_DATASET_PATHS:  # noqa: E501
+        elif (
+            args.dataset_path
+            in NextEditPredictionDataset.SUPPORTED_DATASET_PATHS  # noqa: E501
+            or args.hf_name in NextEditPredictionDataset.SUPPORTED_DATASET_PATHS
+        ):
             dataset_class = NextEditPredictionDataset
             args.hf_split = "train"
-        elif args.dataset_path in ASRDataset.SUPPORTED_DATASET_PATHS:
+        elif (
+            args.dataset_path in ASRDataset.SUPPORTED_DATASET_PATHS
+            or args.hf_name in ASRDataset.SUPPORTED_DATASET_PATHS
+        ):
             dataset_class = ASRDataset
             args.hf_split = "train"
-        elif args.dataset_path in MLPerfDataset.SUPPORTED_DATASET_PATHS:
+        elif args.dataset_path in BlazeditDataset.SUPPORTED_DATASET_PATHS:
+            dataset_class = BlazeditDataset
+            args.hf_split = "train"
+            hf_kwargs = {
+                "min_distance": args.blazedit_min_distance,
+                "max_distance": args.blazedit_max_distance,
+            }
+        elif (
+            args.dataset_path in MLPerfDataset.SUPPORTED_DATASET_PATHS
+            or args.hf_name in MLPerfDataset.SUPPORTED_DATASET_PATHS
+        ):
             dataset_class = MLPerfDataset
             args.hf_split = "train"
         else:
@@ -1358,16 +1459,28 @@ def get_samples(args, tokenizer) -> list[SampleRequest]:
             dataset_split=args.hf_split,
             random_seed=args.seed,
             no_stream=args.no_stream,
+            hf_name=args.hf_name,
         ).sample(
             num_requests=args.num_prompts,
             tokenizer=tokenizer,
             output_len=args.hf_output_len,
             request_id_prefix=args.request_id_prefix,
+            no_oversample=args.no_oversample,
+            **hf_kwargs
         )
 
     else:
         # For datasets that follow a similar structure, use a mapping.
         dataset_mapping = {
+            "spec_bench":
+            lambda: SpecBench(dataset_path=args.dataset_path, 
+                              category=args.spec_bench_category).sample(
+                num_requests=args.num_prompts,
+                tokenizer=tokenizer,
+                output_len=args.spec_bench_output_len,
+                request_id_prefix=args.request_id_prefix,
+                no_oversample=args.no_oversample,
+            ),
             "sharegpt": lambda: ShareGPTDataset(
                 random_seed=args.seed, dataset_path=args.dataset_path
             ).sample(
@@ -1375,6 +1488,7 @@ def get_samples(args, tokenizer) -> list[SampleRequest]:
                 num_requests=args.num_prompts,
                 output_len=args.sharegpt_output_len,
                 request_id_prefix=args.request_id_prefix,
+                no_oversample=args.no_oversample,
             ),
             "burstgpt": lambda: BurstGPTDataset(
                 random_seed=args.seed, dataset_path=args.dataset_path
@@ -1382,6 +1496,7 @@ def get_samples(args, tokenizer) -> list[SampleRequest]:
                 tokenizer=tokenizer,
                 num_requests=args.num_prompts,
                 request_id_prefix=args.request_id_prefix,
+                no_oversample=args.no_oversample,
             ),
             "random": lambda: RandomDataset(
                 random_seed=args.seed, dataset_path=args.dataset_path
@@ -1394,6 +1509,7 @@ def get_samples(args, tokenizer) -> list[SampleRequest]:
                 range_ratio=args.random_range_ratio,
                 request_id_prefix=args.request_id_prefix,
                 batchsize=args.random_batch_size,
+                no_oversample=args.no_oversample,
             ),
             "random-mm":
             lambda: RandomMultiModalDataset(
@@ -1410,6 +1526,7 @@ def get_samples(args, tokenizer) -> list[SampleRequest]:
                 num_mm_items_range_ratio=args.random_mm_num_mm_items_range_ratio,
                 bucket_config=args.random_mm_bucket_config,
                 request_id_prefix=args.request_id_prefix,
+                no_oversample=args.no_oversample,
             ),
             "prefix_repetition":
             lambda: PrefixRepetitionRandomDataset(
@@ -1422,6 +1539,7 @@ def get_samples(args, tokenizer) -> list[SampleRequest]:
                 num_prefixes=args.prefix_repetition_num_prefixes,
                 output_len=args.prefix_repetition_output_len,
                 request_id_prefix=args.request_id_prefix,
+                no_oversample=args.no_oversample,
             ),
         }
 
@@ -1503,8 +1621,17 @@ class CustomDataset(BenchmarkDataset):
         enable_multimodal_chat: bool = False,
         skip_chat_template: bool = False,
         request_id_prefix: str = "",
+        no_oversample: bool = False,
         **kwargs,
     ) -> list:
+        # load all data if needed
+        self.num_available_samples = len(self.data)
+        if num_requests <= 0:
+            num_requests = self.num_available_samples
+            logger.info("num_requests is set to 0 or negative, "
+                        "so using all available samples: %d",
+                        num_requests)
+            
         sampled_requests = []
         for i, item in enumerate(self.data):
             if len(sampled_requests) >= num_requests:
@@ -1531,11 +1658,57 @@ class CustomDataset(BenchmarkDataset):
                     request_id=request_id_prefix + str(i),
                 ))
         self.maybe_oversample_requests(sampled_requests, num_requests, 
-                                       request_id_prefix)
+                                       request_id_prefix, no_oversample)
 
         return sampled_requests
 
 
+# -----------------------------------------------------------------------------
+# Spec Bench Dataset Implementation
+# -----------------------------------------------------------------------------
+
+
+class SpecBench(CustomDataset):
+    """
+    Implements the SpecBench dataset: https://github.com/hemingkx/Spec-Bench
+    Download the dataset using: 
+    wget https://raw.githubusercontent.com/hemingkx/Spec-Bench/refs/heads/main/data/spec_bench/question.jsonl
+    """ # noqa: E501
+
+    def __init__(self, **kwargs) -> None:
+        self.category = kwargs.pop("category", None)
+        super().__init__(**kwargs)
+        self.load_data()
+
+    def load_data(self) -> None:
+        if self.dataset_path is None:
+            raise ValueError("dataset_path must be provided for loading data.")
+
+        self.data = []
+
+        # Load the JSONL file
+        jsonl_data = pd.read_json(path_or_buf=self.dataset_path,
+                                    lines=True)
+
+        # check if the JSONL file has a 'turns' column
+        if "turns" not in jsonl_data.columns:
+            raise ValueError("JSONL file must contain a 'turns' column.")
+
+        for _, row in jsonl_data.iterrows():
+            # sample only from a specific category if specified
+            if (not self.category) or (self.category == row['category']):
+                prompt = row["turns"][0]
+                self.data.append({"prompt": prompt})
+
+        random.seed(self.random_seed)
+        random.shuffle(self.data)
+
+    def sample(self, **kwargs) -> list:
+        # leverage CustomDataset sample
+        kwargs["skip_chat_template"] = False
+        return super().sample(**kwargs)
+    
+    
 # -----------------------------------------------------------------------------
 # Sonnet Dataset Implementation
 # -----------------------------------------------------------------------------
@@ -1576,6 +1749,7 @@ class SonnetDataset(BenchmarkDataset):
         output_len: int = DEFAULT_OUTPUT_LEN,
         return_prompt_formatted: bool = False,
         request_id_prefix: str = "",
+        no_oversample: bool = False,
         **kwargs,
     ) -> list:
         # Calculate average token length for a poem line.
@@ -1671,6 +1845,7 @@ class BurstGPTDataset(BenchmarkDataset):
         max_loras: Optional[int] = None,
         lora_path: Optional[str] = None,
         request_id_prefix: str = "",
+        no_oversample: bool = False,
         **kwargs,
     ) -> list[SampleRequest]:
         samples = []
@@ -1710,6 +1885,7 @@ class HuggingFaceDataset(BenchmarkDataset):
         dataset_split: str,
         no_stream: bool = False,
         dataset_subset: Optional[str] = None,
+        hf_name: Optional[str] = None,
         **kwargs,
     ) -> None:
         super().__init__(dataset_path=dataset_path, **kwargs)
@@ -1717,6 +1893,7 @@ class HuggingFaceDataset(BenchmarkDataset):
         self.dataset_split = dataset_split
         self.dataset_subset = dataset_subset
         self.load_stream = not no_stream
+        self.hf_name = hf_name or dataset_path
         self.load_data()
 
     def load_data(self) -> None:
@@ -1748,6 +1925,7 @@ class ConversationDataset(HuggingFaceDataset):
                output_len: Optional[int] = None,
                enable_multimodal_chat: bool = False,
                request_id_prefix: str = "",
+               no_oversample: bool = False,
                **kwargs) -> list:
         # Filter examples with at least 2 conversations
         filtered_data = self.data.filter(
@@ -1789,7 +1967,7 @@ class ConversationDataset(HuggingFaceDataset):
                 ))
             ind += 1
         self.maybe_oversample_requests(sampled_requests, num_requests, 
-                                       request_id_prefix)
+                                       request_id_prefix, no_oversample)
         return sampled_requests
 
 
@@ -1819,6 +1997,7 @@ class VisionArenaDataset(HuggingFaceDataset):
         output_len: Optional[int] = None,
         enable_multimodal_chat: bool = False,
         request_id_prefix: str = "",
+        no_oversample: bool = False,
         **kwargs,
     ) -> list:
         output_len = (output_len
@@ -1827,10 +2006,9 @@ class VisionArenaDataset(HuggingFaceDataset):
         for i, item in enumerate(self.data):
             if len(sampled_requests) >= num_requests:
                 break
-            parser_fn = self.SUPPORTED_DATASET_PATHS.get(self.dataset_path)
+            parser_fn = self.SUPPORTED_DATASET_PATHS.get(self.hf_name)
             if parser_fn is None:
-                raise ValueError(
-                    f"Unsupported dataset path: {self.dataset_path}")
+                raise ValueError(f"Unsupported dataset path: {self.hf_name}")
             prompt = parser_fn(item)
             mm_content = process_image(item["images"][0])
             prompt_len = len(tokenizer(prompt).input_ids)
@@ -1849,7 +2027,7 @@ class VisionArenaDataset(HuggingFaceDataset):
                     request_id=request_id_prefix + str(i),
                 ))
         self.maybe_oversample_requests(sampled_requests, num_requests, 
-                                       request_id_prefix)
+                                       request_id_prefix, no_oversample)
         return sampled_requests
 
 
@@ -1879,6 +2057,7 @@ class InstructCoderDataset(HuggingFaceDataset):
                output_len: Optional[int] = None,
                enable_multimodal_chat: bool = False,
                request_id_prefix: str = "",
+               no_oversample: bool = False,
                **kwargs) -> list:
         output_len = (output_len
                       if output_len is not None else self.DEFAULT_OUTPUT_LEN)
@@ -1910,7 +2089,7 @@ class InstructCoderDataset(HuggingFaceDataset):
                     request_id=request_id_prefix + str(i),
                 ))
         self.maybe_oversample_requests(sampled_requests, num_requests, 
-                                       request_id_prefix)
+                                       request_id_prefix, no_oversample)
         return sampled_requests
 
 
@@ -1941,6 +2120,7 @@ class MTBenchDataset(HuggingFaceDataset):
         output_len: Optional[int] = None,
         enable_multimodal_chat: bool = False,
         request_id_prefix: str = "",
+        no_oversample: bool = False,
         **kwargs,
     ) -> list:
         output_len = (output_len
@@ -1971,7 +2151,96 @@ class MTBenchDataset(HuggingFaceDataset):
                     request_id=request_id_prefix + str(i),
                 ))
         self.maybe_oversample_requests(sampled_requests, num_requests, 
-                                       request_id_prefix)
+                                       request_id_prefix, no_oversample)
+        return sampled_requests
+
+
+# -----------------------------------------------------------------------------
+# Blazedit Dataset Implementation
+# -----------------------------------------------------------------------------
+
+
+class BlazeditDataset(HuggingFaceDataset):
+    """
+    Blazedit Dataset.
+    https://github.com/ise-uiuc/blazedit
+
+    5k char version: vdaita/edit_5k_char
+    10k char version: vdaita/edit_10k_char
+    """  # noqa: E501
+
+    # 5k char version will have output as ~5k chars
+    # 10k char version will have output as ~10k chars
+    # Assuming 3 char per token, 10k chars will be 3333 tokens
+    # We set default to 4000 to be safe
+    DEFAULT_OUTPUT_LEN = 4000
+    SUPPORTED_DATASET_PATHS = {
+        "vdaita/edit_5k_char",
+        "vdaita/edit_10k_char",
+    }
+
+    def sample(
+        self,
+        tokenizer: PreTrainedTokenizerBase,
+        num_requests: int,
+        output_len: Optional[int] = None,
+        request_id_prefix: str = "",
+        no_oversample: bool = False,
+        min_distance: float = 0.0,
+        max_distance: float = 1.0,
+        **kwargs,
+    ) -> list:
+        output_len = (output_len
+                      if output_len is not None else self.DEFAULT_OUTPUT_LEN)
+        sampled_requests = []
+
+        for i, item in enumerate(self.data):
+            if len(sampled_requests) >= num_requests:
+                break
+            code = item["code"]
+            change_request = item["change_request"]
+            norm_distance = item["norm_distance"]
+
+            # compare the levenshtein distance normalized by code length
+            if norm_distance < min_distance or norm_distance > max_distance:
+                continue
+            
+            # template copied from 
+            # https://github.com/ise-uiuc/blazedit/blob/7765137e656fd62de877422d2e4cf8de51228054/dataset/create_refined_dataset.py#L94-L105 # noqa: E501
+            instruction = f"""Given a code file, please apply the change requests and generate the new file.
+
+Original file:
+```python
+{code}
+```
+
+Change request:
+{change_request}
+
+Please generate the new code file in the "New file" section below.""" # noqa: E501
+
+            # apply template
+            prompt = tokenizer.apply_chat_template(
+                [{
+                    "role": "user",
+                    "content": instruction
+                }],
+                add_generation_prompt=True,
+                tokenize=False,
+            )
+
+            prompt_len = len(tokenizer(prompt).input_ids)
+
+            sampled_requests.append(
+                SampleRequest(
+                    prompt=prompt,
+                    prompt_len=prompt_len,
+                    expected_output_len=output_len,
+                    request_id=request_id_prefix + str(i),
+                ))
+        self.maybe_oversample_requests(sampled_requests, num_requests, 
+                                       request_id_prefix, no_oversample)
+        
         return sampled_requests
 
 
@@ -1994,6 +2263,7 @@ class AIMODataset(HuggingFaceDataset):
                num_requests: int,
                output_len: Optional[int] = None,
                request_id_prefix: str = "",
+               no_oversample: bool = False,
                **kwargs) -> list:
         sampled_requests = []
         ind = 0
@@ -2026,7 +2296,7 @@ class AIMODataset(HuggingFaceDataset):
                 ))
             ind += 1
         self.maybe_oversample_requests(sampled_requests, num_requests,
-                                       request_id_prefix)
+                                       request_id_prefix, no_oversample)
         return sampled_requests
 
 
@@ -2098,11 +2368,11 @@ class NextEditPredictionDataset(HuggingFaceDataset):
 
     def sample(self, tokenizer: PreTrainedTokenizerBase, num_requests: int,
                request_id_prefix: str = "",
+               no_oversample: bool = False,
                **kwargs):
-        formatting_prompt_func = self.MAPPING_PROMPT_FUNCS.get(
-            self.dataset_path)
+        formatting_prompt_func = self.MAPPING_PROMPT_FUNCS.get(self.hf_name)
         if formatting_prompt_func is None:
-            raise ValueError(f"Unsupported dataset path: {self.dataset_path}")
+            raise ValueError(f"Unsupported dataset path: {self.hf_name}")
         samples = []
         for i, sample in enumerate(self.data):
             sample = formatting_prompt_func(sample)
@@ -2116,7 +2386,10 @@ class NextEditPredictionDataset(HuggingFaceDataset):
                 ))
             if len(samples) >= num_requests:
                 break
-        self.maybe_oversample_requests(samples, num_requests, request_id_prefix)
+        self.maybe_oversample_requests(samples, 
+                                       num_requests, 
+                                       request_id_prefix, 
+                                       no_oversample)
         return samples
 
 
@@ -2167,6 +2440,7 @@ class ASRDataset(HuggingFaceDataset):
         num_requests: int,
         output_len: Optional[int] = None,
         request_id_prefix: str = "",
+        no_oversample: bool = False,
         **kwargs,
     ) -> list:
         output_len = (output_len
@@ -2205,7 +2479,7 @@ class ASRDataset(HuggingFaceDataset):
                 skipped,
             )
         self.maybe_oversample_requests(sampled_requests, num_requests, 
-                                       request_id_prefix)
+                                       request_id_prefix, no_oversample)
         return sampled_requests
 
 
@@ -2243,6 +2517,7 @@ class MLPerfDataset(HuggingFaceDataset):
         num_requests: int,
         output_len: Optional[int] = None,
         request_id_prefix: str = "",
+        no_oversample: bool = False,
         **kwargs,
     ) -> list[SampleRequest]:
         # Force dynamic output length based on reference completion.
@@ -2289,7 +2564,7 @@ class MLPerfDataset(HuggingFaceDataset):
             ind += 1
 
         self.maybe_oversample_requests(sampled_requests, num_requests, 
-                                       request_id_prefix)
+                                       request_id_prefix, no_oversample)
         return sampled_requests
 
 
@@ -2323,6 +2598,7 @@ class PrefixRepetitionRandomDataset(BenchmarkDataset):
         num_prefixes: int = DEFAULT_NUM_PREFIXES,
         output_len: int = DEFAULT_OUTPUT_LEN,
         request_id_prefix: str = "",
+        no_oversample: bool = False,
         **kwargs,
     ) -> list[SampleRequest]:
         vocab_size = tokenizer.vocab_size
diff --git a/vllm/benchmarks/lib/endpoint_request_func.py b/vllm/benchmarks/lib/endpoint_request_func.py
index 6bb2a497119e9f9f713f576a7a38282d380143b7..9d67580be26ad4a4beed69c211c74aa6793bd45f 100644
--- a/vllm/benchmarks/lib/endpoint_request_func.py
+++ b/vllm/benchmarks/lib/endpoint_request_func.py
@@ -17,6 +17,47 @@ from tqdm.asyncio import tqdm
 AIOHTTP_TIMEOUT = aiohttp.ClientTimeout(total=6 * 60 * 60)
 
 
+class StreamedResponseHandler:
+    """Handles streaming HTTP responses by accumulating chunks until complete
+    messages are available."""
+
+    def __init__(self):
+        self.buffer = ""
+
+    def add_chunk(self, chunk_bytes: bytes) -> list[str]:
+        """Add a chunk of bytes to the buffer and return any complete
+        messages."""
+        chunk_str = chunk_bytes.decode("utf-8")
+        self.buffer += chunk_str
+
+        messages = []
+
+        # Split by double newlines (SSE message separator)
+        while "\n\n" in self.buffer:
+            message, self.buffer = self.buffer.split("\n\n", 1)
+            message = message.strip()
+            if message:
+                messages.append(message)
+
+        # if self.buffer is not empty, check if it is a complete message
+        # by removing data: prefix and check if it is a valid JSON
+        if self.buffer.startswith("data: "):
+            message_content = self.buffer.removeprefix("data: ").strip()
+            if message_content == "[DONE]":
+                messages.append(self.buffer.strip())
+                self.buffer = ""
+            elif message_content:
+                try:
+                    json.loads(message_content)
+                    messages.append(self.buffer.strip())
+                    self.buffer = ""
+                except json.JSONDecodeError:
+                    # Incomplete JSON, wait for more chunks.
+                    pass
+
+        return messages
+
+
 @dataclass
 class RequestFuncInput:
     """The input for the request function."""
@@ -102,46 +143,50 @@ async def async_request_openai_completions(
                                 headers=headers) as response:
             if response.status == 200:
                 first_chunk_received = False
-                async for chunk_bytes in response.content:
+                handler = StreamedResponseHandler()
+
+                async for chunk_bytes in response.content.iter_any():
                     chunk_bytes = chunk_bytes.strip()
                     if not chunk_bytes:
                         continue
-                    chunk_bytes = chunk_bytes.decode("utf-8")
-                    # NOTE: SSE comments (often used as pings) start with
-                    # a colon. These are not JSON data payload and should
-                    # be skipped.
-                    if chunk_bytes.startswith(":"):
-                        continue
 
-                    chunk = chunk_bytes.removeprefix("data: ")
+                    messages = handler.add_chunk(chunk_bytes)
+                    for message in messages:
+                        # NOTE: SSE comments (often used as pings) start with
+                        # a colon. These are not JSON data payload and should
+                        # be skipped.
+                        if message.startswith(":"):
+                            continue
 
-                    if chunk != "[DONE]":
-                        data = json.loads(chunk)
+                        chunk = message.removeprefix("data: ")
 
-                        # NOTE: Some completion API might have a last
-                        # usage summary response without a token so we
-                        # want to check a token was generated
-                        if choices := data.get("choices"):
-                            # Note that text could be empty here
-                            # e.g. for special tokens
-                            text = choices[0].get("text")
-                            timestamp = time.perf_counter()
-                            # First token
-                            if not first_chunk_received:
-                                first_chunk_received = True
-                                ttft = time.perf_counter() - st
-                                output.ttft = ttft
+                        if chunk != "[DONE]":
+                            data = json.loads(chunk)
 
-                            # Decoding phase
-                            else:
-                                output.itl.append(timestamp -
-                                                  most_recent_timestamp)
+                            # NOTE: Some completion API might have a last
+                            # usage summary response without a token so we
+                            # want to check a token was generated
+                            if choices := data.get("choices"):
+                                # Note that text could be empty here
+                                # e.g. for special tokens
+                                text = choices[0].get("text")
+                                timestamp = time.perf_counter()
+                                # First token
+                                if not first_chunk_received:
+                                    first_chunk_received = True
+                                    ttft = time.perf_counter() - st
+                                    output.ttft = ttft
 
-                            most_recent_timestamp = timestamp
-                            generated_text += text or ""
-                        elif usage := data.get("usage"):
-                            output.output_tokens = usage.get(
-                                "completion_tokens")
+                                # Decoding phase
+                                else:
+                                    output.itl.append(timestamp -
+                                                    most_recent_timestamp)
+
+                                most_recent_timestamp = timestamp
+                                generated_text += text or ""
+                            elif usage := data.get("usage"):
+                                output.output_tokens = usage.get(
+                                    "completion_tokens")
                 if first_chunk_received:
                     output.success = True
                 else:
@@ -227,41 +272,44 @@ async def async_request_openai_chat_completions(
         async with session.post(url=api_url, json=payload,
                                 headers=headers) as response:
             if response.status == 200:
-                async for chunk_bytes in response.content:
+                handler = StreamedResponseHandler()
+                async for chunk_bytes in response.content.iter_any():
                     chunk_bytes = chunk_bytes.strip()
                     if not chunk_bytes:
                         continue
-                    chunk_bytes = chunk_bytes.decode("utf-8")
-                    # NOTE: SSE comments (often used as pings) start with
-                    # a colon. These are not JSON data payload and should
-                    # be skipped.
-                    if chunk_bytes.startswith(":"):
-                        continue
 
-                    chunk = chunk_bytes.removeprefix("data: ")
+                    messages = handler.add_chunk(chunk_bytes)
+                    for message in messages:
+                        # NOTE: SSE comments (often used as pings) start with
+                        # a colon. These are not JSON data payload and should
+                        # be skipped.
+                        if message.startswith(":"):
+                            continue
+
+                        chunk = message.removeprefix("data: ")
 
-                    if chunk != "[DONE]":
-                        timestamp = time.perf_counter()
-                        data = json.loads(chunk)
+                        if chunk != "[DONE]":
+                            timestamp = time.perf_counter()
+                            data = json.loads(chunk)
 
-                        if choices := data.get("choices"):
-                            content = choices[0]["delta"].get("content")
-                            # First token
-                            if ttft == 0.0:
-                                ttft = timestamp - st
-                                output.ttft = ttft
+                            if choices := data.get("choices"):
+                                content = choices[0]["delta"].get("content")
+                                # First token
+                                if ttft == 0.0:
+                                    ttft = timestamp - st
+                                    output.ttft = ttft
 
-                            # Decoding phase
-                            else:
-                                output.itl.append(timestamp -
-                                                  most_recent_timestamp)
+                                # Decoding phase
+                                else:
+                                    output.itl.append(timestamp -
+                                                    most_recent_timestamp)
 
-                            generated_text += content or ""
-                        elif usage := data.get("usage"):
-                            output.output_tokens = usage.get(
-                                "completion_tokens")
+                                generated_text += content or ""
+                            elif usage := data.get("usage"):
+                                output.output_tokens = usage.get(
+                                    "completion_tokens")
 
-                        most_recent_timestamp = timestamp
+                            most_recent_timestamp = timestamp
 
                 output.generated_text = generated_text
                 output.success = True
@@ -347,36 +395,40 @@ async def async_request_openai_audio(
                                     data=form,
                                     headers=headers) as response:
                 if response.status == 200:
-                    async for chunk_bytes in response.content:
+                    handler = StreamedResponseHandler()
+
+                    async for chunk_bytes in response.content.iter_any():
                         chunk_bytes = chunk_bytes.strip()
                         if not chunk_bytes:
                             continue
 
-                        chunk = chunk_bytes.decode("utf-8").removeprefix(
-                            "data: ")
-                        if chunk != "[DONE]":
-                            timestamp = time.perf_counter()
-                            data = json.loads(chunk)
-
-                            if choices := data.get("choices"):
-                                content = choices[0]["delta"].get(
-                                    "content")
-                                # First token
-                                if ttft == 0.0:
-                                    ttft = timestamp - st
-                                    output.ttft = ttft
-
-                                # Decoding phase
-                                else:
-                                    output.itl.append(
-                                        timestamp - most_recent_timestamp)
-
-                                generated_text += content or ""
-                            elif usage := data.get("usage"):
-                                output.output_tokens = usage.get(
-                                    "completion_tokens")
-
-                            most_recent_timestamp = timestamp
+                        messages = handler.add_chunk(chunk_bytes)
+                        for message in messages:
+                            chunk = message.decode("utf-8").removeprefix(
+                                "data: ")
+                            if chunk != "[DONE]":
+                                timestamp = time.perf_counter()
+                                data = json.loads(chunk)
+
+                                if choices := data.get("choices"):
+                                    content = choices[0]["delta"].get(
+                                        "content")
+                                    # First token
+                                    if ttft == 0.0:
+                                        ttft = timestamp - st
+                                        output.ttft = ttft
+
+                                    # Decoding phase
+                                    else:
+                                        output.itl.append(
+                                            timestamp - most_recent_timestamp)
+
+                                    generated_text += content or ""
+                                elif usage := data.get("usage"):
+                                    output.output_tokens = usage.get(
+                                        "completion_tokens")
+
+                                most_recent_timestamp = timestamp
 
                     output.generated_text = generated_text
                     output.success = True
diff --git a/vllm/benchmarks/serve.py b/vllm/benchmarks/serve.py
index abb838316cd31cdce50f07f19c49342edf8f03cf..a98eb2a78f103f097b5ad4255f622f3999edaed6 100644
--- a/vllm/benchmarks/serve.py
+++ b/vllm/benchmarks/serve.py
@@ -189,7 +189,7 @@ async def get_request(
         # NOTE: If we simply accumulate the random delta values
         # from the gamma distribution, their sum would have 1-2% gap
         # from target_total_delay_s. The purpose of the following logic is to
-        # close the gap for stablizing the throughput data
+        # close the gap for stabilizing the throughput data
         # from different random seeds.
         target_total_delay_s = total_requests / request_rate
         normalize_factor = target_total_delay_s / delay_ts[-1]
diff --git a/vllm/benchmarks/throughput.py b/vllm/benchmarks/throughput.py
index f022a55e625f58055b741d7ef58d11d9d6fe4dac..96e39fd92eba0784db246176737c1808e274fd56 100644
--- a/vllm/benchmarks/throughput.py
+++ b/vllm/benchmarks/throughput.py
@@ -37,6 +37,7 @@ def run_vllm(
     requests: list[SampleRequest],
     n: int,
     engine_args: EngineArgs,
+    do_profile: bool,
     disable_detokenize: bool = False,
 ) -> tuple[float, Optional[list[RequestOutput]]]:
     from vllm import LLM, SamplingParams
@@ -75,10 +76,14 @@ def run_vllm(
     outputs = None
     if not use_beam_search:
         start = time.perf_counter()
+        if do_profile:
+            llm.start_profile()
         outputs = llm.generate(prompts,
                                sampling_params,
                                lora_request=lora_requests,
                                use_tqdm=True)
+        if do_profile:
+            llm.stop_profile()
         end = time.perf_counter()
     else:
         assert lora_requests is None, "BeamSearch API does not support LoRA"
@@ -88,6 +93,8 @@ def run_vllm(
         for request in requests:
             assert request.expected_output_len == output_len
         start = time.perf_counter()
+        if do_profile:
+            llm.start_profile()
         llm.beam_search(
             prompts,
             BeamSearchParams(
@@ -95,6 +102,8 @@ def run_vllm(
                 max_tokens=output_len,
                 ignore_eos=True,
             ))
+        if do_profile:
+            llm.stop_profile()
         end = time.perf_counter()
     return end - start, outputs
 
@@ -103,6 +112,7 @@ def run_vllm_chat(
         requests: list[SampleRequest],
         n: int,
         engine_args: EngineArgs,
+        do_profile: bool,
         disable_detokenize: bool = False) -> tuple[float, list[RequestOutput]]:
     """
     Run vLLM chat benchmark. This function is recommended ONLY for benchmarking
@@ -133,7 +143,11 @@ def run_vllm_chat(
                 detokenize=not disable_detokenize,
             ))
     start = time.perf_counter()
+    if do_profile:
+        llm.start_profile()
     outputs = llm.chat(prompts, sampling_params, use_tqdm=True)
+    if do_profile:
+        llm.stop_profile()
     end = time.perf_counter()
     return end - start, outputs
 
@@ -142,6 +156,7 @@ async def run_vllm_async(
     requests: list[SampleRequest],
     n: int,
     engine_args: AsyncEngineArgs,
+    do_profile: bool,
     disable_frontend_multiprocessing: bool = False,
     disable_detokenize: bool = False,
 ) -> float:
@@ -185,6 +200,8 @@ async def run_vllm_async(
 
         generators = []
         start = time.perf_counter()
+        if do_profile:
+            await llm.start_profile()
         for i, (prompt, sp,
                 lr) in enumerate(zip(prompts, sampling_params, lora_requests)):
             generator = llm.generate(prompt,
@@ -195,6 +212,8 @@ async def run_vllm_async(
         all_gens = merge_async_iterators(*generators)
         async for i, res in all_gens:
             pass
+        if do_profile:
+            await llm.stop_profile()
         end = time.perf_counter()
         return end - start
 
@@ -543,6 +562,12 @@ def add_cli_args(parser: argparse.ArgumentParser):
                         type=str,
                         default=None,
                         help="Split of the HF dataset.")
+    parser.add_argument(
+        "--profile",
+        action="store_true",
+        default=False,
+        help="Use Torch Profiler. The env variable "
+        "VLLM_TORCH_PROFILER_DIR must be set to enable profiler.")
 
     # prefix repetition dataset
     prefix_repetition_group = parser.add_argument_group(
@@ -600,22 +625,27 @@ def main(args: argparse.Namespace):
                     requests,
                     args.n,
                     AsyncEngineArgs.from_cli_args(args),
-                    args.disable_frontend_multiprocessing,
-                    args.disable_detokenize,
+                    disable_frontend_multiprocessing=args.disable_frontend_multiprocessing,
+                    disable_detokenize=args.disable_detokenize,
+                    do_profile=args.profile,
                 ))
         else:
             elapsed_time, request_outputs = run_vllm(
                 requests, args.n, EngineArgs.from_cli_args(args),
-                args.disable_detokenize)
+                disable_detokenize=args.disable_detokenize,
+                do_profile=args.profile)
     elif args.backend == "hf":
         assert args.tensor_parallel_size == 1
+        if args.profile:
+            raise NotImplementedError(
+                "Profiling not implemented yet for backend='hf'.")
         elapsed_time = run_hf(requests, args.model, tokenizer, args.n,
                               args.hf_max_batch_size, args.trust_remote_code,
                               args.disable_detokenize)
     elif args.backend == "vllm-chat":
         elapsed_time, request_outputs = run_vllm_chat(
             requests, args.n, EngineArgs.from_cli_args(args),
-            args.disable_detokenize)
+            disable_detokenize=args.disable_detokenize, do_profile=args.profile)
     else:
         raise ValueError(f"Unknown backend: {args.backend}")
 
diff --git a/vllm/collect_env.py b/vllm/collect_env.py
index ee43ad12e8a5e67e48eaa8bab7cec4429cd2dbc5..fb9d3657790cffca3e7e0cf5d88cccf9b1abbea0 100644
--- a/vllm/collect_env.py
+++ b/vllm/collect_env.py
@@ -54,7 +54,6 @@ SystemEnv = namedtuple(
         'is_xnnpack_available',
         'cpu_info',
         'rocm_version',  # vllm specific field
-        'neuron_sdk_version',  # vllm specific field
         'vllm_version',  # vllm specific field
         'vllm_build_flags',  # vllm specific field
         'gpu_topo',  # vllm specific field
@@ -75,6 +74,7 @@ DEFAULT_CONDA_PATTERNS = {
     "zmq",
     "nvidia",
     "pynvml",
+    "flashinfer-python",
 }
 
 DEFAULT_PIP_PATTERNS = {
@@ -90,6 +90,7 @@ DEFAULT_PIP_PATTERNS = {
     "zmq",
     "nvidia",
     "pynvml",
+    "flashinfer-python",
 }
 
 
@@ -275,15 +276,6 @@ def get_rocm_version(run_lambda):
                                      r'HIP version: (\S+)')
 
 
-def get_neuron_sdk_version(run_lambda):
-    # Adapted from your install script
-    try:
-        result = run_lambda(["neuron-ls"])
-        return result if result[0] == 0 else 'N/A'
-    except Exception:
-        return 'N/A'
-
-
 def get_vllm_version():
     from vllm import __version__, __version_tuple__
 
@@ -306,10 +298,9 @@ def get_vllm_version():
 
 def summarize_vllm_build_flags():
     # This could be a static method if the flags are constant, or dynamic if you need to check environment variables, etc.
-    return 'CUDA Archs: {}; ROCm: {}; Neuron: {}'.format(
+    return 'CUDA Archs: {}; ROCm: {}'.format(
         os.environ.get('TORCH_CUDA_ARCH_LIST', 'Not Set'),
         'Enabled' if os.environ.get('ROCM_HOME') else 'Disabled',
-        'Enabled' if os.environ.get('NEURON_CORES') else 'Disabled',
     )
 
 
@@ -498,6 +489,16 @@ def get_libc_version():
     return '-'.join(platform.libc_ver())
 
 
+def is_uv_venv():
+    if os.environ.get("UV"):
+        return True
+    pyvenv_cfg_path = os.path.join(sys.prefix, 'pyvenv.cfg')
+    if os.path.exists(pyvenv_cfg_path):
+        with open(pyvenv_cfg_path, 'r') as f:
+            return any(line.startswith('uv = ') for line in f)
+    return False
+
+
 def get_pip_packages(run_lambda, patterns=None):
     """Return `pip list` output. Note: will also find conda-installed pytorch and numpy packages."""
     if patterns is None:
@@ -513,7 +514,7 @@ def get_pip_packages(run_lambda, patterns=None):
 
         if pip_available:
             cmd = [sys.executable, '-mpip', 'list', '--format=freeze']
-        elif os.environ.get("UV") is not None:
+        elif is_uv_venv():
             print("uv is set")
             cmd = ["uv", "pip", "list", "--format=freeze"]
         else:
@@ -601,7 +602,6 @@ def get_env_info():
     conda_packages = get_conda_packages(run_lambda)
 
     rocm_version = get_rocm_version(run_lambda)
-    neuron_sdk_version = get_neuron_sdk_version(run_lambda)
     vllm_version = get_vllm_version()
     vllm_build_flags = summarize_vllm_build_flags()
     gpu_topo = get_gpu_topo(run_lambda)
@@ -635,7 +635,6 @@ def get_env_info():
         is_xnnpack_available=is_xnnpack_available(),
         cpu_info=get_cpu_info(run_lambda),
         rocm_version=rocm_version,
-        neuron_sdk_version=neuron_sdk_version,
         vllm_version=vllm_version,
         vllm_build_flags=vllm_build_flags,
         gpu_topo=gpu_topo,
@@ -702,7 +701,6 @@ env_info_fmt += """
          vLLM Info
 ==============================
 ROCM Version                 : {rocm_version}
-Neuron SDK Version           : {neuron_sdk_version}
 vLLM Version                 : {vllm_version}
 vLLM Build Flags:
   {vllm_build_flags}
diff --git a/vllm/compilation/backends.py b/vllm/compilation/backends.py
index 3361b65a9b8852c5d18aa0656a913fc0bb522fe8..3cc0fc3106f5aab7dd3b7576e2b3e8c57187f293 100644
--- a/vllm/compilation/backends.py
+++ b/vllm/compilation/backends.py
@@ -454,11 +454,12 @@ class VllmBackend:
         inductor_config = config.inductor_compile_config
         PASS_KEY = "post_grad_custom_post_pass"
         if PASS_KEY in inductor_config:
-            # Config should automatically wrap all inductor passes
             if isinstance(inductor_config[PASS_KEY], PostGradPassManager):
+                # PassManager already added to config, make sure it's correct
                 assert (inductor_config[PASS_KEY].uuid() ==
                         self.post_grad_pass_manager.uuid())
             else:
+                # Config should automatically wrap all inductor passes
                 assert isinstance(inductor_config[PASS_KEY], InductorPass)
                 self.post_grad_pass_manager.add(inductor_config[PASS_KEY])
         inductor_config[PASS_KEY] = self.post_grad_pass_manager
diff --git a/vllm/compilation/collective_fusion.py b/vllm/compilation/collective_fusion.py
index 7a99aaff707dcff30625ea33687ce54a887b9fb3..71274420c342626a51f551010df0c93f455b2756 100644
--- a/vllm/compilation/collective_fusion.py
+++ b/vllm/compilation/collective_fusion.py
@@ -513,7 +513,7 @@ if flashinfer_comm is not None:
                         torch.ops._C.static_scaled_fp8_quant(
                             quant_out, norm_out, scale_factor)
             if scale_factor is None or norm_out is not None:
-                # we need to return allreduce outpput
+                # we need to return allreduce output
                 # in cases of non quant fused AR + RMS norm
                 # and fused AR + RMS norm + quant without fused add
                 allreduce_in.copy_(allreduce_out)
diff --git a/vllm/compilation/fusion_attn.py b/vllm/compilation/fusion_attn.py
index 3095f17110fdecc3c3fd23b96e2b763715058d86..e3677b3dd62d8c037a250d2365c323ccd224ef8e 100644
--- a/vllm/compilation/fusion_attn.py
+++ b/vllm/compilation/fusion_attn.py
@@ -39,6 +39,7 @@ class AttentionQuantPattern(ABC):
         self,
         layer: Attention,
         quant_key: QuantKey,
+        dtype: torch.dtype,
     ):
         self.layer = layer
         self.layer_name = layer.layer_name
@@ -46,11 +47,16 @@ class AttentionQuantPattern(ABC):
         self.head_size = layer.head_size
         self.quant_key = quant_key
         self.quant_dtype = quant_key.dtype
+        self.dtype = dtype
 
         assert self.quant_key in QUANT_OPS, \
             f"unsupported quantization scheme {self.quant_key}"
         self.QUANT_OP = QUANT_OPS[self.quant_key]
 
+    def empty(self, *args, **kwargs):
+        kwargs = {'dtype': self.dtype, 'device': "cuda", **kwargs}
+        return torch.empty(*args, **kwargs)
+
     def empty_quant(self, *args, **kwargs):
         kwargs = {'dtype': self.quant_dtype, 'device': "cuda", **kwargs}
         return torch.empty(*args, **kwargs)
@@ -91,12 +97,13 @@ class AttentionFp8StaticQuantPattern(AttentionQuantPattern):
     def __init__(
         self,
         layer: Attention,
+        dtype: torch.dtype,
         symmetric: bool = True,
     ):
         quant_key = QuantKey(dtype=FP8_DTYPE,
                              scale=kStaticTensorScale,
                              symmetric=symmetric)
-        super().__init__(layer, quant_key)
+        super().__init__(layer, quant_key, dtype)
 
     def _register(self, pm_pass: PatternMatcherPass):
 
@@ -139,10 +146,14 @@ class AttentionFp8StaticQuantPattern(AttentionQuantPattern):
             return RESHAPE_OP(at1[1], [-1, self.num_heads * self.head_size])
 
         inputs = [
-            empty_bf16(5, self.num_heads, self.head_size),  # q
-            empty_bf16(5, self.num_heads, self.head_size),  # k
-            empty_bf16(5, self.num_heads, self.head_size),  # v
-            empty_bf16(5, self.num_heads, self.head_size),  # attn_output
+            self.empty(5, self.num_heads, self.head_size,
+                       dtype=self.dtype),  # q
+            self.empty(5, self.num_heads, self.head_size,
+                       dtype=self.dtype),  # k
+            self.empty(5, self.num_heads, self.head_size,
+                       dtype=self.dtype),  # v
+            self.empty(5, self.num_heads, self.head_size,
+                       dtype=self.dtype),  # attn_output
             self.empty_quant(5,
                              self.num_heads * self.head_size),  # quant_output
             empty_fp32(1, 1)  # scale
@@ -165,8 +176,8 @@ class AttentionNvfp4QuantPattern(AttentionQuantPattern):
     will be passed into Attention op as the `output_scale` argument.
     """
 
-    def __init__(self, layer: Attention):
-        super().__init__(layer, kNvfp4Quant)
+    def __init__(self, layer: Attention, dtype: torch.dtype):
+        super().__init__(layer, kNvfp4Quant, dtype)
 
     def _register(self, pm_pass: PatternMatcherPass):
 
@@ -255,11 +266,15 @@ class AttnFusionPass(VllmInductorPass):
 
         attn_layers = get_layers_from_vllm_config(config, Attention)
         for layer_name, layer in attn_layers.items():
-            pattern_fp8 = AttentionFp8StaticQuantPattern(layer)
+            pattern_fp8 = AttentionFp8StaticQuantPattern(
+                layer, config.model_config.dtype)
             pattern_fp8.register_if_supported(self.patterns)
 
-            pattern_nvfp4 = AttentionNvfp4QuantPattern(layer)
-            pattern_nvfp4.register_if_supported(self.patterns)
+            if current_platform.is_cuda() and hasattr(torch.ops._C,
+                                                      "scaled_fp4_quant"):
+                pattern_nvfp4 = AttentionNvfp4QuantPattern(
+                    layer, config.model_config.dtype)
+                pattern_nvfp4.register_if_supported(self.patterns)
 
         if len(attn_layers) == 0:
             logger.warning(
diff --git a/vllm/config/__init__.py b/vllm/config/__init__.py
index 5337c30050e1d57f7330007e51df5c74bfddb6ba..6503ce54aae062d719e86f75273dd7d6665dd7bc 100644
--- a/vllm/config/__init__.py
+++ b/vllm/config/__init__.py
@@ -8,8 +8,8 @@ import enum
 import hashlib
 import inspect
 import json
+import os
 import textwrap
-import uuid
 import warnings
 from collections.abc import Mapping
 from contextlib import contextmanager
@@ -33,12 +33,17 @@ from vllm.config.cache import (BlockSize, CacheConfig, CacheDType, MambaDType,
                                PrefixCachingHashAlgo)
 from vllm.config.compilation import (CompilationConfig, CompilationLevel,
                                      CUDAGraphMode, PassConfig)
+from vllm.config.kv_events import KVEventsConfig
+from vllm.config.kv_transfer import KVTransferConfig
+from vllm.config.load import LoadConfig
+from vllm.config.lora import LoRAConfig
 from vllm.config.parallel import (DistributedExecutorBackend, EPLBConfig,
                                   ParallelConfig)
 from vllm.config.scheduler import SchedulerConfig, SchedulerPolicy
 from vllm.config.utils import ConfigType, config
 from vllm.logger import init_logger
 from vllm.model_executor.layers.quantization import QuantizationMethods
+from vllm.multimodal import MULTIMODAL_REGISTRY
 from vllm.platforms import current_platform
 from vllm.transformers_utils.config import (
     ConfigFormat, get_config, get_hf_image_processor_config,
@@ -47,9 +52,11 @@ from vllm.transformers_utils.config import (
     is_interleaved, maybe_override_with_speculators_target_model,
     try_get_generation_config, try_get_safetensors_metadata,
     try_get_tokenizer_config, uses_mrope)
-from vllm.transformers_utils.s3_utils import S3Model
-from vllm.transformers_utils.utils import is_s3, maybe_model_redirect
-from vllm.utils import (DEFAULT_MAX_NUM_BATCHED_TOKENS, LayerBlockType,
+from vllm.transformers_utils.runai_utils import (ObjectStorageModel,
+                                                 is_runai_obj_uri)
+from vllm.transformers_utils.utils import maybe_model_redirect
+from vllm.utils import (DEFAULT_MAX_NUM_BATCHED_TOKENS,
+                        STR_DUAL_CHUNK_FLASH_ATTN_VAL, LayerBlockType,
                         LazyLoader, common_broadcastable_dtype, random_uuid)
 
 if TYPE_CHECKING:
@@ -61,8 +68,6 @@ if TYPE_CHECKING:
     from vllm.model_executor.layers.quantization import QuantizationMethods
     from vllm.model_executor.layers.quantization.base_config import (
         QuantizationConfig)
-    from vllm.model_executor.model_loader import LoadFormats
-    from vllm.model_executor.model_loader.tensorizer import TensorizerConfig
     from vllm.v1.sample.logits_processor import LogitsProcessor
 
     HfOverrides = Union[dict, Callable[[type], type]]
@@ -72,8 +77,6 @@ else:
     QuantizationConfig = Any
     QuantizationMethods = Any
     BaseModelLoader = Any
-    LoadFormats = Any
-    TensorizerConfig = Any
     LogitsProcessor = Any
     HfOverrides = Union[dict[str, Any], Callable[[type], type]]
 
@@ -170,6 +173,7 @@ class ModelImpl(str, enum.Enum):
     AUTO = "auto"
     VLLM = "vllm"
     TRANSFORMERS = "transformers"
+    TERRATORCH = "terratorch"
 
 
 def get_attr_docs(cls: type[Any]) -> dict[str, str]:
@@ -418,7 +422,7 @@ class ModelConfig:
     `--media-io-kwargs '{"video": {"num_frames": 40} }'` """
     use_async_output_proc: bool = True
     """Whether to use async output processor."""
-    config_format: Union[str, ConfigFormat] = ConfigFormat.AUTO.value
+    config_format: Union[str, ConfigFormat] = "auto"
     """The format of the model config to load:\n
     - "auto" will try to load the config in hf format if available else it
     will try to load in mistral format.\n
@@ -459,11 +463,6 @@ class ModelConfig:
         DP (which is controlled by `--data-parallel-size`).
         This is only supported on a per-model basis and falls back to
         `"weights"` if the encoder does not support DP."""
-    override_neuron_config: dict[str, Any] = field(default_factory=dict)
-    """Initialize non-default neuron config or override default neuron config
-    that are specific to Neuron devices, this argument will be used to
-    configure the neuron config that can not be gathered from the vllm
-    arguments. e.g. `{"cast_logits_dtype": "bfloat16"}`."""
     pooler_config: Optional["PoolerConfig"] = field(init=False)
     """Pooler config which controls the behaviour of output pooling in pooling
     models."""
@@ -495,7 +494,9 @@ class ModelConfig:
     back to the Transformers implementation if no vLLM implementation is
     available.\n
     - "vllm" will use the vLLM model implementation.\n
-    - "transformers" will use the Transformers model implementation."""
+    - "transformers" will use the Transformers model implementation.\n
+    - "terratorch" will use the TerraTorch model implementation.
+    """
     override_attention_dtype: Optional[str] = None
     """Override dtype for attention"""
     logits_processors: Optional[list[Union[str, type[LogitsProcessor]]]] = None
@@ -560,15 +561,6 @@ class ModelConfig:
                     "affect the random state of the Python process that "
                     "launched vLLM.", self.seed)
 
-        if self.runner != "draft":
-            # If we're not running the draft model, check for speculators config
-            # If speculators config, set model / tokenizer to be target model
-            self.model, self.tokenizer = maybe_override_with_speculators_target_model(  # noqa: E501
-                model=self.model,
-                tokenizer=self.tokenizer,
-                revision=self.revision,
-                trust_remote_code=self.trust_remote_code)
-
         # Keep set served_model_name before maybe_model_redirect(self.model)
         self.served_model_name = get_served_model_name(self.model,
                                                        self.served_model_name)
@@ -607,7 +599,16 @@ class ModelConfig:
                 f"'Please instead use `--hf-overrides '{hf_overrides_str}'`")
             warnings.warn(DeprecationWarning(msg), stacklevel=2)
 
-        self.maybe_pull_model_tokenizer_for_s3(self.model, self.tokenizer)
+        self.maybe_pull_model_tokenizer_for_runai(self.model, self.tokenizer)
+
+        if self.runner != "draft":
+            # If we're not running the draft model, check for speculators config
+            # If speculators config, set model / tokenizer to be target model
+            self.model, self.tokenizer = maybe_override_with_speculators_target_model(  # noqa: E501
+                model=self.model,
+                tokenizer=self.tokenizer,
+                revision=self.revision,
+                trust_remote_code=self.trust_remote_code)
 
         if (backend := envs.VLLM_ATTENTION_BACKEND
             ) and backend == "FLASHINFER" and find_spec("flashinfer") is None:
@@ -630,9 +631,6 @@ class ModelConfig:
             raise ValueError(
                 "Sleep mode is not supported on current platform.")
 
-        if isinstance(self.config_format, str):
-            self.config_format = ConfigFormat(self.config_format)
-
         hf_config = get_config(self.hf_config_path or self.model,
                                self.trust_remote_code,
                                self.revision,
@@ -749,7 +747,7 @@ class ModelConfig:
 
         self.pooler_config = self._init_pooler_config()
 
-        self.dtype = _get_and_verify_dtype(
+        self.dtype: torch.dtype = _get_and_verify_dtype(
             self.model,
             self.hf_config,
             self.dtype,
@@ -785,10 +783,6 @@ class ModelConfig:
         if not self.skip_tokenizer_init:
             self._verify_tokenizer_mode()
 
-        if (not current_platform.is_neuron() and self.override_neuron_config):
-            raise ValueError(
-                "`override_neuron_config` is only supported on Neuron.")
-
         # Avoid running try_verify_and_update_config multiple times
         self.config_updated = False
 
@@ -840,41 +834,42 @@ class ModelConfig:
         """The architecture vllm actually used."""
         return self._architecture
 
-    def maybe_pull_model_tokenizer_for_s3(self, model: str,
-                                          tokenizer: str) -> None:
-        """Pull model/tokenizer from S3 to temporary directory when needed.
+    def maybe_pull_model_tokenizer_for_runai(self, model: str,
+                                             tokenizer: str) -> None:
+        """Pull model/tokenizer from Object Storage to temporary
+        directory when needed.
 
         Args:
             model: Model name or path
             tokenizer: Tokenizer name or path
         """
-        if not (is_s3(model) or is_s3(tokenizer)):
+        if not (is_runai_obj_uri(model) or is_runai_obj_uri(tokenizer)):
             return
 
-        if is_s3(model):
-            s3_model = S3Model()
-            s3_model.pull_files(model,
-                                allow_pattern=["*.model", "*.py", "*.json"])
+        if is_runai_obj_uri(model):
+            object_storage_model = ObjectStorageModel()
+            object_storage_model.pull_files(
+                model, allow_pattern=["*.model", "*.py", "*.json"])
             self.model_weights = model
-            self.model = s3_model.dir
+            self.model = object_storage_model.dir
 
             # If tokenizer is same as model, download to same directory
             if model == tokenizer:
-                s3_model.pull_files(model,
-                                    ignore_pattern=[
-                                        "*.pt", "*.safetensors", "*.bin",
-                                        "*.tensors"
-                                    ])
-                self.tokenizer = s3_model.dir
+                object_storage_model.pull_files(model,
+                                                ignore_pattern=[
+                                                    "*.pt", "*.safetensors",
+                                                    "*.bin", "*.tensors"
+                                                ])
+                self.tokenizer = object_storage_model.dir
                 return
 
         # Only download tokenizer if needed and not already handled
-        if is_s3(tokenizer):
-            s3_tokenizer = S3Model()
-            s3_tokenizer.pull_files(
+        if is_runai_obj_uri(tokenizer):
+            object_storage_tokenizer = ObjectStorageModel()
+            object_storage_tokenizer.pull_files(
                 model,
                 ignore_pattern=["*.pt", "*.safetensors", "*.bin", "*.tensors"])
-            self.tokenizer = s3_tokenizer.dir
+            self.tokenizer = object_storage_tokenizer.dir
 
     def _init_multimodal_config(self) -> Optional["MultiModalConfig"]:
         if self._model_info.supports_multimodal:
@@ -1101,11 +1096,11 @@ class ModelConfig:
 
         assert_never(runner_type)
 
-    def _parse_quant_hf_config(self):
-        quant_cfg = getattr(self.hf_config, "quantization_config", None)
+    def _parse_quant_hf_config(self, hf_config: PretrainedConfig):
+        quant_cfg = getattr(hf_config, "quantization_config", None)
         if quant_cfg is None:
             # compressed-tensors uses a "compression_config" key
-            quant_cfg = getattr(self.hf_config, "compression_config", None)
+            quant_cfg = getattr(hf_config, "compression_config", None)
 
         else:
             # Set quant_method for ModelOpt models.
@@ -1146,7 +1141,11 @@ class ModelConfig:
                                      self.quantization)
 
         # Parse quantization method from the HF model config, if available.
-        quant_cfg = self._parse_quant_hf_config()
+        quant_cfg = self._parse_quant_hf_config(self.hf_config)
+        if quant_cfg is None and (text_config := getattr(
+                self.hf_config, "text_config", None)):
+            # Check the text config as well for multi-modal models.
+            quant_cfg = self._parse_quant_hf_config(text_config)
 
         if quant_cfg is not None:
             # Use the community standard 'quant_method'
@@ -1178,7 +1177,7 @@ class ModelConfig:
             ]
             # Any custom overrides will be in quantization_methods so we place
             # them at the start of the list so custom overrides have preference
-            # over the built in ones.
+            # over the built-in ones.
             quantization_methods = quantization_methods + overrides
 
             # Detect which checkpoint is it
@@ -1308,6 +1307,10 @@ class ModelConfig:
                     self.hf_config.dual_chunk_attention_config[
                         "sparse_attention_enabled"] = True
 
+            if envs.VLLM_ATTENTION_BACKEND != STR_DUAL_CHUNK_FLASH_ATTN_VAL:
+                raise ValueError("please set VLLM_ATTENTION_BACKEND to "
+                                 f"{STR_DUAL_CHUNK_FLASH_ATTN_VAL}")
+
     def verify_async_output_proc(self, parallel_config, speculative_config,
                                  device_config) -> None:
         if not self.use_async_output_proc:
@@ -1422,6 +1425,11 @@ class ModelConfig:
         if getattr(self.hf_text_config, "head_dim", None) is not None:
             return self.hf_text_config.head_dim
 
+        # NOTE: Some models (such as PLaMo2.1) use `hidden_size_per_head`
+        if getattr(self.hf_text_config, "hidden_size_per_head",
+                   None) is not None:
+            return self.hf_text_config.hidden_size_per_head
+
         # FIXME(woosuk): This may not be true for all models.
         return (self.hf_text_config.hidden_size //
                 self.hf_text_config.num_attention_heads)
@@ -1505,7 +1513,8 @@ class ModelConfig:
         if (self.hf_text_config.model_type == "deepseek_mtp"
                 or self.hf_config.model_type == "mimo_mtp"
                 or self.hf_config.model_type == "glm4_moe_mtp"
-                or self.hf_config.model_type == "ernie_mtp"):
+                or self.hf_config.model_type == "ernie_mtp"
+                or self.hf_config.model_type == "qwen3_next_mtp"):
             total_num_hidden_layers = getattr(self.hf_text_config,
                                               "num_nextn_predict_layers", 0)
         else:
@@ -1549,7 +1558,7 @@ class ModelConfig:
                        for bc in block_configs[start:end])
         else:
             # Hybrid model Jamba
-            layers_block_type_value = getattr(self.hf_config,
+            layers_block_type_value = getattr(self.hf_text_config,
                                               "layers_block_type", None)
             if layers_block_type_value is not None:
                 if hasattr(self.hf_text_config,
@@ -1568,15 +1577,28 @@ class ModelConfig:
             if attn_type_list:
                 return sum(t == 1 for t in attn_type_list[start:end])
 
-            if layers_block_type_value is None and attn_type_list is None:
+            # Hybrid model Qwen3Next
+            layer_types_value = getattr(self.hf_config, "layer_types", None)
+            if layer_types_value is not None:
+                if getattr(block_type, "value", block_type) == "attention":
+                    return sum(t == "full_attention"
+                               for t in layer_types_value[start:end])
+                elif getattr(block_type, "value",
+                             block_type) == "linear_attention":
+                    return sum(t == "linear_attention"
+                               for t in layer_types_value[start:end])
+                else:
+                    return sum(t == getattr(block_type, "value", block_type)
+                               for t in layer_types_value[start:end])
+
+            if (layers_block_type_value is None and attn_type_list is None
+                    and layer_types_value is None):
                 raise ValueError(
                     "The model is an hybrid without a"
-                    "layers_block_type or an attn_type_list in the hf_config,"
-                    "cannot determine the num of "
+                    "layers_block_type or an attn_type_list, or a layer_types "
+                    "in the hf_config, cannot determine the num of "
                     f"{block_type.value} layers")
 
-            return sum(t == 1 for t in attn_type_list[start:end])
-
     def get_mamba_chunk_size(self) -> Optional[int]:
         """
         Returns the mamba chunk size if it exists
@@ -1687,13 +1709,7 @@ class ModelConfig:
         """
         For Mllama, VLLM overrides HF's is_encoder_decoder flag and sets it to
         True to enable cross-attention
-        Neuron needs all multimodal data to be in the decoder and does not
-        need to explicitly enable cross-attention
         """
-        if (current_platform.is_neuron()
-                and self.hf_config.model_type == "mllama"):
-            return False
-
         return is_encoder_decoder(self.hf_config)
 
     @property
@@ -1756,6 +1772,32 @@ class ModelConfig:
         # `llm as reranker` models defaults to not using pad_token.
         return getattr(self.hf_config, "use_pad_token", True)
 
+    @property
+    def head_dtype(self) -> torch.dtype:
+        """
+        "head" refers to the last Linear layer(s) of an LLM,
+        such as the lm_head in a generation model,
+        or the score or classifier in a classification model.
+
+        The default head_dtype based on runner_type.\n
+        - The pooling model defaults to using fp32 head,
+        you can use --hf-overrides '{"head_dtype": "model"}' to disable it.\n
+        - The generate model defaults to not using fp32 head,
+        you can use --hf-overrides '{"head_dtype": "float32"}' to enable it.
+        """
+        head_dtype = _get_head_dtype(config=self.hf_config,
+                                     dtype=self.dtype,
+                                     runner_type=self.runner_type)
+
+        if head_dtype not in current_platform.supported_dtypes:
+            logger.warning_once(
+                "The current platform does not support [%s] head dtype, "
+                "fallback to model dtype [%s].", head_dtype, self.dtype)
+            return self.dtype
+
+        logger.debug_once("head dtype: %s", head_dtype)
+        return head_dtype
+
     def get_and_verify_max_len(self, max_model_len: int):
         # Consider max_model_len in tokenizer_config only when
         # pooling models use absolute position_embedding.
@@ -1778,90 +1820,7 @@ class ModelConfig:
         return max_model_len
 
 
-@config
-@dataclass
-class LoadConfig:
-    """Configuration for loading the model weights."""
-
-    load_format: Union[str, LoadFormats] = "auto"
-    """The format of the model weights to load:\n
-    - "auto" will try to load the weights in the safetensors format and fall
-    back to the pytorch bin format if safetensors format is not available.\n
-    - "pt" will load the weights in the pytorch bin format.\n
-    - "safetensors" will load the weights in the safetensors format.\n
-    - "npcache" will load the weights in pytorch format and store a numpy cache
-    to speed up the loading.\n
-    - "dummy" will initialize the weights with random values, which is mainly
-    for profiling.\n
-    - "tensorizer" will use CoreWeave's tensorizer library for fast weight
-    loading. See the Tensorize vLLM Model script in the Examples section for
-    more information.\n
-    - "runai_streamer" will load the Safetensors weights using Run:ai Model
-    Streamer.\n
-    - "bitsandbytes" will load the weights using bitsandbytes quantization.\n
-    - "sharded_state" will load weights from pre-sharded checkpoint files,
-    supporting efficient loading of tensor-parallel models.\n
-    - "gguf" will load weights from GGUF format files (details specified in
-    https://github.com/ggml-org/ggml/blob/master/docs/gguf.md).\n
-    - "mistral" will load weights from consolidated safetensors files used by
-    Mistral models.
-    - Other custom values can be supported via plugins."""
-    download_dir: Optional[str] = None
-    """Directory to download and load the weights, default to the default
-    cache directory of Hugging Face."""
-    model_loader_extra_config: Union[dict, TensorizerConfig] = field(
-        default_factory=dict)
-    """Extra config for model loader. This will be passed to the model loader
-    corresponding to the chosen load_format."""
-    device: Optional[str] = None
-    """Device to which model weights will be loaded, default to
-    device_config.device"""
-    ignore_patterns: Optional[Union[list[str], str]] = None
-    """The list of patterns to ignore when loading the model. Default to
-    "original/**/*" to avoid repeated loading of llama's checkpoints."""
-    use_tqdm_on_load: bool = True
-    """Whether to enable tqdm for showing progress bar when loading model
-    weights."""
-    pt_load_map_location: Union[str, dict[str, str]] = "cpu"
-    """
-    pt_load_map_location: the map location for loading pytorch checkpoint, to
-    support loading checkpoints can only be loaded on certain devices like
-    "cuda", this is equivalent to {"": "cuda"}. Another supported format is
-    mapping from different devices like from GPU 1 to GPU 0:
-    {"cuda:1": "cuda:0"}. Note that when passed from command line, the strings
-    in dictionary needs to be double quoted for json parsing. For more details,
-    see original doc for `map_location` in https://pytorch.org/docs/stable/generated/torch.load.html
-    """
-
-    def compute_hash(self) -> str:
-        """
-        WARNING: Whenever a new field is added to this config,
-        ensure that it is included in the factors list if
-        it affects the computation graph.
-
-        Provide a hash that uniquely identifies all the configs
-        that affect the structure of the computation
-        graph from input ids/embeddings to the final hidden states,
-        excluding anything before input ids/embeddings and after
-        the final hidden states.
-        """
-        # no factors to consider.
-        # this config will not affect the computation graph.
-        factors: list[Any] = []
-        hash_str = hashlib.md5(str(factors).encode(),
-                               usedforsecurity=False).hexdigest()
-        return hash_str
-
-    def __post_init__(self):
-        self.load_format = self.load_format.lower()
-        if self.ignore_patterns is not None and len(self.ignore_patterns) > 0:
-            logger.info(
-                "Ignoring the following patterns when downloading weights: %s",
-                self.ignore_patterns)
-        else:
-            self.ignore_patterns = ["original/**/*"]
-
-Device = Literal["auto", "cuda", "neuron", "cpu", "tpu", "xpu"]
+Device = Literal["auto", "cuda", "cpu", "tpu", "xpu"]
 
 
 @config
@@ -1917,9 +1876,7 @@ class DeviceConfig:
                 self.device_type = self.device.type
 
         # Some device types require processing inputs on CPU
-        if self.device_type in ["neuron"]:
-            self.device = torch.device("cpu")
-        elif self.device_type in ["tpu"]:
+        if self.device_type in ["tpu"]:
             self.device = None
         else:
             # Set device with device type
@@ -1928,7 +1885,7 @@ class DeviceConfig:
 
 SpeculativeMethod = Literal["ngram", "eagle", "eagle3", "medusa",
                             "mlp_speculator", "draft_model", "deepseek_mtp",
-                            "ernie_mtp"]
+                            "ernie_mtp", "qwen3_next_mtp"]
 
 
 @config
@@ -2069,7 +2026,15 @@ class SpeculativeConfig:
                 "n_predict": n_predict,
                 "architectures": ["ErnieMTPModel"]
             })
-            return hf_config
+
+        if hf_config.model_type == "qwen3_next":
+            hf_config.model_type = "qwen3_next_mtp"
+        if hf_config.model_type == "qwen3_next_mtp":
+            n_predict = getattr(hf_config, "num_nextn_predict_layers", None)
+            hf_config.update({
+                "n_predict": n_predict,
+                "architectures": ["Qwen3NextMTP"]
+            })
 
         return hf_config
 
@@ -2090,9 +2055,13 @@ class SpeculativeConfig:
                 (self.target_model_config.hf_text_config.model_type \
                         == "deepseek_v3" or
                     self.target_model_config.hf_text_config.model_type in
-                        ("mimo","ernie4_5_moe")):
+                        ("mimo","ernie4_5_moe", "qwen3_next")):
                 # use the draft model from the same model:
                 self.model = self.target_model_config.model
+                # Align the quantization of draft model for cases such as
+                # --quantization fp8 with a bf16 checkpoint.
+                if not self.quantization:
+                    self.quantization = self.target_model_config.quantization
             elif self.method in ("ngram", "[ngram]"):
                 self.model = "ngram"
             else:
@@ -2171,9 +2140,14 @@ class SpeculativeConfig:
                 # Automatically detect the method
                 if self.method in ('eagle', 'eagle3'):
                     pass
-                elif "eagle-" in self.draft_model_config.model.lower() or \
-                        "eagle3-" in self.draft_model_config.model.lower():
+                # examples:
+                # yuhuili/EAGLE-LLaMA3-Instruct-8B
+                # yuhuili/EAGLE3-LLaMA3.1-Instruct-8B
+                # AngelSlim/Qwen3-8B_eagle3
+                elif "eagle-" in self.draft_model_config.model.lower():
                     self.method = "eagle"
+                elif "eagle3" in self.draft_model_config.model.lower():
+                    self.method = "eagle3"
                 elif self.draft_model_config.hf_config.model_type == "medusa":
                     self.method = "medusa"
                 elif (self.draft_model_config.hf_config.model_type ==
@@ -2197,6 +2171,15 @@ class SpeculativeConfig:
                                 "one layer. Might need some code changes " \
                                 "to support multiple layers."
                             )
+                elif (self.draft_model_config.hf_config.model_type ==
+                      "qwen3_next_mtp"):
+                    self.method = "qwen3_next_mtp"
+                    if self.num_speculative_tokens > 1:
+                        logger.warning(
+                                "All Qwen3Next MTP models only have " \
+                                "one layer. Might need some code changes " \
+                                "to support multiple layers."
+                            )
                 else:
                     self.method = "draft_model"
                     raise NotImplementedError(
@@ -2412,7 +2395,8 @@ class SpeculativeConfig:
         return self.num_speculative_tokens
 
     def use_eagle(self) -> bool:
-        return self.method in ("eagle", "eagle3", "deepseek_mtp", "ernie_mtp")
+        return self.method in ("eagle", "eagle3", "deepseek_mtp", "ernie_mtp",
+                               "qwen3_next_mtp")
 
     def __repr__(self) -> str:
         method = self.method
@@ -2421,111 +2405,6 @@ class SpeculativeConfig:
         return f"SpeculativeConfig({method=}, {model=}, {num_spec_tokens=})"
 
 
-LoRADType = Literal["auto", "float16", "bfloat16"]
-
-
-@config
-@dataclass(config=ConfigDict(arbitrary_types_allowed=True))
-class LoRAConfig:
-    """Configuration for LoRA."""
-
-    max_lora_rank: int = 16
-    """Max LoRA rank."""
-    max_loras: int = 1
-    """Max number of LoRAs in a single batch."""
-    fully_sharded_loras: bool = False
-    """By default, only half of the LoRA computation is sharded with tensor
-    parallelism. Enabling this will use the fully sharded layers. At high
-    sequence length, max rank or tensor parallel size, this is likely faster.
-    """
-    max_cpu_loras: Optional[int] = None
-    """Maximum number of LoRAs to store in CPU memory. Must be >= than
-    `max_loras`."""
-    lora_dtype: Union[torch.dtype, LoRADType] = "auto"
-    """Data type for LoRA. If auto, will default to base model dtype."""
-    lora_extra_vocab_size: int = 256
-    """(Deprecated) Maximum size of extra vocabulary that can be present in a 
-    LoRA adapter. Will be removed in v0.12.0."""
-    lora_vocab_padding_size: ClassVar[int] = current_platform\
-        .get_lora_vocab_padding_size()
-
-    default_mm_loras: Optional[dict[str, str]] = None
-    """Dictionary mapping specific modalities to LoRA model paths; this field
-    is only applicable to multimodal models and should be leveraged when a
-    model always expects a LoRA to be active when a given modality is present.
-    Note that currently, if a request provides multiple additional
-    modalities, each of which have their own LoRA, we do NOT apply
-    default_mm_loras because we currently only support one lora adapter
-    per prompt. When run in offline mode, the lora IDs for n modalities
-    will be automatically assigned to 1-n with the names of the modalities
-    in alphabetic order."""
-    bias_enabled: bool = False
-    """Enable bias for LoRA adapters."""
-
-    def compute_hash(self) -> str:
-        """
-        WARNING: Whenever a new field is added to this config,
-        ensure that it is included in the factors list if
-        it affects the computation graph.
-
-        Provide a hash that uniquely identifies all the configs
-        that affect the structure of the computation
-        graph from input ids/embeddings to the final hidden states,
-        excluding anything before input ids/embeddings and after
-        the final hidden states.
-        """
-        factors: list[Any] = []
-        factors.append(self.max_lora_rank)
-        factors.append(self.max_loras)
-        factors.append(self.fully_sharded_loras)
-        factors.append(self.lora_dtype)
-        factors.append(self.lora_extra_vocab_size)
-        factors.append(self.lora_vocab_padding_size)
-        factors.append(self.bias_enabled)
-        hash_str = hashlib.md5(str(factors).encode(),
-                               usedforsecurity=False).hexdigest()
-        return hash_str
-
-    def __post_init__(self):
-        # Deprecation warning for lora_extra_vocab_size
-        logger.warning(
-            "`lora_extra_vocab_size` is deprecated and will be removed "
-            "in v0.12.0. Additional vocabulary support for "
-            "LoRA adapters is being phased out.")
-
-        # Setting the maximum rank to 512 should be able to satisfy the vast
-        # majority of applications.
-        possible_max_ranks = (8, 16, 32, 64, 128, 256, 320, 512)
-        possible_lora_extra_vocab_size = (256, 512)
-        if self.max_lora_rank not in possible_max_ranks:
-            raise ValueError(
-                f"max_lora_rank ({self.max_lora_rank}) must be one of "
-                f"{possible_max_ranks}.")
-        if self.lora_extra_vocab_size not in possible_lora_extra_vocab_size:
-            raise ValueError(
-                f"lora_extra_vocab_size ({self.lora_extra_vocab_size}) "
-                f"must be one of {possible_lora_extra_vocab_size}.")
-        if self.max_loras < 1:
-            raise ValueError(f"max_loras ({self.max_loras}) must be >= 1.")
-        if self.max_cpu_loras is None:
-            self.max_cpu_loras = self.max_loras
-        elif self.max_cpu_loras < self.max_loras:
-            raise ValueError(
-                f"max_cpu_loras ({self.max_cpu_loras}) must be >= "
-                f"max_loras ({self.max_loras})")
-
-    def verify_with_cache_config(self, cache_config: CacheConfig):
-        if cache_config.cpu_offload_gb > 0 and not envs.VLLM_USE_V1:
-            raise ValueError(
-                "V0 LoRA does not support CPU offload, please use V1.")
-
-    def verify_with_model_config(self, model_config: ModelConfig):
-        if self.lora_dtype in (None, "auto"):
-            self.lora_dtype = model_config.dtype
-        elif isinstance(self.lora_dtype, str):
-            self.lora_dtype = getattr(torch, self.lora_dtype)
-
-
 @config
 @dataclass
 class MultiModalConfig:
@@ -2654,24 +2533,46 @@ class PoolerConfig:
     ## for embeddings models
     normalize: Optional[bool] = None
     """
-    Whether to normalize the embeddings outputs.
+    Whether to normalize the embeddings outputs. Defaults to True.
     """
     dimensions: Optional[int] = None
     """
     Reduce the dimensions of embeddings if model
-    support matryoshka representation.
+    support matryoshka representation. Defaults to None.
+    """
+    enable_chunked_processing: Optional[bool] = None
+    """
+    Whether to enable chunked processing for long inputs that exceed the model's
+    maximum position embeddings. When enabled, long inputs will be split into
+    chunks, processed separately, and then aggregated using weighted averaging.
+    This allows embedding models to handle arbitrarily long text without CUDA
+    errors. Defaults to False.
+    """
+    max_embed_len: Optional[int] = None
+    """
+    Maximum input length allowed for embedding generation. When set, allows
+    inputs longer than max_embed_len to be accepted for embedding models.
+    When an input exceeds max_embed_len, it will be handled according to 
+    the original max_model_len validation logic. 
+    Defaults to None (i.e. set to max_model_len).
     """
 
     ## for classification models
     activation: Optional[bool] = None
     """
     Whether to apply activation function to the classification outputs.
+    Defaults to True.
+    """
+    logit_bias: Optional[float] = None
+    """
+    If provided, apply classification logit biases. Defaults to None.
     """
 
     ## for reward models
     softmax: Optional[bool] = None
     """
     Whether to apply softmax to the reward outputs.
+    Defaults to True.
     """
     step_tag_id: Optional[int] = None
     """
@@ -2686,25 +2587,6 @@ class PoolerConfig:
     ``math-shepherd-mistral-7b-prm`` model.
     """
 
-    enable_chunked_processing: Optional[bool] = None
-    """
-    Whether to enable chunked processing for long inputs that exceed the model's
-    maximum position embeddings. When enabled, long inputs will be split into
-    chunks, processed separately, and then aggregated using weighted averaging.
-    This allows embedding models to handle arbitrarily long text without CUDA
-    errors. Defaults to False.
-    """
-
-    max_embed_len: Optional[int] = None
-    """
-    Maximum input length allowed for embedding generation. When set, allows
-    inputs longer than max_embed_len to be accepted for embedding models.
-    This parameter enables accepting long inputs without requiring
-    VLLM_ALLOW_LONG_MAX_MODEL_LEN environment variable. When an input exceeds
-    max_embed_len, it will be handled according to the original max_model_len
-    validation logic. Defaults to None (i.e. set to max_model_len).
-    """
-
     def compute_hash(self) -> str:
         """
         WARNING: Whenever a new field is added to this config,
@@ -2737,6 +2619,8 @@ _STR_DTYPE_TO_TORCH_DTYPE = {
 _FLOAT16_NOT_SUPPORTED_MODELS = {
     "gemma2": "Numerical instability. Please use bfloat16 or float32 instead.",
     "gemma3": "Numerical instability. Please use bfloat16 or float32 instead.",
+    "gemma3_text":
+    "Numerical instability. Please use bfloat16 or float32 instead.",
     "plamo2": "Numerical instability. Please use bfloat16 or float32 instead.",
     "glm4": "Numerical instability. Please use bfloat16 or float32 instead.",
 }
@@ -2889,6 +2773,31 @@ def _get_and_verify_dtype(
     return torch_dtype
 
 
+def _get_head_dtype(config: PretrainedConfig, dtype: torch.dtype,
+                    runner_type: str) -> torch.dtype:
+    head_dtype: Optional[Union[str,
+                               torch.dtype]] = getattr(config, "head_dtype",
+                                                       None)
+
+    if head_dtype == "model":
+        return dtype
+    elif isinstance(head_dtype, str):
+        head_dtype = head_dtype.lower()
+        if head_dtype not in _STR_DTYPE_TO_TORCH_DTYPE:
+            raise ValueError(f"Unknown dtype: {head_dtype!r}")
+        return _STR_DTYPE_TO_TORCH_DTYPE[head_dtype]
+    elif isinstance(head_dtype, torch.dtype):
+        return head_dtype
+    elif head_dtype is None:
+        if torch.float32 not in current_platform.supported_dtypes:
+            return dtype
+        if runner_type == "pooling":
+            return torch.float32
+        return dtype
+    else:
+        raise ValueError(f"Unknown dtype: {head_dtype}")
+
+
 def _get_and_verify_max_len(
     hf_config: PretrainedConfig,
     tokenizer_config: Optional[dict],
@@ -3024,16 +2933,20 @@ def _get_and_verify_max_len(
                 f"User-specified max_model_len ({max_model_len}) is greater "
                 f"than the derived max_model_len ({max_len_key}="
                 f"{derived_max_model_len} or model_max_length="
-                f"{model_max_length} in model's config.json). This may lead "
-                "to incorrect model outputs or CUDA errors.")
+                f"{model_max_length} in model's config.json).")
+            warning = (
+                "VLLM_ALLOW_LONG_MAX_MODEL_LEN must be used with extreme "
+                "caution. If the model uses relative position encoding (RoPE), "
+                "positions exceeding derived_max_model_len lead to nan. If the "
+                "model uses absolute position encoding, positions exceeding "
+                "derived_max_model_len will cause a CUDA array out-of-bounds "
+                "error.")
             if envs.VLLM_ALLOW_LONG_MAX_MODEL_LEN:
-                logger.warning(
-                    "%s Make sure the value is correct and within the "
-                    "model context size.", msg)
+                logger.warning_once("%s %s", msg, warning)
             else:
                 raise ValueError(
                     f"{msg} To allow overriding this maximum, set "
-                    "the env var VLLM_ALLOW_LONG_MAX_MODEL_LEN=1")
+                    f"the env var VLLM_ALLOW_LONG_MAX_MODEL_LEN=1. {warning}")
     return int(max_model_len)
 
 
@@ -3202,149 +3115,6 @@ class ObservabilityConfig:
             self.collect_detailed_traces[0].split(","))
 
 
-KVProducer = Literal["kv_producer", "kv_both"]
-KVConsumer = Literal["kv_consumer", "kv_both"]
-KVRole = Literal[KVProducer, KVConsumer]
-
-
-@config
-@dataclass
-class KVTransferConfig:
-    """Configuration for distributed KV cache transfer."""
-
-    kv_connector: Optional[str] = None
-    """The KV connector for vLLM to transmit KV caches between vLLM instances.
-    """
-
-    engine_id: Optional[str] = None
-    """The engine id for KV transfers."""
-
-    kv_buffer_device: Optional[str] = "cuda"
-    """The device used by kv connector to buffer the KV cache.
-    Currently only support 'cuda'."""
-
-    kv_buffer_size: float = 1e9
-    """The buffer size for TorchDistributedConnector. Measured in number of
-    bytes. Recommended value: 1e9 (about 1GB)."""
-
-    kv_role: Optional[KVRole] = None
-    """Whether this vLLM instance produces, consumes KV cache, or both. Choices
-    are 'kv_producer', 'kv_consumer', and 'kv_both'."""
-
-    kv_rank: Optional[int] = None
-    """The rank of this vLLM instance in the KV cache transfer. Typical value:
-    0 for prefill instance, 1 for decode instance.
-    Currently only 1P1D is supported."""
-
-    kv_parallel_size: int = 1
-    """The number of parallel instances for KV cache transfer. For
-    PyNcclConnector, this should be 2."""
-
-    kv_ip: str = "127.0.0.1"
-    """The KV connector ip, used to build distributed connection."""
-
-    kv_port: int = 14579
-    """The KV connector port, used to build distributed connection."""
-
-    kv_connector_extra_config: dict[str, Any] = field(default_factory=dict)
-    """any extra config that the connector may need."""
-
-    kv_connector_module_path: Optional[str] = None
-    """The Python module path to dynamically load the KV connector from.
-    Only supported in V1."""
-
-    def compute_hash(self) -> str:
-        """
-        WARNING: Whenever a new field is added to this config,
-        ensure that it is included in the factors list if
-        it affects the computation graph.
-
-        Provide a hash that uniquely identifies all the configs
-        that affect the structure of the computation
-        graph from input ids/embeddings to the final hidden states,
-        excluding anything before input ids/embeddings and after
-        the final hidden states.
-        """
-        # no factors to consider.
-        # this config will not affect the computation graph.
-        factors: list[Any] = []
-        hash_str = hashlib.md5(str(factors).encode(),
-                               usedforsecurity=False).hexdigest()
-        return hash_str
-
-    def __post_init__(self) -> None:
-        if self.engine_id is None:
-            self.engine_id = str(uuid.uuid4())
-
-        if self.kv_role is not None and self.kv_role not in get_args(KVRole):
-            raise ValueError(f"Unsupported kv_role: {self.kv_role}. "
-                             f"Supported roles are {get_args(KVRole)}")
-
-        if self.kv_connector is not None and self.kv_role is None:
-            raise ValueError("Please specify kv_disagg_role when kv_connector "
-                             f"is set, supported roles are {get_args(KVRole)}")
-
-    @property
-    def is_kv_transfer_instance(self) -> bool:
-        return self.kv_connector is not None and \
-            self.kv_role in get_args(KVRole)
-
-    @property
-    def is_kv_producer(self) -> bool:
-        return self.kv_connector is not None and \
-            self.kv_role in get_args(KVProducer)
-
-    @property
-    def is_kv_consumer(self) -> bool:
-        return self.kv_connector is not None and \
-            self.kv_role in get_args(KVConsumer)
-
-    def get_from_extra_config(self, key, default) -> Any:
-        return self.kv_connector_extra_config.get(key, default)
-
-
-@config
-@dataclass
-class KVEventsConfig:
-    """Configuration for KV event publishing."""
-
-    enable_kv_cache_events: bool = False
-    """If True, enable KV cache events for tracking block storage and removal.
-    Events can be published externally by zmq using the event publisher config.
-    """
-
-    publisher: str = "null"
-    """The publisher to use for publishing kv events. Can be "null", "zmq".
-    """
-
-    endpoint: str = "tcp://*:5557"
-    """The zmq endpoint to use for publishing kv events.
-    """
-
-    replay_endpoint: Optional[str] = None
-    """The zmq endpoint to use for replaying kv events.
-    """
-
-    buffer_steps: int = 10_000
-    """The number of steps to cache for replay endpoint. Will only save
-    events from the last N steps for the replay endpoint.
-    """
-
-    hwm: int = 100_000
-    """The zmq high water mark for the event publisher. After queueing N events,
-    events will start dropping if the consumer is not keeping up.
-    """
-
-    max_queue_size: int = 100_000
-    """The maximum number of events to queue while waiting for publishing.
-    """
-
-    topic: str = ""
-    """The topic to use for the event publisher. Consumers can subscribe to
-    this topic to receive events.
-    """
-
-
 @config
 @dataclass(config=ConfigDict(arbitrary_types_allowed=True))
 class VllmConfig:
@@ -3648,6 +3418,24 @@ class VllmConfig:
                 " Disabling `torch.compile`.")
             self.compilation_config.level = CompilationLevel.NO_COMPILATION
 
+        if self.cache_config.kv_sharing_fast_prefill:
+            if not envs.VLLM_USE_V1:
+                raise NotImplementedError(
+                    "Fast prefill optimization for KV sharing is not supported "
+                    "in V0 currently.")
+
+            if self.speculative_config is not None and \
+                self.speculative_config.use_eagle():
+                raise NotImplementedError(
+                    "Fast prefill optimization for KV sharing is not "
+                    "compatible with EAGLE as EAGLE requires correct logits "
+                    "for all tokens while fast prefill gives incorrect logits "
+                    "for prompt tokens.")
+
+            logger.warning_once(
+                "--kv-sharing-fast-prefill requires changes on model side for "
+                "correctness and to realize prefill savings. ")
+
         if ((not envs.VLLM_USE_V1) and self.lora_config is not None
                 and self.compilation_config.level
                 != CompilationLevel.NO_COMPILATION):
@@ -3658,16 +3446,37 @@ class VllmConfig:
 
         disable_chunked_prefill_reasons: list[str] = []
 
-        if self.model_config and self.model_config.pooler_config:
-            pooling_type = self.model_config.pooler_config.pooling_type
-            if pooling_type is None or pooling_type.lower() != "last":
-                disable_chunked_prefill_reasons.append(
-                    "Only \"last\" pooling supports chunked "
-                    "prefill and prefix caching; disabling both.")
-            elif not getattr(self.model_config.hf_config, "is_causal", True):
+        if self.model_config:
+            if self.model_config.pooler_config:
+                pooling_type = self.model_config.pooler_config.pooling_type
+                if pooling_type is None or pooling_type.lower() != "last":
+                    disable_chunked_prefill_reasons.append(
+                        "Only \"last\" pooling supports chunked "
+                        "prefill and prefix caching; disabling both.")
+                if not getattr(self.model_config.hf_config, "is_causal", True):
+                    disable_chunked_prefill_reasons.append(
+                        "Only models using causal attention supports chunked "
+                        "prefill and prefix caching; disabling both.")
+            elif self.model_config.is_encoder_decoder:
+                self.scheduler_config.max_num_encoder_input_tokens = \
+                    MULTIMODAL_REGISTRY.get_encdec_max_encoder_len(self.model_config)
+                logger.debug(
+                    "Encoder-decoder model detected: setting "
+                    "`max_num_encoder_input_tokens` to encoder length (%s)",
+                    self.scheduler_config.max_num_encoder_input_tokens)
+                self.scheduler_config.disable_chunked_mm_input = True
                 disable_chunked_prefill_reasons.append(
-                    "Only models using causal attention supports chunked "
-                    "prefill and prefix caching; disabling both.")
+                    "Encoder-decoder models do not support chunked prefill nor"
+                    " prefix caching; disabling both.")
+                if (self.model_config.architecture
+                        == "WhisperForConditionalGeneration"
+                        and os.environ.get("VLLM_WORKER_MULTIPROC_METHOD")
+                        != "spawn"):
+                    logger.warning(
+                        "Whisper is known to have issues with "
+                        "forked workers. If startup is hanging, "
+                        "try setting 'VLLM_WORKER_MULTIPROC_METHOD' "
+                        "to 'spawn'.")
 
         if disable_chunked_prefill_reasons:
             for reason in disable_chunked_prefill_reasons:
@@ -3724,7 +3533,7 @@ class VllmConfig:
             # logger should only print warning message for hybrid models. As we
             # can't know whether the model is hybrid or not now, so we don't log
             # warning message here and will log it later.
-            if not (current_platform.is_cuda() or current_platform.is_rocm()):
+            if not current_platform.support_hybrid_kv_cache():
                 # Hybrid KV cache manager is not supported on non-GPU platforms.
                 self.scheduler_config.disable_hybrid_kv_cache_manager = True
             if self.kv_transfer_config is not None:
@@ -3774,30 +3583,40 @@ class VllmConfig:
 
     def _set_cudagraph_sizes(self):
         """
-        cudagraph batchsize padding logic:
-
-        `[1, 2, 4] + [8 * i for i in range(1, 1025)]` is a list of all possible
-        batch sizes that cudagraph will capture.
-
-        Depending on the engine's configuration of `max_num_seqs`, the
-        candidate batch sizes to capture cudagraph will shrink to the subset
-        which just cover the range of `[1, max_num_seqs]`. In the common case,
-        `max_num_seqs` is 256, and the cudagraph batch sizes will be
-        `[1, 2, 4, 8, 16, 24, 32, 40, ..., 256]`.
+        vLLM defines the default candidate list of batch sizes for CUDA graph
+        capture as:
 
-        However, if users specify the cudagraph capture sizes through
-        compilation config, we will use the specified sizes instead.
+        ```python
+        max_graph_size = min(max_num_seqs * 2, 512)
+        # 1, 2, 4, then multiples of 8 up to max_graph_size
+        cuda_graph_sizes = [1, 2, 4, 8, 16, 24, 32, 40, ..., max_graph_size]
 
         In the end, `vllm_config.compilation_config.cudagraph_capture_sizes`
         will be the final sizes to capture cudagraph (in descending order).
 
-        During runtime, if batchsize is larger than
-        `vllm_config.compilation_config.cudagraph_capture_sizes`,
-        no cudagraph will be used.
-        If the batch size is no larger than
-        `vllm_config.compilation_config.cudagraph_capture_sizes`,
-        we can quickly find the padded graph size for a given batch size by
-        looking up `vllm_config.compilation_config.bs_to_padded_graph_size`.
+        These sizes are used to capture and reuse CUDA graphs for
+        performance-critical paths (e.g., decoding). Capturing enables
+        significantly faster kernel dispatch by avoiding Python overhead. The
+        list is then filtered based on `max_num_batched_tokens` (e.g., 8192 on
+        most GPUs), which controls the total allowed number of tokens in a
+        batch. Since each sequence may have a variable number of tokens, the
+        maximum usable batch size will depend on actual sequence lengths.
+
+        Example:
+            With `max_num_batched_tokens = 8192`, and typical sequences
+            averaging ~32 tokens, most practical batch sizes fall below 256.
+            However, the system will still allow capture sizes up to 512 if
+            shape and memory permit.
+
+        Note:
+            If users explicitly specify cudagraph capture sizes in the
+            compilation config, those will override this default logic.
+            At runtime:
+
+            - If batch size <= one of the `cudagraph_capture_sizes`, the closest
+            padded CUDA graph will be used.
+            - If batch size > largest `cudagraph_capture_sizes`, cudagraph will
+            not be used.
         """
 
         # calculate the default `batch_size_capture_list`
@@ -3899,7 +3718,6 @@ class VllmConfig:
             f"skip_tokenizer_init={self.model_config.skip_tokenizer_init}, "
             f"tokenizer_mode={self.model_config.tokenizer_mode}, "
             f"revision={self.model_config.revision}, "
-            f"override_neuron_config={self.model_config.override_neuron_config}, "  # noqa
             f"tokenizer_revision={self.model_config.tokenizer_revision}, "
             f"trust_remote_code={self.model_config.trust_remote_code}, "
             f"dtype={self.model_config.dtype}, "
@@ -3908,6 +3726,7 @@ class VllmConfig:
             f"load_format={self.load_config.load_format}, "
             f"tensor_parallel_size={self.parallel_config.tensor_parallel_size}, "  # noqa
             f"pipeline_parallel_size={self.parallel_config.pipeline_parallel_size}, "  # noqa
+            f"data_parallel_size={self.parallel_config.data_parallel_size}, "  # noqa
             f"disable_custom_all_reduce={self.parallel_config.disable_custom_all_reduce}, "  # noqa
             f"quantization={self.model_config.quantization}, "
             f"enforce_eager={self.model_config.enforce_eager}, "
@@ -4006,7 +3825,7 @@ def contains_object_print(text):
     Check if the text looks like a printed Python object, e.g.
     contains any substring matching the pattern: "at 0xFFFFFFF>"
     We match against 0x followed by 2-16 hex chars (there's
-    a max of 16 on a 64 bit system).
+    a max of 16 on a 64-bit system).
 
     Args:
         text (str): The text to check
diff --git a/vllm/config/cache.py b/vllm/config/cache.py
index 79761e784485979ee1e95410d09a653b38f62826..4c4e39c37ee5080f26bef3fdd268abd2357dd544 100644
--- a/vllm/config/cache.py
+++ b/vllm/config/cache.py
@@ -24,7 +24,7 @@ logger = init_logger(__name__)
 BlockSize = Literal[1, 8, 16, 32, 64, 128]
 CacheDType = Literal["auto", "fp8", "fp8_e4m3", "fp8_e5m2", "fp8_inc"]
 MambaDType = Literal["auto", "float32"]
-PrefixCachingHashAlgo = Literal["builtin", "sha256", "sha256_cbor_64bit"]
+PrefixCachingHashAlgo = Literal["sha256", "sha256_cbor"]
 
 
 @config
@@ -33,9 +33,8 @@ class CacheConfig:
     """Configuration for the KV cache."""
 
     block_size: SkipValidation[BlockSize] = None  # type: ignore
-    """Size of a contiguous cache block in number of tokens. This is ignored on
-    neuron devices and set to `--max-model-len`. On CUDA devices, only block
-    sizes up to 32 are supported. On HPU devices, block size defaults to 128.
+    """Size of a contiguous cache block in number of tokens. On CUDA devices,
+    only block sizes up to 32 are supported.
 
     This config has no static default. If left unspecified by the user, it will
     be set in `Platform.check_and_update_config()` based on the current
@@ -64,17 +63,12 @@ class CacheConfig:
     """Sliding window size for the KV cache. This is primarily set in
     `ModelConfig` and that value should be manually duplicated here."""
     enable_prefix_caching: Optional[bool] = None
-    """Whether to enable prefix caching. Disabled by default for V0. Enabled by
-    default for V1."""
-    prefix_caching_hash_algo: PrefixCachingHashAlgo = "builtin"
+    """Whether to enable prefix caching. Enabled by default for V1."""
+    prefix_caching_hash_algo: PrefixCachingHashAlgo = "sha256"
     """Set the hash algorithm for prefix caching:\n
-    - "builtin" is Python's built-in hash.\n
-    - "sha256" is collision resistant but with certain overheads.
-    This option uses Pickle for object serialization before hashing.\n
-    - "sha256_cbor_64bit" provides a reproducible, cross-language compatible
-    hash. It serializes objects using canonical CBOR and hashes them with
-    SHA-256. The resulting hash consists of the lower 64 bits of the SHA-256
-    digest."""
+    - "sha256" uses Pickle for object serialization before hashing.\n
+    - "sha256_cbor" provides a reproducible, cross-language compatible hash. It
+    serializes objects using canonical CBOR and hashes them with SHA-256."""
     cpu_offload_gb: float = 0
     """The space in GiB to offload to CPU, per GPU. Default is 0, which means
     no offloading. Intuitively, this argument can be seen as a virtual way to
@@ -119,6 +113,15 @@ class CacheConfig:
     necessary for implementing this optimization in some models (e.g. Gemma3n)
     """
 
+    kv_cache_memory_bytes: Optional[int] = None
+    """Size of KV Cache per GPU in bytes. By default, this is set to None
+    and vllm can automatically infer the kv cache size based on
+    gpu_memory_utilization. However, users may want to manually specify
+    the kv cache memory size. kv_cache_memory_bytes allows more fine-grain
+    control of how much memory gets used when compared with using
+    gpu_memory_memory_utilization. Note that kv_cache_memory_bytes
+    (when not-None) ignores gpu_memory_utilization"""
+
     def compute_hash(self) -> str:
         """
         WARNING: Whenever a new field is added to this config,
@@ -145,19 +148,12 @@ class CacheConfig:
 
         self._verify_cache_dtype()
         self._verify_prefix_caching()
-        self._verify_kv_sharing_fast_prefill()
 
     def metrics_info(self):
         # convert cache_config to dict(key: str, value: str) for prometheus
         # metrics info
         return {key: str(value) for key, value in self.__dict__.items()}
 
-    def _verify_kv_sharing_fast_prefill(self) -> None:
-        if self.kv_sharing_fast_prefill and not envs.VLLM_USE_V1:
-            raise NotImplementedError(
-                "Fast prefill optimization for KV sharing is not supported "
-                "in V0 currently.")
-
     @model_validator(mode='after')
     def _verify_args(self) -> Self:
         if self.cpu_offload_gb < 0:
diff --git a/vllm/config/compilation.py b/vllm/config/compilation.py
index 5c3b220016360d460d947015c40d349712417cb9..f8ccc202226156188655c6042a94c6a3ce151f39 100644
--- a/vllm/config/compilation.py
+++ b/vllm/config/compilation.py
@@ -234,7 +234,7 @@ class CompilationConfig:
     - FULL_AND_PIECEWISE.
 
     PIECEWISE mode build piecewise cudagraph only, keeping the cudagraph
-    incompatiable ops (i.e. some attention ops) outside the cudagraph
+    incompatible ops (i.e. some attention ops) outside the cudagraph
     for general flexibility.
     This is the default mode.
 
@@ -340,6 +340,8 @@ class CompilationConfig:
         "vllm.mamba_mixer",
         "vllm.short_conv",
         "vllm.linear_attention",
+        "vllm.plamo2_mamba_mixer",
+        "vllm.gdn_attention",
     ]
 
     def compute_hash(self) -> str:
@@ -545,7 +547,8 @@ class CompilationConfig:
             # full cudagraph outside the fx graph. This reduces some cpu
             # overhead when the runtime batch_size is not cudagraph captured.
             # see https://github.com/vllm-project/vllm/pull/20059 for details.
-            self.splitting_ops = self._attention_ops
+            # make a copy to avoid mutating the class-level list via reference.
+            self.splitting_ops = list(self._attention_ops)
         elif len(self.splitting_ops) == 0:
             logger.warning_once("Using piecewise compilation with empty "
                                 "splitting_ops.")
@@ -560,6 +563,18 @@ class CompilationConfig:
                 self.cudagraph_mode = CUDAGraphMode.FULL
             self.splitting_ops = []
 
+        if envs.VLLM_ALL2ALL_BACKEND == "deepep_high_throughput":
+            # exclude MoE dispatch/combine from capture by ensuring
+            # piecewise splitting includes them, so communication remains
+            # outside CUDA graphs while compute can still be graphed.
+            moe_ops = [
+                "vllm.moe_forward",
+                "vllm.moe_forward_shared",
+            ]
+            for op in moe_ops:
+                if op not in self.splitting_ops:
+                    self.splitting_ops.append(op)
+
     def splitting_ops_contain_attention(self) -> bool:
         return self.splitting_ops is not None and all(
             op in self.splitting_ops for op in self._attention_ops)
diff --git a/vllm/config/kv_events.py b/vllm/config/kv_events.py
new file mode 100644
index 0000000000000000000000000000000000000000..1c6bdffa1281d877b201655b2a6c6f83c53e661e
--- /dev/null
+++ b/vllm/config/kv_events.py
@@ -0,0 +1,50 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from typing import Optional
+
+from pydantic.dataclasses import dataclass
+
+from vllm.config.utils import config
+
+
+@config
+@dataclass
+class KVEventsConfig:
+    """Configuration for KV event publishing."""
+
+    enable_kv_cache_events: bool = False
+    """If True, enable KV cache events for tracking block storage and removal.
+    Events can be published externally by zmq using the event publisher config.
+    """
+
+    publisher: str = "null"
+    """The publisher to use for publishing kv events. Can be "null", "zmq".
+    """
+
+    endpoint: str = "tcp://*:5557"
+    """The zmq endpoint to use for publishing kv events.
+    """
+
+    replay_endpoint: Optional[str] = None
+    """The zmq endpoint to use for replaying kv events.
+    """
+
+    buffer_steps: int = 10_000
+    """The number of steps to cache for replay endpoint. Will only save
+    events from the last N steps for the replay endpoint.
+    """
+
+    hwm: int = 100_000
+    """The zmq high water mark for the event publisher. After queueing N events,
+    events will start dropping if the consumer is not keeping up.
+    """
+
+    max_queue_size: int = 100_000
+    """The maximum number of events to queue while waiting for publishing.
+    """
+
+    topic: str = ""
+    """The topic to use for the event publisher. Consumers can subscribe to
+    this topic to receive events.
+    """
diff --git a/vllm/config/kv_transfer.py b/vllm/config/kv_transfer.py
new file mode 100644
index 0000000000000000000000000000000000000000..9abf4acacfe8185c749387e871758d29df4bb490
--- /dev/null
+++ b/vllm/config/kv_transfer.py
@@ -0,0 +1,111 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import hashlib
+import uuid
+from dataclasses import field
+from typing import Any, Literal, Optional, get_args
+
+from pydantic.dataclasses import dataclass
+
+from vllm.config.utils import config
+
+KVProducer = Literal["kv_producer", "kv_both"]
+KVConsumer = Literal["kv_consumer", "kv_both"]
+KVRole = Literal[KVProducer, KVConsumer]
+
+
+@config
+@dataclass
+class KVTransferConfig:
+    """Configuration for distributed KV cache transfer."""
+
+    kv_connector: Optional[str] = None
+    """The KV connector for vLLM to transmit KV caches between vLLM instances.
+    """
+
+    engine_id: Optional[str] = None
+    """The engine id for KV transfers."""
+
+    kv_buffer_device: Optional[str] = "cuda"
+    """The device used by kv connector to buffer the KV cache.
+    Currently only support 'cuda'."""
+
+    kv_buffer_size: float = 1e9
+    """The buffer size for TorchDistributedConnector. Measured in number of
+    bytes. Recommended value: 1e9 (about 1GB)."""
+
+    kv_role: Optional[KVRole] = None
+    """Whether this vLLM instance produces, consumes KV cache, or both. Choices
+    are 'kv_producer', 'kv_consumer', and 'kv_both'."""
+
+    kv_rank: Optional[int] = None
+    """The rank of this vLLM instance in the KV cache transfer. Typical value:
+    0 for prefill instance, 1 for decode instance.
+    Currently only 1P1D is supported."""
+
+    kv_parallel_size: int = 1
+    """The number of parallel instances for KV cache transfer. For
+    P2pNcclConnector, this should be 2."""
+
+    kv_ip: str = "127.0.0.1"
+    """The KV connector ip, used to build distributed connection."""
+
+    kv_port: int = 14579
+    """The KV connector port, used to build distributed connection."""
+
+    kv_connector_extra_config: dict[str, Any] = field(default_factory=dict)
+    """any extra config that the connector may need."""
+
+    kv_connector_module_path: Optional[str] = None
+    """The Python module path to dynamically load the KV connector from.
+    Only supported in V1."""
+
+    def compute_hash(self) -> str:
+        """
+        WARNING: Whenever a new field is added to this config,
+        ensure that it is included in the factors list if
+        it affects the computation graph.
+
+        Provide a hash that uniquely identifies all the configs
+        that affect the structure of the computation
+        graph from input ids/embeddings to the final hidden states,
+        excluding anything before input ids/embeddings and after
+        the final hidden states.
+        """
+        # no factors to consider.
+        # this config will not affect the computation graph.
+        factors: list[Any] = []
+        hash_str = hashlib.md5(str(factors).encode(),
+                               usedforsecurity=False).hexdigest()
+        return hash_str
+
+    def __post_init__(self) -> None:
+        if self.engine_id is None:
+            self.engine_id = str(uuid.uuid4())
+
+        if self.kv_role is not None and self.kv_role not in get_args(KVRole):
+            raise ValueError(f"Unsupported kv_role: {self.kv_role}. "
+                             f"Supported roles are {get_args(KVRole)}")
+
+        if self.kv_connector is not None and self.kv_role is None:
+            raise ValueError("Please specify kv_disagg_role when kv_connector "
+                             f"is set, supported roles are {get_args(KVRole)}")
+
+    @property
+    def is_kv_transfer_instance(self) -> bool:
+        return self.kv_connector is not None and \
+            self.kv_role in get_args(KVRole)
+
+    @property
+    def is_kv_producer(self) -> bool:
+        return self.kv_connector is not None and \
+            self.kv_role in get_args(KVProducer)
+
+    @property
+    def is_kv_consumer(self) -> bool:
+        return self.kv_connector is not None and \
+            self.kv_role in get_args(KVConsumer)
+
+    def get_from_extra_config(self, key, default) -> Any:
+        return self.kv_connector_extra_config.get(key, default)
diff --git a/vllm/config/load.py b/vllm/config/load.py
new file mode 100644
index 0000000000000000000000000000000000000000..26ffec23ad5c67a121c469dcfa3689c8139f96af
--- /dev/null
+++ b/vllm/config/load.py
@@ -0,0 +1,113 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import hashlib
+from dataclasses import field
+from typing import TYPE_CHECKING, Any, Optional, Union
+
+from pydantic.dataclasses import dataclass
+
+from vllm.config.utils import config
+from vllm.logger import init_logger
+
+if TYPE_CHECKING:
+    from vllm.model_executor.model_loader import LoadFormats
+    from vllm.model_executor.model_loader.tensorizer import TensorizerConfig
+else:
+    LoadFormats = Any
+    TensorizerConfig = Any
+
+logger = init_logger(__name__)
+
+
+@config
+@dataclass
+class LoadConfig:
+    """Configuration for loading the model weights."""
+
+    load_format: Union[str, LoadFormats] = "auto"
+    """The format of the model weights to load:\n
+    - "auto" will try to load the weights in the safetensors format and fall
+    back to the pytorch bin format if safetensors format is not available.\n
+    - "pt" will load the weights in the pytorch bin format.\n
+    - "safetensors" will load the weights in the safetensors format.\n
+    - "npcache" will load the weights in pytorch format and store a numpy cache
+    to speed up the loading.\n
+    - "dummy" will initialize the weights with random values, which is mainly
+    for profiling.\n
+    - "tensorizer" will use CoreWeave's tensorizer library for fast weight
+    loading. See the Tensorize vLLM Model script in the Examples section for
+    more information.\n
+    - "runai_streamer" will load the Safetensors weights using Run:ai Model
+    Streamer.\n
+    - "bitsandbytes" will load the weights using bitsandbytes quantization.\n
+    - "sharded_state" will load weights from pre-sharded checkpoint files,
+    supporting efficient loading of tensor-parallel models.\n
+    - "gguf" will load weights from GGUF format files (details specified in
+    https://github.com/ggml-org/ggml/blob/master/docs/gguf.md).\n
+    - "mistral" will load weights from consolidated safetensors files used by
+    Mistral models.
+    - Other custom values can be supported via plugins."""
+    download_dir: Optional[str] = None
+    """Directory to download and load the weights, default to the default
+    cache directory of Hugging Face."""
+    safetensors_load_strategy: str = "lazy"
+    """Specifies the loading strategy for safetensors weights.
+    - "lazy" (default): Weights are memory-mapped from the file. This enables
+      on-demand loading and is highly efficient for models on local storage.
+    - "eager": The entire file is read into CPU memory upfront before loading.
+      This is recommended for models on network filesystems (e.g., Lustre, NFS)
+      as it avoids inefficient random reads, significantly speeding up model
+      initialization. However, it uses more CPU RAM.
+    """
+    model_loader_extra_config: Union[dict, TensorizerConfig] = field(
+        default_factory=dict)
+    """Extra config for model loader. This will be passed to the model loader
+    corresponding to the chosen load_format."""
+    device: Optional[str] = None
+    """Device to which model weights will be loaded, default to
+    device_config.device"""
+    ignore_patterns: Optional[Union[list[str], str]] = None
+    """The list of patterns to ignore when loading the model. Default to
+    "original/**/*" to avoid repeated loading of llama's checkpoints."""
+    use_tqdm_on_load: bool = True
+    """Whether to enable tqdm for showing progress bar when loading model
+    weights."""
+    pt_load_map_location: Union[str, dict[str, str]] = "cpu"
+    """
+    pt_load_map_location: the map location for loading pytorch checkpoint, to
+    support loading checkpoints can only be loaded on certain devices like
+    "cuda", this is equivalent to {"": "cuda"}. Another supported format is
+    mapping from different devices like from GPU 1 to GPU 0:
+    {"cuda:1": "cuda:0"}. Note that when passed from command line, the strings
+    in dictionary needs to be double quoted for json parsing. For more details,
+    see original doc for `map_location` in https://pytorch.org/docs/stable/generated/torch.load.html
+    """
+
+    def compute_hash(self) -> str:
+        """
+        WARNING: Whenever a new field is added to this config,
+        ensure that it is included in the factors list if
+        it affects the computation graph.
+
+        Provide a hash that uniquely identifies all the configs
+        that affect the structure of the computation
+        graph from input ids/embeddings to the final hidden states,
+        excluding anything before input ids/embeddings and after
+        the final hidden states.
+        """
+        # no factors to consider.
+        # this config will not affect the computation graph.
+        factors: list[Any] = []
+        hash_str = hashlib.md5(str(factors).encode(),
+                               usedforsecurity=False).hexdigest()
+        return hash_str
+
+    def __post_init__(self):
+        self.load_format = self.load_format.lower()
+        if self.ignore_patterns is not None and len(self.ignore_patterns) > 0:
+            logger.info(
+                "Ignoring the following patterns when downloading weights: %s",
+                self.ignore_patterns)
+        else:
+            self.ignore_patterns = ["original/**/*"]
diff --git a/vllm/config/lora.py b/vllm/config/lora.py
new file mode 100644
index 0000000000000000000000000000000000000000..3fe28f5dad4fa7cdcb3986f93d7add34012e02a1
--- /dev/null
+++ b/vllm/config/lora.py
@@ -0,0 +1,132 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import hashlib
+from typing import TYPE_CHECKING, Any, ClassVar, Literal, Optional, Union
+
+import torch
+from pydantic import ConfigDict
+from pydantic.dataclasses import dataclass
+
+import vllm.envs as envs
+from vllm.config.utils import config
+from vllm.logger import init_logger
+from vllm.platforms import current_platform
+
+if TYPE_CHECKING:
+    from vllm.config import ModelConfig
+    from vllm.config.cache import CacheConfig
+else:
+    ModelConfig = Any
+    CacheConfig = Any
+
+logger = init_logger(__name__)
+
+LoRADType = Literal["auto", "float16", "bfloat16"]
+
+
+@config
+@dataclass(config=ConfigDict(arbitrary_types_allowed=True))
+class LoRAConfig:
+    """Configuration for LoRA."""
+
+    max_lora_rank: int = 16
+    """Max LoRA rank."""
+    max_loras: int = 1
+    """Max number of LoRAs in a single batch."""
+    fully_sharded_loras: bool = False
+    """By default, only half of the LoRA computation is sharded with tensor
+    parallelism. Enabling this will use the fully sharded layers. At high
+    sequence length, max rank or tensor parallel size, this is likely faster.
+    """
+    max_cpu_loras: Optional[int] = None
+    """Maximum number of LoRAs to store in CPU memory. Must be >= than
+    `max_loras`."""
+    lora_dtype: Union[torch.dtype, LoRADType] = "auto"
+    """Data type for LoRA. If auto, will default to base model dtype."""
+    lora_extra_vocab_size: int = 256
+    """(Deprecated) Maximum size of extra vocabulary that can be present in a 
+    LoRA adapter. Will be removed in v0.12.0."""
+    lora_vocab_padding_size: ClassVar[int] = current_platform\
+        .get_lora_vocab_padding_size()
+    default_mm_loras: Optional[dict[str, str]] = None
+    """Dictionary mapping specific modalities to LoRA model paths; this field
+    is only applicable to multimodal models and should be leveraged when a
+    model always expects a LoRA to be active when a given modality is present.
+    Note that currently, if a request provides multiple additional
+    modalities, each of which have their own LoRA, we do NOT apply
+    default_mm_loras because we currently only support one lora adapter
+    per prompt. When run in offline mode, the lora IDs for n modalities
+    will be automatically assigned to 1-n with the names of the modalities
+    in alphabetic order."""
+    bias_enabled: bool = False
+    """[DEPRECATED] Enable bias for LoRA adapters. This option will be
+    removed in v0.12.0."""
+
+    def compute_hash(self) -> str:
+        """
+        WARNING: Whenever a new field is added to this config,
+        ensure that it is included in the factors list if
+        it affects the computation graph.
+
+        Provide a hash that uniquely identifies all the configs
+        that affect the structure of the computation
+        graph from input ids/embeddings to the final hidden states,
+        excluding anything before input ids/embeddings and after
+        the final hidden states.
+        """
+        factors: list[Any] = []
+        factors.append(self.max_lora_rank)
+        factors.append(self.max_loras)
+        factors.append(self.fully_sharded_loras)
+        factors.append(self.lora_dtype)
+        factors.append(self.lora_extra_vocab_size)
+        factors.append(self.lora_vocab_padding_size)
+        factors.append(self.bias_enabled)
+        hash_str = hashlib.md5(str(factors).encode(),
+                               usedforsecurity=False).hexdigest()
+        return hash_str
+
+    def __post_init__(self):
+        # Deprecation warning for lora_extra_vocab_size
+        logger.warning(
+            "`lora_extra_vocab_size` is deprecated and will be removed "
+            "in v0.12.0. Additional vocabulary support for "
+            "LoRA adapters is being phased out.")
+
+        # Deprecation warning for enable_lora_bias
+        if self.bias_enabled:
+            logger.warning("`enable_lora_bias` is deprecated "
+                           "and will be removed in v0.12.0.")
+
+        # Setting the maximum rank to 512 should be able to satisfy the vast
+        # majority of applications.
+        possible_max_ranks = (8, 16, 32, 64, 128, 256, 320, 512)
+        possible_lora_extra_vocab_size = (256, 512)
+        if self.max_lora_rank not in possible_max_ranks:
+            raise ValueError(
+                f"max_lora_rank ({self.max_lora_rank}) must be one of "
+                f"{possible_max_ranks}.")
+        if self.lora_extra_vocab_size not in possible_lora_extra_vocab_size:
+            raise ValueError(
+                f"lora_extra_vocab_size ({self.lora_extra_vocab_size}) "
+                f"must be one of {possible_lora_extra_vocab_size}.")
+        if self.max_loras < 1:
+            raise ValueError(f"max_loras ({self.max_loras}) must be >= 1.")
+        if self.max_cpu_loras is None:
+            self.max_cpu_loras = self.max_loras
+        elif self.max_cpu_loras < self.max_loras:
+            raise ValueError(
+                f"max_cpu_loras ({self.max_cpu_loras}) must be >= "
+                f"max_loras ({self.max_loras})")
+
+    def verify_with_cache_config(self, cache_config: CacheConfig):
+        if cache_config.cpu_offload_gb > 0 and not envs.VLLM_USE_V1:
+            raise ValueError(
+                "V0 LoRA does not support CPU offload, please use V1.")
+
+    def verify_with_model_config(self, model_config: ModelConfig):
+        if self.lora_dtype in (None, "auto"):
+            self.lora_dtype = model_config.dtype
+        elif isinstance(self.lora_dtype, str):
+            self.lora_dtype = getattr(torch, self.lora_dtype)
diff --git a/vllm/config/parallel.py b/vllm/config/parallel.py
index 9ea883d4a03cd1db5821a1d7de14e4d047b86b15..2f8ad5c6b6b04fce98608a02f6d5355044c4f3e3 100644
--- a/vllm/config/parallel.py
+++ b/vllm/config/parallel.py
@@ -87,7 +87,7 @@ class ParallelConfig:
     data_parallel_external_lb: bool = False
     """Whether to use "external" DP LB mode. Applies only to online serving
     and when data_parallel_size > 0. This is useful for a "one-pod-per-rank"
-    wide-EP setup in Kuberentes. Set implicitly when --data-parallel-rank
+    wide-EP setup in Kubernetes. Set implicitly when --data-parallel-rank
     is provided explicitly to vllm serve."""
     data_parallel_hybrid_lb: bool = False
     """Whether to use "hybrid" DP LB mode. Applies only to online serving
@@ -170,6 +170,11 @@ class ParallelConfig:
     Set to be private as it's not intended to be configured by users.
     """
 
+    decode_context_parallel_size: int = 1
+    """Number of decode context parallel groups, because the world size does
+    not change by dcp, it simply reuse the GPUs of TP group, and tp_size
+    needs to be divisible by dcp_size."""
+
     @property
     def world_size_across_dp(self) -> int:
         """world_size_across_dp is TPxPPxDP, it is the size of the world
@@ -363,8 +368,10 @@ class ParallelConfig:
         else:
             if self.eplb_config.num_redundant_experts != 0:
                 raise ValueError(
-                    "num_redundant_experts should be used with EPLB."
-                    f"{self.eplb_config.num_redundant_experts}.")
+                    "num_redundant_experts is set to "
+                    f"{self.eplb_config.num_redundant_experts} but EPLB is not "
+                    "enabled. Either enable EPLB or unset "
+                    "num_redundant_experts.")
         if self.distributed_executor_backend is None and self.world_size > 1:
             # We use multiprocessing by default if world_size fits on the
             # current node and we aren't in a ray placement group.
@@ -372,10 +379,7 @@ class ParallelConfig:
             from vllm.executor import ray_utils
             backend: DistributedExecutorBackend = "mp"
             ray_found = ray_utils.ray_is_available()
-            if current_platform.is_neuron():
-                # neuron uses single process to control multiple devices
-                backend = "uni"
-            elif current_platform.is_tpu() and envs.VLLM_XLA_USE_SPMD:
+            if current_platform.is_tpu() and envs.VLLM_XLA_USE_SPMD:
                 backend = "uni"
             elif (current_platform.is_cuda()
                   and cuda_device_count_stateless() < self.world_size):
diff --git a/vllm/core/scheduler.py b/vllm/core/scheduler.py
index d7864293e9647a7900aa30c4b5cd02f44b92f813..92ebad778ea4b200819ce1f9595f06891d974050 100644
--- a/vllm/core/scheduler.py
+++ b/vllm/core/scheduler.py
@@ -11,7 +11,8 @@ from typing import Callable, Deque, Dict, Iterable, List, Optional
 from typing import Sequence as GenericSequence
 from typing import Set, Tuple, Union
 
-from vllm.config import CacheConfig, LoRAConfig, SchedulerConfig
+from vllm.config import CacheConfig, SchedulerConfig
+from vllm.config.lora import LoRAConfig
 from vllm.core.interfaces import AllocStatus, BlockSpaceManager
 from vllm.logger import init_logger
 from vllm.lora.request import LoRARequest
diff --git a/vllm/distributed/device_communicators/all2all.py b/vllm/distributed/device_communicators/all2all.py
index 85f87cb21edcd04aaab49cc5f5a77de5637d6ffd..7c0f30b9aab8cc69e715f8fdb6267f87d5208e45 100644
--- a/vllm/distributed/device_communicators/all2all.py
+++ b/vllm/distributed/device_communicators/all2all.py
@@ -1,6 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-from typing import TYPE_CHECKING, Any
+from typing import Any
 
 import torch
 import torch.distributed as dist
@@ -13,11 +13,6 @@ from .base_device_communicator import All2AllManagerBase, Cache
 
 logger = init_logger(__name__)
 
-if TYPE_CHECKING:
-    from vllm.model_executor.layers.fused_moe.layer import FusedMoE
-else:
-    FusedMoE = None
-
 
 class NaiveAll2AllManager(All2AllManagerBase):
     """
diff --git a/vllm/distributed/device_communicators/all_reduce_utils.py b/vllm/distributed/device_communicators/all_reduce_utils.py
index 5c64e7d5c4ba3f2faf6b0ca23cc60c6bd676105f..805a88854b77c37c052588c7f2a40f7a58492232 100644
--- a/vllm/distributed/device_communicators/all_reduce_utils.py
+++ b/vllm/distributed/device_communicators/all_reduce_utils.py
@@ -36,8 +36,8 @@ CUSTOM_ALL_REDUCE_MAX_SIZES = {
     "10.0": {
         2: 2 * MiB,  # 2 MB
         4: 2 * MiB,  # 2 MB
-        6: 2 * MiB,  # 2 MB
-        8: 2 * MiB,  # 2 MB
+        6: 1 * MiB,  # 1 MB
+        8: 1 * MiB,  # 1 MB
     }
 }
 
diff --git a/vllm/distributed/device_communicators/base_device_communicator.py b/vllm/distributed/device_communicators/base_device_communicator.py
index 9131582eef7549559aa90fbd7fdedd57ade39a66..01f59b44a0e69611956824f97f69933f6ba8c8fa 100644
--- a/vllm/distributed/device_communicators/base_device_communicator.py
+++ b/vllm/distributed/device_communicators/base_device_communicator.py
@@ -252,7 +252,10 @@ class DeviceCommunicatorBase:
 
         moe_modules = [
             module for module in model.modules()
-            if module.__class__.__name__ == "FusedMoE"
+            # TODO(bnell): Should use isinstance but can't.  Maybe search for
+            # presence of quant_method.init_prepare_finalize?
+            if (module.__class__.__name__ == "FusedMoE"
+                or module.__class__.__name__ == "SharedFusedMoE")
         ]
         for module in moe_modules:
             module.quant_method.init_prepare_finalize(module)
diff --git a/vllm/distributed/device_communicators/cuda_communicator.py b/vllm/distributed/device_communicators/cuda_communicator.py
index eef3f9f75f9f1838848765e4b493a2c10fb37798..78c90b006ffc8b202e196c2daff4e7d8b3ec34d8 100644
--- a/vllm/distributed/device_communicators/cuda_communicator.py
+++ b/vllm/distributed/device_communicators/cuda_communicator.py
@@ -57,11 +57,19 @@ class CudaCommunicator(DeviceCommunicatorBase):
         self.ca_comm: Optional[CustomAllreduce] = None
         self.qr_comm: Optional[QuickAllReduce] = None
         self.symm_mem_comm: Optional[SymmMemCommunicator] = None
+        if envs.VLLM_ALLREDUCE_USE_SYMM_MEM and current_platform.is_cuda():
+            self.symm_mem_comm = SymmMemCommunicator(
+                group=self.cpu_group,
+                device=self.device,
+            )
+
         if use_custom_allreduce and self.world_size > 1:
             # Initialize a custom fast all-reduce implementation.
             self.ca_comm = CustomAllreduce(
                 group=self.cpu_group,
                 device=self.device,
+                symm_mem_enabled=(self.symm_mem_comm is not None
+                                  and not self.symm_mem_comm.disabled),
             )
 
             if current_platform.is_rocm():
@@ -72,11 +80,6 @@ class CudaCommunicator(DeviceCommunicatorBase):
                 # currently be an MI300 series.
                 self.qr_comm = QuickAllReduce(group=self.cpu_group,
                                               device=self.device)
-        if envs.VLLM_ALLREDUCE_USE_SYMM_MEM and current_platform.is_cuda():
-            self.symm_mem_comm = SymmMemCommunicator(
-                group=self.cpu_group,
-                device=self.device,
-            )
 
         if self.use_all2all:
             all2all_backend = envs.VLLM_ALL2ALL_BACKEND
diff --git a/vllm/distributed/device_communicators/custom_all_reduce.py b/vllm/distributed/device_communicators/custom_all_reduce.py
index 6d8f865861e2f2587c9f965dee57677e7b92f2ad..d6e5218dcb6779ace7da444e27bdbd66ff17dce4 100644
--- a/vllm/distributed/device_communicators/custom_all_reduce.py
+++ b/vllm/distributed/device_communicators/custom_all_reduce.py
@@ -54,13 +54,14 @@ class CustomAllreduce:
     def __init__(self,
                  group: ProcessGroup,
                  device: Union[int, str, torch.device],
-                 max_size=8192 * 1024) -> None:
+                 max_size=8192 * 1024,
+                 symm_mem_enabled=False) -> None:
         """
         Args:
             group: the process group to work on. If None, it will use the
                 default process group.
             device: the device to bind the CustomAllreduce to. If None,
-                it will be bind to f"cuda:{local_rank}".
+                it will be bound to f"cuda:{local_rank}".
         It is the caller's responsibility to make sure each communicator
         is bind to a unique device, and all communicators in this group
         are in the same node.
@@ -111,7 +112,7 @@ class CustomAllreduce:
         self.device = device
         device_capability = current_platform.get_device_capability(
         ).as_version_str()
-        if (current_platform.is_cuda() and envs.VLLM_ALLREDUCE_USE_SYMM_MEM
+        if (current_platform.is_cuda() and symm_mem_enabled
                 and device_capability in CUSTOM_ALL_REDUCE_MAX_SIZES):
             max_size = min(
                 CUSTOM_ALL_REDUCE_MAX_SIZES[device_capability][world_size],
@@ -159,7 +160,7 @@ class CustomAllreduce:
 
         self.disabled = False
         # Buffers memory are owned by this Python class and passed to C++.
-        # Meta data composes of two parts: meta data for synchronization and a
+        # Metadata composes of two parts: metadata for synchronization and a
         # temporary buffer for storing intermediate allreduce results.
         self.meta_ptrs = self.create_shared_buffer(ops.meta_size() + max_size,
                                                    group=group,
diff --git a/vllm/distributed/device_communicators/neuron_communicator.py b/vllm/distributed/device_communicators/neuron_communicator.py
deleted file mode 100644
index 5b61a1687a0168390af6bb62b1173fc24fd0f1b7..0000000000000000000000000000000000000000
--- a/vllm/distributed/device_communicators/neuron_communicator.py
+++ /dev/null
@@ -1,20 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-import torch
-
-from vllm.distributed.device_communicators.base_device_communicator import (
-    DeviceCommunicatorBase)
-from vllm.platforms import current_platform
-
-if current_platform.is_neuron():
-    import torch_xla.core.xla_model as xm
-
-
-class NeuronCommunicator(DeviceCommunicatorBase):
-
-    def all_reduce(self, x: torch.Tensor) -> torch.Tensor:
-        return xm.all_reduce(xm.REDUCE_SUM, x)
-
-    def all_gather(self, x: torch.Tensor, dim: int = -1) -> torch.Tensor:
-        assert dim == -1, "Neuron only supports dim=-1 for all-gather."
-        return xm.all_gather(x, dim=dim)
diff --git a/vllm/distributed/device_communicators/pynccl.py b/vllm/distributed/device_communicators/pynccl.py
index 502bfd39005add9f429dbb925f60e26d6053bcfc..3e4d0d250af94dbe32d27fb5780b482374e6cbbf 100644
--- a/vllm/distributed/device_communicators/pynccl.py
+++ b/vllm/distributed/device_communicators/pynccl.py
@@ -31,7 +31,7 @@ class PyNcclCommunicator:
             group: the process group to work on. If None, it will use the
                 default process group.
             device: the device to bind the PyNcclCommunicator to. If None,
-                it will be bind to f"cuda:{local_rank}".
+                it will be bound to f"cuda:{local_rank}".
             library_path: the path to the NCCL library. If None, it will
                 use the default library path.
         It is the caller's responsibility to make sure each communicator
diff --git a/vllm/distributed/device_communicators/quick_all_reduce.py b/vllm/distributed/device_communicators/quick_all_reduce.py
index c61231e2d33f4a2abc0a370a3821b1e7dfb8b13e..836241910e2fb7f4b4b1a459376147cb873390dc 100644
--- a/vllm/distributed/device_communicators/quick_all_reduce.py
+++ b/vllm/distributed/device_communicators/quick_all_reduce.py
@@ -78,7 +78,7 @@ class QuickAllReduce:
             group: the process group to work on. If None, it will use the
                 default process group.
             device: the device to bind the CustomAllreduce to. If None,
-                it will be bind to f"cuda:{local_rank}".
+                it will be bound to f"cuda:{local_rank}".
         It is the caller's responsibility to make sure each communicator
         is bind to a unique device, and all communicators in this group
         are in the same node.
diff --git a/vllm/distributed/device_communicators/ray_communicator.py b/vllm/distributed/device_communicators/ray_communicator.py
index 46cc1c2f52d67e3990bdb4cf4d9ed0c4cf2251aa..8cd8c459a9e5184e436301e0fea9ff6526795e9c 100644
--- a/vllm/distributed/device_communicators/ray_communicator.py
+++ b/vllm/distributed/device_communicators/ray_communicator.py
@@ -186,7 +186,7 @@ class RayPPCommunicator(Communicator):
         """
         Receive a torch.Tensor from a peer and synchronize the current stream.
 
-        After this call returns, the receive buffer is safe to read from from
+        After this call returns, the receive buffer is safe to read from
         any stream. An RayChannelError will be raised if an error occurred
         (e.g., remote actor died), and the buffer is not safe to read.
 
diff --git a/vllm/distributed/device_communicators/symm_mem.py b/vllm/distributed/device_communicators/symm_mem.py
index d907e1b833d040d1bf9e5d7a7452d8eeeab29487..09012d16978d93773d901ed01c335e8c73cde460 100644
--- a/vllm/distributed/device_communicators/symm_mem.py
+++ b/vllm/distributed/device_communicators/symm_mem.py
@@ -27,8 +27,13 @@ class SymmMemCommunicator:
         "10.0": [6, 8],
     }
 
-    def __init__(self, group: ProcessGroup, device: Union[int, str,
-                                                          torch.device]):
+    def __init__(
+            self,
+            group: ProcessGroup,
+            device: Union[int, str, torch.device],
+            # add options for testing
+            force_multimem: Optional[bool] = None,
+            max_size_override: Optional[int] = None):
         self.disabled = True
 
         if not symm_mem_available:
@@ -64,8 +69,17 @@ class SymmMemCommunicator:
                 self.world_size,
             )
             return
-        self.max_size = SYMM_MEM_ALL_REDUCE_MAX_SIZES[self.device_capability][
-            self.world_size]
+        # Use override max_size if provided, otherwise use default
+        if max_size_override is not None:
+            self.max_size = max_size_override
+            logger.info(
+                "SymmMemCommunicator: Using override max_size: %s bytes",
+                self.max_size,
+            )
+        else:
+            self.max_size = SYMM_MEM_ALL_REDUCE_MAX_SIZES[
+                self.device_capability][self.world_size]
+
         self.buffer = torch_symm_mem.empty(
             self.max_size // self.dtype.itemsize,
             device=self.device,
@@ -76,6 +90,7 @@ class SymmMemCommunicator:
             logger.warning("SymmMemCommunicator: symmetric memory "
                            "multicast operations are not supported.")
             return
+        self.force_multimem = force_multimem
         self.disabled = False
 
     def should_use_symm_mem(self, inp: torch.Tensor):
@@ -98,8 +113,18 @@ class SymmMemCommunicator:
         if out is None:
             out = torch.empty_like(inp)
         self.buffer[:inp.numel()].copy_(inp.view(-1))
-        if self.world_size in self._WORLD_SIZES_MULTIMEM[
-                self.device_capability]:
+
+        # Determine which algorithm to use
+        use_multimem = False
+        if self.force_multimem is not None:
+            # Test override: use forced setting
+            use_multimem = self.force_multimem
+        else:
+            # Normal logic: use multimem for supported world sizes
+            use_multimem = self.world_size in self._WORLD_SIZES_MULTIMEM[
+                self.device_capability]
+
+        if use_multimem:
             torch.ops.symm_mem.multimem_all_reduce_(self.buffer[:inp.numel()],
                                                     "sum",
                                                     self.group.group_name)
diff --git a/vllm/distributed/kv_events.py b/vllm/distributed/kv_events.py
index 37f8f72fa90569da143931e78c2da5c62271f28a..46f0cd9289b2305116f9d9961ac8a943e93c627c 100644
--- a/vllm/distributed/kv_events.py
+++ b/vllm/distributed/kv_events.py
@@ -14,8 +14,9 @@ from typing import Any, Callable, Optional, Union
 import msgspec
 import zmq
 
-from vllm.config import KVEventsConfig
+from vllm.config.kv_events import KVEventsConfig
 from vllm.logger import init_logger
+from vllm.v1.core.kv_cache_utils import ExternalBlockHash
 
 logger = init_logger(__name__)
 
@@ -44,8 +45,8 @@ MEDIUM_GPU = "GPU"
 
 
 class BlockStored(KVCacheEvent):
-    block_hashes: list[int]
-    parent_block_hash: Optional[int]
+    block_hashes: list[ExternalBlockHash]
+    parent_block_hash: Optional[ExternalBlockHash]
     token_ids: list[int]
     block_size: int
     lora_id: Optional[int]
@@ -53,7 +54,7 @@ class BlockStored(KVCacheEvent):
 
 
 class BlockRemoved(KVCacheEvent):
-    block_hashes: list[int]
+    block_hashes: list[ExternalBlockHash]
     medium: Optional[str]
 
 
diff --git a/vllm/distributed/kv_transfer/__init__.py b/vllm/distributed/kv_transfer/__init__.py
index fa9b7e4f14c020ccd6e12863173b6e81b318837d..cf58e7914972c7b2bc3a3a83b262554bdf2b9191 100644
--- a/vllm/distributed/kv_transfer/__init__.py
+++ b/vllm/distributed/kv_transfer/__init__.py
@@ -2,11 +2,12 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from vllm.distributed.kv_transfer.kv_transfer_state import (
-    KVConnectorBaseType, ensure_kv_transfer_initialized, get_kv_transfer_group,
-    has_kv_transfer_group, is_v1_kv_transfer_group)
+    KVConnectorBaseType, ensure_kv_transfer_initialized,
+    ensure_kv_transfer_shutdown, get_kv_transfer_group, has_kv_transfer_group,
+    is_v1_kv_transfer_group)
 
 __all__ = [
     "get_kv_transfer_group", "has_kv_transfer_group",
     "is_v1_kv_transfer_group", "ensure_kv_transfer_initialized",
-    "KVConnectorBaseType"
+    "ensure_kv_transfer_shutdown", "KVConnectorBaseType"
 ]
diff --git a/vllm/distributed/kv_transfer/kv_connector/factory.py b/vllm/distributed/kv_transfer/kv_connector/factory.py
index 584fc1d6559514b70d63de6697707f71e4bb071e..670f9c26b2104f8b89a13a17799bee98a169c1c6 100644
--- a/vllm/distributed/kv_transfer/kv_connector/factory.py
+++ b/vllm/distributed/kv_transfer/kv_connector/factory.py
@@ -14,7 +14,8 @@ from vllm.logger import init_logger
 # yapf: enable
 
 if TYPE_CHECKING:
-    from vllm.config import KVTransferConfig, VllmConfig
+    from vllm.config import VllmConfig
+    from vllm.config.kv_transfer import KVTransferConfig
 
 logger = init_logger(__name__)
 
diff --git a/vllm/distributed/kv_transfer/kv_connector/utils.py b/vllm/distributed/kv_transfer/kv_connector/utils.py
index 2364400b3d350d2b1cf192ebf45afb6b644c572c..f4dc248a127942c156e263e78530b406ff19603a 100644
--- a/vllm/distributed/kv_transfer/kv_connector/utils.py
+++ b/vllm/distributed/kv_transfer/kv_connector/utils.py
@@ -6,7 +6,7 @@ KV cache helper for store.
 from collections import defaultdict
 from collections.abc import Sequence
 from concurrent.futures import CancelledError, Future
-from typing import Optional, cast
+from typing import Literal, Optional, Union, cast
 
 import torch
 
@@ -196,3 +196,51 @@ class KVOutputAggregator:
             output_future.add_done_callback(make_callback(i))
 
         return result_future
+
+
+def _make_src_and_dst_indices(
+    src_block_ids: list[int],
+    dst_block_ids: list[int],
+    src_device: Union[torch.device, str],
+    dst_device: Union[torch.device, str],
+) -> tuple[torch.Tensor, torch.Tensor]:
+    src_indices = torch.tensor(src_block_ids,
+                               device=src_device,
+                               dtype=torch.int64)
+    dst_indices = torch.tensor(dst_block_ids,
+                               device=dst_device,
+                               dtype=torch.int64)
+    return src_indices, dst_indices
+
+
+def copy_kv_blocks(
+    src_kv_caches: dict[str, torch.Tensor],
+    dst_kv_caches: dict[str, torch.Tensor],
+    src_block_ids: list[int],
+    dst_block_ids: list[int],
+    direction: Literal["h2d", "d2h"],
+) -> None:
+    """Copy kv blocks between different buffers."""
+    if not src_kv_caches or not dst_kv_caches or \
+       not src_block_ids or not dst_block_ids or \
+       len(src_block_ids) != len(dst_block_ids):
+        return
+
+    src_device = next(iter(src_kv_caches.values())).device
+    dst_device = next(iter(dst_kv_caches.values())).device
+
+    src_indices, dst_indices = _make_src_and_dst_indices(
+        src_block_ids=src_block_ids,
+        dst_block_ids=dst_block_ids,
+        src_device=src_device,
+        dst_device=dst_device)
+
+    from vllm.platforms import current_platform
+    if direction == "h2d":
+        copy_fn = current_platform.insert_blocks_to_device
+    else:
+        copy_fn = current_platform.swap_out_blocks_to_host
+    for layer_name in src_kv_caches:
+        src_tensor = src_kv_caches[layer_name]
+        dst_tensor = dst_kv_caches[layer_name]
+        copy_fn(src_tensor, dst_tensor, src_indices, dst_indices)
diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/base.py b/vllm/distributed/kv_transfer/kv_connector/v1/base.py
index 2804003f5a7085b7fcb843a602655e7c03d78ea9..cd4561154b78b3866b1b667549c33d1d2b61b451 100644
--- a/vllm/distributed/kv_transfer/kv_connector/v1/base.py
+++ b/vllm/distributed/kv_transfer/kv_connector/v1/base.py
@@ -226,6 +226,14 @@ class KVConnectorBase_V1(ABC):
         """
         return None, None
 
+    def shutdown(self):
+        """
+        Shutdown the connector. This is called when the worker process
+        is shutting down to ensure that all the async operations are
+        completed and the connector is cleaned up properly.
+        """
+        return None
+
     # ==============================
     # Scheduler-side methods
     # ==============================
@@ -235,7 +243,7 @@ class KVConnectorBase_V1(ABC):
         self,
         request: "Request",
         num_computed_tokens: int,
-    ) -> tuple[int, bool]:
+    ) -> tuple[Optional[int], bool]:
         """
         Get number of new tokens that can be loaded from the
         external KV cache beyond the num_computed_tokens.
@@ -247,8 +255,11 @@ class KVConnectorBase_V1(ABC):
 
         Returns:
             A tuple with the following elements:
-                - The number of tokens that can be loaded from the 
-                  external KV cache beyond what is already computed.
+                - An optional number of tokens that can be loaded from the 
+                  external KV cache beyond what is already computed. 
+                  If None, it means that the connector needs more time to
+                  determine the number of matched tokens, and the scheduler
+                  should query for this request again later.
                 - `True` if external KV cache tokens will be loaded
                   asynchronously (between scheduler steps). Must be
                   'False' if the first element is 0.
diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/lmcache_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/lmcache_connector.py
index e838ac2499c04ad5332d3fd7fa4c968d09fbaf89..c99f538ee418541506416bdfad9fb73049a5ba49 100644
--- a/vllm/distributed/kv_transfer/kv_connector/v1/lmcache_connector.py
+++ b/vllm/distributed/kv_transfer/kv_connector/v1/lmcache_connector.py
@@ -110,7 +110,7 @@ class LMCacheConnectorV1(KVConnectorBase_V1):
         self,
         request: "Request",
         num_computed_tokens: int,
-    ) -> tuple[int, bool]:
+    ) -> tuple[Optional[int], bool]:
         """
         Get number of new tokens that can be loaded from the
         external KV cache beyond the num_computed_tokens.
diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/multi_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/multi_connector.py
index 65bcb4d93b1e13b443ce862e46ae9c4fb504c504..616d158d6767079d0bb138da326e3de5862aefb5 100644
--- a/vllm/distributed/kv_transfer/kv_connector/v1/multi_connector.py
+++ b/vllm/distributed/kv_transfer/kv_connector/v1/multi_connector.py
@@ -7,7 +7,8 @@ from typing import TYPE_CHECKING, Any, Optional
 
 import torch
 
-from vllm.config import KVTransferConfig, VllmConfig
+from vllm.config import VllmConfig
+from vllm.config.kv_transfer import KVTransferConfig
 from vllm.distributed.kv_events import KVCacheEvent
 from vllm.distributed.kv_transfer.kv_connector.factory import (
     KVConnectorFactory)
@@ -87,6 +88,18 @@ class MultiConnector(KVConnectorBase_V1):
         for c in self._connectors:
             c.clear_connector_metadata()
 
+    def shutdown(self):
+        exception: Optional[Exception] = None
+        for c in self._connectors:
+            try:
+                c.shutdown()
+            except Exception as e:
+                logger.exception("Exception during connector %s shutdown.",
+                                 c.__class__.__name__)
+                exception = e
+        if exception:
+            raise exception
+
     # ==============================
     # Worker-side methods
     # ==============================
@@ -142,11 +155,15 @@ class MultiConnector(KVConnectorBase_V1):
         self,
         request: "Request",
         num_computed_tokens: int,
-    ) -> tuple[int, bool]:
+    ) -> tuple[Optional[int], bool]:
         to_return = (0, False)
         for i, c in enumerate(self._connectors):
             toks, load_async = c.get_num_new_matched_tokens(
                 request, num_computed_tokens)
+            # If there is a connector still looking up the matches,
+            # we return None to indicate that we are not done yet.
+            if toks is None:
+                return (None, False)
             # The first connector that has new matched tokens will be assigned
             # to this request.
             if to_return[0] == 0 and toks > 0:
diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py
index 6608d2a4a9e098766901a131d9c0e6afecaf81cd..17f5be76ce400c611f14676edf55c0f25e8f414a 100644
--- a/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py
+++ b/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py
@@ -14,6 +14,7 @@ from dataclasses import dataclass
 from typing import TYPE_CHECKING, Any, Optional
 
 import msgspec
+import numpy as np
 import torch
 import zmq
 
@@ -60,6 +61,7 @@ except ImportError:
 _NIXL_SUPPORTED_XPUS = {
     "cuda": ("cuda", ),
     "tpu": ("cpu", ),
+    "xpu": ("cpu", ),
 }
 
 
@@ -160,7 +162,7 @@ class NixlConnector(KVConnectorBase_V1):
 
     def get_num_new_matched_tokens(
             self, request: "Request",
-            num_computed_tokens: int) -> tuple[int, bool]:
+            num_computed_tokens: int) -> tuple[Optional[int], bool]:
         assert self.connector_scheduler is not None
         return self.connector_scheduler.get_num_new_matched_tokens(
             request, num_computed_tokens)
@@ -715,7 +717,7 @@ class NixlConnectorWorker:
         # are non-contiguous (it's not locally guaranteed that they will be)
         # Disadvantage is that the encoded NixlAgentMetadata is now larger
         # (roughly 8KB vs 5KB).
-        # Conversely for FlashInfer, K and V are transferred in the same tensor
+        # Conversely for FlashInfer, K and V are registered in the same region
         # to better exploit the memory layout (ie num_blocks is the first dim).
         split_k_and_v = not (self.use_mla or self._use_pallas_v1
                              or self._use_flashinfer)
@@ -758,12 +760,21 @@ class NixlConnectorWorker:
         assert tensor_size_bytes % self.num_blocks == 0
         self.block_len = tensor_size_bytes // self.num_blocks
         self.slot_size_bytes = self.block_len // self.block_size
+        self.device_kv_caches = kv_caches
+        self.dst_num_blocks[self.engine_id] = self.num_blocks
         if self._use_flashinfer:
             assert self.slot_size_bytes % 2 == 0
             self.slot_size_bytes /= 2
-        self.device_kv_caches = kv_caches
-        self.dst_num_blocks[self.engine_id] = self.num_blocks
 
+            # NOTE (NickLucche) When FlashInfer is used, memory is registered
+            # with joint KV for each block. This minimizes the overhead in
+            # registerMem allowing faster descs queries. In order to be able to
+            # split on kv_heads dim as required by heterogeneous TP, one must
+            # be able to index K/V separately. Hence we double the number
+            # of 'virtual' regions here and halve `block_len` below.
+            self.num_regions *= 2
+
+        kv_block_len = self.get_backend_aware_kv_block_len()
         # Register local/src descr for NIXL xfer.
         blocks_data = []
         for base_addr in seen_base_addresses:
@@ -776,8 +787,18 @@ class NixlConnectorWorker:
                 block_offset = block_id * self.block_len
                 addr = base_addr + block_offset
                 # (addr, len, device id)
-                # TODO: does device_id matter to DRAM?
-                blocks_data.append((addr, self.block_len, self.tp_rank))
+                blocks_data.append((addr, kv_block_len, self.tp_rank))
+
+            if self._use_flashinfer:
+                # Separate and interleave K/V regions to maintain the same
+                # descs ordering. This is needed for selecting contiguous heads
+                # when split across TP ranks.
+                for block_id in range(self.num_blocks):
+                    block_offset = block_id * self.block_len
+                    addr = base_addr + block_offset
+                    # Register addresses for V cache (K registered first).
+                    v_addr = addr + kv_block_len
+                    blocks_data.append((v_addr, kv_block_len, self.tp_rank))
         logger.debug("Created %s blocks for src engine %s and rank %s",
                      len(blocks_data), self.engine_id, self.tp_rank)
 
@@ -787,7 +808,7 @@ class NixlConnectorWorker:
         self.src_xfer_side_handle = self.nixl_wrapper.prep_xfer_dlist(
             "NIXL_INIT_AGENT", descs)
 
-        # TODO(mgoin): Hybrid memory allocator is currently diabled for
+        # TODO(mgoin): Hybrid memory allocator is currently disabled for
         # models with local attention (Llama 4). Can remove this once enabled.
         if self.vllm_config.model_config.hf_config.model_type == "llama4":
             from transformers import Llama4TextConfig
@@ -903,7 +924,7 @@ class NixlConnectorWorker:
             remote_block_size = nixl_agent_meta.block_len // (
                 self.slot_size_bytes * tp_ratio)
             if self._use_flashinfer:
-                # Account for joint KV in FlashInfer.
+                # With flashinfer, KV are sent in the same message.
                 remote_block_size //= 2
             if tp_ratio > 1:
                 # Heterogeneous TP expects same kv_cache_layout.
@@ -929,10 +950,10 @@ class NixlConnectorWorker:
         # rank. With heterogeneous TP, prepare the descriptors by splitting the
         # P KV cache along kv_head dim, of D worker's kv_head size (D>P).
         # Eg. PTP1 DTP2 => P0 KV:[block0-KV_0 | block0-KV_1..].
-        # Only register the remote's descriptors if current rank pulls from it.
         self.kv_caches_base_addr[
             engine_id] = nixl_agent_meta.kv_caches_base_addr
-        rank_offset = self.tp_rank % tp_ratio * self.block_len \
+        kv_block_len = self.get_backend_aware_kv_block_len()
+        rank_offset = self.tp_rank % tp_ratio * kv_block_len \
             if not (self.use_mla or is_kv_replicated) else 0
         # Register all remote blocks, but only the corresponding kv heads.
         for base_addr in nixl_agent_meta.kv_caches_base_addr:
@@ -943,7 +964,16 @@ class NixlConnectorWorker:
                 # self.block_len == remote_block_len//tp_ratio bytes.
                 addr = base_addr + block_offset + rank_offset
                 # (addr, len, device id)
-                blocks_data.append((addr, self.block_len, remote_tp_rank))
+                blocks_data.append((addr, kv_block_len, remote_tp_rank))
+
+            if self._use_flashinfer:
+                # With FlashInfer index V separately to allow head splitting.
+                for block_id in range(nixl_agent_meta.num_blocks):
+                    block_offset = block_id * nixl_agent_meta.block_len
+                    addr = base_addr + block_offset + rank_offset
+                    v_addr = addr + nixl_agent_meta.block_len // 2
+                    blocks_data.append((v_addr, kv_block_len, remote_tp_rank))
+
         logger.debug(
             "Created %s blocks for dst engine %s with remote rank %s and "
             "local rank %s", len(blocks_data), engine_id, remote_tp_rank,
@@ -1163,8 +1193,8 @@ class NixlConnectorWorker:
         # workers will issue xfers to parts of the P worker remote kv caches.
 
         # Get descs ids.
-        local_block_descs_ids: list[int] = []
-        remote_block_descs_ids: list[int] = []
+        local_block_descs_ids: np.ndarray
+        remote_block_descs_ids: np.ndarray
         if not self.block_window_per_layer:
             # Default case: assume global attention
             remote_block_descs_ids = self._get_block_descs_ids(
@@ -1174,6 +1204,8 @@ class NixlConnectorWorker:
         else:
             # TODO(mgoin): remove this once we have hybrid memory allocator
             # Optimization for models with local attention (Llama 4)
+            local_descs_list = []
+            remote_descs_list = []
             for layer_idx, block_window in enumerate(
                     self.block_window_per_layer):
                 # For each layer:
@@ -1193,8 +1225,11 @@ class NixlConnectorWorker:
                 layer_remote_desc_ids = self._get_block_descs_ids(
                     dst_engine_id, layer_remote_block_ids, layer_idx)
 
-                local_block_descs_ids.extend(layer_local_desc_ids)
-                remote_block_descs_ids.extend(layer_remote_desc_ids)
+                local_descs_list.append(layer_local_desc_ids)
+                remote_descs_list.append(layer_remote_desc_ids)
+
+            local_block_descs_ids = np.concatenate(local_descs_list)
+            remote_block_descs_ids = np.concatenate(remote_descs_list)
 
         assert len(local_block_descs_ids) == len(remote_block_descs_ids)
 
@@ -1219,14 +1254,14 @@ class NixlConnectorWorker:
     def _get_block_descs_ids(self,
                              engine_id: str,
                              block_ids: list[int],
-                             layer_idx: Optional[int] = None) -> list[int]:
+                             layer_idx: Optional[int] = None) -> np.ndarray:
         """
         Get the descs ids for a set of block ids.
         If layer_idx is provided, we use the region_ids for the given layer.
         Otherwise, we use all regions.
         """
         if layer_idx is None:
-            region_ids = range(self.num_regions)
+            region_ids = np.arange(self.num_regions)
         else:
             assert layer_idx < self.num_layers
             if self.num_layers < self.num_regions:
@@ -1234,20 +1269,35 @@ class NixlConnectorWorker:
                 # the regions are organized as [K0, V0, K1, V1, ...]
                 # and we select K_i and V_i
                 assert 2 * self.num_layers == self.num_regions
-                region_ids = range(2 * layer_idx, 2 * layer_idx + 2)
+                region_ids = np.arange(2 * layer_idx, 2 * layer_idx + 2)
             else:
                 # Otherwise, we assume we have MLA and select i-th layer
                 assert self.num_layers == self.num_regions
-                region_ids = range(layer_idx, layer_idx + 1)
+                region_ids = np.arange(layer_idx, layer_idx + 1)
 
         num_blocks = self.dst_num_blocks[engine_id]
 
         # Compute the desc ids for each block.
-        descs_ids: list[int] = []
-        for reg_id in region_ids:
-            for block_id in block_ids:
-                descs_ids.append(reg_id * num_blocks + block_id)
-        return descs_ids
+        region_ids = region_ids[:, None]
+        block_ids = np.array(block_ids)[None, :]
+        descs_ids = region_ids * num_blocks + block_ids
+        return descs_ids.flatten()
+
+    def get_backend_aware_kv_block_len(self):
+        """
+        Get the block length for one K/V element (K and V have the same size).
+
+        For FA and other backends, this is equal to the length of the whole 
+        block, as K and V are in separate regions.
+        For FlashInfer, this is half the length of the whole block, as K and V
+        share the same region.
+        """
+        if self._use_flashinfer:
+            # For indexing only half (either just the K or V part).
+            block_len = self.block_len // 2
+        else:
+            block_len = self.block_len
+        return block_len
 
 
 @contextlib.contextmanager
diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_engine.py b/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_engine.py
index dfd95548c4632a3f0178bb5bdeb3c0a0871e73a5..fa7cc66ab654d484762a5a49971b39b29df5c31d 100644
--- a/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_engine.py
+++ b/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_engine.py
@@ -15,7 +15,7 @@ import msgpack
 import torch
 import zmq
 
-from vllm.config import KVTransferConfig
+from vllm.config.kv_transfer import KVTransferConfig
 from vllm.distributed.device_communicators.pynccl_wrapper import (
     NCCLLibrary, buffer_type, cudaStream_t, ncclComm_t, ncclDataTypeEnum)
 from vllm.distributed.kv_transfer.kv_connector.v1.p2p.tensor_memory_pool import (  # noqa: E501
diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/shared_storage_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/shared_storage_connector.py
index fd79387269d56a7f013eeeff3af85054f74748b5..9f4da613f8b62c11a40a1a1f0817f0c94d5dbe4d 100644
--- a/vllm/distributed/kv_transfer/kv_connector/v1/shared_storage_connector.py
+++ b/vllm/distributed/kv_transfer/kv_connector/v1/shared_storage_connector.py
@@ -3,7 +3,7 @@
 import hashlib
 import os
 from dataclasses import dataclass
-from typing import TYPE_CHECKING
+from typing import TYPE_CHECKING, Optional
 
 import safetensors
 import torch
@@ -238,7 +238,7 @@ class SharedStorageConnector(KVConnectorBase_V1):
         self,
         request: "Request",
         num_computed_tokens: int,
-    ) -> tuple[int, bool]:
+    ) -> tuple[Optional[int], bool]:
         """
         Get number of new tokens that can be loaded from the
         external KV cache beyond the num_computed_tokens.
diff --git a/vllm/distributed/kv_transfer/kv_pipe/mooncake_pipe.py b/vllm/distributed/kv_transfer/kv_pipe/mooncake_pipe.py
index 0b560d1b3b3ce3a5df94391c450f19b1cb405fda..2a434e280179ea1f4e571d52036d91098156f259 100644
--- a/vllm/distributed/kv_transfer/kv_pipe/mooncake_pipe.py
+++ b/vllm/distributed/kv_transfer/kv_pipe/mooncake_pipe.py
@@ -13,7 +13,7 @@ import zmq
 from safetensors.torch import load as safetensors_load
 from safetensors.torch import save as safetensors_save
 
-from vllm.config import KVTransferConfig
+from vllm.config.kv_transfer import KVTransferConfig
 from vllm.distributed.kv_transfer.kv_pipe.base import KVPipeBase
 from vllm.logger import init_logger
 from vllm.utils import join_host_port, make_zmq_path, split_host_port
diff --git a/vllm/distributed/kv_transfer/kv_pipe/pynccl_pipe.py b/vllm/distributed/kv_transfer/kv_pipe/pynccl_pipe.py
index 09de0b682efca795d0b8ec874c13a5200c3ad043..66120e9a0a1a0a37bcfb3e2a8f092393e0752ddd 100644
--- a/vllm/distributed/kv_transfer/kv_pipe/pynccl_pipe.py
+++ b/vllm/distributed/kv_transfer/kv_pipe/pynccl_pipe.py
@@ -20,7 +20,7 @@ from typing import Callable, Optional
 
 import torch
 
-from vllm.config import KVTransferConfig
+from vllm.config.kv_transfer import KVTransferConfig
 from vllm.distributed.device_communicators.pynccl import PyNcclCommunicator
 from vllm.distributed.kv_transfer.kv_pipe.base import KVPipeBase
 from vllm.distributed.utils import StatelessProcessGroup
diff --git a/vllm/distributed/kv_transfer/kv_transfer_state.py b/vllm/distributed/kv_transfer/kv_transfer_state.py
index 5e0f64fca220cb2ae246e25381040bedc5f8039a..d5747bed92771d0487bf31c5f949e2beae03a0b2 100644
--- a/vllm/distributed/kv_transfer/kv_transfer_state.py
+++ b/vllm/distributed/kv_transfer/kv_transfer_state.py
@@ -64,3 +64,10 @@ def ensure_kv_transfer_initialized(vllm_config: "VllmConfig") -> None:
                 config=vllm_config, role=KVConnectorRole.WORKER)
         else:
             raise ValueError("V0 is no longer supported")
+
+
+def ensure_kv_transfer_shutdown() -> None:
+    global _KV_CONNECTOR_AGENT
+    if _KV_CONNECTOR_AGENT is not None:
+        _KV_CONNECTOR_AGENT.shutdown()
+        _KV_CONNECTOR_AGENT = None
diff --git a/vllm/distributed/parallel_state.py b/vllm/distributed/parallel_state.py
index b89aee99c8d469786539152ea852b212442e0994..ef229299b684885c615f1e098d9b6d92b5cc288e 100644
--- a/vllm/distributed/parallel_state.py
+++ b/vllm/distributed/parallel_state.py
@@ -29,6 +29,7 @@ import weakref
 from collections import namedtuple
 from contextlib import contextmanager, nullcontext
 from dataclasses import dataclass
+from datetime import timedelta
 from multiprocessing import shared_memory
 from typing import Any, Callable, Optional, Union
 from unittest.mock import patch
@@ -904,6 +905,18 @@ def get_tensor_model_parallel_group():
     return get_tp_group()
 
 
+_DCP: Optional[GroupCoordinator] = None
+
+
+def get_dcp_group() -> GroupCoordinator:
+    assert _DCP is not None, (
+        "decode context model parallel group is not initialized")
+    return _DCP
+
+
+# kept for backward compatibility
+get_context_model_parallel_group = get_dcp_group
+
 _PP: Optional[GroupCoordinator] = None
 
 _DP: Optional[GroupCoordinator] = None
@@ -939,8 +952,8 @@ def get_pipeline_model_parallel_group():
 def graph_capture(device: torch.device):
     """
     `graph_capture` is a context manager which should surround the code that
-    is capturing the CUDA graph. Its main purpose is to ensure that the
-    some operations will be run after the graph is captured, before the graph
+    is capturing the CUDA graph. Its main purpose is to ensure that some
+    operations will be run after the graph is captured, before the graph
     is replayed. It returns a `GraphCaptureContext` object which contains the
     necessary data for the graph capture. Currently, it only contains the
     stream that the graph capture is running on. This stream is set to the
@@ -966,13 +979,12 @@ def set_custom_all_reduce(enable: bool):
     _ENABLE_CUSTOM_ALL_REDUCE = enable
 
 
-def init_distributed_environment(
-    world_size: int = -1,
-    rank: int = -1,
-    distributed_init_method: str = "env://",
-    local_rank: int = -1,
-    backend: str = "nccl",
-):
+def init_distributed_environment(world_size: int = -1,
+                                 rank: int = -1,
+                                 distributed_init_method: str = "env://",
+                                 local_rank: int = -1,
+                                 backend: str = "nccl",
+                                 timeout: Optional[timedelta] = None):
     logger.debug(
         "world_size=%d rank=%d local_rank=%d "
         "distributed_init_method=%s backend=%s", world_size, rank, local_rank,
@@ -1008,7 +1020,8 @@ def init_distributed_environment(
             backend=backend,
             init_method=distributed_init_method,
             world_size=world_size,
-            rank=rank)
+            rank=rank,
+            timeout=timeout)
     # set the local rank
     # local_rank is not available in torch ProcessGroup,
     # see https://github.com/pytorch/pytorch/issues/122816
@@ -1034,6 +1047,7 @@ def init_distributed_environment(
 def initialize_model_parallel(
     tensor_model_parallel_size: int = 1,
     pipeline_model_parallel_size: int = 1,
+    decode_context_model_parallel_size: Optional[int] = 1,
     backend: Optional[str] = None,
 ) -> None:
     """
@@ -1098,6 +1112,23 @@ def initialize_model_parallel(
                                     use_message_queue_broadcaster=True,
                                     group_name="tp")
 
+    # Build the DCP model-parallel groups.
+    global _DCP
+    assert _DCP is None, (
+        "decode context model parallel group is already initialized")
+    # Note(hc): In the current implementation of decode context parallel,
+    # dcp_size must not exceed tp_size, because the world size does not
+    # change by DCP, it simply reuses the GPUs of TP group, and split one
+    # TP group into tp_size//dcp_size DCP groups.
+    group_ranks = all_ranks.reshape(
+        -1, decode_context_model_parallel_size).unbind(0)
+    group_ranks = [x.tolist() for x in group_ranks]
+    _DCP = init_model_parallel_group(group_ranks,
+                                     get_world_group().local_rank,
+                                     backend,
+                                     use_message_queue_broadcaster=True,
+                                     group_name="dcp")
+
     # Build the pipeline model-parallel groups.
     global _PP
     assert _PP is None, (
@@ -1141,6 +1172,7 @@ def initialize_model_parallel(
 def ensure_model_parallel_initialized(
     tensor_model_parallel_size: int,
     pipeline_model_parallel_size: int,
+    decode_context_model_parallel_size: Optional[int] = 1,
     backend: Optional[str] = None,
 ) -> None:
     """Helper to initialize model parallel groups if they are not initialized,
@@ -1151,7 +1183,8 @@ def ensure_model_parallel_initialized(
         get_world_group().device_group)
     if not model_parallel_is_initialized():
         initialize_model_parallel(tensor_model_parallel_size,
-                                  pipeline_model_parallel_size, backend)
+                                  pipeline_model_parallel_size,
+                                  decode_context_model_parallel_size, backend)
         return
 
     assert (
@@ -1226,6 +1259,16 @@ def get_tensor_model_parallel_rank():
     return get_tp_group().rank_in_group
 
 
+def get_decode_context_model_parallel_world_size():
+    """Return world size for the decode context model parallel group."""
+    return get_dcp_group().world_size
+
+
+def get_decode_context_model_parallel_rank():
+    """Return my rank for the decode context model parallel group."""
+    return get_dcp_group().rank_in_group
+
+
 def get_node_count() -> int:
     """Return the total number of nodes in the distributed environment. """
     assert _NODE_COUNT is not None, (
@@ -1246,6 +1289,11 @@ def destroy_model_parallel():
         _PP.destroy()
     _PP = None
 
+    global _DCP
+    if _DCP:
+        _DCP.destroy()
+    _DCP = None
+
     global _DP
     if _DP:
         _DP.destroy()
diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index e1d51febe60f535ee82166751b437fc30e712da1..a70ade69f05666d9f30130e2229b0fc32d0e893e 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -22,9 +22,9 @@ from typing_extensions import TypeIs, deprecated
 
 import vllm.envs as envs
 from vllm.config import (BlockSize, CacheConfig, CacheDType, CompilationConfig,
-                         ConfigFormat, ConfigType, ConvertOption,
-                         DecodingConfig, DetailedTraceModules, Device,
-                         DeviceConfig, DistributedExecutorBackend, EPLBConfig,
+                         ConfigType, ConvertOption, DecodingConfig,
+                         DetailedTraceModules, Device, DeviceConfig,
+                         DistributedExecutorBackend, EPLBConfig,
                          GuidedDecodingBackend, HfOverrides, KVEventsConfig,
                          KVTransferConfig, LoadConfig, LogprobsMode,
                          LoRAConfig, MambaDType, MMEncoderTPMode, ModelConfig,
@@ -227,8 +227,14 @@ def _compute_kwargs(cls: ConfigType) -> dict[str, Any]:
         elif contains_type(type_hints, int):
             kwargs[name]["type"] = int
             # Special case for large integers
-            if name in {"max_model_len", "max_num_batched_tokens"}:
+            human_readable_ints = {
+                "max_model_len",
+                "max_num_batched_tokens",
+                "kv_cache_memory_bytes",
+            }
+            if name in human_readable_ints:
                 kwargs[name]["type"] = human_readable_int
+                kwargs[name]["help"] += f"\n\n{human_readable_int.__doc__}"
         elif contains_type(type_hints, float):
             kwargs[name]["type"] = float
         elif (contains_type(type_hints, dict)
@@ -289,6 +295,7 @@ class EngineArgs:
     trust_remote_code: bool = ModelConfig.trust_remote_code
     allowed_local_media_path: str = ModelConfig.allowed_local_media_path
     download_dir: Optional[str] = LoadConfig.download_dir
+    safetensors_load_strategy: str = LoadConfig.safetensors_load_strategy
     load_format: Union[str, LoadFormats] = LoadConfig.load_format
     config_format: str = ModelConfig.config_format
     dtype: ModelDType = ModelConfig.dtype
@@ -306,6 +313,8 @@ class EngineArgs:
     # number of P/D disaggregation (or other disaggregation) workers
     pipeline_parallel_size: int = ParallelConfig.pipeline_parallel_size
     tensor_parallel_size: int = ParallelConfig.tensor_parallel_size
+    decode_context_parallel_size: int = \
+        ParallelConfig.decode_context_parallel_size
     data_parallel_size: int = ParallelConfig.data_parallel_size
     data_parallel_rank: Optional[int] = None
     data_parallel_start_rank: Optional[int] = None
@@ -332,6 +341,7 @@ class EngineArgs:
     swap_space: float = CacheConfig.swap_space
     cpu_offload_gb: float = CacheConfig.cpu_offload_gb
     gpu_memory_utilization: float = CacheConfig.gpu_memory_utilization
+    kv_cache_memory_bytes: Optional[int] = CacheConfig.kv_cache_memory_bytes
     max_num_batched_tokens: Optional[
         int] = SchedulerConfig.max_num_batched_tokens
     max_num_partial_prefills: int = SchedulerConfig.max_num_partial_prefills
@@ -417,8 +427,6 @@ class EngineArgs:
     scheduling_policy: SchedulerPolicy = SchedulerConfig.policy
     scheduler_cls: Union[str, Type[object]] = SchedulerConfig.scheduler_cls
 
-    override_neuron_config: dict[str, Any] = \
-        get_field(ModelConfig, "override_neuron_config")
     override_pooler_config: Optional[Union[dict, PoolerConfig]] = \
         ModelConfig.override_pooler_config
     compilation_config: CompilationConfig = \
@@ -547,7 +555,6 @@ class EngineArgs:
             help="Disable async output processing. This may result in "
             "lower performance.")
         model_group.add_argument("--config-format",
-                                 choices=[f.value for f in ConfigFormat],
                                  **model_kwargs["config_format"])
         # This one is a special case because it can bool
         # or str. TODO: Handle this in get_kwargs
@@ -559,8 +566,6 @@ class EngineArgs:
                                  help=model_kwargs["hf_token"]["help"])
         model_group.add_argument("--hf-overrides",
                                  **model_kwargs["hf_overrides"])
-        model_group.add_argument("--override-neuron-config",
-                                 **model_kwargs["override_neuron_config"])
         model_group.add_argument("--override-pooler-config",
                                  **model_kwargs["override_pooler_config"])
         model_group.add_argument("--logits-processor-pattern",
@@ -590,6 +595,8 @@ class EngineArgs:
         load_group.add_argument("--load-format", **load_kwargs["load_format"])
         load_group.add_argument("--download-dir",
                                 **load_kwargs["download_dir"])
+        load_group.add_argument("--safetensors-load-strategy",
+                                **load_kwargs["safetensors_load_strategy"])
         load_group.add_argument("--model-loader-extra-config",
                                 **load_kwargs["model_loader_extra_config"])
         load_group.add_argument("--ignore-patterns",
@@ -636,6 +643,9 @@ class EngineArgs:
             **parallel_kwargs["pipeline_parallel_size"])
         parallel_group.add_argument("--tensor-parallel-size", "-tp",
                                     **parallel_kwargs["tensor_parallel_size"])
+        parallel_group.add_argument(
+            "--decode-context-parallel-size", "-dcp",
+            **parallel_kwargs["decode_context_parallel_size"])
         parallel_group.add_argument("--data-parallel-size", "-dp",
                                     **parallel_kwargs["data_parallel_size"])
         parallel_group.add_argument(
@@ -731,6 +741,8 @@ class EngineArgs:
         cache_group.add_argument("--block-size", **cache_kwargs["block_size"])
         cache_group.add_argument("--gpu-memory-utilization",
                                  **cache_kwargs["gpu_memory_utilization"])
+        cache_group.add_argument("--kv-cache-memory-bytes",
+                                 **cache_kwargs["kv_cache_memory_bytes"])
         cache_group.add_argument("--swap-space", **cache_kwargs["swap_space"])
         cache_group.add_argument("--kv-cache-dtype",
                                  **cache_kwargs["cache_dtype"])
@@ -987,7 +999,6 @@ class EngineArgs:
             mm_processor_kwargs=self.mm_processor_kwargs,
             mm_processor_cache_gb=self.mm_processor_cache_gb,
             mm_encoder_tp_mode=self.mm_encoder_tp_mode,
-            override_neuron_config=self.override_neuron_config,
             override_pooler_config=self.override_pooler_config,
             logits_processor_pattern=self.logits_processor_pattern,
             generation_config=self.generation_config,
@@ -1024,6 +1035,7 @@ class EngineArgs:
         return LoadConfig(
             load_format=self.load_format,
             download_dir=self.download_dir,
+            safetensors_load_strategy=self.safetensors_load_strategy,
             device="cpu"
             if is_online_quantization(self.quantization) else None,
             model_loader_extra_config=self.model_loader_extra_config,
@@ -1053,9 +1065,10 @@ class EngineArgs:
             SpeculatorsConfig)
 
         if self.speculative_config is None:
-            hf_config = get_config(self.hf_config_path or self.model,
-                                   self.trust_remote_code, self.revision,
-                                   self.code_revision, self.config_format)
+            hf_config = get_config(
+                self.hf_config_path or target_model_config.model,
+                self.trust_remote_code, self.revision, self.code_revision,
+                self.config_format)
 
             # if loading a SpeculatorsConfig, load the speculative_config
             # details from the config directly
@@ -1065,7 +1078,7 @@ class EngineArgs:
                 self.speculative_config = {}
                 self.speculative_config[
                     "num_speculative_tokens"] = hf_config.num_lookahead_tokens
-                self.speculative_config["model"] = self.model
+                self.speculative_config["model"] = target_model_config.model
                 self.speculative_config["method"] = hf_config.method
             else:
                 return None
@@ -1156,9 +1169,21 @@ class EngineArgs:
             # global layers in interleaved sliding window models.
             sliding_window = model_config.get_sliding_window()
 
+        # Note(hc): In the current implementation of decode context
+        # parallel(DCP), tp_size needs to be divisible by dcp_size,
+        # because the world size does not change by dcp, it simply
+        # reuses the GPUs of TP group, and split one TP group into
+        # tp_size//dcp_size DCP groups.
+        assert self.tensor_parallel_size % self.decode_context_parallel_size \
+            == 0, (
+            f"tp_size={self.tensor_parallel_size} must be divisible by"
+            f"dcp_size={self.decode_context_parallel_size}."
+        )
+
         cache_config = CacheConfig(
             block_size=self.block_size,
             gpu_memory_utilization=self.gpu_memory_utilization,
+            kv_cache_memory_bytes=self.kv_cache_memory_bytes,
             swap_space=self.swap_space,
             cache_dtype=self.kv_cache_dtype,
             is_attention_free=model_config.is_attention_free,
@@ -1306,6 +1331,7 @@ class EngineArgs:
             distributed_executor_backend=self.distributed_executor_backend,
             worker_cls=self.worker_cls,
             worker_extension_cls=self.worker_extension_cls,
+            decode_context_parallel_size=self.decode_context_parallel_size,
         )
 
         speculative_config = self.create_speculative_config(
@@ -1436,17 +1462,6 @@ class EngineArgs:
                                recommend_to_remove=True)
             return False
 
-        # Triton v3.3 has f16 conversion regression issue on Turing and Volta,
-        # which broke fp16 inference
-        # see: https://github.com/triton-lang/triton/issues/6698
-        if (current_platform.is_cuda()
-                and not current_platform.has_device_capability(80)
-                and model_config.dtype == torch.float16):
-            _raise_or_fallback(
-                feature_name="Compute Capability < 8.0 with FP16",
-                recommend_to_remove=False)
-            return False
-
         if self.kv_cache_dtype != "auto":
             supported = current_platform.is_kv_cache_dtype_supported(
                 self.kv_cache_dtype, model_config)
@@ -1476,12 +1491,6 @@ class EngineArgs:
                                recommend_to_remove=False)
             return False
 
-        # No OTLP observability so far.
-        if (self.otlp_traces_endpoint or self.collect_detailed_traces):
-            _raise_or_fallback(feature_name="--otlp-traces-endpoint",
-                               recommend_to_remove=False)
-            return False
-
         # V1 supports N-gram, Medusa, and Eagle speculative decoding.
         if (self.speculative_config is not None
                 and self.speculative_config.get("method") == "draft_model"):
@@ -1499,8 +1508,11 @@ class EngineArgs:
             "TRITON_MLA",
             "CUTLASS_MLA",
             "FLASHMLA",
+            "FLASHMLA_VLLM_V1",
+            "FLASH_ATTN_MLA",
             "FLASHINFER",
             "FLASHINFER_VLLM_V1",
+            "FLASHINFER_MLA",
             "ROCM_AITER_MLA",
             "TORCH_SDPA_VLLM_V1",
             "FLEX_ATTENTION",
@@ -1589,20 +1601,12 @@ class EngineArgs:
                 "in low performance due to small KV cache size. Consider "
                 "setting --max-model-len to a smaller value.", max_model_len)
 
-        # if using prefix caching, we must set a hash algo
-        if self.enable_prefix_caching:
-            # Disable prefix caching for multimodal models for VLLM_V0.
-            if model_config.is_multimodal_model:
-                logger.warning(
-                    "--enable-prefix-caching is not supported for multimodal "
-                    "models in V0 and has been disabled.")
-                self.enable_prefix_caching = False
-
-            # VLLM_V0 only supports builtin hash algo for prefix caching.
-            if self.prefix_caching_hash_algo == "sha256":
-                raise ValueError(
-                    "sha256 is not supported for prefix caching in V0 engine. "
-                    "Please use 'builtin'.")
+        # Disable prefix caching for multimodal models for VLLM_V0.
+        if self.enable_prefix_caching and model_config.is_multimodal_model:
+            logger.warning(
+                "--enable-prefix-caching is not supported for multimodal "
+                "models in V0 and has been disabled.")
+            self.enable_prefix_caching = False
 
         # Set max_num_seqs to 256 for VLLM_V0.
         if self.max_num_seqs is None:
diff --git a/vllm/engine/async_llm_engine.py b/vllm/engine/async_llm_engine.py
index 9f9ad1854c3b6b02de2fd13008ea4b23dd209000..c53ece18964cb46d32a015def5d559f967777d0b 100644
--- a/vllm/engine/async_llm_engine.py
+++ b/vllm/engine/async_llm_engine.py
@@ -10,8 +10,9 @@ from typing import (Any, AsyncGenerator, Callable, Dict, Iterable, List,
 from weakref import ReferenceType
 
 import vllm.envs as envs
-from vllm.config import (DecodingConfig, LoRAConfig, ModelConfig,
-                         ParallelConfig, SchedulerConfig, VllmConfig)
+from vllm.config import (DecodingConfig, ModelConfig, ParallelConfig,
+                         SchedulerConfig, VllmConfig)
+from vllm.config.lora import LoRAConfig
 from vllm.core.scheduler import SchedulerOutputs
 from vllm.engine.arg_utils import AsyncEngineArgs
 from vllm.engine.async_timeout import asyncio_timeout
@@ -717,7 +718,7 @@ class AsyncLLMEngine(EngineClient):
                 # Stop the execute model loop in parallel workers until there
                 # are more requests to process. This avoids waiting
                 # indefinitely in torch.distributed ops which may otherwise
-                # timeout, and unblocks the RPC thread in the workers so that
+                # time out, and unblocks the RPC thread in the workers so that
                 # they can process any other queued control plane messages,
                 # such as add/remove lora adapters.
                 await engine.engine.stop_remote_worker_execution_loop_async()
diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py
index 10ded6f16d41c5d6e0f322c1a31bb3fffc255f86..c303d093f6324d538f2440e6cd9f7c431f25e1d7 100644
--- a/vllm/engine/llm_engine.py
+++ b/vllm/engine/llm_engine.py
@@ -16,9 +16,9 @@ import torch
 from typing_extensions import TypeVar
 
 import vllm.envs as envs
-from vllm.config import (DecodingConfig, LoRAConfig, ModelConfig,
-                         ObservabilityConfig, ParallelConfig, SchedulerConfig,
-                         VllmConfig)
+from vllm.config import (DecodingConfig, ModelConfig, ObservabilityConfig,
+                         ParallelConfig, SchedulerConfig, VllmConfig)
+from vllm.config.lora import LoRAConfig
 from vllm.core.scheduler import ScheduledSequenceGroup, SchedulerOutputs
 from vllm.engine.arg_utils import EngineArgs
 from vllm.engine.metrics_types import StatLoggerBase, Stats
@@ -278,7 +278,8 @@ class LLMEngine:
                     self.cache_config.block_size,
                     "gpu_memory_utilization":
                     self.cache_config.gpu_memory_utilization,
-
+                    "kv_cache_memory_bytes":
+                    self.cache_config.kv_cache_memory_bytes,
                     # Quantization
                     "quantization":
                     self.model_config.quantization,
@@ -1414,7 +1415,7 @@ class LLMEngine:
         num_generation_tokens_iter = 0
         num_tokens_iter = 0
         time_to_first_tokens_iter: List[float] = []
-        time_per_output_tokens_iter: List[float] = []
+        inter_token_latencies_iter: List[float] = []
         num_preemption_iter = (0 if scheduler_outputs is None else
                                scheduler_outputs.preempted)
 
@@ -1498,9 +1499,9 @@ class LLMEngine:
                         num_generation_tokens_from_prefill_groups += (
                             seq_group.num_seqs())
                 else:
-                    # TPOTs.
+                    # ITLs
                     latency = seq_group.get_last_token_latency()
-                    time_per_output_tokens_iter.append(latency)
+                    inter_token_latencies_iter.append(latency)
                     if seq_group.state.current_step == 0:
                         # For async_output_proc, the do_log_stats()
                         # is called following init_multi_step(), which
@@ -1582,7 +1583,7 @@ class LLMEngine:
             num_generation_tokens_iter=num_generation_tokens_iter,
             num_tokens_iter=num_tokens_iter,
             time_to_first_tokens_iter=time_to_first_tokens_iter,
-            time_per_output_tokens_iter=time_per_output_tokens_iter,
+            inter_token_latencies_iter=inter_token_latencies_iter,
             num_preemption_iter=num_preemption_iter,
 
             # Request stats
diff --git a/vllm/engine/metrics.py b/vllm/engine/metrics.py
index ba8dbd1fad791df09bd1f223eb6ad29ad72e0dca..0a8709db40880a6d7b07d610a482aec878899518 100644
--- a/vllm/engine/metrics.py
+++ b/vllm/engine/metrics.py
@@ -113,9 +113,21 @@ class Metrics:
                 0.75, 1.0, 2.5, 5.0, 7.5, 10.0, 20.0, 40.0, 80.0, 160.0, 640.0,
                 2560.0
             ])
+        # Deprecated in 0.11 - Renamed as vllm:inter_token_latency_seconds
+        # TODO: in 0.12, only enable if show_hidden_metrics=True
         self.histogram_time_per_output_token = self._histogram_cls(
             name="vllm:time_per_output_token_seconds",
-            documentation="Histogram of time per output token in seconds.",
+            documentation=(
+                "Histogram of time per output token in seconds."
+                "DEPRECATED: Use vllm:inter_token_latency_seconds instead."),
+            labelnames=labelnames,
+            buckets=[
+                0.01, 0.025, 0.05, 0.075, 0.1, 0.15, 0.2, 0.3, 0.4, 0.5, 0.75,
+                1.0, 2.5, 5.0, 7.5, 10.0, 20.0, 40.0, 80.0
+            ])
+        self.histogram_inter_token_latency = self._histogram_cls(
+            name="vllm:inter_token_latency_seconds",
+            documentation="Histogram of inter token latency in seconds.",
             labelnames=labelnames,
             buckets=[
                 0.01, 0.025, 0.05, 0.075, 0.1, 0.15, 0.2, 0.3, 0.4, 0.5, 0.75,
@@ -491,7 +503,9 @@ class PrometheusStatLogger(StatLoggerBase):
         self._log_histogram(self.metrics.histogram_time_to_first_token,
                             stats.time_to_first_tokens_iter)
         self._log_histogram(self.metrics.histogram_time_per_output_token,
-                            stats.time_per_output_tokens_iter)
+                            stats.inter_token_latencies_iter)
+        self._log_histogram(self.metrics.histogram_inter_token_latency,
+                            stats.inter_token_latencies_iter)
 
         # Request level data
         # Latency
diff --git a/vllm/engine/metrics_types.py b/vllm/engine/metrics_types.py
index 3281a9121a9df4d7cc0f9ce8ed86f7f766fda56e..9778ab5a8c99b266c481ef17174466f34a4c446c 100644
--- a/vllm/engine/metrics_types.py
+++ b/vllm/engine/metrics_types.py
@@ -43,7 +43,7 @@ class Stats:
     num_generation_tokens_iter: int
     num_tokens_iter: int
     time_to_first_tokens_iter: List[float]
-    time_per_output_tokens_iter: List[float]
+    inter_token_latencies_iter: List[float]
     num_preemption_iter: int
 
     # Request stats (should have _requests suffix)
diff --git a/vllm/engine/multiprocessing/client.py b/vllm/engine/multiprocessing/client.py
index 2d3248859c940aa3c184dc20060c20d1736376c9..7d1f29a9824d79a0b167b7fcfd333b92e64bb0a2 100644
--- a/vllm/engine/multiprocessing/client.py
+++ b/vllm/engine/multiprocessing/client.py
@@ -235,7 +235,7 @@ class MQLLMEngineClient(EngineClient):
                         # therefore we have to inform that the current
                         # processed requests failed as well. Send back a dead
                         # engine error give this feedback and also give a
-                        # 'hint' to the server to shutdown next.
+                        # 'hint' to the server to shut down next.
                         exception = self.dead_error
 
                     if request_id is None:
@@ -270,7 +270,7 @@ class MQLLMEngineClient(EngineClient):
             queue.put_nowait(request_output)
 
     async def setup(self):
-        """Setup the client before it starts sending server requests."""
+        """Set up the client before it starts sending server requests."""
 
         # Start output_loop
         if self.output_loop is None:
diff --git a/vllm/engine/multiprocessing/engine.py b/vllm/engine/multiprocessing/engine.py
index 343b8df7e87bda22cd9456e8b7f973bbd24cd366..138283d4c8a759fef52923ae2b639b7cd19fcf4f 100644
--- a/vllm/engine/multiprocessing/engine.py
+++ b/vllm/engine/multiprocessing/engine.py
@@ -49,7 +49,7 @@ class MQLLMEngine:
 
     This class is used to wrap the
     [`LLMEngine`][vllm.engine.llm_engine.LLMEngine] class to enable use
-    in concurrnet manner. It runs a background loop and uses zeromq to
+    in concurrent manner. It runs a background loop and uses zeromq to
     receive new requests and stream outputs incrementally via ipc.
 
     The [`LLMEngine`][vllm.engine.llm_engine.LLMEngine] generate or encode
diff --git a/vllm/engine/protocol.py b/vllm/engine/protocol.py
index b0b11a33a444398ad8048cfd6ce26f8b48886c37..94eacfbdfb3011f0f62b2ea8a78ea1a376606bc1 100644
--- a/vllm/engine/protocol.py
+++ b/vllm/engine/protocol.py
@@ -78,6 +78,7 @@ class EngineClient(ABC):
         preprocessor = await self.get_input_preprocessor()
         tokenizer_group = preprocessor.get_tokenizer_group()
         tokenizer = await tokenizer_group.get_lora_tokenizer_async()
+        eos_token_id = tokenizer.eos_token_id
 
         if is_explicit_encoder_decoder_prompt(prompt):
             raise NotImplementedError
@@ -104,7 +105,7 @@ class EngineClient(ABC):
         tokenized_length = len(prompt_token_ids)
 
         sort_beams_key = create_sort_beams_key_function(
-            tokenizer.eos_token_id, length_penalty)
+            eos_token_id, length_penalty)
 
         beam_search_params = SamplingParams(
             logprobs=2 * beam_width,
@@ -154,7 +155,7 @@ class EngineClient(ABC):
                 if result.outputs[0].logprobs is not None:
                     logprobs = result.outputs[0].logprobs[0]
                     for token_id, logprob_obj in logprobs.items():
-                        if token_id == tokenizer.eos_token_id and \
+                        if token_id == eos_token_id and \
                             not ignore_eos:
                             completed.append(
                                 BeamSearchSequence(
@@ -166,7 +167,7 @@ class EngineClient(ABC):
                                     cum_logprob=current_beam.cum_logprob +
                                     logprob_obj.logprob,
                                     finish_reason="stop",
-                                    stop_reason=tokenizer.eos_token_id))
+                                    stop_reason=eos_token_id))
                         else:
                             new_beams.append(
                                 BeamSearchSequence(
@@ -189,14 +190,14 @@ class EngineClient(ABC):
         best_beams = sorted_completed[:beam_width]
 
         for beam in best_beams:
-            if (beam.tokens[-1] == tokenizer.eos_token_id and not ignore_eos):
+            if (beam.tokens[-1] == eos_token_id and not ignore_eos):
                 # Skip the eos token in the text.
                 tokens = beam.tokens[tokenized_length:-1]
             else:
                 tokens = beam.tokens[tokenized_length:]
             beam.text = tokenizer.decode(tokens)
 
-        beam_search_output = RequestOutput(
+        yield RequestOutput(
             request_id=request_id,
             prompt=prompt_text,
             outputs=[
@@ -214,8 +215,6 @@ class EngineClient(ABC):
             prompt_token_ids=prompt_token_ids,
             prompt_logprobs=None)
 
-        yield beam_search_output
-
     @abstractmethod
     def encode(
         self,
diff --git a/vllm/entrypoints/chat_utils.py b/vllm/entrypoints/chat_utils.py
index 1954cbcbf1edd213e03b44124a500ada587b9aa6..b53dbfb3a26a224ae8793c773234727d14ac08b4 100644
--- a/vllm/entrypoints/chat_utils.py
+++ b/vllm/entrypoints/chat_utils.py
@@ -41,7 +41,8 @@ from typing_extensions import Required, TypeAlias, TypedDict
 from vllm.config import ModelConfig
 from vllm.logger import init_logger
 from vllm.model_executor.models import SupportsMultiModal
-from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalDataDict
+from vllm.multimodal import (MULTIMODAL_REGISTRY, MultiModalDataDict,
+                             MultiModalUUIDDict)
 from vllm.multimodal.utils import MediaConnector
 # yapf: disable
 from vllm.transformers_utils.chat_templates import (
@@ -72,6 +73,11 @@ class ChatCompletionContentPartAudioParam(TypedDict, total=False):
 
     type: Required[Literal["audio_url"]]
     """The type of the content part."""
+    uuid: Optional[str]
+    """
+    User-provided UUID of a media. User must guarantee that it is properly
+    generated and unique for different medias.
+    """
 
 
 class ChatCompletionContentPartImageEmbedsParam(TypedDict, total=False):
@@ -83,6 +89,11 @@ class ChatCompletionContentPartImageEmbedsParam(TypedDict, total=False):
     """
     type: Required[Literal["image_embeds"]]
     """The type of the content part."""
+    uuid: Optional[str]
+    """
+    User-provided UUID of a media. User must guarantee that it is properly
+    generated and unique for different medias.
+    """
 
 
 class VideoURL(TypedDict, total=False):
@@ -97,12 +108,18 @@ class ChatCompletionContentPartVideoParam(TypedDict, total=False):
 
     type: Required[Literal["video_url"]]
     """The type of the content part."""
+    uuid: Optional[str]
+    """
+    User-provided UUID of a media. User must guarantee that it is properly
+    generated and unique for different medias.
+    """
 
 
 class PILImage(BaseModel):
     """
     A PIL.Image.Image object.
     """
+
     image_pil: Image.Image
     model_config = ConfigDict(arbitrary_types_allowed=True)
 
@@ -115,7 +132,13 @@ class CustomChatCompletionContentPILImageParam(TypedDict, total=False):
         "image_pil": ImageAsset('cherry_blossom').pil_image
     }
     """
+
     image_pil: Required[PILImage]
+    uuid: Optional[str]
+    """
+    User-provided UUID of a media. User must guarantee that it is properly
+    generated and unique for different medias.
+    """
 
 
 class CustomChatCompletionContentSimpleImageParam(TypedDict, total=False):
@@ -127,7 +150,13 @@ class CustomChatCompletionContentSimpleImageParam(TypedDict, total=False):
         "image_url": "https://example.com/image.jpg"
     }
     """
+
     image_url: Required[str]
+    uuid: Optional[str]
+    """
+    User-provided UUID of a media. User must guarantee that it is properly
+    generated and unique for different medias.
+    """
 
 
 class CustomChatCompletionContentSimpleAudioParam(TypedDict, total=False):
@@ -138,6 +167,7 @@ class CustomChatCompletionContentSimpleAudioParam(TypedDict, total=False):
         "audio_url": "https://example.com/audio.mp3"
     }
     """
+
     audio_url: Required[str]
 
 
@@ -149,7 +179,13 @@ class CustomChatCompletionContentSimpleVideoParam(TypedDict, total=False):
         "video_url": "https://example.com/video.mp4"
     }
     """
+
     video_url: Required[str]
+    uuid: Optional[str]
+    """
+    User-provided UUID of a media. User must guarantee that it is properly
+    generated and unique for different medias.
+    """
 
 
 class CustomThinkCompletionContentParam(TypedDict, total=False):
@@ -174,19 +210,24 @@ class CustomThinkCompletionContentParam(TypedDict, total=False):
 
 
 ChatCompletionContentPartParam: TypeAlias = Union[
-    OpenAIChatCompletionContentPartParam, ChatCompletionContentPartAudioParam,
+    OpenAIChatCompletionContentPartParam,
+    ChatCompletionContentPartAudioParam,
     ChatCompletionContentPartInputAudioParam,
-    ChatCompletionContentPartVideoParam, ChatCompletionContentPartRefusalParam,
+    ChatCompletionContentPartVideoParam,
+    ChatCompletionContentPartRefusalParam,
     CustomChatCompletionContentPILImageParam,
     CustomChatCompletionContentSimpleImageParam,
     ChatCompletionContentPartImageEmbedsParam,
     CustomChatCompletionContentSimpleAudioParam,
-    CustomChatCompletionContentSimpleVideoParam, str,
-    CustomThinkCompletionContentParam]
+    CustomChatCompletionContentSimpleVideoParam,
+    str,
+    CustomThinkCompletionContentParam,
+]
 
 
 class CustomChatCompletionMessageParam(TypedDict, total=False):
     """Enables custom roles in the Chat Completion API."""
+
     role: Required[str]
     """The role of the message's author."""
 
@@ -207,9 +248,11 @@ class CustomChatCompletionMessageParam(TypedDict, total=False):
     """The tool calls generated by the model, such as function calls."""
 
 
-ChatCompletionMessageParam = Union[OpenAIChatCompletionMessageParam,
-                                   CustomChatCompletionMessageParam,
-                                   OpenAIHarmonyMessage]
+ChatCompletionMessageParam = Union[
+    OpenAIChatCompletionMessageParam,
+    CustomChatCompletionMessageParam,
+    OpenAIHarmonyMessage,
+]
 
 
 # TODO: Make fields ReadOnly once mypy supports it
@@ -262,13 +305,13 @@ def _is_var_or_elems_access(
     key: Optional[str] = None,
 ) -> bool:
     if isinstance(node, jinja2.nodes.Filter):
-        return (node.node is not None
-                and _is_var_or_elems_access(node.node, varname, key))
+        return node.node is not None and _is_var_or_elems_access(
+            node.node, varname, key)
     if isinstance(node, jinja2.nodes.Test):
         return _is_var_or_elems_access(node.node, varname, key)
 
-    if (isinstance(node, jinja2.nodes.Getitem)
-            and isinstance(node.arg, jinja2.nodes.Slice)):
+    if isinstance(node, jinja2.nodes.Getitem) and isinstance(
+            node.arg, jinja2.nodes.Slice):
         return _is_var_or_elems_access(node.node, varname, key)
 
     # yapf: disable
@@ -373,15 +416,18 @@ def resolve_mistral_chat_template(
 ) -> Optional[str]:
     if chat_template is not None:
         logger.warning_once(
-            "'chat_template' cannot be overridden for mistral tokenizer.")
+            "'chat_template' cannot be overridden for mistral tokenizer."
+        )
     if "add_generation_prompt" in kwargs:
         logger.warning_once(
             "'add_generation_prompt' is not supported for mistral tokenizer, "
-            "so it will be ignored.")
+            "so it will be ignored."
+        )
     if "continue_final_message" in kwargs:
         logger.warning_once(
             "'continue_final_message' is not supported for mistral tokenizer, "
-            "so it will be ignored.")
+            "so it will be ignored."
+        )
     return None
 
 
@@ -401,23 +447,35 @@ def resolve_hf_chat_template(
         try:
             processor = cached_get_processor(
                 tokenizer.name_or_path,
-                processor_cls=(PreTrainedTokenizer, PreTrainedTokenizerFast,
-                               ProcessorMixin),
+                processor_cls=(
+                    PreTrainedTokenizer,
+                    PreTrainedTokenizerFast,
+                    ProcessorMixin,
+                ),
                 trust_remote_code=model_config.trust_remote_code,
             )
-            if isinstance(processor, ProcessorMixin) and \
-                hasattr(processor, 'chat_template') and \
-                processor.chat_template is not None:
+            if (
+                isinstance(processor, ProcessorMixin)
+                and hasattr(processor, "chat_template")
+                and processor.chat_template is not None
+            ):
                 return processor.chat_template
         except Exception:
-            logger.debug("Failed to load AutoProcessor chat template for %s", tokenizer.name_or_path, exc_info=True)  # noqa: E501
+            logger.debug(
+                "Failed to load AutoProcessor chat template for %s",
+                tokenizer.name_or_path,
+                exc_info=True,
+            )  # noqa: E501
 
     # 3rd priority: AutoTokenizer chat template
     try:
         return tokenizer.get_chat_template(chat_template, tools=tools)
     except Exception:
-        logger.debug("Failed to load AutoTokenizer chat template for %s",
-                     tokenizer.name_or_path, exc_info=True)
+        logger.debug(
+            "Failed to load AutoTokenizer chat template for %s",
+            tokenizer.name_or_path,
+            exc_info=True,
+        )
 
     # 4th priority: Predefined fallbacks
     path = get_chat_template_fallback_path(
@@ -425,12 +483,16 @@ def resolve_hf_chat_template(
         tokenizer_name_or_path=model_config.tokenizer,
     )
     if path is not None:
-        logger.info("Loading chat template fallback for %s as there isn't one "
-                    "defined on HF Hub.", tokenizer.name_or_path)
+        logger.info(
+            "Loading chat template fallback for %s as there isn't one "
+            "defined on HF Hub.",
+            tokenizer.name_or_path,
+        )
         chat_template = load_chat_template(path)
     else:
-        logger.debug("There is no chat template fallback for %s",
-                     tokenizer.name_or_path)
+        logger.debug(
+            "There is no chat template fallback for %s", tokenizer.name_or_path
+        )
 
     return chat_template
 
@@ -452,11 +514,17 @@ def _resolve_chat_template_content_format(
     else:
         hf_chat_template = None
 
-    jinja_text = (hf_chat_template if isinstance(hf_chat_template, str)
-                  else load_chat_template(chat_template, is_literal=True))
+    jinja_text = (
+        hf_chat_template
+        if isinstance(hf_chat_template, str)
+        else load_chat_template(chat_template, is_literal=True)
+    )
 
-    detected_format = ("string" if jinja_text is None else
-                       _detect_content_format(jinja_text, default="string"))
+    detected_format = (
+        "string"
+        if jinja_text is None
+        else _detect_content_format(jinja_text, default="string")
+    )
 
     return detected_format
 
@@ -512,7 +580,6 @@ def resolve_chat_template_content_format(
     return detected_format
 
 
-
 ModalityStr = Literal["image", "audio", "video", "image_embeds"]
 _T = TypeVar("_T")
 
@@ -531,6 +598,7 @@ class BaseMultiModalItemTracker(ABC, Generic[_T]):
         self._tokenizer = tokenizer
 
         self._items_by_modality = defaultdict[str, list[_T]](list)
+        self._uuids_by_modality = defaultdict[str, list[Optional[str]]](list)
 
     @property
     def model_config(self) -> ModelConfig:
@@ -539,6 +607,7 @@ class BaseMultiModalItemTracker(ABC, Generic[_T]):
     @cached_property
     def model_cls(self) -> type[SupportsMultiModal]:
         from vllm.model_executor.model_loader import get_model_cls
+
         model_cls = get_model_cls(self.model_config)
         return cast(type[SupportsMultiModal], model_cls)
 
@@ -554,10 +623,15 @@ class BaseMultiModalItemTracker(ABC, Generic[_T]):
     def mm_processor(self):
         return self.mm_registry.create_processor(self.model_config)
 
-    def add(self, modality: ModalityStr, item: _T) -> Optional[str]:
+    def add(
+        self, modality: ModalityStr, item: _T, uuid: Optional[str] = None
+    ) -> Optional[str]:
         """
         Add a multi-modal item to the current prompt and returns the
         placeholder string to use, if any.
+
+        An optional uuid can be added which serves as a unique identifier of the
+        media. 
         """
         input_modality = modality.replace("_embeds", "")
         num_items = len(self._items_by_modality[modality]) + 1
@@ -565,37 +639,64 @@ class BaseMultiModalItemTracker(ABC, Generic[_T]):
         self.mm_processor.validate_num_items(input_modality, num_items)
 
         self._items_by_modality[modality].append(item)
+        self._uuids_by_modality[modality].append(uuid)
 
         return self.model_cls.get_placeholder_str(modality, num_items)
 
+    def all_mm_uuids(self) -> Optional[MultiModalUUIDDict]:
+        if not self._items_by_modality:
+            return None
+        mm_uuids = {}
+        uuids_by_modality = dict(self._uuids_by_modality)
+        if "image" in uuids_by_modality and "image_embeds" in uuids_by_modality:
+            raise ValueError(
+                "Mixing raw image and embedding inputs is not allowed"
+            )
+
+        if "image_embeds" in uuids_by_modality:
+            image_embeds_uuids = uuids_by_modality["image_embeds"]
+            if len(image_embeds_uuids) > 1:
+                raise ValueError(
+                    "Only one message can have {'type': 'image_embeds'}"
+                )
+            mm_uuids["image"] = uuids_by_modality["image_embeds"]
+        if "image" in uuids_by_modality:
+            mm_uuids["image"] = uuids_by_modality["image"]  # UUIDs of images
+        if "audio" in uuids_by_modality:
+            mm_uuids["audio"] = uuids_by_modality["audio"]  # UUIDs of audios
+        if "video" in uuids_by_modality:
+            mm_uuids["video"] = uuids_by_modality["video"]  # UUIDs of videos
+        return mm_uuids
+
     @abstractmethod
     def create_parser(self) -> "BaseMultiModalContentParser":
         raise NotImplementedError
 
 
 class MultiModalItemTracker(BaseMultiModalItemTracker[object]):
-
     def all_mm_data(self) -> Optional[MultiModalDataDict]:
         if not self._items_by_modality:
             return None
         mm_inputs = {}
         items_by_modality = dict(self._items_by_modality)
         if "image" in items_by_modality and "image_embeds" in items_by_modality:
-            raise ValueError(\
-                "Mixing raw image and embedding inputs is not allowed")
+            raise ValueError(
+                "Mixing raw image and embedding inputs is not allowed"
+            )
 
         if "image_embeds" in items_by_modality:
             image_embeds_lst = items_by_modality["image_embeds"]
             if len(image_embeds_lst) > 1:
-                raise ValueError(\
-                    "Only one message can have {'type': 'image_embeds'}")
+                raise ValueError(
+                    "Only one message can have {'type': 'image_embeds'}"
+                )
             mm_inputs["image"] = image_embeds_lst[0]
         if "image" in items_by_modality:
-            mm_inputs["image"] = items_by_modality["image"] # A list of images
+            mm_inputs["image"] = items_by_modality["image"]  # A list of images
         if "audio" in items_by_modality:
-            mm_inputs["audio"] = items_by_modality["audio"] # A list of audios
+            mm_inputs["audio"] = items_by_modality["audio"]  # A list of audios
         if "video" in items_by_modality:
-            mm_inputs["video"] = items_by_modality["video"] # A list of videos
+            mm_inputs["video"] = items_by_modality["video"]  # A list of videos
         return mm_inputs
 
     def create_parser(self) -> "BaseMultiModalContentParser":
@@ -603,32 +704,33 @@ class MultiModalItemTracker(BaseMultiModalItemTracker[object]):
 
 
 class AsyncMultiModalItemTracker(BaseMultiModalItemTracker[Awaitable[object]]):
-
     async def all_mm_data(self) -> Optional[MultiModalDataDict]:
         if not self._items_by_modality:
             return None
         mm_inputs = {}
         items_by_modality = {
-                modality: await asyncio.gather(*items)
-                for modality, items in self._items_by_modality.items()
-            }
+            modality: await asyncio.gather(*items)
+            for modality, items in self._items_by_modality.items()
+        }
 
         if "image" in items_by_modality and "image_embeds" in items_by_modality:
             raise ValueError(
-                "Mixing raw image and embedding inputs is not allowed")
+                "Mixing raw image and embedding inputs is not allowed"
+            )
 
         if "image_embeds" in items_by_modality:
             image_embeds_lst = items_by_modality["image_embeds"]
             if len(image_embeds_lst) > 1:
                 raise ValueError(
-                    "Only one message can have {'type': 'image_embeds'}")
+                    "Only one message can have {'type': 'image_embeds'}"
+                )
             mm_inputs["image"] = image_embeds_lst[0]
         if "image" in items_by_modality:
-            mm_inputs["image"] = items_by_modality["image"] # A list of images
+            mm_inputs["image"] = items_by_modality["image"]  # A list of images
         if "audio" in items_by_modality:
-            mm_inputs["audio"] = items_by_modality["audio"] # A list of audios
+            mm_inputs["audio"] = items_by_modality["audio"]  # A list of audios
         if "video" in items_by_modality:
-            mm_inputs["video"] = items_by_modality["video"] # A list of videos
+            mm_inputs["video"] = items_by_modality["video"]  # A list of videos
         return mm_inputs
 
     def create_parser(self) -> "BaseMultiModalContentParser":
@@ -636,7 +738,6 @@ class AsyncMultiModalItemTracker(BaseMultiModalItemTracker[Awaitable[object]]):
 
 
 class BaseMultiModalContentParser(ABC):
-
     def __init__(self) -> None:
         super().__init__()
 
@@ -648,8 +749,9 @@ class BaseMultiModalContentParser(ABC):
         # }
         self._placeholder_storage: dict[str, list] = defaultdict(list)
 
-    def _add_placeholder(self, modality: ModalityStr,
-                         placeholder: Optional[str]):
+    def _add_placeholder(
+        self, modality: ModalityStr, placeholder: Optional[str]
+    ):
         mod_placeholder = MODALITY_PLACEHOLDERS_MAP[modality]
         if placeholder:
             self._placeholder_storage[mod_placeholder].append(placeholder)
@@ -658,33 +760,39 @@ class BaseMultiModalContentParser(ABC):
         return dict(self._placeholder_storage)
 
     @abstractmethod
-    def parse_image(self, image_url: str) -> None:
+    def parse_image(self, image_url: str, uuid: Optional[str] = None) -> None:
         raise NotImplementedError
 
     @abstractmethod
-    def parse_image_embeds(self,
-                           image_embeds: Union[str, dict[str, str]]) -> None:
+    def parse_image_embeds(
+        self,
+        image_embeds: Union[str, dict[str, str]],
+        uuid: Optional[str] = None,
+    ) -> None:
         raise NotImplementedError
 
     @abstractmethod
-    def parse_image_pil(self, image_pil: Image.Image) -> None:
+    def parse_image_pil(
+        self, image_pil: Image.Image, uuid: Optional[str] = None
+    ) -> None:
         raise NotImplementedError
 
     @abstractmethod
-    def parse_audio(self, audio_url: str) -> None:
+    def parse_audio(self, audio_url: str, uuid: Optional[str] = None) -> None:
         raise NotImplementedError
 
     @abstractmethod
-    def parse_input_audio(self, input_audio: InputAudio) -> None:
+    def parse_input_audio(
+        self, input_audio: InputAudio, uuid: Optional[str] = None
+    ) -> None:
         raise NotImplementedError
 
     @abstractmethod
-    def parse_video(self, video_url: str) -> None:
+    def parse_video(self, video_url: str, uuid: Optional[str] = None) -> None:
         raise NotImplementedError
 
 
 class MultiModalContentParser(BaseMultiModalContentParser):
-
     def __init__(self, tracker: MultiModalItemTracker) -> None:
         super().__init__()
 
@@ -695,70 +803,79 @@ class MultiModalContentParser(BaseMultiModalContentParser):
             allowed_local_media_path=tracker.allowed_local_media_path,
         )
 
-    def parse_image(self, image_url: str) -> None:
+    def parse_image(self, image_url: str, uuid: Optional[str] = None) -> None:
         image = self._connector.fetch_image(image_url)
 
-        placeholder = self._tracker.add("image", image)
+        placeholder = self._tracker.add("image", image, uuid)
         self._add_placeholder("image", placeholder)
 
-    def parse_image_embeds(self,
-                           image_embeds: Union[str, dict[str, str]]) -> None:
+    def parse_image_embeds(
+        self,
+        image_embeds: Union[str, dict[str, str]],
+        uuid: Optional[str] = None,
+    ) -> None:
         if isinstance(image_embeds, dict):
             embeds = {
                 k: self._connector.fetch_image_embedding(v)
                 for k, v in image_embeds.items()
             }
-            placeholder = self._tracker.add("image_embeds", embeds)
+            placeholder = self._tracker.add("image_embeds", embeds, uuid)
 
         if isinstance(image_embeds, str):
             embedding = self._connector.fetch_image_embedding(image_embeds)
-            placeholder = self._tracker.add("image_embeds", embedding)
+            placeholder = self._tracker.add("image_embeds", embedding, uuid)
 
         self._add_placeholder("image", placeholder)
 
-    def parse_image_pil(self, image_pil: Image.Image) -> None:
-        placeholder = self._tracker.add("image", image_pil)
+    def parse_image_pil(
+        self, image_pil: Image.Image, uuid: Optional[str] = None
+    ) -> None:
+        placeholder = self._tracker.add("image", image_pil, uuid)
         self._add_placeholder("image", placeholder)
 
-    def parse_audio(self, audio_url: str) -> None:
+    def parse_audio(self, audio_url: str, uuid: Optional[str] = None) -> None:
         audio = self._connector.fetch_audio(audio_url)
 
-        placeholder = self._tracker.add("audio", audio)
+        placeholder = self._tracker.add("audio", audio, uuid)
         self._add_placeholder("audio", placeholder)
 
-    def parse_input_audio(self, input_audio: InputAudio) -> None:
+    def parse_input_audio(
+        self, input_audio: InputAudio, uuid: Optional[str] = None
+    ) -> None:
         audio_data = input_audio.get("data", "")
         audio_format = input_audio.get("format", "")
         audio_url = f"data:audio/{audio_format};base64,{audio_data}"
 
-        return self.parse_audio(audio_url)
+        return self.parse_audio(audio_url, uuid)
 
-    def parse_video(self, video_url: str) -> None:
+    def parse_video(self, video_url: str, uuid: Optional[str] = None) -> None:
         video = self._connector.fetch_video(video_url=video_url)
 
-        placeholder = self._tracker.add("video", video)
+        placeholder = self._tracker.add("video", video, uuid)
         self._add_placeholder("video", placeholder)
 
 
 class AsyncMultiModalContentParser(BaseMultiModalContentParser):
-
     def __init__(self, tracker: AsyncMultiModalItemTracker) -> None:
         super().__init__()
 
         self._tracker = tracker
         self._connector = MediaConnector(
             media_io_kwargs=self._tracker._model_config.media_io_kwargs,
-            allowed_local_media_path=tracker.allowed_local_media_path
+            allowed_local_media_path=tracker.allowed_local_media_path,
         )
 
-    def parse_image(self, image_url: str) -> None:
+    def parse_image(self, image_url: str, uuid: Optional[str] = None) -> None:
         image_coro = self._connector.fetch_image_async(image_url)
 
-        placeholder = self._tracker.add("image", image_coro)
+        placeholder = self._tracker.add("image", image_coro, uuid)
         self._add_placeholder("image", placeholder)
 
-    def parse_image_embeds(self,
-                           image_embeds: Union[str, dict[str, str]]) -> None:
+    def parse_image_embeds(
+        self,
+        image_embeds: Union[str, dict[str, str]],
+        uuid: Optional[str] = None,
+    ) -> None:
         future: asyncio.Future[Union[str, dict[str, str]]] = asyncio.Future()
 
         if isinstance(image_embeds, dict):
@@ -769,37 +886,40 @@ class AsyncMultiModalContentParser(BaseMultiModalContentParser):
             future.set_result(embeds)
 
         if isinstance(image_embeds, str):
-            embedding = self._connector.\
-                fetch_image_embedding(image_embeds)
+            embedding = self._connector.fetch_image_embedding(image_embeds)
             future.set_result(embedding)
 
-        placeholder = self._tracker.add("image_embeds", future)
+        placeholder = self._tracker.add("image_embeds", future, uuid)
         self._add_placeholder("image", placeholder)
 
-    def parse_image_pil(self, image_pil: Image.Image) -> None:
+    def parse_image_pil(
+        self, image_pil: Image.Image, uuid: Optional[str] = None
+    ) -> None:
         future: asyncio.Future[Image.Image] = asyncio.Future()
         future.set_result(image_pil)
 
-        placeholder = self._tracker.add("image", future)
+        placeholder = self._tracker.add("image", future, uuid)
         self._add_placeholder("image", placeholder)
 
-    def parse_audio(self, audio_url: str) -> None:
+    def parse_audio(self, audio_url: str, uuid: Optional[str] = None) -> None:
         audio_coro = self._connector.fetch_audio_async(audio_url)
 
-        placeholder = self._tracker.add("audio", audio_coro)
+        placeholder = self._tracker.add("audio", audio_coro, uuid)
         self._add_placeholder("audio", placeholder)
 
-    def parse_input_audio(self, input_audio: InputAudio) -> None:
+    def parse_input_audio(
+        self, input_audio: InputAudio, uuid: Optional[str] = None
+    ) -> None:
         audio_data = input_audio.get("data", "")
         audio_format = input_audio.get("format", "")
         audio_url = f"data:audio/{audio_format};base64,{audio_data}"
 
-        return self.parse_audio(audio_url)
+        return self.parse_audio(audio_url, uuid)
 
-    def parse_video(self, video_url: str) -> None:
+    def parse_video(self, video_url: str, uuid: Optional[str] = None) -> None:
         video = self._connector.fetch_video_async(video_url=video_url)
 
-        placeholder = self._tracker.add("video", video)
+        placeholder = self._tracker.add("video", video, uuid)
         self._add_placeholder("video", placeholder)
 
 
@@ -809,20 +929,23 @@ def validate_chat_template(chat_template: Optional[Union[Path, str]]):
         return
 
     elif isinstance(chat_template, Path) and not chat_template.exists():
-        raise FileNotFoundError(
-            "the supplied chat template path doesn't exist")
+        raise FileNotFoundError("the supplied chat template path doesn't exist")
 
     elif isinstance(chat_template, str):
         JINJA_CHARS = "{}\n"
-        if not any(c in chat_template
-                   for c in JINJA_CHARS) and not Path(chat_template).exists():
+        if (
+            not any(c in chat_template for c in JINJA_CHARS)
+            and not Path(chat_template).exists()
+        ):
             raise ValueError(
                 f"The supplied chat template string ({chat_template}) "
-                f"appears path-like, but doesn't exist!")
+                f"appears path-like, but doesn't exist!"
+            )
 
     else:
         raise TypeError(
-            f"{type(chat_template)} is not a valid chat template type")
+            f"{type(chat_template)} is not a valid chat template type"
+        )
 
 
 def _load_chat_template(
@@ -835,8 +958,9 @@ def _load_chat_template(
 
     if is_literal:
         if isinstance(chat_template, Path):
-            raise TypeError("chat_template is expected to be read directly "
-                            "from its value")
+            raise TypeError(
+                "chat_template is expected to be read directly from its value"
+            )
 
         return chat_template
 
@@ -849,9 +973,11 @@ def _load_chat_template(
 
         JINJA_CHARS = "{}\n"
         if not any(c in chat_template for c in JINJA_CHARS):
-            msg = (f"The supplied chat template ({chat_template}) "
-                   f"looks like a file path, but it failed to be "
-                   f"opened. Reason: {e}")
+            msg = (
+                f"The supplied chat template ({chat_template}) "
+                f"looks like a file path, but it failed to be "
+                f"opened. Reason: {e}"
+            )
             raise ValueError(msg) from e
 
         # If opening a file fails, set chat template to be args to
@@ -870,8 +996,9 @@ def load_chat_template(
     return _cached_load_chat_template(chat_template, is_literal=is_literal)
 
 
-def _get_interleaved_text_prompt(placeholder_storage: dict[str, list],
-                                 texts: list[str]) -> str:
+def _get_interleaved_text_prompt(
+    placeholder_storage: dict[str, list], texts: list[str]
+) -> str:
     for idx, elem in enumerate(texts):
         if elem in placeholder_storage:
             texts[idx] = placeholder_storage[elem].pop(0)
@@ -881,10 +1008,11 @@ def _get_interleaved_text_prompt(placeholder_storage: dict[str, list],
 
 # TODO: Let user specify how to insert multimodal tokens into prompt
 # (similar to chat template)
-def _get_full_multimodal_text_prompt(placeholder_storage: dict[str, list],
-                                     texts: list[str],
-                                     interleave_strings: bool
-                                     ) -> str:
+def _get_full_multimodal_text_prompt(
+    placeholder_storage: dict[str, list],
+    texts: list[str],
+    interleave_strings: bool,
+) -> str:
     """Combine multimodal prompts for a multimodal language model."""
 
     # flatten storage to make it looks like
@@ -907,7 +1035,6 @@ def _get_full_multimodal_text_prompt(placeholder_storage: dict[str, list],
     # Look through the text prompt to check for missing placeholders
     missing_placeholders: list[str] = []
     for placeholder in placeholder_counts:
-
         # For any existing placeholder in the text prompt, we leave it as is
         placeholder_counts[placeholder] -= text_prompt.count(placeholder)
 
@@ -916,15 +1043,18 @@ def _get_full_multimodal_text_prompt(placeholder_storage: dict[str, list],
                 "Placeholder count is negative! "
                 "Ensure that the 'interleave_strings' flag is disabled "
                 "(current value: %s) "
-                "when manually placing image placeholders.", interleave_strings
+                "when manually placing image placeholders.",
+                interleave_strings,
             )
             logger.debug("Input prompt: %s", text_prompt)
             raise ValueError(
                 f"Found more '{placeholder}' placeholders in input prompt than "
-                "actual multimodal data items.")
+                "actual multimodal data items."
+            )
 
-        missing_placeholders.extend([placeholder] *
-                                    placeholder_counts[placeholder])
+        missing_placeholders.extend(
+            [placeholder] * placeholder_counts[placeholder]
+        )
 
     # NOTE: Default behaviour: we always add missing placeholders
     # at the front of the prompt, if interleave_strings=False
@@ -944,7 +1074,8 @@ _AudioParser = TypeAdapter(ChatCompletionContentPartAudioParam).validate_python
 _VideoParser = TypeAdapter(ChatCompletionContentPartVideoParam).validate_python
 
 _ResponsesInputImageParser = TypeAdapter(
-    ResponseInputImageParam).validate_python
+    ResponseInputImageParam
+).validate_python
 _ContentPart: TypeAlias = Union[str, dict[str, str], InputAudio, PILImage]
 
 # Define a mapping from part types to their corresponding parsing functions.
@@ -952,32 +1083,35 @@ MM_PARSER_MAP: dict[
     str,
     Callable[[ChatCompletionContentPartParam], _ContentPart],
 ] = {
-    "text":
-    lambda part: _TextParser(part).get("text", None),
-    "thinking":
-    lambda part: _ThinkParser(part).get("thinking", None),
-    "input_text":
-    lambda part: _TextParser(part).get("text", None),
-    "input_image":
-    lambda part: _ResponsesInputImageParser(part).get("image_url", None),
-    "image_url":
-    lambda part: _ImageParser(part).get("image_url", {}).get("url", None),
-    "image_embeds":
-    lambda part: _ImageEmbedsParser(part).get("image_embeds", None),
+    "text": lambda part: _TextParser(part).get("text", None),
+    "thinking": lambda part: _ThinkParser(part).get("thinking", None),
+    "input_text": lambda part: _TextParser(part).get("text", None),
+    "input_image": lambda part: _ResponsesInputImageParser(part).get(
+        "image_url", None
+    ),
+    "image_url": lambda part: _ImageParser(part)
+    .get("image_url", {})
+    .get("url", None),
+    "image_embeds": lambda part: _ImageEmbedsParser(part).get(
+        "image_embeds", None
+    ),
     "image_pil": lambda part: _PILImageParser(part).get("image_pil", None),
-    "audio_url":
-    lambda part: _AudioParser(part).get("audio_url", {}).get("url", None),
-    "input_audio":
-    lambda part: _InputAudioParser(part).get("input_audio", None),
-    "refusal":
-    lambda part: _RefusalParser(part).get("refusal", None),
-    "video_url":
-    lambda part: _VideoParser(part).get("video_url", {}).get("url", None),
+    "audio_url": lambda part: _AudioParser(part)
+    .get("audio_url", {})
+    .get("url", None),
+    "input_audio": lambda part: _InputAudioParser(part).get(
+        "input_audio", None
+    ),
+    "refusal": lambda part: _RefusalParser(part).get("refusal", None),
+    "video_url": lambda part: _VideoParser(part)
+    .get("video_url", {})
+    .get("url", None),
 }
 
 
 def _parse_chat_message_content_mm_part(
-        part: ChatCompletionContentPartParam) -> tuple[str, _ContentPart]:
+    part: ChatCompletionContentPartParam,
+) -> tuple[str, _ContentPart]:
     """
     Parses a given multi-modal content part based on its type.
 
@@ -993,7 +1127,8 @@ def _parse_chat_message_content_mm_part(
         ValueError: If the 'type' field is missing and no direct URL is found.
     """
     assert isinstance(
-        part, dict)  # This is needed to avoid mypy errors: part.get() from str
+        part, dict
+    )  # This is needed to avoid mypy errors: part.get() from str
     part_type = part.get("type", None)
 
     if isinstance(part_type, str) and part_type in MM_PARSER_MAP:
@@ -1002,8 +1137,10 @@ def _parse_chat_message_content_mm_part(
         # Special case for 'image_url.detail'
         # We only support 'auto', which is the default
         if part_type == "image_url" and part.get("detail", "auto") != "auto":
-            logger.warning("'image_url.detail' is currently not supported "
-                           "and will be ignored.")
+            logger.warning(
+                "'image_url.detail' is currently not supported "
+                "and will be ignored."
+            )
 
         return part_type, content
 
@@ -1011,19 +1148,22 @@ def _parse_chat_message_content_mm_part(
     # 'type' is required field by pydantic
     if part_type is None:
         if part.get("image_url") is not None:
-            image_params = cast(CustomChatCompletionContentSimpleImageParam,
-                                part)
+            image_params = cast(
+                CustomChatCompletionContentSimpleImageParam, part
+            )
             return "image_url", image_params.get("image_url", "")
         if part.get("audio_url") is not None:
-            audio_params = cast(CustomChatCompletionContentSimpleAudioParam,
-                                part)
+            audio_params = cast(
+                CustomChatCompletionContentSimpleAudioParam, part
+            )
             return "audio_url", audio_params.get("audio_url", "")
         if part.get("input_audio") is not None:
             input_audio_params = cast(dict[str, str], part)
             return "input_audio", input_audio_params
         if part.get("video_url") is not None:
-            video_params = cast(CustomChatCompletionContentSimpleVideoParam,
-                                part)
+            video_params = cast(
+                CustomChatCompletionContentSimpleVideoParam, part
+            )
             return "video_url", video_params.get("video_url", "")
         # Raise an error if no 'type' or direct URL is found.
         raise ValueError("Missing 'type' field in multimodal part.")
@@ -1033,9 +1173,16 @@ def _parse_chat_message_content_mm_part(
     return part_type, "unknown part_type content"
 
 
-VALID_MESSAGE_CONTENT_MM_PART_TYPES = ("text", "refusal", "image_url",
-                                       "image_embeds", "image_pil",
-                                       "audio_url", "input_audio", "video_url")
+VALID_MESSAGE_CONTENT_MM_PART_TYPES = (
+    "text",
+    "refusal",
+    "image_url",
+    "image_embeds",
+    "image_pil",
+    "audio_url",
+    "input_audio",
+    "video_url",
+)
 
 
 def _parse_chat_message_content_parts(
@@ -1055,21 +1202,20 @@ def _parse_chat_message_content_parts(
             part,
             mm_parser,
             wrap_dicts=wrap_dicts,
-            interleave_strings=interleave_strings
+            interleave_strings=interleave_strings,
         )
         if parse_res:
             content.append(parse_res)
 
     if wrap_dicts:
         # Parsing wraps images and texts as interleaved dictionaries
-        return [ConversationMessage(role=role,
-                                    content=content)]  # type: ignore
+        return [ConversationMessage(role=role, content=content)]  # type: ignore
     texts = cast(list[str], content)
     mm_placeholder_storage = mm_parser.mm_placeholder_storage()
     if mm_placeholder_storage:
-        text_prompt = _get_full_multimodal_text_prompt(mm_placeholder_storage,
-                                                       texts,
-                                                       interleave_strings)
+        text_prompt = _get_full_multimodal_text_prompt(
+            mm_placeholder_storage, texts, interleave_strings
+        )
     else:
         text_prompt = "\n".join(texts)
 
@@ -1099,46 +1245,59 @@ def _parse_chat_message_content_part(
     if part_type in VALID_MESSAGE_CONTENT_MM_PART_TYPES and content is None:
         logger.warning(
             "Skipping multimodal part '%s' (type: '%s') "
-            "with empty / unparsable content.", part, part_type)
+            "with empty / unparsable content.",
+            part,
+            part_type,
+        )
         return None
 
     if part_type in ("text", "input_text", "refusal", "thinking"):
         str_content = cast(str, content)
         if wrap_dicts:
-            return {'type': 'text', 'text': str_content}
+            return {"type": "text", "text": str_content}
         else:
             return str_content
 
+    # For media items, if a user has provided one, use it. Otherwise, insert
+    # a placeholder empty uuid.
+    uuid = part.get("uuid", None)
+    if uuid is not None:
+        uuid = str(uuid)
+
     modality = None
     if part_type == "image_pil":
         image_content = cast(Image.Image, content)
-        mm_parser.parse_image_pil(image_content)
+        mm_parser.parse_image_pil(image_content, uuid)
         modality = "image"
     elif part_type in ("image_url", "input_image"):
         str_content = cast(str, content)
-        mm_parser.parse_image(str_content)
+        mm_parser.parse_image(str_content, uuid)
         modality = "image"
     elif part_type == "image_embeds":
         content = cast(Union[str, dict[str, str]], content)
-        mm_parser.parse_image_embeds(content)
+        mm_parser.parse_image_embeds(content, uuid)
         modality = "image"
     elif part_type == "audio_url":
         str_content = cast(str, content)
-        mm_parser.parse_audio(str_content)
+        mm_parser.parse_audio(str_content, uuid)
         modality = "audio"
     elif part_type == "input_audio":
         dict_content = cast(InputAudio, content)
-        mm_parser.parse_input_audio(dict_content)
+        mm_parser.parse_input_audio(dict_content, uuid)
         modality = "audio"
     elif part_type == "video_url":
         str_content = cast(str, content)
-        mm_parser.parse_video(str_content)
+        mm_parser.parse_video(str_content, uuid)
         modality = "video"
     else:
         raise NotImplementedError(f"Unknown part type: {part_type}")
 
-    return {'type': modality} if wrap_dicts else (
-        MODALITY_PLACEHOLDERS_MAP[modality] if interleave_strings else None
+    return (
+        {"type": modality}
+        if wrap_dicts
+        else (
+            MODALITY_PLACEHOLDERS_MAP[modality] if interleave_strings else None
+        )
     )
 
 
@@ -1171,14 +1330,16 @@ def _parse_chat_message_content(
     )
 
     for result_msg in result:
-        if role == 'assistant':
+        if role == "assistant":
             parsed_msg = _AssistantParser(message)
 
             # The 'tool_calls' is not None check ensures compatibility.
             # It's needed only if downstream code doesn't strictly
             # follow the OpenAI spec.
-            if ("tool_calls" in parsed_msg
-                and parsed_msg["tool_calls"] is not None):
+            if (
+                "tool_calls" in parsed_msg
+                and parsed_msg["tool_calls"] is not None
+            ):
                 result_msg["tool_calls"] = list(parsed_msg["tool_calls"])
         elif role == "tool":
             parsed_msg = _ToolParser(message)
@@ -1198,12 +1359,15 @@ def _postprocess_messages(messages: list[ConversationMessage]) -> None:
     # so, for messages that have tool_calls, parse the string (which we get
     # from openAI format) to dict
     for message in messages:
-        if (message["role"] == "assistant" and "tool_calls" in message
-                and isinstance(message["tool_calls"], list)):
-
+        if (
+            message["role"] == "assistant"
+            and "tool_calls" in message
+            and isinstance(message["tool_calls"], list)
+        ):
             for item in message["tool_calls"]:
                 item["function"]["arguments"] = json.loads(
-                    item["function"]["arguments"])
+                    item["function"]["arguments"]
+                )
 
 
 def parse_chat_messages(
@@ -1211,7 +1375,11 @@ def parse_chat_messages(
     model_config: ModelConfig,
     tokenizer: AnyTokenizer,
     content_format: _ChatTemplateContentFormat,
-) -> tuple[list[ConversationMessage], Optional[MultiModalDataDict]]:
+) -> tuple[
+    list[ConversationMessage],
+    Optional[MultiModalDataDict],
+    Optional[MultiModalUUIDDict],
+]:
     conversation: list[ConversationMessage] = []
     mm_tracker = MultiModalItemTracker(model_config, tokenizer)
 
@@ -1224,14 +1392,14 @@ def parse_chat_messages(
                 content_format == "string"
                 and model_config.multimodal_config is not None
                 and model_config.multimodal_config.interleave_mm_strings
-            )
+            ),
         )
 
         conversation.extend(sub_messages)
 
     _postprocess_messages(conversation)
 
-    return conversation, mm_tracker.all_mm_data()
+    return conversation, mm_tracker.all_mm_data(), mm_tracker.all_mm_uuids()
 
 
 def parse_chat_messages_futures(
@@ -1239,7 +1407,11 @@ def parse_chat_messages_futures(
     model_config: ModelConfig,
     tokenizer: AnyTokenizer,
     content_format: _ChatTemplateContentFormat,
-) -> tuple[list[ConversationMessage], Awaitable[Optional[MultiModalDataDict]]]:
+) -> tuple[
+    list[ConversationMessage],
+    Awaitable[Optional[MultiModalDataDict]],
+    Optional[MultiModalUUIDDict],
+]:
     conversation: list[ConversationMessage] = []
     mm_tracker = AsyncMultiModalItemTracker(model_config, tokenizer)
 
@@ -1252,14 +1424,14 @@ def parse_chat_messages_futures(
                 content_format == "string"
                 and model_config.multimodal_config is not None
                 and model_config.multimodal_config.interleave_mm_strings
-            )
+            ),
         )
 
         conversation.extend(sub_messages)
 
     _postprocess_messages(conversation)
 
-    return conversation, mm_tracker.all_mm_data()
+    return conversation, mm_tracker.all_mm_data(), mm_tracker.all_mm_uuids()
 
 
 def apply_hf_chat_template(
@@ -1283,10 +1455,10 @@ def apply_hf_chat_template(
         raise ValueError(
             "As of transformers v4.44, default chat template is no longer "
             "allowed, so you must provide a chat template if the tokenizer "
-            "does not define one.")
+            "does not define one."
+        )
 
     try:
-
         return tokenizer.apply_chat_template(
             conversation=conversation,  # type: ignore[arg-type]
             tools=tools,  # type: ignore[arg-type]
@@ -1298,13 +1470,14 @@ def apply_hf_chat_template(
     # External library exceptions can sometimes occur despite the framework's
     # internal exception management capabilities.
     except Exception as e:
-
         # Log and report any library-related exceptions for further
         # investigation.
         logger.exception(
-            "An error occurred in `transformers` while applying chat template")
+            "An error occurred in `transformers` while applying chat template"
+        )
         raise ValueError(str(e)) from e
 
+
 def apply_mistral_chat_template(
     tokenizer: MistralTokenizer,
     messages: list[ChatCompletionMessageParam],
@@ -1337,26 +1510,26 @@ def apply_mistral_chat_template(
     # External library exceptions can sometimes occur despite the framework's
     # internal exception management capabilities.
     except Exception as e:
-
         # Log and report any library-related exceptions for further
         # investigation.
         logger.exception(
-            "An error occurred in `mistral_common` while applying chat "
-            "template")
+            "An error occurred in `mistral_common` while applying chat template"
+        )
         raise ValueError(str(e)) from e
 
+
 def get_history_tool_calls_cnt(conversation: list[ConversationMessage]):
     idx = 0
     for msg in conversation:
-        if msg['role'] == 'assistant':
-            tool_calls = msg.get('tool_calls')
-            idx += len(list(tool_calls)) if tool_calls is not None else 0 # noqa
+        if msg["role"] == "assistant":
+            tool_calls = msg.get("tool_calls")
+            idx += len(list(tool_calls)) if tool_calls is not None else 0  # noqa
     return idx
 
-def make_tool_call_id(id_type:str='random', func_name=None, idx=None):
 
-    if id_type=='kimi_k2':
-        return f'functions.{func_name}:{idx}'
+def make_tool_call_id(id_type: str = "random", func_name=None, idx=None):
+    if id_type == "kimi_k2":
+        return f"functions.{func_name}:{idx}"
     else:
         # by default return random
         return f"chatcmpl-tool-{random_uuid()}"
diff --git a/vllm/entrypoints/context.py b/vllm/entrypoints/context.py
index 9d587e8669339c1308876e16c9f90b069da3572b..9012639457cadc1b3df147ca2d9e844a3b6b70c3 100644
--- a/vllm/entrypoints/context.py
+++ b/vllm/entrypoints/context.py
@@ -1,9 +1,10 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import asyncio
+import contextlib
 import json
 import logging
 from abc import ABC, abstractmethod
-from collections.abc import Sequence
 from contextlib import AsyncExitStack
 from typing import TYPE_CHECKING, Optional, Union
 
@@ -21,6 +22,23 @@ if TYPE_CHECKING:
 logger = logging.getLogger(__name__)
 
 
+class TurnTokens:
+    """Tracks token counts for a single conversation turn."""
+
+    def __init__(self, input_tokens=0, output_tokens=0):
+        self.input_tokens = input_tokens
+        self.output_tokens = output_tokens
+
+    def reset(self):
+        """Reset counters for a new turn."""
+        self.input_tokens = 0
+        self.output_tokens = 0
+
+    def copy(self):
+        """Create a copy of this turn's token counts."""
+        return TurnTokens(self.input_tokens, self.output_tokens)
+
+
 class ConversationContext(ABC):
 
     @abstractmethod
@@ -41,17 +59,32 @@ class ConversationContext(ABC):
 
     @abstractmethod
     async def init_tool_sessions(self, tool_server: Optional[ToolServer],
-                                 exit_stack: AsyncExitStack) -> None:
+                                 exit_stack: AsyncExitStack,
+                                 request_id: str) -> None:
         pass
 
+    @abstractmethod
+    async def cleanup_session(self) -> None:
+        raise NotImplementedError("Should not be called.")
+
 
 class SimpleContext(ConversationContext):
 
     def __init__(self):
         self.last_output = None
+        self.num_prompt_tokens = 0
+        self.num_output_tokens = 0
+        self.num_cached_tokens = 0
+        # todo num_reasoning_tokens is not implemented yet.
+        self.num_reasoning_tokens = 0
 
     def append_output(self, output) -> None:
         self.last_output = output
+        if not isinstance(output, RequestOutput):
+            raise ValueError("SimpleContext only supports RequestOutput.")
+        self.num_prompt_tokens = len(output.prompt_token_ids or [])
+        self.num_cached_tokens = output.num_cached_tokens or 0
+        self.num_output_tokens += len(output.outputs[0].token_ids or [])
 
     def need_builtin_tool_call(self) -> bool:
         return False
@@ -63,9 +96,13 @@ class SimpleContext(ConversationContext):
         raise NotImplementedError("Should not be called.")
 
     async def init_tool_sessions(self, tool_server: Optional[ToolServer],
-                                 exit_stack: AsyncExitStack) -> None:
+                                 exit_stack: AsyncExitStack,
+                                 request_id: str) -> None:
         pass
 
+    async def cleanup_session(self) -> None:
+        raise NotImplementedError("Should not be called.")
+
 
 class HarmonyContext(ConversationContext):
 
@@ -77,39 +114,130 @@ class HarmonyContext(ConversationContext):
         self._messages = messages
         self.available_tools = available_tools
         self._tool_sessions: dict[str, Union[ClientSession, Tool]] = {}
+        self.called_tools: set[str] = set()
 
         self.parser = get_streamable_parser_for_assistant()
         self.num_init_messages = len(messages)
         self.num_prompt_tokens = 0
         self.num_output_tokens = 0
-        # TODO(woosuk): Implement the following fields.
         self.num_cached_tokens = 0
         self.num_reasoning_tokens = 0
+        self.num_tool_output_tokens = 0
 
-    def _update_num_prompt_tokens(self, output: RequestOutput):
-        if output.prompt_token_ids and len(output.prompt_token_ids) > 0:
-            # NOTE: with built-in tools, there might be multiple rounds in
-            # the conversation, with the full conversation being resent
-            # as new prompt each time. Hence the sum.
-            self.num_prompt_tokens += len(output.prompt_token_ids)
+        # Turn tracking - replaces multiple individual tracking variables
+        self.current_turn = TurnTokens()
+        self.previous_turn = TurnTokens()
+        self.is_first_turn = True
+        self.first_tok_of_message = True  # For streaming support
 
-    def _update_num_output_tokens(self, token_ids: Sequence[int]):
-        self.num_output_tokens += len(token_ids)
+    def _update_num_reasoning_tokens(self):
+        # Count all analysis and commentary channels as reasoning tokens
+        if self.parser.current_channel in {"analysis", "commentary"}:
+            self.num_reasoning_tokens += 1
 
     def append_output(self, output) -> None:
         if isinstance(output, RequestOutput):
-            self._update_num_prompt_tokens(output)
             output_token_ids = output.outputs[0].token_ids
-            self._update_num_output_tokens(output_token_ids)
             self.parser = get_streamable_parser_for_assistant()
             for token_id in output_token_ids:
                 self.parser.process(token_id)
+                # Check if the current token is part of reasoning content
+                self._update_num_reasoning_tokens()
+            self._update_prefill_token_usage(output)
+            # Reset current turn output tokens for this turn
+            self.current_turn.output_tokens = 0
+            self._update_decode_token_usage(output)
+            # Move current turn to previous turn for next turn's calculations
+            self.previous_turn = self.current_turn.copy()
             output_msgs = self.parser.messages
         else:
             # Tool output.
             output_msgs = output
         self._messages.extend(output_msgs)
 
+    def _update_prefill_token_usage(self, output: RequestOutput) -> None:
+        """Update token usage statistics for the prefill phase of generation.
+        
+        The prefill phase processes the input prompt tokens. This method:
+        1. Counts the prompt tokens for this turn
+        2. Calculates tool output tokens for multi-turn conversations
+        3. Updates cached token counts
+        4. Tracks state for next turn calculations
+        
+        Tool output tokens are calculated as:
+        current_prompt_tokens - last_turn_prompt_tokens - 
+        last_turn_output_tokens
+        This represents tokens added between turns (typically tool responses).
+        
+        Args:
+            output: The RequestOutput containing prompt token information
+        """
+        if output.prompt_token_ids is not None:
+            this_turn_input_tokens = len(output.prompt_token_ids)
+        else:
+            this_turn_input_tokens = 0
+            logger.error(
+                "RequestOutput appended contains no prompt_token_ids.")
+
+        # Update current turn input tokens
+        self.current_turn.input_tokens = this_turn_input_tokens
+        self.num_prompt_tokens += this_turn_input_tokens
+
+        # Calculate tool tokens (except on first turn)
+        if self.is_first_turn:
+            self.is_first_turn = False
+        else:
+            # start counting tool after first turn
+            # tool tokens = this turn prefill - last turn prefill -
+            # last turn decode
+            this_turn_tool_tokens = (self.current_turn.input_tokens -
+                                     self.previous_turn.input_tokens -
+                                     self.previous_turn.output_tokens)
+
+            # Handle negative tool token counts (shouldn't happen in normal
+            # cases)
+            if this_turn_tool_tokens < 0:
+                logger.error(
+                    "Negative tool output tokens calculated: %d "
+                    "(current_input=%d, previous_input=%d, "
+                    "previous_output=%d). Setting to 0.",
+                    this_turn_tool_tokens, self.current_turn.input_tokens,
+                    self.previous_turn.input_tokens,
+                    self.previous_turn.output_tokens)
+                this_turn_tool_tokens = 0
+
+            self.num_tool_output_tokens += this_turn_tool_tokens
+
+        # Update cached tokens
+        if output.num_cached_tokens is not None:
+            self.num_cached_tokens += output.num_cached_tokens
+
+    def _update_decode_token_usage(self, output: RequestOutput) -> int:
+        """Update token usage statistics for the decode phase of generation.
+        
+        The decode phase processes the generated output tokens. This method:
+        1. Counts output tokens from all completion outputs
+        2. Updates the total output token count
+        3. Tracks tokens generated in the current turn
+        
+        In streaming mode, this is called for each token generated.
+        In non-streaming mode, this is called once with all output tokens.
+        
+        Args:
+            output: The RequestOutput containing generated token information
+            
+        Returns:
+            int: Number of output tokens processed in this call
+        """
+        updated_output_token_count = 0
+        if output.outputs:
+            for completion_output in output.outputs:
+                # only keep last round
+                updated_output_token_count += len(completion_output.token_ids)
+            self.num_output_tokens += updated_output_token_count
+            self.current_turn.output_tokens += updated_output_token_count
+        return updated_output_token_count
+
     @property
     def messages(self) -> list:
         return self._messages
@@ -118,7 +246,8 @@ class HarmonyContext(ConversationContext):
         last_msg = self.messages[-1]
         recipient = last_msg.recipient
         return recipient is not None and (recipient.startswith("browser.")
-                                          or recipient.startswith("python"))
+                                          or recipient.startswith("python") or
+                                          recipient.startswith("container."))
 
     async def call_tool(self) -> list[Message]:
         if not self.messages:
@@ -132,6 +261,9 @@ class HarmonyContext(ConversationContext):
             elif recipient.startswith("python"):
                 return await self.call_python_tool(
                     self._tool_sessions["python"], last_msg)
+            elif recipient.startswith("container."):
+                return await self.call_container_tool(
+                    self._tool_sessions["container"], last_msg)
         raise ValueError("No tool call found")
 
     def render_for_completion(self) -> list[int]:
@@ -140,6 +272,7 @@ class HarmonyContext(ConversationContext):
     async def call_search_tool(self, tool_session: Union["ClientSession",
                                                          Tool],
                                last_msg: Message) -> list[Message]:
+        self.called_tools.add("browser")
         if isinstance(tool_session, Tool):
             return await tool_session.get_result(self)
         tool_name = last_msg.recipient.split(".")[1]
@@ -149,12 +282,16 @@ class HarmonyContext(ConversationContext):
         content = TextContent(text=result_str)
         author = Author(role=Role.TOOL, name=last_msg.recipient)
         return [
-            Message(author=author, content=[content], recipient=Role.ASSISTANT)
+            Message(author=author,
+                    content=[content],
+                    recipient=Role.ASSISTANT,
+                    channel=last_msg.channel)
         ]
 
     async def call_python_tool(self, tool_session: Union["ClientSession",
                                                          Tool],
                                last_msg: Message) -> list[Message]:
+        self.called_tools.add("python")
         if isinstance(tool_session, Tool):
             return await tool_session.get_result(self)
         param = {
@@ -174,13 +311,63 @@ class HarmonyContext(ConversationContext):
         ]
 
     async def init_tool_sessions(self, tool_server: Optional[ToolServer],
-                                 exit_stack: AsyncExitStack) -> None:
+                                 exit_stack: AsyncExitStack,
+                                 request_id: str) -> None:
         if tool_server:
             for tool_name in self.available_tools:
                 if tool_name not in self._tool_sessions:
-                    self._tool_sessions[
-                        tool_name] = await exit_stack.enter_async_context(
-                            tool_server.new_session(tool_name))
+                    tool_session = await exit_stack.enter_async_context(
+                        tool_server.new_session(tool_name, request_id))
+                    self._tool_sessions[tool_name] = tool_session
+                    exit_stack.push_async_exit(self.cleanup_session)
+
+    async def call_container_tool(self, tool_session: Union["ClientSession",
+                                                            Tool],
+                                  last_msg: Message) -> list[Message]:
+        """
+            Call container tool. Expect this to be run in a stateful docker
+            with command line terminal.
+            The official container tool would at least
+            expect the following format:
+            - for tool name: exec
+                - args:
+                    {
+                        "cmd":List[str] "command to execute",
+                        "workdir":optional[str] "current working directory",
+                        "env":optional[object/dict] "environment variables",
+                        "session_name":optional[str] "session name",
+                        "timeout":optional[int] "timeout in seconds",
+                        "user":optional[str] "user name",
+                    }
+        """
+        self.called_tools.add("container")
+        if isinstance(tool_session, Tool):
+            return await tool_session.get_result(self)
+        tool_name = last_msg.recipient.split(".")[1].split(" ")[0]
+        args = json.loads(last_msg.content[0].text)
+        result = await tool_session.call_tool(tool_name, args)
+        result_str = result.content[0].text
+        content = TextContent(text=result_str)
+        author = Author(role=Role.TOOL, name=last_msg.recipient)
+        return [
+            Message(author=author,
+                    content=[content],
+                    recipient=Role.ASSISTANT,
+                    channel=last_msg.channel)
+        ]
+
+    async def cleanup_session(self, *args, **kwargs) -> None:
+        """Can be used as coro to used in __aexit__"""
+
+        async def cleanup_tool_session(tool_session):
+            if not isinstance(tool_session, Tool):
+                logger.info("Cleaning up tool session for %s",
+                            tool_session._client_info)
+                with contextlib.suppress(Exception):
+                    await tool_session.call_tool("cleanup_session", {})
+
+        await asyncio.gather(*(cleanup_tool_session(self._tool_sessions[tool])
+                               for tool in self.called_tools))
 
 
 class StreamingHarmonyContext(HarmonyContext):
@@ -203,15 +390,22 @@ class StreamingHarmonyContext(HarmonyContext):
             # append_output is called for each output token in streaming case,
             # so we only want to add the prompt tokens once for each message.
             if self.first_tok_of_message:
-                self._update_num_prompt_tokens(output)
+                self._update_prefill_token_usage(output)
+                self.current_turn.output_tokens = 0
             # Reset self.first_tok_of_message if needed:
             # if the current token is the last one of the current message
             # (finished=True), then the next token processed will mark the
             # beginning of a new message
             self.first_tok_of_message = output.finished
-            tok = output.outputs[0].token_ids[0]
-            self.parser.process(tok)
-            self._update_num_output_tokens(output.outputs[0].token_ids)
+            for tok in output.outputs[0].token_ids:
+                self.parser.process(tok)
+            self._update_decode_token_usage(output)
+
+            # For streaming, update previous turn when message is complete
+            if output.finished:
+                self.previous_turn = self.current_turn.copy()
+            # Check if the current token is part of reasoning content
+            self._update_num_reasoning_tokens()
             self.last_tok = tok
         else:
             # Handle the case of tool output in direct message format
diff --git a/vllm/entrypoints/harmony_utils.py b/vllm/entrypoints/harmony_utils.py
index 078d316844257b508dc51df6db46c8d5f2d0f8d5..f7528ba81dce56dc05bf605e1b73b9ee795f6dfa 100644
--- a/vllm/entrypoints/harmony_utils.py
+++ b/vllm/entrypoints/harmony_utils.py
@@ -1,5 +1,8 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from __future__ import annotations
+
 import datetime
 import json
 from collections.abc import Iterable, Sequence
@@ -13,12 +16,15 @@ from openai.types.responses.response_function_web_search import (
 from openai.types.responses.response_reasoning_item import (
     Content as ResponseReasoningTextContent)
 from openai.types.responses.tool import Tool
-from openai_harmony import (Author, Conversation, DeveloperContent,
-                            HarmonyEncodingName, Message, ReasoningEffort,
-                            Role, StreamableParser, SystemContent, TextContent,
-                            ToolDescription, load_harmony_encoding)
-
-from vllm.entrypoints.openai.protocol import ResponseInputOutputItem
+from openai_harmony import (Author, ChannelConfig, Conversation,
+                            DeveloperContent, HarmonyEncodingName, Message,
+                            ReasoningEffort, Role, StreamableParser,
+                            SystemContent, TextContent, ToolDescription,
+                            load_harmony_encoding)
+
+from vllm import envs
+from vllm.entrypoints.openai.protocol import (ChatCompletionToolsParam,
+                                              ResponseInputOutputItem)
 from vllm.utils import random_uuid
 
 REASONING_EFFORT = {
@@ -29,6 +35,20 @@ REASONING_EFFORT = {
 
 _harmony_encoding = None
 
+# Builtin tools that should be included in the system message when
+# they are available and requested by the user.
+# Tool args are provided by MCP tool descriptions. Output
+# of the tools are stringified.
+BUILTIN_TOOLS = {
+    "web_search_preview",
+    "code_interpreter",
+    "container",
+}
+
+
+def has_custom_tools(tool_types: list[str]) -> bool:
+    return not set(tool_types).issubset(BUILTIN_TOOLS)
+
 
 def get_encoding():
     global _harmony_encoding
@@ -44,10 +64,19 @@ def get_system_message(
     start_date: Optional[str] = None,
     browser_description: Optional[str] = None,
     python_description: Optional[str] = None,
+    container_description: Optional[str] = None,
+    instructions: Optional[str] = None,
+    with_custom_tools: bool = False,
 ) -> Message:
     sys_msg_content = SystemContent.new()
     if model_identity is not None:
         sys_msg_content = sys_msg_content.with_model_identity(model_identity)
+    if (instructions is not None
+            and envs.VLLM_GPT_OSS_HARMONY_SYSTEM_INSTRUCTIONS):
+        current_identity = sys_msg_content.model_identity
+        new_identity = (f'{current_identity}\n{instructions}'
+                        if current_identity else instructions)
+        sys_msg_content = sys_msg_content.with_model_identity(new_identity)
     if reasoning_effort is not None:
         sys_msg_content = sys_msg_content.with_reasoning_effort(
             REASONING_EFFORT[reasoning_effort])
@@ -59,32 +88,55 @@ def get_system_message(
         sys_msg_content = sys_msg_content.with_tools(browser_description)
     if python_description is not None:
         sys_msg_content = sys_msg_content.with_tools(python_description)
+    if container_description is not None:
+        sys_msg_content = sys_msg_content.with_tools(container_description)
+    if not with_custom_tools:
+        channel_config = sys_msg_content.channel_config
+        invalid_channel = "commentary"
+        new_config = ChannelConfig.require_channels(
+            [c for c in channel_config.valid_channels if c != invalid_channel])
+        sys_msg_content = sys_msg_content.with_channel_config(new_config)
     sys_msg = Message.from_role_and_content(Role.SYSTEM, sys_msg_content)
     return sys_msg
 
 
-def get_developer_message(instructions: Optional[str] = None,
-                          tools: Optional[list[Tool]] = None) -> Message:
+def create_tool_definition(tool: Union[ChatCompletionToolsParam, Tool]):
+    if isinstance(tool, ChatCompletionToolsParam):
+        return ToolDescription.new(
+            name=tool.function.name,
+            description=tool.function.description,
+            parameters=tool.function.parameters,
+        )
+    return ToolDescription.new(
+        name=tool.name,
+        description=tool.description,
+        parameters=tool.parameters,
+    )
+
+
+def get_developer_message(
+    instructions: Optional[str] = None,
+    tools: Optional[list[Union[Tool, ChatCompletionToolsParam]]] = None,
+) -> Message:
     dev_msg_content = DeveloperContent.new()
-    if instructions is not None:
+    if (instructions is not None
+            and not envs.VLLM_GPT_OSS_HARMONY_SYSTEM_INSTRUCTIONS):
         dev_msg_content = dev_msg_content.with_instructions(instructions)
     if tools is not None:
-        function_tools = []
+        function_tools: list[Union[Tool, ChatCompletionToolsParam]] = []
         for tool in tools:
-            if tool.type in ("web_search_preview", "code_interpreter"):
+            if tool.type in ("web_search_preview", "code_interpreter",
+                             "container"):
                 # These are built-in tools that are added to the system message.
                 pass
+
             elif tool.type == "function":
                 function_tools.append(tool)
             else:
                 raise ValueError(f"tool type {tool.type} not supported")
         if function_tools:
             function_tool_descriptions = [
-                ToolDescription.new(
-                    name=tool.name,
-                    description=tool.description,
-                    parameters=tool.parameters,
-                ) for tool in function_tools
+                create_tool_definition(tool) for tool in function_tools
             ]
             dev_msg_content = dev_msg_content.with_function_tools(
                 function_tool_descriptions)
@@ -120,6 +172,8 @@ def parse_response_input(
                 TextContent(text=text_prefix + c["text"]) for c in content
             ]
             msg = Message.from_role_and_contents(role, contents)
+        if role == "assistant":
+            msg = msg.with_channel("final")
     elif response_msg["type"] == "function_call_output":
         call_id = response_msg["call_id"]
         call_response: Optional[ResponseFunctionToolCall] = None
@@ -148,16 +202,46 @@ def parse_response_input(
     return msg
 
 
-def parse_chat_input(chat_msg) -> Message:
-    role = chat_msg["role"]
-    content = chat_msg["content"]
+def parse_chat_input(chat_msg) -> list[Message]:
+    if not isinstance(chat_msg, dict):
+        # Handle Pydantic models
+        chat_msg = chat_msg.model_dump(exclude_none=True)
+
+    role = chat_msg.get("role")
+
+    # Assistant message with tool calls
+    tool_calls = chat_msg.get("tool_calls")
+    if role == "assistant" and tool_calls:
+        msgs: list[Message] = []
+        for call in tool_calls:
+            func = call.get("function", {})
+            name = func.get("name", "")
+            arguments = func.get("arguments", "") or ""
+            msg = Message.from_role_and_content(Role.ASSISTANT, arguments)
+            msg = msg.with_channel("commentary")
+            msg = msg.with_recipient(f"functions.{name}")
+            msg = msg.with_content_type("json")
+            msgs.append(msg)
+        return msgs
+
+    # Tool role message (tool output)
+    if role == "tool":
+        name = chat_msg.get("name", "")
+        content = chat_msg.get("content", "") or ""
+        msg = Message.from_author_and_content(
+            Author.new(Role.TOOL, f"functions.{name}"),
+            content).with_channel("commentary")
+        return [msg]
+
+    # Default: user/assistant/system messages with content
+    content = chat_msg.get("content", "")
     if isinstance(content, str):
         contents = [TextContent(text=content)]
     else:
         # TODO: Support refusal.
         contents = [TextContent(text=c.get("text", "")) for c in content]
     msg = Message.from_role_and_contents(role, contents)
-    return msg
+    return [msg]
 
 
 def render_for_completion(messages: list[Message]) -> list[int]:
@@ -227,7 +311,7 @@ def parse_output_message(message: Message) -> list[ResponseOutputItem]:
                     call_id=f"call_{random_id}",
                     type="function_call",
                     name=function_name,
-                    id=f"ft_{random_id}",
+                    id=f"fc_{random_id}",
                 )
                 output_items.append(response_item)
         elif recipient is not None and (recipient.startswith("python")
diff --git a/vllm/entrypoints/launcher.py b/vllm/entrypoints/launcher.py
index 4e852ba59493029635a49aef946c695a83f49413..887e2771092406d078aa2bbf21be59ce9f928f08 100644
--- a/vllm/entrypoints/launcher.py
+++ b/vllm/entrypoints/launcher.py
@@ -95,7 +95,7 @@ async def serve_http(app: FastAPI,
         port = uvicorn_kwargs["port"]
         process = find_process_using_port(port)
         if process is not None:
-            logger.debug(
+            logger.warning(
                 "port %s is used by process %s launched with command:\n%s",
                 port, process, " ".join(process.cmdline()))
         logger.info("Shutting down FastAPI HTTP server.")
diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py
index 9b2ad808eb03e821b1fc93f9dcf9c516d6ae930f..a6174161f115a95f5b2fa566aba67c0bf00023c1 100644
--- a/vllm/entrypoints/llm.py
+++ b/vllm/entrypoints/llm.py
@@ -110,6 +110,14 @@ class LLM:
             values will increase the KV cache size and thus improve the model's
             throughput. However, if the value is too high, it may cause out-of-
             memory (OOM) errors.
+        kv_cache_memory_bytes: Size of KV Cache per GPU in bytes. By default,
+            this is set to None and vllm can automatically infer the kv cache
+            size based on gpu_memory_utilization. However, users may want to
+            manually specify the kv cache memory size. kv_cache_memory_bytes
+            allows more fine-grain control of how much memory gets used when
+            compared with using gpu_memory_memory_utilization. Note that
+            kv_cache_memory_bytes (when not-None) ignores
+            gpu_memory_utilization
         swap_space: The size (GiB) of CPU memory per GPU to use as swap space.
             This can be used for temporarily storing the states of the requests
             when their `best_of` sampling parameters are larger than 1. If all
@@ -184,6 +192,7 @@ class LLM:
         hf_overrides: Optional[HfOverrides] = None,
         mm_processor_kwargs: Optional[dict[str, Any]] = None,
         override_pooler_config: Optional[PoolerConfig] = None,
+        kv_cache_memory_bytes: Optional[int] = None,
         compilation_config: Optional[Union[int, dict[str, Any],
                                            CompilationConfig]] = None,
         logits_processors: Optional[list[Union[str,
@@ -204,7 +213,7 @@ class LLM:
 
         if "kv_transfer_config" in kwargs and isinstance(
                 kwargs["kv_transfer_config"], dict):
-            from vllm.config import KVTransferConfig
+            from vllm.config.kv_transfer import KVTransferConfig
             raw_config_dict = kwargs["kv_transfer_config"]
             try:
                 kwargs["kv_transfer_config"] = KVTransferConfig(
@@ -251,6 +260,7 @@ class LLM:
             tokenizer_revision=tokenizer_revision,
             seed=seed,
             gpu_memory_utilization=gpu_memory_utilization,
+            kv_cache_memory_bytes=kv_cache_memory_bytes,
             swap_space=swap_space,
             cpu_offload_gb=cpu_offload_gb,
             enforce_eager=enforce_eager,
@@ -796,7 +806,7 @@ class LLM:
             # NOTE: _parse_chat_message_content_parts() currently doesn't
             # handle mm_processor_kwargs, since there is no implementation in
             # the chat message parsing for it.
-            conversation, mm_data = parse_chat_messages(
+            conversation, mm_data, mm_uuids = parse_chat_messages(
                 msgs,
                 model_config,
                 tokenizer,
@@ -826,6 +836,9 @@ class LLM:
             if mm_data is not None:
                 prompt["multi_modal_data"] = mm_data
 
+            if mm_uuids is not None:
+                prompt["multi_modal_uuids"] = mm_uuids
+
             if mm_processor_kwargs is not None:
                 prompt["mm_processor_kwargs"] = mm_processor_kwargs
 
diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py
index 3cebfdf885becd553d3ed462f173cfc1c6db85b7..c159bcee315f266830f3b0e3715d544984760878 100644
--- a/vllm/entrypoints/openai/api_server.py
+++ b/vllm/entrypoints/openai/api_server.py
@@ -616,14 +616,23 @@ async def create_responses(request: ResponsesRequest, raw_request: Request):
 
 
 @router.get("/v1/responses/{response_id}")
-async def retrieve_responses(response_id: str, raw_request: Request):
+async def retrieve_responses(
+    response_id: str,
+    raw_request: Request,
+    starting_after: Optional[int] = None,
+    stream: Optional[bool] = False,
+):
     handler = responses(raw_request)
     if handler is None:
         return base(raw_request).create_error_response(
             message="The model does not support Responses API")
 
     try:
-        response = await handler.retrieve_responses(response_id)
+        response = await handler.retrieve_responses(
+            response_id,
+            starting_after=starting_after,
+            stream=stream,
+        )
     except Exception as e:
         raise HTTPException(status_code=HTTPStatus.INTERNAL_SERVER_ERROR.value,
                             detail=str(e)) from e
@@ -631,6 +640,9 @@ async def retrieve_responses(response_id: str, raw_request: Request):
     if isinstance(response, ErrorResponse):
         return JSONResponse(content=response.model_dump(),
                             status_code=response.error.code)
+    elif stream:
+        return StreamingResponse(content=response,
+                                 media_type="text/event-stream")
     return JSONResponse(content=response.model_dump())
 
 
@@ -1705,6 +1717,8 @@ async def init_app_state(
 
     if args.tool_server == "demo":
         tool_server: Optional[ToolServer] = DemoToolServer()
+        assert isinstance(tool_server, DemoToolServer)
+        await tool_server.init_and_validate()
     elif args.tool_server:
         tool_server = MCPToolServer()
         await tool_server.add_tool_server(args.tool_server)
diff --git a/vllm/entrypoints/openai/cli_args.py b/vllm/entrypoints/openai/cli_args.py
index d0b5d013eb9e53d668e060f52a5df42c5978525c..a6db97e55d7040d00741a1fbdf15ab79d35bd6f2 100644
--- a/vllm/entrypoints/openai/cli_args.py
+++ b/vllm/entrypoints/openai/cli_args.py
@@ -134,14 +134,13 @@ schema. Example: `[{"type": "text", "text": "Hello world!"}]`"""
     """If specified, will run the OpenAI frontend server in the same process as
     the model serving engine."""
     enable_request_id_headers: bool = False
-    """If specified, API server will add X-Request-Id header to responses.
-    Caution: this hurts performance at high QPS."""
+    """If specified, API server will add X-Request-Id header to responses."""
     enable_auto_tool_choice: bool = False
-    """If specified, exclude tool definitions in prompts when
-    tool_choice='none'."""
-    exclude_tools_when_tool_choice_none: bool = False
     """Enable auto tool choice for supported models. Use `--tool-call-parser`
     to specify which parser to use."""
+    exclude_tools_when_tool_choice_none: bool = False
+    """If specified, exclude tool definitions in prompts when
+    tool_choice='none'."""
     tool_call_parser: Optional[str] = None
     """Select the tool call parser depending on the model that you're using.
     This is used to parse the model-generated tool call into OpenAI API format.
@@ -204,7 +203,7 @@ schema. Example: `[{"type": "text", "text": "Hello world!"}]`"""
         frontend_kwargs["lora_modules"]["type"] = optional_type(str)
         frontend_kwargs["lora_modules"]["action"] = LoRAParserAction
 
-        # Special case: Middleware needs append action
+        # Special case: Middleware needs to append action
         frontend_kwargs["middleware"]["action"] = "append"
         frontend_kwargs["middleware"]["type"] = str
         if "nargs" in frontend_kwargs["middleware"]:
diff --git a/vllm/entrypoints/openai/protocol.py b/vllm/entrypoints/openai/protocol.py
index 12b274e1211bcba2d20183c7aef69756952125d8..c8ecbd28e7db5eea21f28315f834c7a941dd6dc3 100644
--- a/vllm/entrypoints/openai/protocol.py
+++ b/vllm/entrypoints/openai/protocol.py
@@ -43,10 +43,10 @@ from vllm.entrypoints.chat_utils import (ChatCompletionMessageParam,
 from vllm.entrypoints.score_utils import (ScoreContentPartParam,
                                           ScoreMultiModalParam)
 from vllm.logger import init_logger
+from vllm.logprobs import Logprob
 from vllm.pooling_params import PoolingParams
 from vllm.sampling_params import (BeamSearchParams, GuidedDecodingParams,
                                   RequestOutputKind, SamplingParams)
-from vllm.sequence import Logprob
 from vllm.utils import random_uuid, resolve_obj_by_qualname
 
 logger = init_logger(__name__)
@@ -1270,9 +1270,20 @@ class CompletionRequest(OpenAIBaseModel):
     @model_validator(mode="before")
     @classmethod
     def validate_prompt_and_prompt_embeds(cls, data):
-        if data.get("prompt") is None and data.get("prompt_embeds") is None:
+        prompt = data.get("prompt")
+        prompt_embeds = data.get("prompt_embeds")
+
+        prompt_is_empty = (prompt is None
+                           or (isinstance(prompt, str) and prompt == ""))
+        embeds_is_empty = (prompt_embeds is None
+                           or (isinstance(prompt_embeds, list)
+                               and len(prompt_embeds) == 0))
+
+        if prompt_is_empty and embeds_is_empty:
             raise ValueError(
-                "At least one of `prompt` or `prompt_embeds` must be set.")
+                "Either prompt or prompt_embeds must be provided and non-empty."
+            )
+
         return data
 
     @model_validator(mode="before")
@@ -1342,6 +1353,14 @@ class EmbeddingChatRequest(OpenAIBaseModel):
     truncate_prompt_tokens: Optional[Annotated[int, Field(ge=-1)]] = None
 
     # --8<-- [start:chat-embedding-extra-params]
+    add_generation_prompt: bool = Field(
+        default=False,
+        description=
+        ("If true, the generation prompt will be added to the chat template. "
+         "This is a parameter used by chat template in tokenizer config of the "
+         "model."),
+    )
+
     add_special_tokens: bool = Field(
         default=False,
         description=(
@@ -1424,9 +1443,10 @@ class IOProcessorRequest(OpenAIBaseModel, Generic[T]):
     When using plugins IOProcessor plugins, the actual input is processed
     by the plugin itself. Hence, we use a generic type for the request data
     """
+    softmax: bool = True
 
     def to_pooling_params(self):
-        return PoolingParams(task="encode")
+        return PoolingParams(task="encode", softmax=self.softmax)
 
 
 class IOProcessorResponse(OpenAIBaseModel, Generic[T]):
@@ -1832,7 +1852,8 @@ class InputTokensDetails(OpenAIBaseModel):
 
 
 class OutputTokensDetails(OpenAIBaseModel):
-    reasoning_tokens: int
+    reasoning_tokens: int = 0
+    tool_output_tokens: int = 0
 
 
 class ResponseUsage(OpenAIBaseModel):
@@ -2175,6 +2196,13 @@ class TranscriptionRequest(OpenAIBaseModel):
     )
     # --8<-- [end:transcription-extra-params]
 
+    to_language: Optional[str] = None
+    """The language of the output audio we transcribe to.
+
+    Please note that this is not currently used by supported models at this 
+    time, but it is a placeholder for future use, matching translation api.
+    """
+
     # --8<-- [start:transcription-sampling-params]
     temperature: float = Field(default=0.0)
     """The sampling temperature, between 0 and 1.
@@ -2408,6 +2436,9 @@ class TranslationRequest(OpenAIBaseModel):
 
     # TODO support additional sampling parameters
     # --8<-- [start:translation-sampling-params]
+    seed: Optional[int] = Field(None, ge=_LONG_INFO.min, le=_LONG_INFO.max)
+    """The seed to use for sampling."""
+
     temperature: float = Field(default=0.0)
     """The sampling temperature, between 0 and 1.
 
@@ -2427,6 +2458,14 @@ class TranslationRequest(OpenAIBaseModel):
     will improve accuracy.
     """
 
+    to_language: Optional[str] = None
+    """The language of the input audio we translate to.
+
+    Please note that this is not supported by all models, refer to the specific
+    model documentation for more details.
+    For instance, Whisper only supports `to_language=en`.
+    """
+
     stream: Optional[bool] = False
     """Custom field not present in the original OpenAI definition. When set,
     it will enable output to be streamed in a similar fashion as the Chat
@@ -2458,6 +2497,7 @@ class TranslationRequest(OpenAIBaseModel):
 
         return SamplingParams.from_optional(temperature=temperature,
                                             max_tokens=max_tokens,
+                                            seed=self.seed,
                                             output_kind=RequestOutputKind.DELTA
                                             if self.stream \
                                             else RequestOutputKind.FINAL_ONLY)
diff --git a/vllm/entrypoints/openai/run_batch.py b/vllm/entrypoints/openai/run_batch.py
index 01551a8c7f04adb67f3c7b48899a85b74beb030e..fa813550e520ca4b5681ea60314783efda8a8f10 100644
--- a/vllm/entrypoints/openai/run_batch.py
+++ b/vllm/entrypoints/openai/run_batch.py
@@ -161,7 +161,7 @@ async def write_local_file(output_path: str,
     batch_outputs: The list of batch outputs to write.
     """
     # We should make this async, but as long as run_batch runs as a
-    # standalone program, blocking the event loop won't effect performance.
+    # standalone program, blocking the event loop won't affect performance.
     with open(output_path, "w", encoding="utf-8") as f:
         for o in batch_outputs:
             print(o.model_dump_json(), file=f)
diff --git a/vllm/entrypoints/openai/serving_chat.py b/vllm/entrypoints/openai/serving_chat.py
index 6300d0758c3d4e66faf113643bfe45fef95469cc..579f6f537ee2d862d2ab771f37e55f1b8e6a9dcc 100644
--- a/vllm/entrypoints/openai/serving_chat.py
+++ b/vllm/entrypoints/openai/serving_chat.py
@@ -43,10 +43,10 @@ from vllm.entrypoints.openai.tool_parsers.mistral_tool_parser import (
 from vllm.entrypoints.utils import get_max_tokens
 from vllm.inputs.data import TokensPrompt as EngineTokensPrompt
 from vllm.logger import init_logger
+from vllm.logprobs import Logprob
 from vllm.outputs import CompletionOutput, RequestOutput
 from vllm.reasoning import ReasoningParser, ReasoningParserManager
 from vllm.sampling_params import BeamSearchParams, SamplingParams
-from vllm.sequence import Logprob
 from vllm.transformers_utils.tokenizer import AnyTokenizer, MistralTokenizer
 from vllm.transformers_utils.tokenizers import (maybe_serialize_tool_calls,
                                                 truncate_tool_call_ids,
@@ -186,7 +186,7 @@ class OpenAIServingChat(OpenAIServing):
             lora_request = self._maybe_get_adapters(
                 request, supports_default_mm_loras=True)
 
-            model_name = self._get_model_name(request.model, lora_request)
+            model_name = self.models.model_name(lora_request)
 
             tokenizer = await self.engine_client.get_tokenizer(lora_request)
 
@@ -489,6 +489,8 @@ class OpenAIServingChat(OpenAIServing):
                 get_streamable_parser_for_assistant()
                 for _ in range(num_choices)
             ]
+            harmony_tools_streamed = [False] * num_choices
+        tools_streamed = [False] * num_choices
 
         if isinstance(request.tool_choice, ChatCompletionNamedToolChoiceParam):
             tool_choice_function_name = request.tool_choice.function.name
@@ -662,13 +664,11 @@ class OpenAIServingChat(OpenAIServing):
 
                     if self.use_harmony:
                         harmony_parser = harmony_parsers[i]
+                        prev_recipient = harmony_parser.current_recipient
                         for token_id in output.token_ids:
                             harmony_parser.process(token_id)
-                        is_reasoning = \
-                            harmony_parser.current_channel == "analysis"
-                        if not request.include_reasoning and is_reasoning:
-                            # Skip the reasoning content.
-                            continue
+                        cur_channel = harmony_parser.current_channel
+                        cur_recipient = harmony_parser.current_recipient
                         delta_text = harmony_parser.last_content_delta or ""
                     else:
                         delta_text = output.text
@@ -681,8 +681,7 @@ class OpenAIServingChat(OpenAIServing):
                     delta_message: Optional[DeltaMessage]
 
                     # just update previous_texts and previous_token_ids
-                    if ((tool_choice_auto or self.reasoning_parser)
-                            and not self.use_harmony):
+                    if tool_choice_auto or self.reasoning_parser:
                         assert previous_texts is not None
                         assert all_previous_token_ids is not None
                         previous_text = previous_texts[i]
@@ -696,11 +695,54 @@ class OpenAIServingChat(OpenAIServing):
                             current_token_ids = as_list(output.token_ids)
 
                     if self.use_harmony:
-                        if is_reasoning:
-                            delta_message = DeltaMessage(
-                                reasoning_content=delta_text)
-                        else:
+                        if cur_channel == "final":
                             delta_message = DeltaMessage(content=delta_text)
+                        elif cur_channel == "analysis":
+                            if request.include_reasoning:
+                                delta_message = DeltaMessage(
+                                    reasoning_content=delta_text)
+                            else:
+                                delta_message = None
+                        elif (cur_channel == "commentary" and cur_recipient
+                              and cur_recipient.startswith("functions.")):
+                            # Count completed tool calls to determine index
+                            base_index = 0
+                            for msg in harmony_parser.messages:
+                                if (msg.channel == "commentary"
+                                        and msg.recipient
+                                        and msg.recipient.startswith(
+                                            "functions.")):
+                                    base_index += 1
+
+                            if prev_recipient != cur_recipient:
+                                tool_name = cur_recipient.split(
+                                    "functions.", 1)[1]
+                                delta_message = DeltaMessage(tool_calls=[
+                                    DeltaToolCall(
+                                        id=make_tool_call_id(),
+                                        type="function",
+                                        function=DeltaFunctionCall(
+                                            name=tool_name,
+                                            arguments="",
+                                        ),
+                                        index=base_index,
+                                    )
+                                ])
+                            elif delta_text:
+                                delta_message = DeltaMessage(tool_calls=[
+                                    DeltaToolCall(
+                                        index=base_index,
+                                        function=DeltaFunctionCall(
+                                            arguments=delta_text),
+                                    )
+                                ])
+                            else:
+                                delta_message = None
+
+                            if delta_message is not None:
+                                harmony_tools_streamed[i] = True
+                        else:
+                            delta_message = None
                     # handle streaming deltas for tools with named tool_choice
                     elif tool_choice_function_name:
                         if (self.reasoning_parser and not reasoning_end_arr[i]
@@ -758,6 +800,7 @@ class OpenAIServingChat(OpenAIServing):
                             delta_message = DeltaMessage(tool_calls=[
                                 delta_tool_call,
                             ])
+                            tools_streamed[i] = True
 
                     elif request.tool_choice == "required":
                         assert previous_texts is not None
@@ -783,6 +826,7 @@ class OpenAIServingChat(OpenAIServing):
                         if (delta_message and delta_message.tool_calls and
                                 delta_message.tool_calls[0].id is not None):
                             history_tool_call_cnt += 1
+                            tools_streamed[i] = True
 
                         # update the previous values for the next iteration
                         previous_texts[i] = current_text
@@ -859,6 +903,8 @@ class OpenAIServingChat(OpenAIServing):
                                     current_token_ids=current_token_ids,
                                     delta_token_ids=delta_token_ids,
                                     request=request))
+                            if delta_message and delta_message.tool_calls:
+                                tools_streamed[i] = True
                     # when only tool calls
                     elif tool_choice_auto:
                         assert tool_parser is not None
@@ -871,6 +917,8 @@ class OpenAIServingChat(OpenAIServing):
                                 current_token_ids=current_token_ids,
                                 delta_token_ids=output.token_ids,
                                 request=request))
+                        if delta_message and delta_message.tool_calls:
+                            tools_streamed[i] = True
 
                     # when only reasoning
                     elif self.reasoning_parser:
@@ -907,7 +955,10 @@ class OpenAIServingChat(OpenAIServing):
                     # wasn't ready to send a token, then
                     #   get the next token without streaming a chunk
                     if delta_message is None:
-                        continue
+                        if output.finish_reason is None:
+                            continue
+                        else:
+                            delta_message = DeltaMessage()
 
                     # Log streaming delta if output logging is enabled
                     if self.enable_log_outputs and self.request_logger:
@@ -993,12 +1044,18 @@ class OpenAIServingChat(OpenAIServing):
                             ])
 
                         # Send the finish response for each request.n only once
+                        if auto_tools_called or tools_streamed[i] or (
+                                self.use_harmony
+                                and harmony_tools_streamed[i]):
+                            finish_reason_ = "tool_calls"
+                        else:
+                            finish_reason_ = output.finish_reason \
+                                if output.finish_reason else "stop"
                         choice_data = ChatCompletionResponseStreamChoice(
                             index=i,
                             delta=delta_message,
                             logprobs=logprobs,
-                            finish_reason=output.finish_reason
-                            if not auto_tools_called else "tool_calls",
+                            finish_reason=finish_reason_,
                             stop_reason=output.stop_reason,
                             token_ids=(as_list(output.token_ids)
                                        if request.return_token_ids else None))
@@ -1117,6 +1174,7 @@ class OpenAIServingChat(OpenAIServing):
         for output in final_res.outputs:
             token_ids = output.token_ids
             out_logprobs = output.logprobs
+            tool_call_info = None
 
             if request.logprobs and request.top_logprobs is not None:
                 assert out_logprobs is not None, "Did not output logprobs"
@@ -1131,31 +1189,42 @@ class OpenAIServingChat(OpenAIServing):
                 logprobs = None
 
             if self.use_harmony:
-                reasoning_content, final_content, is_tool_call = (
-                    parse_chat_output(token_ids))
-                if not request.include_reasoning:
-                    reasoning_content = None
-
-                if is_tool_call:
-                    # TODO(woosuk): Implement tool call for gpt-oss.
-                    # For now, only Responses API supports tool call for
-                    # gpt-oss.
-                    raise NotImplementedError(
-                        "Tool call in Chat Completion API is not supported "
-                        "for gpt-oss yet. Please use Responses API instead.")
+                if self.tool_parser is not None:
+                    tool_parser = self.tool_parser(tokenizer)
+                    # NOTE: We use token_ids for openai tool parser
+                    tool_call_info = tool_parser.extract_tool_calls(
+                        "",
+                        request=request,
+                        token_ids=token_ids,  # type: ignore
+                    )
+                    reasoning_content, content = None, tool_call_info.content
+                    if request.include_reasoning:
+                        reasoning_content, content, _ = parse_chat_output(
+                            token_ids)
+                    message = ChatMessage(
+                        role=role,
+                        reasoning_content=reasoning_content,
+                        content=content,
+                        tool_calls=tool_call_info.tool_calls,
+                    )
                 else:
-                    # Normal message
+                    reasoning_content, content, _ = parse_chat_output(
+                        token_ids)
+                    if not request.include_reasoning:
+                        reasoning_content = None
                     message = ChatMessage(
                         role=role,
                         reasoning_content=reasoning_content,
-                        content=final_content,
+                        content=content,
                     )
 
                 choice_data = ChatCompletionResponseChoice(
                     index=output.index,
                     message=message,
                     logprobs=logprobs,
-                    finish_reason="tool_calls" if is_tool_call else
+                    finish_reason="tool_calls" if
+                    (tool_call_info is not None
+                     and tool_call_info.tools_called) else
                     output.finish_reason if output.finish_reason else "stop",
                     stop_reason=output.stop_reason,
                 )
@@ -1419,9 +1488,10 @@ class OpenAIServingChat(OpenAIServing):
             step_top_logprobs = top_logprobs[i]
             if step_top_logprobs is None or step_top_logprobs.get(
                     token_id) is None:
-                token = tokenizer.decode(token_id)
                 if should_return_as_token_id:
                     token = f"token_id:{token_id}"
+                else:
+                    token = tokenizer.decode(token_id)
 
                 logprobs_content.append(
                     ChatCompletionLogProbsContent(
@@ -1503,12 +1573,12 @@ class OpenAIServingChat(OpenAIServing):
         messages.append(sys_msg)
 
         # Add developer message.
-        dev_msg = get_developer_message()
+        dev_msg = get_developer_message(tools=request.tools)
         messages.append(dev_msg)
 
         # Add user message.
         for chat_msg in request.messages:
-            messages.append(parse_chat_input(chat_msg))
+            messages.extend(parse_chat_input(chat_msg))
 
         # Render prompt token ids.
         prompt_token_ids = render_for_completion(messages)
diff --git a/vllm/entrypoints/openai/serving_classification.py b/vllm/entrypoints/openai/serving_classification.py
index b4fdc36390319dc4eb0573fe7ccd0c711123e814..7e88424c169cef07eae949088eb99596f593825d 100644
--- a/vllm/entrypoints/openai/serving_classification.py
+++ b/vllm/entrypoints/openai/serving_classification.py
@@ -20,6 +20,7 @@ from vllm.entrypoints.openai.serving_engine import (ClassificationServeContext,
                                                     OpenAIServing,
                                                     ServeContext)
 from vllm.entrypoints.openai.serving_models import OpenAIServingModels
+from vllm.entrypoints.renderer import RenderConfig
 from vllm.logger import init_logger
 from vllm.outputs import ClassificationOutput, PoolingRequestOutput
 from vllm.pooling_params import PoolingParams
@@ -54,14 +55,10 @@ class ClassificationMixin(OpenAIServing):
             ctx.tokenizer = await self.engine_client.get_tokenizer(
                 ctx.lora_request)
 
-            (
-                ctx.request_prompts,
-                ctx.engine_prompts,
-            ) = await self._preprocess_completion(
-                ctx.request,
-                ctx.tokenizer,
-                ctx.request.input,
-            )
+            renderer = self._get_renderer(ctx.tokenizer)
+            ctx.engine_prompts = await renderer.render_prompt(
+                prompt_or_prompts=ctx.request.input,
+                config=self._build_render_config(ctx.request))
 
             return None
 
@@ -117,6 +114,12 @@ class ClassificationMixin(OpenAIServing):
             usage=usage,
         )
 
+    def _build_render_config(self,
+                             request: ClassificationRequest) -> RenderConfig:
+        return RenderConfig(
+            max_length=self.max_model_len,
+            truncate_prompt_tokens=request.truncate_prompt_tokens)
+
 
 class ServingClassification(ClassificationMixin):
     request_id_prefix = "classify"
@@ -143,7 +146,7 @@ class ServingClassification(ClassificationMixin):
         request: ClassificationRequest,
         raw_request: Request,
     ) -> Union[ClassificationResponse, ErrorResponse]:
-        model_name = self._get_model_name(request.model)
+        model_name = self.models.model_name()
         request_id = (f"{self.request_id_prefix}-"
                       f"{self._base_request_id(raw_request)}")
 
diff --git a/vllm/entrypoints/openai/serving_completion.py b/vllm/entrypoints/openai/serving_completion.py
index 11effba8f9eb37ee807198e7e8eaa805a44c7a2e..c2de449a96994f575fe001e5daa0b0868c948de6 100644
--- a/vllm/entrypoints/openai/serving_completion.py
+++ b/vllm/entrypoints/openai/serving_completion.py
@@ -26,21 +26,18 @@ from vllm.entrypoints.openai.protocol import (CompletionLogProbs,
                                               PromptTokenUsageInfo,
                                               RequestResponseMetadata,
                                               UsageInfo)
-from vllm.entrypoints.openai.serving_engine import (
-    EmbedsPrompt as ServingEngineEmbedsPrompt)
 from vllm.entrypoints.openai.serving_engine import (OpenAIServing,
-                                                    TextTokensPrompt,
-                                                    clamp_prompt_logprobs,
-                                                    is_text_tokens_prompt)
+                                                    clamp_prompt_logprobs)
 # yapf: enable
 from vllm.entrypoints.openai.serving_models import OpenAIServingModels
+from vllm.entrypoints.renderer import RenderConfig
 from vllm.entrypoints.utils import get_max_tokens
 from vllm.inputs.data import (EmbedsPrompt, TokensPrompt, is_embeds_prompt,
                               is_tokens_prompt)
 from vllm.logger import init_logger
+from vllm.logprobs import Logprob
 from vllm.outputs import RequestOutput
 from vllm.sampling_params import BeamSearchParams, SamplingParams
-from vllm.sequence import Logprob
 from vllm.transformers_utils.tokenizer import AnyTokenizer
 from vllm.utils import as_list, merge_async_iterators
 
@@ -132,12 +129,12 @@ class OpenAIServingCompletion(OpenAIServing):
             else:
                 tokenizer = await self.engine_client.get_tokenizer(lora_request
                                                                    )
+            renderer = self._get_renderer(tokenizer)
 
-            request_prompts, engine_prompts = await self._preprocess_completion(
-                request,
-                tokenizer,
-                request.prompt,
-                add_special_tokens=request.add_special_tokens,
+            engine_prompts = await renderer.render_prompt_and_embeds(
+                prompt_or_prompts=request.prompt,
+                prompt_embeds=request.prompt_embeds,
+                config=self._build_render_config(request),
             )
         except ValueError as e:
             logger.exception("Error in preprocessing prompt inputs")
@@ -198,7 +195,7 @@ class OpenAIServingCompletion(OpenAIServing):
 
                 self._log_inputs(
                     request_id_item,
-                    request_prompts[i],
+                    engine_prompt,
                     params=sampling_params,
                     lora_request=lora_request,
                 )
@@ -235,7 +232,7 @@ class OpenAIServingCompletion(OpenAIServing):
 
         result_generator = merge_async_iterators(*generators)
 
-        model_name = self._get_model_name(request.model, lora_request)
+        model_name = self.models.model_name(lora_request)
         num_prompts = len(engine_prompts)
 
         # Similar to the OpenAI API, when n != best_of, we do not stream the
@@ -249,7 +246,7 @@ class OpenAIServingCompletion(OpenAIServing):
         if stream:
             return self.completion_stream_generator(
                 request,
-                request_prompts,
+                engine_prompts,
                 result_generator,
                 request_id,
                 created_time,
@@ -273,11 +270,9 @@ class OpenAIServingCompletion(OpenAIServing):
                 # We did not pass it into vLLM engine to avoid being redundant
                 # with the inputs token IDs
                 if final_res.prompt is None:
-                    request_prompt = request_prompts[i]
-                    if is_text_tokens_prompt(request_prompt):
-                        final_res.prompt = request_prompt["prompt"]
-                    else:
-                        final_res.prompt = None
+                    engine_prompt = engine_prompts[i]
+                    final_res.prompt = None if is_embeds_prompt(
+                        engine_prompt) else engine_prompt.get("prompt")
 
             final_res_batch_checked = cast(list[RequestOutput],
                                            final_res_batch)
@@ -313,8 +308,7 @@ class OpenAIServingCompletion(OpenAIServing):
     async def completion_stream_generator(
         self,
         request: CompletionRequest,
-        request_prompts: list[Union[TextTokensPrompt,
-                                    ServingEngineEmbedsPrompt]],
+        engine_prompts: list[Union[TokensPrompt, EmbedsPrompt]],
         result_generator: AsyncIterator[tuple[int, RequestOutput]],
         request_id: str,
         created_time: int,
@@ -350,14 +344,11 @@ class OpenAIServingCompletion(OpenAIServing):
                     num_cached_tokens = res.num_cached_tokens
                     first_iteration = False
 
-                if res.prompt is not None:
-                    prompt_text = res.prompt
-                else:
-                    request_prompt = request_prompts[prompt_idx]
-                    if is_text_tokens_prompt(request_prompt):
-                        prompt_text = request_prompt["prompt"]
-                    else:
-                        prompt_text = None
+                prompt_text = res.prompt
+                if prompt_text is None:
+                    engine_prompt = engine_prompts[prompt_idx]
+                    prompt_text = None if is_embeds_prompt(
+                        engine_prompt) else engine_prompt.get("prompt")
 
                 # Prompt details are excluded from later streamed outputs
                 if prompt_token_ids is not None:
@@ -378,6 +369,8 @@ class OpenAIServingCompletion(OpenAIServing):
                     assert request.max_tokens is not None
                     if request.echo and not has_echoed[i]:
                         assert prompt_token_ids is not None
+                        if request.return_token_ids:
+                            prompt_text = ""
                         assert prompt_text is not None
                         if request.max_tokens == 0:
                             # only return the prompt
@@ -525,6 +518,8 @@ class OpenAIServingCompletion(OpenAIServing):
             for output in final_res.outputs:
                 assert request.max_tokens is not None
                 if request.echo:
+                    if request.return_token_ids:
+                        prompt_text = ""
                     assert prompt_text is not None
                     if request.max_tokens == 0:
                         token_ids = prompt_token_ids
@@ -676,3 +671,18 @@ class OpenAIServingCompletion(OpenAIServing):
             tokens=out_tokens,
             top_logprobs=out_top_logprobs,
         )
+
+    def _build_render_config(
+        self,
+        request: CompletionRequest,
+        max_input_length: Optional[int] = None,
+    ) -> RenderConfig:
+        max_input_tokens_len = self.max_model_len - (request.max_tokens or 0)
+        return RenderConfig(
+            max_length=max_input_tokens_len,
+            truncate_prompt_tokens=request.truncate_prompt_tokens,
+            add_special_tokens=request.add_special_tokens,
+            cache_salt=request.cache_salt,
+            needs_detokenization=bool(request.echo
+                                      and not request.return_token_ids),
+        )
diff --git a/vllm/entrypoints/openai/serving_embedding.py b/vllm/entrypoints/openai/serving_embedding.py
index 0a0d98db2d0d89c888a9d4fdb469cd0f5c0030e2..c0d1fe4b6e1683e3ac94a036247ccf9485573953 100644
--- a/vllm/entrypoints/openai/serving_embedding.py
+++ b/vllm/entrypoints/openai/serving_embedding.py
@@ -24,12 +24,11 @@ from vllm.entrypoints.openai.protocol import (EmbeddingChatRequest,
                                               ErrorResponse, UsageInfo)
 from vllm.entrypoints.openai.serving_engine import (EmbeddingServeContext,
                                                     OpenAIServing,
-                                                    RequestPrompt,
                                                     ServeContext,
                                                     TextTokensPrompt)
 # yapf: enable
 from vllm.entrypoints.openai.serving_models import OpenAIServingModels
-from vllm.inputs.data import EmbedsPrompt as EngineEmbedsPrompt
+from vllm.entrypoints.renderer import RenderConfig
 from vllm.inputs.data import TokensPrompt as EngineTokensPrompt
 from vllm.logger import init_logger
 from vllm.outputs import (EmbeddingOutput, EmbeddingRequestOutput,
@@ -79,11 +78,12 @@ class EmbeddingMixin(OpenAIServing):
 
             tokenizer = await self.engine_client.get_tokenizer(ctx.lora_request
                                                                )
+            renderer = self._get_renderer(tokenizer)
 
             if isinstance(ctx.request, EmbeddingChatRequest):
                 (
                     _,
-                    ctx.request_prompts,
+                    _,
                     ctx.engine_prompts,
                 ) = await self._preprocess_chat(
                     ctx.request,
@@ -93,25 +93,33 @@ class EmbeddingMixin(OpenAIServing):
                     or ctx.chat_template,
                     chat_template_content_format=ctx.
                     chat_template_content_format,
-                    # In embedding requests, we are not generating tokens,
-                    # so there is no need to append extra tokens to the input
-                    add_generation_prompt=False,
+                    add_generation_prompt=ctx.request.add_generation_prompt,
                     continue_final_message=False,
                     add_special_tokens=ctx.request.add_special_tokens,
                 )
             else:
-                (ctx.request_prompts,
-                 ctx.engine_prompts) = await self._preprocess_completion(
-                     ctx.request,
-                     tokenizer,
-                     ctx.request.input,
-                     add_special_tokens=ctx.request.add_special_tokens,
-                 )
+                ctx.engine_prompts = await renderer.render_prompt(
+                    prompt_or_prompts=ctx.request.input,
+                    config=self._build_render_config(ctx.request),
+                )
             return None
         except (ValueError, TypeError) as e:
             logger.exception("Error in preprocessing prompt inputs")
             return self.create_error_response(str(e))
 
+    def _build_render_config(
+            self, request: EmbeddingCompletionRequest) -> RenderConfig:
+        # Set max_length based on chunked processing capability
+        if self._should_use_chunked_processing(request):
+            max_length = None
+        else:
+            max_length = self.max_embed_len or self.max_model_len
+
+        return RenderConfig(
+            max_length=max_length,
+            truncate_prompt_tokens=request.truncate_prompt_tokens,
+            add_special_tokens=request.add_special_tokens)
+
     @override
     def _build_response(
         self,
@@ -287,8 +295,7 @@ class EmbeddingMixin(OpenAIServing):
     async def _create_single_prompt_generator(
         self,
         ctx: EmbeddingServeContext,
-        engine_prompt: Union[EngineTokensPrompt, EngineEmbedsPrompt],
-        request_prompt: RequestPrompt,
+        engine_prompt: EngineTokensPrompt,
         pooling_params: PoolingParams,
         trace_headers: Optional[Mapping[str, str]],
         prompt_index: int,
@@ -297,16 +304,10 @@ class EmbeddingMixin(OpenAIServing):
         request_id_item = f"{ctx.request_id}-{prompt_index}"
 
         self._log_inputs(request_id_item,
-                         request_prompt,
+                         engine_prompt,
                          params=pooling_params,
                          lora_request=ctx.lora_request)
 
-        # Mypy has an existing bug related to inferring the variance
-        # of TypedDicts with `builtins.enumerate`:
-        # https://github.com/python/mypy/issues/8586#issuecomment-2867698435
-        engine_prompt = cast(Union[EngineTokensPrompt, EngineEmbedsPrompt],
-                             engine_prompt)
-
         # Return the original generator without wrapping
         return self.engine_client.encode(
             engine_prompt,
@@ -355,20 +356,14 @@ class EmbeddingMixin(OpenAIServing):
                 return self.create_error_response(
                     "Engine prompts not available")
 
-            if ctx.request_prompts is None:
-                return self.create_error_response(
-                    "Request prompts not available")
-
             max_pos_embeddings = self._get_max_position_embeddings()
 
             for i, engine_prompt in enumerate(ctx.engine_prompts):
-                request_prompt = ctx.request_prompts[i]
-
                 # Check if this specific prompt needs chunked processing
-                if self._is_text_tokens_prompt(request_prompt):
+                if self._is_text_tokens_prompt(engine_prompt):
                     # Cast to TextTokensPrompt since we've verified
                     # prompt_token_ids
-                    text_tokens_prompt = cast(TextTokensPrompt, request_prompt)
+                    text_tokens_prompt = cast(TextTokensPrompt, engine_prompt)
                     if (len(text_tokens_prompt["prompt_token_ids"])
                             > max_pos_embeddings):
                         # Use chunked processing for this prompt
@@ -379,13 +374,8 @@ class EmbeddingMixin(OpenAIServing):
                         continue
 
                 # Normal processing for short prompts or non-token prompts
-                # Cast engine_prompt to the expected type for mypy
-                engine_prompt_typed = cast(
-                    Union[EngineTokensPrompt, EngineEmbedsPrompt],
-                    engine_prompt)
                 generator = await self._create_single_prompt_generator(
-                    ctx, engine_prompt_typed, request_prompt, pooling_params,
-                    trace_headers, i)
+                    ctx, engine_prompt, pooling_params, trace_headers, i)
                 generators.append(generator)
 
             from vllm.utils import merge_async_iterators
@@ -421,10 +411,6 @@ class EmbeddingMixin(OpenAIServing):
             if not use_chunked:
                 return await super()._collect_batch(ctx=ctx)
 
-            if ctx.request_prompts is None:
-                return self.create_error_response(
-                    "Request prompts not available")
-
             if ctx.result_generator is None:
                 return self.create_error_response(
                     "Result generator not available")
@@ -540,7 +526,7 @@ class EmbeddingMixin(OpenAIServing):
                             data=final_embedding)
 
                         # Get original prompt token IDs for this prompt
-                        original_prompt = ctx.request_prompts[prompt_idx]
+                        original_prompt = ctx.engine_prompts[prompt_idx]
                         if not self._is_text_tokens_prompt(original_prompt):
                             return self.create_error_response(
                                 f"Chunked prompt {prompt_idx} is not a "
@@ -613,7 +599,7 @@ class OpenAIServingEmbedding(EmbeddingMixin):
         See https://platform.openai.com/docs/api-reference/embeddings/create
         for the API specification. This API mimics the OpenAI Embedding API.
         """
-        model_name = self._get_model_name(request.model)
+        model_name = self.models.model_name()
         request_id = (
             f"{self.request_id_prefix}-"
             f"{self._base_request_id(raw_request, request.request_id)}")
diff --git a/vllm/entrypoints/openai/serving_engine.py b/vllm/entrypoints/openai/serving_engine.py
index b6a18760115a2ae22fea7ea8902406de6f93285a..d391cc50ad232dc7dbdf785a02a31a2c330c4d89 100644
--- a/vllm/entrypoints/openai/serving_engine.py
+++ b/vllm/entrypoints/openai/serving_engine.py
@@ -1,7 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-import asyncio
-import io
 import json
 import sys
 import time
@@ -9,10 +7,8 @@ import traceback
 from collections.abc import AsyncGenerator, Iterable, Mapping, Sequence
 from concurrent.futures import ThreadPoolExecutor
 from http import HTTPStatus
-from typing import (Annotated, Any, Callable, ClassVar, Generic, Optional,
-                    TypeVar, Union, cast, overload)
+from typing import Any, Callable, ClassVar, Generic, Optional, TypeVar, Union
 
-import pybase64
 import torch
 from fastapi import Request
 from pydantic import BaseModel, ConfigDict, Field
@@ -62,18 +58,19 @@ from vllm.entrypoints.openai.protocol import (ChatCompletionRequest,
                                               TranslationRequest)
 from vllm.entrypoints.openai.serving_models import OpenAIServingModels
 from vllm.entrypoints.openai.tool_parsers import ToolParser
+from vllm.entrypoints.renderer import (BaseRenderer, CompletionRenderer,
+                                       RenderConfig)
 # yapf: enable
-from vllm.inputs.data import EmbedsPrompt as EngineEmbedsPrompt
+from vllm.inputs.data import PromptType
 from vllm.inputs.data import TokensPrompt as EngineTokensPrompt
-from vllm.inputs.parse import parse_and_batch_prompt
 from vllm.logger import init_logger
+from vllm.logprobs import Logprob, PromptLogprobs
 from vllm.lora.request import LoRARequest
 from vllm.multimodal import (  # noqa: F401 - Required to resolve Pydantic error in RequestProcessingMixin
     MultiModalDataDict, MultiModalUUIDDict)
 from vllm.outputs import PoolingRequestOutput, RequestOutput
 from vllm.pooling_params import PoolingParams
 from vllm.sampling_params import BeamSearchParams, SamplingParams
-from vllm.sequence import Logprob, PromptLogprobs
 from vllm.tracing import (contains_trace_headers, extract_trace_headers,
                           log_tracing_disabled_warning)
 from vllm.transformers_utils.tokenizer import AnyTokenizer, MistralTokenizer
@@ -82,16 +79,26 @@ from vllm.utils import (AsyncMicrobatchTokenizer, is_list_of,
 
 logger = init_logger(__name__)
 
-CompletionLikeRequest = Union[CompletionRequest, DetokenizeRequest,
-                              EmbeddingCompletionRequest, RerankRequest,
-                              ClassificationRequest, ScoreRequest,
-                              TokenizeCompletionRequest]
+CompletionLikeRequest = Union[
+    CompletionRequest,
+    DetokenizeRequest,
+    EmbeddingCompletionRequest,
+    RerankRequest,
+    ClassificationRequest,
+    ScoreRequest,
+    TokenizeCompletionRequest,
+]
 
 ChatLikeRequest = Union[ChatCompletionRequest, EmbeddingChatRequest,
                         TokenizeChatRequest]
 SpeechToTextRequest = Union[TranscriptionRequest, TranslationRequest]
-AnyRequest = Union[CompletionLikeRequest, ChatLikeRequest, SpeechToTextRequest,
-                   ResponsesRequest, IOProcessorRequest]
+AnyRequest = Union[
+    CompletionLikeRequest,
+    ChatLikeRequest,
+    SpeechToTextRequest,
+    ResponsesRequest,
+    IOProcessorRequest,
+]
 
 AnyResponse = Union[
     CompletionResponse,
@@ -135,9 +142,9 @@ class RequestProcessingMixin(BaseModel):
     Mixin for request processing,
     handling prompt preparation and engine input.
     """
+
     request_prompts: Optional[Sequence[RequestPrompt]] = []
-    engine_prompts: Optional[Union[list[EngineTokensPrompt],
-                                   list[EngineEmbedsPrompt]]] = []
+    engine_prompts: Optional[list[EngineTokensPrompt]] = []
 
     model_config = ConfigDict(arbitrary_types_allowed=True)
 
@@ -147,6 +154,7 @@ class ResponseGenerationMixin(BaseModel):
     Mixin for response generation,
     managing result generators and final batch results.
     """
+
     result_generator: Optional[AsyncGenerator[tuple[int, Union[
         RequestOutput, PoolingRequestOutput]], None]] = None
     final_res_batch: list[Union[RequestOutput, PoolingRequestOutput]] = Field(
@@ -155,8 +163,12 @@ class ResponseGenerationMixin(BaseModel):
     model_config = ConfigDict(arbitrary_types_allowed=True)
 
 
-class ServeContext(RequestProcessingMixin, ResponseGenerationMixin, BaseModel,
-                   Generic[RequestT]):
+class ServeContext(
+        RequestProcessingMixin,
+        ResponseGenerationMixin,
+        BaseModel,
+        Generic[RequestT],
+):
     # Shared across all requests
     request: RequestT
     raw_request: Optional[Request] = None
@@ -227,6 +239,29 @@ class OpenAIServing:
                                          AsyncMicrobatchTokenizer] = {}
         self.log_error_stack = log_error_stack
 
+    def _get_renderer(self, tokenizer: Optional[AnyTokenizer]) -> BaseRenderer:
+        """
+        Get a Renderer instance with the provided tokenizer.
+        Uses shared async tokenizer pool for efficiency.
+        """
+        return CompletionRenderer(
+            model_config=self.model_config,
+            tokenizer=tokenizer,
+            async_tokenizer_pool=self._async_tokenizer_pool)
+
+    def _build_render_config(
+        self,
+        request: Any,
+    ) -> RenderConfig:
+        """
+        Build and return a `RenderConfig` for an endpoint.
+
+        Used by the renderer to control how prompts are prepared
+        (e.g., tokenization and length handling). Endpoints should
+        implement this with logic appropriate to their request type.
+        """
+        raise NotImplementedError
+
     def _get_async_tokenizer(self, tokenizer) -> AsyncMicrobatchTokenizer:
         """
         Return (and cache) an `AsyncMicrobatchTokenizer` bound to the
@@ -298,8 +333,8 @@ class OpenAIServing:
         truncate_prompt_tokens = getattr(ctx.request, "truncate_prompt_tokens",
                                          None)
 
-        if truncate_prompt_tokens is not None and \
-            truncate_prompt_tokens > self.max_model_len:
+        if (truncate_prompt_tokens is not None
+                and truncate_prompt_tokens > self.max_model_len):
             return self.create_error_response(
                 "truncate_prompt_tokens value is "
                 "greater than max_model_len."
@@ -340,21 +375,13 @@ class OpenAIServing:
             for i, engine_prompt in enumerate(ctx.engine_prompts):
                 request_id_item = f"{ctx.request_id}-{i}"
 
-                if ctx.request_prompts is None:
-                    return self.create_error_response(
-                        "Request prompts not available")
-
-                self._log_inputs(request_id_item,
-                                 ctx.request_prompts[i],
-                                 params=pooling_params,
-                                 lora_request=ctx.lora_request)
-
-                # Mypy has an existing bug related to inferring the variance of
-                # TypedDicts with `builtins.enumerate`:
-                # https://github.com/python/mypy/issues/8586#issuecomment-2867698435
-                engine_prompt = cast(
-                    Union[EngineTokensPrompt, EngineEmbedsPrompt],
-                    engine_prompt)
+                self._log_inputs(
+                    request_id_item,
+                    engine_prompt,
+                    params=pooling_params,
+                    lora_request=ctx.lora_request,
+                )
+
                 generator = self.engine_client.encode(
                     engine_prompt,
                     pooling_params,
@@ -410,10 +437,11 @@ class OpenAIServing:
             return self.create_error_response(str(e))
 
     def create_error_response(
-            self,
-            message: str,
-            err_type: str = "BadRequestError",
-            status_code: HTTPStatus = HTTPStatus.BAD_REQUEST) -> ErrorResponse:
+        self,
+        message: str,
+        err_type: str = "BadRequestError",
+        status_code: HTTPStatus = HTTPStatus.BAD_REQUEST,
+    ) -> ErrorResponse:
         if self.log_error_stack:
             exc_type, _, _ = sys.exc_info()
             if exc_type is not None:
@@ -424,10 +452,11 @@ class OpenAIServing:
             message=message, type=err_type, code=status_code.value))
 
     def create_streaming_error_response(
-            self,
-            message: str,
-            err_type: str = "BadRequestError",
-            status_code: HTTPStatus = HTTPStatus.BAD_REQUEST) -> str:
+        self,
+        message: str,
+        err_type: str = "BadRequestError",
+        status_code: HTTPStatus = HTTPStatus.BAD_REQUEST,
+    ) -> str:
         json_str = json.dumps(
             self.create_error_response(message=message,
                                        err_type=err_type,
@@ -438,25 +467,25 @@ class OpenAIServing:
         self,
         request: AnyRequest,
     ) -> Optional[ErrorResponse]:
-
         error_response = None
 
         if self._is_model_supported(request.model):
             return None
         if request.model in self.models.lora_requests:
             return None
-        if envs.VLLM_ALLOW_RUNTIME_LORA_UPDATING and request.model and (
-                load_result := await self.models.resolve_lora(request.model)):
+        if (envs.VLLM_ALLOW_RUNTIME_LORA_UPDATING and request.model and
+            (load_result := await self.models.resolve_lora(request.model))):
             if isinstance(load_result, LoRARequest):
                 return None
-            if isinstance(load_result, ErrorResponse) and \
-                load_result.error.code == HTTPStatus.BAD_REQUEST.value:
+            if (isinstance(load_result, ErrorResponse) and
+                    load_result.error.code == HTTPStatus.BAD_REQUEST.value):
                 error_response = load_result
 
         return error_response or self.create_error_response(
             message=f"The model `{request.model}` does not exist.",
             err_type="NotFoundError",
-            status_code=HTTPStatus.NOT_FOUND)
+            status_code=HTTPStatus.NOT_FOUND,
+        )
 
     def _get_active_default_mm_loras(
             self, request: AnyRequest) -> Optional[LoRARequest]:
@@ -487,7 +516,6 @@ class OpenAIServing:
         request: AnyRequest,
         supports_default_mm_loras: bool = False,
     ) -> Optional[LoRARequest]:
-
         if request.model in self.models.lora_requests:
             return self.models.lora_requests[request.model]
 
@@ -548,13 +576,15 @@ class OpenAIServing:
                 prompt,
                 add_special_tokens=add_special_tokens,
                 truncation=True,
-                max_length=self.max_model_len)
+                max_length=self.max_model_len,
+            )
         else:
             encoded = await async_tokenizer(
                 prompt,
                 add_special_tokens=add_special_tokens,
                 truncation=True,
-                max_length=truncate_prompt_tokens)
+                max_length=truncate_prompt_tokens,
+            )
 
         input_ids = encoded.input_ids
         input_text = prompt
@@ -595,16 +625,22 @@ class OpenAIServing:
 
         # Note: EmbeddingRequest, ClassificationRequest,
         # and ScoreRequest doesn't have max_tokens
-        if isinstance(request,
-                      (EmbeddingChatRequest, EmbeddingCompletionRequest,
-                       ScoreRequest, RerankRequest, ClassificationRequest)):
-
+        if isinstance(
+                request,
+            (
+                EmbeddingChatRequest,
+                EmbeddingCompletionRequest,
+                ScoreRequest,
+                RerankRequest,
+                ClassificationRequest,
+            ),
+        ):
             # Note: input length can be up to the entire model context length
             # since these requests don't generate tokens.
             if token_num > self.max_model_len:
                 operations: dict[type[AnyRequest], str] = {
                     ScoreRequest: "score",
-                    ClassificationRequest: "classification"
+                    ClassificationRequest: "classification",
                 }
                 operation = operations.get(type(request),
                                            "embedding generation")
@@ -618,8 +654,11 @@ class OpenAIServing:
 
         # Note: TokenizeRequest and DetokenizeRequest doesn't have max_tokens
         # and does not require model context length validation
-        if isinstance(request, (TokenizeCompletionRequest, TokenizeChatRequest,
-                                DetokenizeRequest)):
+        if isinstance(
+                request,
+            (TokenizeCompletionRequest, TokenizeChatRequest,
+             DetokenizeRequest),
+        ):
             return TextTokensPrompt(prompt=input_text,
                                     prompt_token_ids=input_ids)
 
@@ -639,8 +678,8 @@ class OpenAIServing:
                 f"{token_num} input tokens. Please reduce the length of "
                 "the input messages.")
 
-        if max_tokens is not None and \
-            token_num + max_tokens > self.max_model_len:
+        if (max_tokens is not None
+                and token_num + max_tokens > self.max_model_len):
             raise ValueError(
                 "'max_tokens' or 'max_completion_tokens' is too large: "
                 f"{max_tokens}. This model's maximum context length is "
@@ -698,156 +737,6 @@ class OpenAIServing:
                     tokenizer=tokenizer,
                 )
 
-    async def _tokenize_prompt_input_or_inputs_async(
-        self,
-        request: AnyRequest,
-        tokenizer: Optional[AnyTokenizer],
-        input_or_inputs: Optional[Union[str, list[str], list[int],
-                                        list[list[int]]]],
-        add_special_tokens: bool = True,
-    ) -> tuple[list[TextTokensPrompt], list[EmbedsPrompt]]:
-        """
-        Tokenize/detokenize depending on the input format.
-
-        According to `OpenAI API <https://platform.openai.com/docs/api-reference/embeddings/create>`_
-        , each input can be a string or array of tokens. Note that each request
-        can pass one or more inputs.
-        """
-        inputs_embeds = list[EmbedsPrompt]()
-        inputs_text = list[TextTokensPrompt]()
-
-        truncate_prompt_tokens = getattr(request, "truncate_prompt_tokens",
-                                         None)
-
-        if (truncate_prompt_tokens or 0) < 0:
-            truncate_prompt_tokens = self.max_model_len
-
-        if (isinstance(request, CompletionRequest)
-                and request.prompt_embeds is not None):
-            inputs_embeds.extend(
-                self._load_prompt_embeds(request.prompt_embeds,
-                                         truncate_prompt_tokens))
-
-        # Empty prompts are okay as long as there are prompt embeddings
-        if input_or_inputs is None or (inputs_embeds
-                                       and input_or_inputs == ""):
-            return [], inputs_embeds
-
-        # Although our type checking is based on mypy,
-        # VSCode Pyright extension should still work properly
-        # "is False" is required for Pyright to perform type narrowing
-        # See: https://github.com/microsoft/pyright/issues/7672
-
-        # Parse and batch the input prompts
-        batch_inputs = parse_and_batch_prompt(input_or_inputs)
-
-        # Process each input in the batch concurrently
-        tasks = []
-        for prompt_input in batch_inputs:
-            if prompt_input["is_tokens"] is False:
-                assert tokenizer is not None, \
-                    "Tokenizer is required for text prompts"
-                task = self._normalize_prompt_text_to_input(
-                    request,
-                    prompt_input["content"],
-                    tokenizer=tokenizer,
-                    add_special_tokens=add_special_tokens)
-            else:
-                task = self._normalize_prompt_tokens_to_input(
-                    request, prompt_input["content"], tokenizer=tokenizer)
-            tasks.append(task)
-
-        # Wait for all tokenization tasks to complete
-        results = await asyncio.gather(*tasks)
-        inputs_text.extend(results)
-
-        return inputs_text, inputs_embeds
-
-    @overload
-    async def _preprocess_completion(
-        self,
-        request: Union[DetokenizeRequest, EmbeddingCompletionRequest,
-                       RerankRequest, ClassificationRequest, ScoreRequest,
-                       TokenizeCompletionRequest],
-        tokenizer: Optional[AnyTokenizer],
-        input_or_inputs: Union[str, list[str], list[int], list[list[int]]],
-        add_special_tokens: bool = ...,
-    ) -> tuple[list[TextTokensPrompt], list[EngineTokensPrompt]]:
-        ...
-
-    @overload
-    async def _preprocess_completion(
-        self,
-        request: CompletionRequest,
-        tokenizer: Optional[AnyTokenizer],
-        input_or_inputs: Optional[Union[str, list[str], list[int],
-                                        list[list[int]]]],
-        add_special_tokens: bool = ...,
-    ) -> tuple[list[Union[TextTokensPrompt, EmbedsPrompt]], list[Union[
-            EngineTokensPrompt, EngineEmbedsPrompt]]]:
-        ...
-
-    async def _preprocess_completion(
-        self,
-        request: CompletionLikeRequest,
-        tokenizer: Optional[AnyTokenizer],
-        input_or_inputs: Optional[Union[str, list[str], list[int],
-                                        list[list[int]]]],
-        add_special_tokens: bool = True,
-    ) -> tuple[Union[list[TextTokensPrompt], list[Union[
-            TextTokensPrompt, EmbedsPrompt]]], Union[
-                list[EngineTokensPrompt], list[Union[EngineTokensPrompt,
-                                                     EngineEmbedsPrompt]]]]:
-        if not isinstance(request,
-                          CompletionRequest) and input_or_inputs is None:
-            raise ValueError(
-                "Prompt embeds with non-completion requests is not"
-                " currently supported.")
-
-        (request_prompts_text, request_prompts_embeds
-         ) = await self._tokenize_prompt_input_or_inputs_async(
-             request,
-             tokenizer,
-             input_or_inputs,
-             add_special_tokens=add_special_tokens,
-         )
-
-        engine_prompts_text = [
-            EngineTokensPrompt(
-                prompt_token_ids=request_prompt_text["prompt_token_ids"])
-            for request_prompt_text in request_prompts_text
-        ]
-        cache_salt = request.cache_salt if (
-            hasattr(request, "cache_salt")
-            and request.cache_salt is not None) else None
-        if cache_salt:
-            for prompt_text in engine_prompts_text:
-                prompt_text["cache_salt"] = cache_salt
-
-        # This check is equivalent to simply checking if
-        # `request_prompts_embeds` is empty, but it's difficult to propagate
-        # overloads to the private helper functions to enable this check.
-        # This overload is needed because only TextPrompts are allowed for
-        # non-completion requests and if we don't add the overload here,
-        # everywhere this function is used outside of serving_completion will
-        # need logic asserting that only text prompts are in the request.
-        if not isinstance(request,
-                          CompletionRequest) and input_or_inputs is not None:
-            return request_prompts_text, engine_prompts_text
-
-        engine_prompts_embeds = [
-            EngineEmbedsPrompt(
-                prompt_embeds=request_prompt_embeds["prompt_embeds"])
-            for request_prompt_embeds in request_prompts_embeds
-        ]
-        if cache_salt:
-            for prompt_embed in engine_prompts_embeds:
-                prompt_embed["cache_salt"] = cache_salt
-
-        request_prompts = request_prompts_embeds + request_prompts_text
-        engine_prompts = engine_prompts_embeds + engine_prompts_text
-        return request_prompts, engine_prompts
-
     async def _preprocess_chat(
         self,
         request: Union[ChatLikeRequest, ResponsesRequest],
@@ -862,8 +751,11 @@ class OpenAIServing:
         chat_template_kwargs: Optional[dict[str, Any]] = None,
         tool_parser: Optional[Callable[[AnyTokenizer], ToolParser]] = None,
         add_special_tokens: bool = False,
-    ) -> tuple[list[ConversationMessage], Sequence[RequestPrompt],
-               list[EngineTokensPrompt]]:
+    ) -> tuple[
+            list[ConversationMessage],
+            Sequence[RequestPrompt],
+            list[EngineTokensPrompt],
+    ]:
         model_config = self.model_config
 
         resolved_content_format = resolve_chat_template_content_format(
@@ -873,7 +765,7 @@ class OpenAIServing:
             tokenizer,
             model_config=model_config,
         )
-        conversation, mm_data_future = parse_chat_messages_futures(
+        conversation, mm_data_future, mm_uuids = parse_chat_messages_futures(
             messages,
             model_config,
             tokenizer,
@@ -925,8 +817,8 @@ class OpenAIServing:
 
         if tokenizer is None:
             assert isinstance(request_prompt, str), (
-                "Prompt has to be a string", \
-                "when the tokenizer is not initialised"
+                "Prompt has to be a string",
+                "when the tokenizer is not initialised",
             )
             prompt_inputs = TextTokensPrompt(prompt=request_prompt,
                                              prompt_token_ids=[1])
@@ -943,12 +835,17 @@ class OpenAIServing:
                 "Prompt has to be either a string or a list of token ids")
             prompt_inputs = TextTokensPrompt(
                 prompt=tokenizer.decode(request_prompt),
-                prompt_token_ids=request_prompt)
+                prompt_token_ids=request_prompt,
+            )
 
         engine_prompt = EngineTokensPrompt(
             prompt_token_ids=prompt_inputs["prompt_token_ids"])
         if mm_data is not None:
             engine_prompt["multi_modal_data"] = mm_data
+
+        if mm_uuids is not None:
+            engine_prompt["multi_modal_uuids"] = mm_uuids
+
         if request.mm_processor_kwargs is not None:
             engine_prompt["mm_processor_kwargs"] = request.mm_processor_kwargs
 
@@ -1007,49 +904,15 @@ class OpenAIServing:
                 prompt_token_ids=prompt_token_ids)
             request_prompt = prompt_token_ids
             # Update the sampling params.
-            sampling_params.max_tokens = (self.max_model_len -
-                                          len(prompt_token_ids))
+            sampling_params.max_tokens = self.max_model_len - len(
+                prompt_token_ids)
             # OPTIMIZATION
             priority = orig_priority - 1
 
-    @staticmethod
-    def _load_prompt_embeds(
-        prompt_embeds: Optional[Union[bytes, list[bytes]]],
-        truncate_prompt_tokens: Optional[Annotated[int, Field(ge=1)]] = None
-    ) -> list[EmbedsPrompt]:
-
-        def _load_and_validate_embed(embed: bytes) -> EmbedsPrompt:
-            tensor = torch.load(io.BytesIO(
-                pybase64.b64decode(embed, validate=True)),
-                                weights_only=True,
-                                map_location=torch.device("cpu"))
-            assert isinstance(tensor, torch.Tensor) and tensor.dtype in (
-                torch.float32,
-                torch.bfloat16,
-                torch.float16,
-            )
-            tensor = tensor.to_dense()
-            if tensor.dim() > 2:
-                tensor = tensor.squeeze(0)
-                assert tensor.dim() == 2
-            if truncate_prompt_tokens is not None:
-                tensor = tensor[-truncate_prompt_tokens:]
-            return {"prompt_embeds": tensor}
-
-        if prompt_embeds:
-            if isinstance(prompt_embeds, list):
-                return [
-                    _load_and_validate_embed(embed) for embed in prompt_embeds
-                ]
-            else:
-                return [_load_and_validate_embed(prompt_embeds)]
-        else:
-            return []
-
     def _log_inputs(
         self,
         request_id: str,
-        inputs: RequestPrompt,
+        inputs: Union[RequestPrompt, PromptType],
         params: Optional[Union[SamplingParams, PoolingParams,
                                BeamSearchParams]],
         lora_request: Optional[LoRARequest],
@@ -1061,11 +924,9 @@ class OpenAIServing:
             prompt = inputs
         elif isinstance(inputs, list):
             prompt_token_ids = inputs
-        elif 'prompt_embeds' in inputs:
-            prompt_embeds = inputs.get("prompt_embeds")
         else:
-            prompt = inputs["prompt"]
-            prompt_token_ids = inputs["prompt_token_ids"]
+            prompt = getattr(inputs, 'prompt', None)
+            prompt_token_ids = getattr(inputs, 'prompt_token_ids', None)
 
         self.request_logger.log_inputs(
             request_id,
@@ -1101,10 +962,12 @@ class OpenAIServing:
         return raw_request.headers.get("X-Request-Id", default)
 
     @staticmethod
-    def _get_decoded_token(logprob: Logprob,
-                           token_id: int,
-                           tokenizer: AnyTokenizer,
-                           return_as_token_id: bool = False) -> str:
+    def _get_decoded_token(
+        logprob: Logprob,
+        token_id: int,
+        tokenizer: AnyTokenizer,
+        return_as_token_id: bool = False,
+    ) -> str:
         if return_as_token_id:
             return f"token_id:{token_id}"
 
@@ -1117,19 +980,10 @@ class OpenAIServing:
             return True
         return self.models.is_base_model(model_name)
 
-    def _get_model_name(self,
-                        model_name: Optional[str] = None,
-                        lora_request: Optional[LoRARequest] = None) -> str:
-        if lora_request:
-            return lora_request.lora_name
-        if not model_name:
-            return self.models.base_model_paths[0].name
-        return model_name
-
 
 def clamp_prompt_logprobs(
     prompt_logprobs: Union[PromptLogprobs,
-                           None]) -> Union[PromptLogprobs, None]:
+                           None], ) -> Union[PromptLogprobs, None]:
     if prompt_logprobs is None:
         return prompt_logprobs
 
@@ -1137,6 +991,6 @@ def clamp_prompt_logprobs(
         if logprob_dict is None:
             continue
         for logprob_values in logprob_dict.values():
-            if logprob_values.logprob == float('-inf'):
+            if logprob_values.logprob == float("-inf"):
                 logprob_values.logprob = -9999.0
     return prompt_logprobs
diff --git a/vllm/entrypoints/openai/serving_pooling.py b/vllm/entrypoints/openai/serving_pooling.py
index 685c98c817c3dec4bc536661b611a651919992f3..cac1d1ba56839804ba2ae6a2d720e07ed9c30e44 100644
--- a/vllm/entrypoints/openai/serving_pooling.py
+++ b/vllm/entrypoints/openai/serving_pooling.py
@@ -4,7 +4,7 @@
 import asyncio
 import base64
 import time
-from collections.abc import AsyncGenerator, Sequence
+from collections.abc import AsyncGenerator
 from typing import Final, Literal, Optional, Union, cast
 
 import jinja2
@@ -26,8 +26,9 @@ from vllm.entrypoints.openai.protocol import (ErrorResponse,
                                               PoolingRequest, PoolingResponse,
                                               PoolingResponseData, UsageInfo)
 # yapf: enable
-from vllm.entrypoints.openai.serving_engine import OpenAIServing, RequestPrompt
+from vllm.entrypoints.openai.serving_engine import OpenAIServing
 from vllm.entrypoints.openai.serving_models import OpenAIServingModels
+from vllm.entrypoints.renderer import RenderConfig
 from vllm.entrypoints.utils import _validate_truncation_size
 from vllm.logger import init_logger
 from vllm.outputs import PoolingOutput, PoolingRequestOutput
@@ -90,7 +91,7 @@ class OpenAIServingPooling(OpenAIServing):
         if error_check_ret is not None:
             return error_check_ret
 
-        model_name = self._get_model_name(request.model)
+        model_name = self.models.model_name()
 
         request_id = f"pool-{self._base_request_id(raw_request)}"
         created_time = int(time.time())
@@ -104,6 +105,7 @@ class OpenAIServingPooling(OpenAIServing):
             else:
                 tokenizer = await self.engine_client.get_tokenizer(lora_request
                                                                    )
+            renderer = self._get_renderer(tokenizer)
 
             if getattr(request, "dimensions", None) is not None:
                 return self.create_error_response(
@@ -126,14 +128,11 @@ class OpenAIServingPooling(OpenAIServing):
 
                 engine_prompts = await self.io_processor.pre_process_async(
                     prompt=validated_prompt, request_id=request_id)
-                request_prompts: Sequence[RequestPrompt] = [
-                    ""
-                ] * len(engine_prompts)
 
             elif isinstance(request, PoolingChatRequest):
                 (
                     _,
-                    request_prompts,
+                    _,
                     engine_prompts,
                 ) = await self._preprocess_chat(
                     request,
@@ -149,13 +148,10 @@ class OpenAIServingPooling(OpenAIServing):
                     add_special_tokens=request.add_special_tokens,
                 )
             elif isinstance(request, PoolingCompletionRequest):
-                (request_prompts,
-                 engine_prompts) = await self._preprocess_completion(
-                     request,
-                     tokenizer,
-                     request.input,
-                     add_special_tokens=request.add_special_tokens,
-                 )
+                engine_prompts = await renderer.render_prompt(
+                    prompt_or_prompts=request.input,
+                    config=self._build_render_config(request),
+                )
             else:
                 raise ValueError(
                     f"Unsupported request of type {type(request)}")
@@ -177,7 +173,7 @@ class OpenAIServingPooling(OpenAIServing):
                 request_id_item = f"{request_id}-{i}"
 
                 self._log_inputs(request_id_item,
-                                 request_prompts[i],
+                                 engine_prompt,
                                  params=pooling_params,
                                  lora_request=lora_request)
 
@@ -272,3 +268,10 @@ class OpenAIServingPooling(OpenAIServing):
             data=items,
             usage=usage,
         )
+
+    def _build_render_config(
+            self, request: PoolingCompletionRequest) -> RenderConfig:
+        return RenderConfig(
+            max_length=self.max_model_len,
+            truncate_prompt_tokens=request.truncate_prompt_tokens,
+            add_special_tokens=request.add_special_tokens)
diff --git a/vllm/entrypoints/openai/serving_responses.py b/vllm/entrypoints/openai/serving_responses.py
index 899cb07b2b37d3fa5834bd418713d69129eabde9..401ba6c53331cb3fc115b6241026edf9ffc09dba 100644
--- a/vllm/entrypoints/openai/serving_responses.py
+++ b/vllm/entrypoints/openai/serving_responses.py
@@ -4,6 +4,8 @@
 import asyncio
 import json
 import time
+import uuid
+from collections import deque
 from collections.abc import AsyncGenerator, AsyncIterator, Sequence
 from contextlib import AsyncExitStack
 from copy import copy
@@ -24,7 +26,8 @@ from openai.types.responses import (ResponseCreatedEvent,
                                     ResponseOutputMessage, ResponseOutputText,
                                     ResponseReasoningItem,
                                     ResponseReasoningTextDeltaEvent,
-                                    ResponseReasoningTextDoneEvent)
+                                    ResponseReasoningTextDoneEvent,
+                                    response_text_delta_event)
 from openai.types.responses.response_output_text import (Logprob,
                                                          LogprobTopLogprob)
 # yapf: enable
@@ -41,12 +44,13 @@ from vllm.entrypoints.context import (ConversationContext, HarmonyContext,
                                       SimpleContext, StreamingHarmonyContext)
 from vllm.entrypoints.harmony_utils import (
     get_developer_message, get_stop_tokens_for_assistant_actions,
-    get_system_message, get_user_message, parse_output_message,
-    parse_remaining_state, parse_response_input, render_for_completion)
+    get_system_message, get_user_message, has_custom_tools,
+    parse_output_message, parse_remaining_state, parse_response_input,
+    render_for_completion)
 from vllm.entrypoints.logger import RequestLogger
 # yapf conflicts with isort for this block
 # yapf: disable
-from vllm.entrypoints.openai.protocol import (ErrorResponse,
+from vllm.entrypoints.openai.protocol import (DeltaMessage, ErrorResponse,
                                               InputTokensDetails,
                                               OutputTokensDetails,
                                               RequestResponseMetadata,
@@ -55,14 +59,14 @@ from vllm.entrypoints.openai.protocol import (ErrorResponse,
 # yapf: enable
 from vllm.entrypoints.openai.serving_engine import OpenAIServing
 from vllm.entrypoints.openai.serving_models import OpenAIServingModels
-from vllm.entrypoints.tool_server import MCPToolServer, ToolServer
+from vllm.entrypoints.tool_server import ToolServer
 from vllm.inputs.data import TokensPrompt as EngineTokensPrompt
 from vllm.logger import init_logger
+from vllm.logprobs import Logprob as SampleLogprob
+from vllm.logprobs import SampleLogprobs
 from vllm.outputs import CompletionOutput
 from vllm.reasoning import ReasoningParser, ReasoningParserManager
 from vllm.sampling_params import SamplingParams
-from vllm.sequence import Logprob as SampleLogprob
-from vllm.sequence import SampleLogprobs
 from vllm.transformers_utils.tokenizer import AnyTokenizer
 from vllm.utils import random_uuid
 
@@ -168,6 +172,11 @@ class OpenAIServingResponses(OpenAIServing):
         # never remove messages from the store.
         self.msg_store: dict[str, list[ChatCompletionMessageParam]] = {}
 
+        # HACK(wuhang): This is a hack. We should use a better store.
+        # FIXME: If enable_store=True, this may cause a memory leak since we
+        # never remove events from the store.
+        self.event_store: dict[str, tuple[deque[str], asyncio.Event]] = {}
+
         self.background_tasks: dict[str, asyncio.Task] = {}
 
         self.tool_server = tool_server
@@ -228,7 +237,7 @@ class OpenAIServingResponses(OpenAIServing):
 
         try:
             lora_request = self._maybe_get_adapters(request)
-            model_name = self._get_model_name(request.model, lora_request)
+            model_name = self.models.model_name(lora_request)
             tokenizer = await self.engine_client.get_tokenizer(lora_request)
 
             if self.use_harmony:
@@ -249,15 +258,6 @@ class OpenAIServingResponses(OpenAIServing):
         if raw_request:
             raw_request.state.request_metadata = request_metadata
 
-        if self.tool_server is not None and isinstance(
-                self.tool_server,
-                MCPToolServer) and request.stream and request.tools and any(
-                    tool.type in ["web_search_preview", "code_interpreter"]
-                    for tool in request.tools):
-            return self.create_error_response(
-                "MCP tool server is not supported in background mode and "
-                "streaming mode")
-
         # Schedule the request and get the result generator.
         generators: list[AsyncGenerator[ConversationContext, None]] = []
 
@@ -267,6 +267,8 @@ class OpenAIServingResponses(OpenAIServing):
                 builtin_tool_list.append("browser")
             if self.tool_server.has_tool("python"):
                 builtin_tool_list.append("python")
+            if self.tool_server.has_tool("container"):
+                builtin_tool_list.append("container")
 
         if self.tool_server is not None:
             available_tools = builtin_tool_list
@@ -329,25 +331,44 @@ class OpenAIServingResponses(OpenAIServing):
                 self.response_store[response.id] = response
 
             # Run the request in the background.
-            task = asyncio.create_task(
-                self._run_background_request(
-                    request,
-                    sampling_params,
-                    result_generator,
-                    context,
-                    model_name,
-                    tokenizer,
-                    request_metadata,
-                    created_time,
-                ),
-                name=f"create_{response.id}",
-            )
+            if request.stream:
+                task = asyncio.create_task(
+                    self._run_background_request_stream(
+                        request,
+                        sampling_params,
+                        result_generator,
+                        context,
+                        model_name,
+                        tokenizer,
+                        request_metadata,
+                        created_time,
+                    ),
+                    name=f"create_{request.request_id}",
+                )
+            else:
+                task = asyncio.create_task(
+                    self._run_background_request(
+                        request,
+                        sampling_params,
+                        result_generator,
+                        context,
+                        model_name,
+                        tokenizer,
+                        request_metadata,
+                        created_time,
+                    ),
+                    name=f"create_{response.id}",
+                )
 
             # For cleanup.
             response_id = response.id
             self.background_tasks[response_id] = task
             task.add_done_callback(
                 lambda _: self.background_tasks.pop(response_id, None))
+
+            if request.stream:
+                return self.responses_background_stream_generator(
+                    request.request_id)
             return response
 
         if request.stream:
@@ -430,7 +451,8 @@ class OpenAIServingResponses(OpenAIServing):
 
         async with AsyncExitStack() as exit_stack:
             try:
-                await context.init_tool_sessions(self.tool_server, exit_stack)
+                await context.init_tool_sessions(self.tool_server, exit_stack,
+                                                 request.request_id)
                 async for _ in result_generator:
                     pass
             except asyncio.CancelledError:
@@ -442,11 +464,7 @@ class OpenAIServingResponses(OpenAIServing):
         if self.use_harmony:
             assert isinstance(context, HarmonyContext)
             output = self._make_response_output_items_with_harmony(context)
-            # TODO: these are all 0 for now!
-            num_prompt_tokens = context.num_prompt_tokens
-            num_generated_tokens = context.num_output_tokens
-            num_cached_tokens = context.num_cached_tokens
-            num_reasoning_tokens = context.num_reasoning_tokens
+            num_tool_output_tokens = context.num_tool_output_tokens
         else:
             assert isinstance(context, SimpleContext)
             final_res = context.last_output
@@ -459,10 +477,13 @@ class OpenAIServingResponses(OpenAIServing):
 
             # Calculate usage.
             assert final_res.prompt_token_ids is not None
-            num_prompt_tokens = len(final_res.prompt_token_ids)
-            num_generated_tokens = len(final_output.token_ids)
-            num_cached_tokens = final_res.num_cached_tokens
-            num_reasoning_tokens = 0
+            num_tool_output_tokens = 0
+
+        assert isinstance(context, (SimpleContext, HarmonyContext))
+        num_prompt_tokens = context.num_prompt_tokens
+        num_generated_tokens = context.num_output_tokens
+        num_cached_tokens = context.num_cached_tokens
+        num_reasoning_tokens = context.num_reasoning_tokens
 
         usage = ResponseUsage(
             input_tokens=num_prompt_tokens,
@@ -471,7 +492,8 @@ class OpenAIServingResponses(OpenAIServing):
             input_tokens_details=InputTokensDetails(
                 cached_tokens=num_cached_tokens),
             output_tokens_details=OutputTokensDetails(
-                reasoning_tokens=num_reasoning_tokens),
+                reasoning_tokens=num_reasoning_tokens,
+                tool_output_tokens=num_tool_output_tokens),
         )
         response = ResponsesResponse.from_request(
             request,
@@ -537,6 +559,28 @@ class OpenAIServingResponses(OpenAIServing):
                 ))
         return out
 
+    def _create_stream_response_logprobs(
+        self,
+        token_ids: Sequence[int],
+        logprobs: Optional[SampleLogprobs],
+        tokenizer: AnyTokenizer,
+        top_logprobs: Optional[int] = None
+    ) -> list[response_text_delta_event.Logprob]:
+        lgs = self._create_response_logprobs(token_ids=token_ids,
+                                             logprobs=logprobs,
+                                             tokenizer=tokenizer,
+                                             top_logprobs=top_logprobs)
+        return [
+            response_text_delta_event.Logprob(
+                token=lg.token,
+                logprob=lg.logprob,
+                top_logprobs=[
+                    response_text_delta_event.LogprobTopLogprob(
+                        token=tl.token, logprob=tl.logprob)
+                    for tl in lg.top_logprobs
+                ]) for lg in lgs
+        ]
+
     def _make_response_output_items(
         self,
         request: ResponsesRequest,
@@ -670,13 +714,21 @@ class OpenAIServingResponses(OpenAIServing):
             # New conversation.
             reasoning_effort = (request.reasoning.effort
                                 if request.reasoning else None)
+            # Temporary: OpenAI types doesn't have container tool
+            # so we used MCP to cover that, up for change
             tool_types = [tool.type for tool in request.tools]
+            if envs.VLLM_GPT_OSS_USE_CONTAINER_TOOL:
+                tool_types.append("container")
             enable_browser = ("web_search_preview" in tool_types
                               and self.tool_server is not None
                               and self.tool_server.has_tool("browser"))
             enable_code_interpreter = ("code_interpreter" in tool_types
                                        and self.tool_server is not None
                                        and self.tool_server.has_tool("python"))
+            enable_container = ("container" in tool_types
+                                and self.tool_server is not None
+                                and self.tool_server.has_tool("container"))
+            with_custom_tools = has_custom_tools(tool_types)
             sys_msg = get_system_message(
                 reasoning_effort=reasoning_effort,
                 browser_description=self.tool_server.get_tool_description(
@@ -685,11 +737,17 @@ class OpenAIServingResponses(OpenAIServing):
                 python_description=self.tool_server.get_tool_description(
                     "python") if enable_code_interpreter
                 and self.tool_server is not None else None,
+                container_description=self.tool_server.get_tool_description(
+                    "container")
+                if enable_container and self.tool_server is not None else None,
+                instructions=request.instructions,
+                with_custom_tools=with_custom_tools,
             )
             messages.append(sys_msg)
-            dev_msg = get_developer_message(request.instructions,
-                                            request.tools)
-            messages.append(dev_msg)
+            if with_custom_tools:
+                dev_msg = get_developer_message(
+                    instructions=request.instructions, tools=request.tools)
+                messages.append(dev_msg)
         else:
             # Continue the previous conversation.
             # FIXME(woosuk): Currently, request params like reasoning and
@@ -717,7 +775,7 @@ class OpenAIServingResponses(OpenAIServing):
                             prev_msgs.append(msg)
             messages.extend(prev_msgs)
         # Append the new input.
-        # Reponses API supports simple text inputs without chat format.
+        # Responses API supports simple text inputs without chat format.
         if isinstance(request.input, str):
             messages.append(get_user_message(request.input))
         else:
@@ -728,7 +786,7 @@ class OpenAIServingResponses(OpenAIServing):
             for response_msg in request.input:
                 messages.append(
                     parse_response_input(response_msg, prev_outputs))
-                # User passes in a a tool call request and its output. We need
+                # User passes in a tool call request and its output. We need
                 # to add the tool call request to prev_outputs so that the
                 # parse_response_input can find the tool call request when
                 # parsing the tool call output.
@@ -736,6 +794,40 @@ class OpenAIServingResponses(OpenAIServing):
                     prev_outputs.append(response_msg)
         return messages
 
+    async def _run_background_request_stream(
+        self,
+        request: ResponsesRequest,
+        *args,
+        **kwargs,
+    ):
+        event_deque: deque[str] = deque()
+        new_event_signal = asyncio.Event()
+        self.event_store[request.request_id] = (event_deque, new_event_signal)
+        response = None
+        try:
+            generator = self.responses_stream_generator(
+                request, *args, **kwargs)
+            async for event in generator:
+                event_deque.append(event)
+                new_event_signal.set()  # Signal new event available
+        except Exception as e:
+            logger.exception("Background request failed for %s",
+                             request.request_id)
+            response = self.create_error_response(str(e))
+        finally:
+            # Mark as finished with a special marker
+            event_deque.append("__STREAM_END__")
+            new_event_signal.set()
+
+        if response is not None and isinstance(response, ErrorResponse):
+            # If the request has failed, update the status to "failed".
+            response_id = request.request_id
+            async with self.response_store_lock:
+                stored_response = self.response_store.get(response_id)
+                assert stored_response is not None
+                if stored_response.status not in ("completed", "cancelled"):
+                    stored_response.status = "failed"
+
     async def _run_background_request(
         self,
         request: ResponsesRequest,
@@ -759,9 +851,36 @@ class OpenAIServingResponses(OpenAIServing):
                 if stored_response.status not in ("completed", "cancelled"):
                     stored_response.status = "failed"
 
+    async def responses_background_stream_generator(
+        self,
+        response_id: str,
+        starting_after: Optional[int] = None,
+    ):
+        if response_id not in self.event_store:
+            raise ValueError(f"Unknown response_id: {response_id}")
+
+        event_deque, new_event_signal = self.event_store[response_id]
+        start_index = 0 if starting_after is None else starting_after + 1
+        current_index = start_index
+
+        while True:
+            new_event_signal.clear()
+
+            # Yield existing events from start_index
+            while current_index < len(event_deque):
+                event = event_deque[current_index]
+                if event == "__STREAM_END__":
+                    return
+                yield event
+                current_index += 1
+
+            await new_event_signal.wait()
+
     async def retrieve_responses(
         self,
         response_id: str,
+        starting_after: Optional[int],
+        stream: Optional[bool],
     ) -> Union[ErrorResponse, ResponsesResponse]:
         if not response_id.startswith("resp_"):
             return self._make_invalid_id_error(response_id)
@@ -771,6 +890,12 @@ class OpenAIServingResponses(OpenAIServing):
 
         if response is None:
             return self._make_not_found_error(response_id)
+
+        if stream:
+            return self.responses_background_stream_generator(
+                response_id,
+                starting_after,
+            )
         return response
 
     async def cancel_responses(
@@ -829,7 +954,7 @@ class OpenAIServingResponses(OpenAIServing):
             status_code=HTTPStatus.BAD_REQUEST,
         )
 
-    async def _process_streaming_events(
+    async def _process_simple_streaming_events(
         self,
         request: ResponsesRequest,
         sampling_params: SamplingParams,
@@ -839,47 +964,292 @@ class OpenAIServingResponses(OpenAIServing):
         tokenizer: AnyTokenizer,
         request_metadata: RequestResponseMetadata,
         created_time: int,
+        _send_event: Callable[[BaseModel], str],
     ) -> AsyncGenerator[str, None]:
-        sequence_number = 0
-
-        def _send_event(event: BaseModel):
-            nonlocal sequence_number
-            # Set sequence_number if the event has this attribute
-            if hasattr(event, 'sequence_number'):
-                event.sequence_number = sequence_number
-            sequence_number += 1
-            # Get event type from the event's type field if it exists
-            event_type = getattr(event, 'type', 'unknown')
-            return (f"event: {event_type}\n"
-                    f"data: {event.model_dump_json(indent=None)}\n\n")
+        current_content_index = 0
+        current_output_index = 0
+        current_item_id = ""
+        reasoning_parser = None
+        if self.reasoning_parser:
+            reasoning_parser = self.reasoning_parser(tokenizer)
+        previous_text = ""
+        previous_token_ids: list[int] = []
+        first_delta_sent = False
+        previous_delta_messages: list[DeltaMessage] = []
+        async for ctx in result_generator:
+            assert isinstance(ctx, SimpleContext)
+            if ctx.last_output is None:
+                continue
+            if ctx.last_output.outputs:
+                output = ctx.last_output.outputs[0]
+                if reasoning_parser:
+                    delta_message = \
+                        reasoning_parser.extract_reasoning_content_streaming(
+                        previous_text=previous_text,
+                        current_text=previous_text + output.text,
+                        delta_text=output.text,
+                        previous_token_ids=previous_token_ids,
+                        current_token_ids=previous_token_ids +
+                        output.token_ids,
+                        delta_token_ids=output.token_ids,
+                    )
+                else:
+                    delta_message = DeltaMessage(content=output.text, )
+                previous_text += output.text
+                previous_token_ids += output.token_ids
+                if not delta_message:
+                    continue
+                if not first_delta_sent:
+                    current_item_id = str(uuid.uuid4())
+                    if delta_message.reasoning_content:
+                        yield _send_event(
+                            openai_responses_types.
+                            ResponseOutputItemAddedEvent(
+                                type="response.output_item.added",
+                                sequence_number=-1,
+                                output_index=current_output_index,
+                                item=openai_responses_types.
+                                ResponseReasoningItem(
+                                    type="reasoning",
+                                    id=current_item_id,
+                                    summary=[],
+                                    status="in_progress",
+                                ),
+                            ))
+                    else:
+                        yield _send_event(
+                            openai_responses_types.
+                            ResponseOutputItemAddedEvent(
+                                type="response.output_item.added",
+                                sequence_number=-1,
+                                output_index=current_output_index,
+                                item=openai_responses_types.
+                                ResponseOutputMessage(
+                                    id=current_item_id,
+                                    type="message",
+                                    role="assistant",
+                                    content=[],
+                                    status="in_progress",
+                                ),
+                            ))
+                    yield _send_event(
+                        openai_responses_types.ResponseContentPartAddedEvent(
+                            type="response.content_part.added",
+                            sequence_number=-1,
+                            output_index=current_output_index,
+                            item_id=current_item_id,
+                            content_index=current_content_index,
+                            part=openai_responses_types.ResponseOutputText(
+                                type="output_text",
+                                text="",
+                                annotations=[],
+                                logprobs=[],
+                            ),
+                        ))
+                    current_content_index += 1
+                    first_delta_sent = True
+                # todo(kebe7jun) tool call support
+
+                # check delta message and previous delta message are
+                # same as content or reasoning content
+                if (previous_delta_messages
+                        and previous_delta_messages[-1].reasoning_content
+                        is not None and delta_message.content is not None):
+                    # from reasoning to normal content, send done
+                    # event for reasoning
+                    reason_content = ''.join(
+                        pm.reasoning_content for pm in previous_delta_messages
+                        if pm.reasoning_content is not None)
+                    yield _send_event(
+                        ResponseReasoningTextDoneEvent(
+                            type="response.reasoning_text.done",
+                            item_id=current_item_id,
+                            sequence_number=-1,
+                            output_index=current_output_index,
+                            content_index=current_content_index,
+                            text=reason_content,
+                        ))
+                    current_content_index = 0
+                    reasoning_item = ResponseReasoningItem(
+                        type="reasoning",
+                        content=[
+                            ResponseReasoningTextContent(
+                                text=reason_content,
+                                type="reasoning_text",
+                            ),
+                        ],
+                        status="completed",
+                        id=current_item_id,
+                        summary=[],
+                    )
+                    yield _send_event(
+                        ResponseOutputItemDoneEvent(
+                            type="response.output_item.done",
+                            sequence_number=-1,
+                            output_index=current_output_index,
+                            item=reasoning_item,
+                        ))
+                    yield _send_event(
+                        openai_responses_types.ResponseOutputItemAddedEvent(
+                            type="response.output_item.added",
+                            sequence_number=-1,
+                            output_index=current_output_index,
+                            item=openai_responses_types.ResponseOutputMessage(
+                                id=current_item_id,
+                                type="message",
+                                role="assistant",
+                                content=[],
+                                status="in_progress",
+                            ),
+                        ))
+                    current_output_index += 1
+                    current_item_id = str(uuid.uuid4())
+                    yield _send_event(
+                        openai_responses_types.ResponseContentPartAddedEvent(
+                            type="response.content_part.added",
+                            sequence_number=-1,
+                            output_index=current_output_index,
+                            item_id=current_item_id,
+                            content_index=current_content_index,
+                            part=openai_responses_types.ResponseOutputText(
+                                type="output_text",
+                                text="",
+                                annotations=[],
+                                logprobs=[],
+                            ),
+                        ))
+                    current_content_index += 1
+                    # reset previous delta messages
+                    previous_delta_messages = []
 
+                if delta_message.reasoning_content is not None:
+                    yield _send_event(
+                        ResponseReasoningTextDeltaEvent(
+                            type="response.reasoning_text.delta",
+                            sequence_number=-1,
+                            content_index=current_content_index,
+                            output_index=current_output_index,
+                            item_id=current_item_id,
+                            delta=delta_message.reasoning_content,
+                        ))
+                elif delta_message.content is not None:
+                    yield _send_event(
+                        openai_responses_types.ResponseTextDeltaEvent(
+                            type="response.output_text.delta",
+                            sequence_number=-1,
+                            content_index=current_content_index,
+                            output_index=current_output_index,
+                            item_id=current_item_id,
+                            delta=delta_message.content,
+                            logprobs=self._create_stream_response_logprobs(
+                                token_ids=output.token_ids,
+                                logprobs=output.logprobs,
+                                tokenizer=tokenizer,
+                                top_logprobs=request.top_logprobs,
+                            ) if request.is_include_output_logprobs() else [],
+                        ))
+                current_content_index += 1
+
+                previous_delta_messages.append(delta_message)
+        if previous_delta_messages:
+            if previous_delta_messages[-1].reasoning_content is not None:
+                reason_content = ''.join(pm.reasoning_content
+                                         for pm in previous_delta_messages
+                                         if pm.reasoning_content is not None)
+                yield _send_event(
+                    ResponseReasoningTextDoneEvent(
+                        type="response.reasoning_text.done",
+                        item_id=current_item_id,
+                        sequence_number=-1,
+                        output_index=current_output_index,
+                        content_index=current_content_index,
+                        text=reason_content,
+                    ))
+                current_content_index += 1
+                reasoning_item = ResponseReasoningItem(
+                    type="reasoning",
+                    content=[
+                        ResponseReasoningTextContent(
+                            text=reason_content,
+                            type="reasoning_text",
+                        ),
+                    ],
+                    status="completed",
+                    id=current_item_id,
+                    summary=[],
+                )
+                yield _send_event(
+                    ResponseOutputItemDoneEvent(
+                        type="response.output_item.done",
+                        sequence_number=-1,
+                        output_index=current_output_index,
+                        item=reasoning_item,
+                    ))
+            elif previous_delta_messages[-1].content is not None:
+                final_content = ''.join(pm.content
+                                        for pm in previous_delta_messages
+                                        if pm.content is not None)
+                yield _send_event(
+                    openai_responses_types.ResponseTextDoneEvent(
+                        type="response.output_text.done",
+                        sequence_number=-1,
+                        output_index=current_output_index,
+                        content_index=current_content_index,
+                        text=final_content,
+                        logprobs=[],
+                        item_id=current_item_id,
+                    ))
+                current_content_index += 1
+                part = ResponseOutputText(
+                    text=final_content,
+                    type="output_text",
+                    annotations=[],
+                )
+                yield _send_event(
+                    openai_responses_types.ResponseContentPartDoneEvent(
+                        type="response.content_part.done",
+                        sequence_number=-1,
+                        item_id=current_item_id,
+                        output_index=current_output_index,
+                        content_index=current_content_index,
+                        part=part,
+                    ))
+                current_content_index += 1
+                item = ResponseOutputMessage(
+                    type="message",
+                    role="assistant",
+                    content=[
+                        part,
+                    ],
+                    status="completed",
+                    id=current_item_id,
+                    summary=[],
+                )
+                yield _send_event(
+                    ResponseOutputItemDoneEvent(
+                        type="response.output_item.done",
+                        sequence_number=-1,
+                        output_index=current_output_index,
+                        item=item,
+                    ))
+
+    async def _process_harmony_streaming_events(
+        self,
+        request: ResponsesRequest,
+        sampling_params: SamplingParams,
+        result_generator: AsyncIterator[Optional[ConversationContext]],
+        context: ConversationContext,
+        model_name: str,
+        tokenizer: AnyTokenizer,
+        request_metadata: RequestResponseMetadata,
+        created_time: int,
+        _send_event: Callable[[BaseModel], str],
+    ) -> AsyncGenerator[str, None]:
         current_content_index = 0  # FIXME: this number is never changed
         current_output_index = 0
         current_item_id = ""  # FIXME: this number is never changed
         sent_output_item_added = False
 
-        initial_response = ResponsesResponse.from_request(
-            request,
-            sampling_params,
-            model_name=model_name,
-            created_time=created_time,
-            output=[],
-            status="in_progress",
-            usage=None,
-        ).model_dump()
-        yield _send_event(
-            ResponseCreatedEvent(
-                type="response.created",
-                sequence_number=-1,
-                response=initial_response,
-            ))
-        yield _send_event(
-            ResponseInProgressEvent(
-                type="response.in_progress",
-                sequence_number=-1,
-                response=initial_response,
-            ))
-
         async for ctx in result_generator:
 
             assert isinstance(ctx, StreamingHarmonyContext)
@@ -1229,29 +1599,6 @@ class OpenAIServingResponses(OpenAIServing):
                             ),
                         ))
 
-        async def empty_async_generator():
-            # A hack to trick Python to think this is a generator but in fact
-            # it immediately returns.
-            if False:
-                yield
-
-        final_response = await self.responses_full_generator(
-            request,
-            sampling_params,
-            empty_async_generator(),
-            context,
-            model_name,
-            tokenizer,
-            request_metadata,
-            created_time=created_time,
-        )
-        yield _send_event(
-            openai_responses_types.ResponseCompletedEvent(
-                type="response.completed",
-                sequence_number=-1,
-                response=final_response.model_dump(),
-            ))
-
     async def responses_stream_generator(
         self,
         request: ResponsesRequest,
@@ -1266,16 +1613,78 @@ class OpenAIServingResponses(OpenAIServing):
         # TODO:
         # 1. Handle disconnect
 
-        if not isinstance(context, StreamingHarmonyContext):
-            raise NotImplementedError(
-                "Streaming is not supported for responses API without Harmony."
-            )
-
         created_time = created_time or int(time.time())
 
+        sequence_number = 0
+
+        def _send_event(event: BaseModel):
+            nonlocal sequence_number
+            # Set sequence_number if the event has this attribute
+            if hasattr(event, 'sequence_number'):
+                event.sequence_number = sequence_number
+            sequence_number += 1
+            # Get event type from the event's type field if it exists
+            event_type = getattr(event, 'type', 'unknown')
+            return (f"event: {event_type}\n"
+                    f"data: {event.model_dump_json(indent=None)}\n\n")
+
         async with AsyncExitStack() as exit_stack:
-            await context.init_tool_sessions(self.tool_server, exit_stack)
-            async for event_data in self._process_streaming_events(
-                    request, sampling_params, result_generator, context,
-                    model_name, tokenizer, request_metadata, created_time):
+            processer = None
+            if self.use_harmony:
+                await context.init_tool_sessions(self.tool_server, exit_stack,
+                                                 request.request_id)
+                processer = self._process_harmony_streaming_events
+            else:
+                processer = self._process_simple_streaming_events
+
+            initial_response = ResponsesResponse.from_request(
+                request,
+                sampling_params,
+                model_name=model_name,
+                created_time=created_time,
+                output=[],
+                status="in_progress",
+                usage=None,
+            ).model_dump()
+            yield _send_event(
+                ResponseCreatedEvent(
+                    type="response.created",
+                    sequence_number=-1,
+                    response=initial_response,
+                ))
+            yield _send_event(
+                ResponseInProgressEvent(
+                    type="response.in_progress",
+                    sequence_number=-1,
+                    response=initial_response,
+                ))
+
+            async for event_data in processer(request, sampling_params,
+                                              result_generator, context,
+                                              model_name, tokenizer,
+                                              request_metadata, created_time,
+                                              _send_event):
                 yield event_data
+
+            async def empty_async_generator():
+                # A hack to trick Python to think this is a generator but
+                # in fact it immediately returns.
+                if False:
+                    yield
+
+            final_response = await self.responses_full_generator(
+                request,
+                sampling_params,
+                empty_async_generator(),
+                context,
+                model_name,
+                tokenizer,
+                request_metadata,
+                created_time=created_time,
+            )
+            yield _send_event(
+                openai_responses_types.ResponseCompletedEvent(
+                    type="response.completed",
+                    sequence_number=-1,
+                    response=final_response.model_dump(),
+                ))
diff --git a/vllm/entrypoints/openai/serving_score.py b/vllm/entrypoints/openai/serving_score.py
index 847c014a11dc3bbc4d2e1c3a1d0449cc043cc324..24767ed66fc6ad19b17e94ea285127b7c00d3385 100644
--- a/vllm/entrypoints/openai/serving_score.py
+++ b/vllm/entrypoints/openai/serving_score.py
@@ -353,7 +353,7 @@ class ServingScores(OpenAIServing):
                 final_res_batch,
                 request_id,
                 created_time,
-                self._get_model_name(request.model),
+                self.models.model_name(),
             )
         except asyncio.CancelledError:
             return self.create_error_response("Client disconnected")
@@ -399,7 +399,7 @@ class ServingScores(OpenAIServing):
             return self.request_output_to_rerank_response(
                 final_res_batch,
                 request_id,
-                self._get_model_name(request.model),
+                self.models.model_name(),
                 documents,
                 top_n,
             )
diff --git a/vllm/entrypoints/openai/serving_tokenization.py b/vllm/entrypoints/openai/serving_tokenization.py
index 2f258255d5f16ee6b6bac51058f019ba10724448..1efd9678571c4e43e6f5531a3532021d1e10a4ab 100644
--- a/vllm/entrypoints/openai/serving_tokenization.py
+++ b/vllm/entrypoints/openai/serving_tokenization.py
@@ -22,6 +22,7 @@ from vllm.entrypoints.openai.protocol import (DetokenizeRequest,
 # yapf: enable
 from vllm.entrypoints.openai.serving_engine import OpenAIServing
 from vllm.entrypoints.openai.serving_models import OpenAIServingModels
+from vllm.entrypoints.renderer import RenderConfig
 from vllm.logger import init_logger
 from vllm.transformers_utils.tokenizer import AnyTokenizer
 
@@ -65,13 +66,14 @@ class OpenAIServingTokenization(OpenAIServing):
             lora_request = self._maybe_get_adapters(request)
 
             tokenizer = await self.engine_client.get_tokenizer(lora_request)
+            renderer = self._get_renderer(tokenizer)
 
             if isinstance(request, TokenizeChatRequest):
                 tool_dicts = (None if request.tools is None else
                               [tool.model_dump() for tool in request.tools])
                 (
                     _,
-                    request_prompts,
+                    _,
                     engine_prompts,
                 ) = await self._preprocess_chat(
                     request,
@@ -87,21 +89,18 @@ class OpenAIServingTokenization(OpenAIServing):
                     add_special_tokens=request.add_special_tokens,
                 )
             else:
-                (request_prompts,
-                 engine_prompts) = await self._preprocess_completion(
-                     request,
-                     tokenizer,
-                     request.prompt,
-                     add_special_tokens=request.add_special_tokens,
-                 )
+                engine_prompts = await renderer.render_prompt(
+                    prompt_or_prompts=request.prompt,
+                    config=self._build_render_config(request),
+                )
         except (ValueError, TypeError, jinja2.TemplateError) as e:
             logger.exception("Error in preprocessing prompt inputs")
             return self.create_error_response(f"{e} {e.__cause__}")
 
         input_ids: list[int] = []
-        for i, engine_prompt in enumerate(engine_prompts):
+        for engine_prompt in engine_prompts:
             self._log_inputs(request_id,
-                             request_prompts[i],
+                             engine_prompt,
                              params=None,
                              lora_request=lora_request)
 
@@ -158,6 +157,9 @@ class OpenAIServingTokenization(OpenAIServing):
             return self.create_error_response(
                 f"Failed to get tokenizer info: {str(e)}")
 
+    def _build_render_config(self, request: TokenizeRequest) -> RenderConfig:
+        return RenderConfig(add_special_tokens=request.add_special_tokens)
+
 
 @dataclass
 class TokenizerInfo:
diff --git a/vllm/entrypoints/openai/speech_to_text.py b/vllm/entrypoints/openai/speech_to_text.py
index 1cbd7dba393f6c93bc851e8686a37489927c3312..965bdac3ac5ad2685f6b2181a6db5a4682018d09 100644
--- a/vllm/entrypoints/openai/speech_to_text.py
+++ b/vllm/entrypoints/openai/speech_to_text.py
@@ -89,6 +89,9 @@ class OpenAISpeechToText(OpenAIServing):
     ) -> tuple[list[PromptType], float]:
         # Validate request
         language = self.model_cls.validate_language(request.language)
+        # Skip to_language validation to avoid extra logging for Whisper.
+        to_language = self.model_cls.validate_language(request.to_language) \
+            if request.to_language else None
 
         if len(audio_data) / 1024**2 > self.max_audio_filesize_mb:
             raise ValueError("Maximum file size exceeded.")
@@ -112,7 +115,9 @@ class OpenAISpeechToText(OpenAIServing):
                 model_config=self.model_config,
                 language=language,
                 task_type=self.task_type,
-                request_prompt=request.prompt)
+                request_prompt=request.prompt,
+                to_language=to_language,
+            )
             prompts.append(prompt)
         return prompts, duration
 
diff --git a/vllm/entrypoints/openai/tool_parsers/__init__.py b/vllm/entrypoints/openai/tool_parsers/__init__.py
index 44aa1208a54c783eeb99b4aabb3ea6e4e27d0c4d..35096b0461361f8bf1f6e36e43605d4c36cb0808 100644
--- a/vllm/entrypoints/openai/tool_parsers/__init__.py
+++ b/vllm/entrypoints/openai/tool_parsers/__init__.py
@@ -16,6 +16,7 @@ from .llama4_pythonic_tool_parser import Llama4PythonicToolParser
 from .llama_tool_parser import Llama3JsonToolParser
 from .minimax_tool_parser import MinimaxToolParser
 from .mistral_tool_parser import MistralToolParser
+from .openai_tool_parser import OpenAIToolParser
 from .phi4mini_tool_parser import Phi4MiniJsonToolParser
 from .pythonic_tool_parser import PythonicToolParser
 from .qwen3coder_tool_parser import Qwen3CoderToolParser
@@ -46,4 +47,5 @@ __all__ = [
     "Qwen3CoderToolParser",
     "SeedOssToolParser",
     "Step3ToolParser",
+    "OpenAIToolParser",
 ]
diff --git a/vllm/entrypoints/openai/tool_parsers/internlm2_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/internlm2_tool_parser.py
index 6ef8fadf59ac5856914355a6a5c3703a4135976e..37c360145b04a69287030a66bdc18fe06a2e0bef 100644
--- a/vllm/entrypoints/openai/tool_parsers/internlm2_tool_parser.py
+++ b/vllm/entrypoints/openai/tool_parsers/internlm2_tool_parser.py
@@ -35,7 +35,7 @@ class Internlm2ToolParser(ToolParser):
             self, request: ChatCompletionRequest) -> ChatCompletionRequest:
         if request.tools and request.tool_choice != 'none':
             # do not skip special tokens because internlm use the special
-            # tokens to indicated the start and end of the tool calls
+            # tokens to indicate the start and end of the tool calls
             # information.
             request.skip_special_tokens = False
         return request
@@ -60,8 +60,8 @@ class Internlm2ToolParser(ToolParser):
         if '<|action_start|>' not in current_text:
             self.position = len(current_text)
             return DeltaMessage(content=delta_text)
-        # if the tool call is sended, return a empty delta message
-        # to make sure the finish_reason will be send correctly.
+        # if the tool call is sent, return an empty delta message
+        # to make sure the finish_reason will be sent correctly.
         if self.current_tool_id > 0:
             return DeltaMessage(content='')
 
@@ -89,7 +89,7 @@ class Internlm2ToolParser(ToolParser):
         try:
             parsable_arr = action
 
-            # tool calls are generated in an object in inernlm2
+            # tool calls are generated in an object in internlm2
             # it's not support parallel tool calls
             try:
                 tool_call_arr: dict = partial_json_parser.loads(
diff --git a/vllm/entrypoints/openai/tool_parsers/llama4_pythonic_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/llama4_pythonic_tool_parser.py
index 6bf44a4345a9d94cee7fc101156298432f805496..9a9a19ce2188eaa93e01ad6496074a66068f80b2 100644
--- a/vllm/entrypoints/openai/tool_parsers/llama4_pythonic_tool_parser.py
+++ b/vllm/entrypoints/openai/tool_parsers/llama4_pythonic_tool_parser.py
@@ -176,7 +176,7 @@ class Llama4PythonicToolParser(ToolParser):
                             index] += delta.function.arguments
 
         # HACK: serving_chat.py inspects the internal state of tool parsers
-        # when determining it's final streaming delta, automatically
+        # when determining its final streaming delta, automatically
         # adding autocompleted JSON.
         # These two lines avoid that nonsense while ensuring finish_reason
         # is set to tool_calls when at least one tool is called.
diff --git a/vllm/entrypoints/openai/tool_parsers/mistral_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/mistral_tool_parser.py
index c0691f122904e03d821ce29c1c63b9ee056dd553..e6b300fd84e94cafcbadf352111e205b9acab5ec 100644
--- a/vllm/entrypoints/openai/tool_parsers/mistral_tool_parser.py
+++ b/vllm/entrypoints/openai/tool_parsers/mistral_tool_parser.py
@@ -143,7 +143,7 @@ class MistralToolParser(ToolParser):
             except json.JSONDecodeError:
                 # use a regex to find the part corresponding to the tool call.
                 # NOTE: This use case should not happen if the model is trained
-                # correctly. It's a easy possible fix so it's included, but
+                # correctly. It's an easy possible fix so it's included, but
                 # can be brittle for very complex / highly nested tool calls
                 raw_tool_call = self.tool_call_regex.findall(tool_content)[0]
                 function_call_arr = json.loads(raw_tool_call)
diff --git a/vllm/entrypoints/openai/tool_parsers/openai_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/openai_tool_parser.py
new file mode 100644
index 0000000000000000000000000000000000000000..c5d59514b94450e57e000cd7f7a192ef8e38336b
--- /dev/null
+++ b/vllm/entrypoints/openai/tool_parsers/openai_tool_parser.py
@@ -0,0 +1,73 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from __future__ import annotations
+
+from collections.abc import Sequence
+from typing import TYPE_CHECKING
+
+from vllm.entrypoints.harmony_utils import parse_output_into_messages
+from vllm.entrypoints.openai.protocol import (ChatCompletionRequest,
+                                              DeltaMessage,
+                                              ExtractedToolCallInformation,
+                                              FunctionCall, ToolCall)
+from vllm.entrypoints.openai.tool_parsers.abstract_tool_parser import (
+    ToolParser, ToolParserManager)
+
+if TYPE_CHECKING:
+    from vllm.transformers_utils.tokenizer import AnyTokenizer
+
+
+@ToolParserManager.register_module("openai")
+class OpenAIToolParser(ToolParser):
+
+    def __init__(self, tokenizer: AnyTokenizer):
+        super().__init__(tokenizer)
+
+    def extract_tool_calls(
+        self,
+        model_output: str,
+        request: ChatCompletionRequest,
+        token_ids: Sequence[int] | None = None,
+    ) -> ExtractedToolCallInformation:
+        if token_ids is None:
+            raise NotImplementedError(
+                "OpenAIToolParser requires token IDs and does not support text-based extraction."  # noqa: E501
+            )
+
+        parser = parse_output_into_messages(token_ids)
+        tool_calls = []
+        final_content = None
+
+        if len(parser.messages) > 0:
+            for msg in parser.messages:
+                if msg.recipient and msg.recipient.startswith("functions."):
+                    tool_calls.append(
+                        ToolCall(
+                            type="function",
+                            function=FunctionCall(
+                                name=msg.recipient.split("functions.")[1],
+                                arguments=msg.content[0].text,
+                            ),
+                        ))
+                elif msg.channel == "final":
+                    final_content = msg.content[0].text
+
+        return ExtractedToolCallInformation(
+            tools_called=len(tool_calls) > 0,
+            tool_calls=tool_calls,
+            content=final_content,
+        )
+
+    def extract_tool_calls_streaming(
+        self,
+        previous_text: str,
+        current_text: str,
+        delta_text: str,
+        previous_token_ids: Sequence[int],
+        current_token_ids: Sequence[int],
+        delta_token_ids: Sequence[int],
+        request: ChatCompletionRequest,
+    ) -> DeltaMessage | None:
+        raise NotImplementedError(
+            "Not being used, manual parsing in serving_chat.py"  # noqa: E501
+        )
diff --git a/vllm/entrypoints/openai/tool_parsers/pythonic_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/pythonic_tool_parser.py
index 73329cdf701d6e589845997d47c76bd73496db13..992f141bef0f2b905ebf12655ef968c0a109dafb 100644
--- a/vllm/entrypoints/openai/tool_parsers/pythonic_tool_parser.py
+++ b/vllm/entrypoints/openai/tool_parsers/pythonic_tool_parser.py
@@ -165,7 +165,7 @@ class PythonicToolParser(ToolParser):
                             index] += delta.function.arguments
 
             # HACK: serving_chat.py inspects the internal state of tool parsers
-            # when determining it's final streaming delta, automatically
+            # when determining its final streaming delta, automatically
             # adding autocompleted JSON.
             # These two lines avoid that nonsense while ensuring finish_reason
             # is set to tool_calls when at least one tool is called.
diff --git a/vllm/entrypoints/renderer.py b/vllm/entrypoints/renderer.py
new file mode 100644
index 0000000000000000000000000000000000000000..f0798afbcf212070f4546e8090ca141aed741ed9
--- /dev/null
+++ b/vllm/entrypoints/renderer.py
@@ -0,0 +1,395 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import asyncio
+import io
+from abc import ABC, abstractmethod
+from dataclasses import dataclass
+from typing import Annotated, Optional, Union
+
+import pybase64
+import torch
+from pydantic import Field
+
+from vllm.config import ModelConfig
+from vllm.inputs.data import EmbedsPrompt as EngineEmbedsPrompt
+from vllm.inputs.data import TokensPrompt as EngineTokensPrompt
+from vllm.inputs.parse import parse_and_batch_prompt
+from vllm.transformers_utils.tokenizer import AnyTokenizer
+from vllm.utils import AsyncMicrobatchTokenizer
+
+
+@dataclass(frozen=True)
+class RenderConfig:
+    """Configuration to control how prompts are prepared."""
+
+    max_length: Optional[int] = None
+    """Maximum allowable total input token length. If provided,
+    token inputs longer than this raise ``ValueError``."""
+
+    truncate_prompt_tokens: Optional[int] = None
+    """Number of tokens to keep. ``None`` means no truncation.
+    ``0`` yields an empty list (and skips embeds).
+    ``-1`` maps to ``model_config.max_model_len``."""
+
+    add_special_tokens: Optional[bool] = True
+    """Whether to add model-specific special tokens during tokenization."""
+
+    cache_salt: Optional[str] = None
+    """String to disambiguate prefix cache entries."""
+
+    needs_detokenization: Optional[bool] = False
+    """If True, detokenize IDs back to text for inclusion in outputs."""
+
+
+class BaseRenderer(ABC):
+    """
+    Base class for unified input processing and rendering.
+    
+    The Renderer serves as a unified input processor that consolidates
+    tokenization, chat template formatting, and multimodal input handling
+    into a single component.
+    It converts high-level API requests (OpenAI-style JSON) into token IDs and
+    multimodal features ready for engine consumption.
+    
+    Key responsibilities:
+    - Convert text prompts to token sequences with proper special tokens
+    - Apply chat templates and format conversations
+    - Handle multimodal inputs (images, audio, etc.) when applicable
+    - Manage prompt truncation and length validation
+    - Provide clean separation between API layer and engine core
+    """
+
+    def __init__(
+        self,
+        model_config: ModelConfig,
+        tokenizer: Optional[AnyTokenizer] = None,
+    ):
+        super().__init__()
+        self.model_config = model_config
+        self.tokenizer = tokenizer
+
+    @abstractmethod
+    async def render_prompt(
+        self,
+        *,
+        prompt_or_prompts: Union[str, list[str], list[int], list[list[int]]],
+        config: "RenderConfig",
+    ) -> list[EngineTokensPrompt]:
+        """
+        Convert text or token inputs into engine-ready TokensPrompt objects.
+
+        This method accepts text or token inputs and produces a
+        list of [`TokensPrompt`][vllm.inputs.data.TokensPrompt] objects
+        for the engine.
+
+        Args:
+            prompt_or_prompts: One of:
+                - ``str``: Single text prompt.
+                - ``list[str]``: Batch of text prompts.
+                - ``list[int]``: Single pre-tokenized sequence.
+                - ``list[list[int]]``: Batch of pre-tokenized sequences.
+            config: Render configuration controlling how prompts are prepared
+                (e.g., tokenization and length handling). 
+
+        Returns:
+            list[EngineTokensPrompt]: Engine-ready token prompts.
+
+        Raises:
+            ValueError: If input formats are invalid or length limits exceeded.
+        """
+        raise NotImplementedError
+
+    @abstractmethod
+    async def render_prompt_and_embeds(
+        self,
+        *,
+        prompt_or_prompts: Optional[Union[str, list[str], list[int],
+                                          list[list[int]]]] = None,
+        prompt_embeds: Optional[Union[bytes, list[bytes]]] = None,
+        config: "RenderConfig",
+    ) -> list[Union[EngineTokensPrompt, EngineEmbedsPrompt]]:
+        """
+        Convert text/token and/or base64-encoded embeddings inputs into
+        engine-ready prompt objects using a unified RenderConfig.
+
+        At least one of ``prompt_or_prompts`` or ``prompt_embeds`` must be
+        provided and non-empty. If both are omitted or empty (e.g., empty
+        string and empty list), a ``ValueError`` is raised.
+
+        Args:
+            prompt_or_prompts: Text or token inputs to include.
+            prompt_embeds: Base64-encoded bytes (or list thereof) containing a
+                torch-saved tensor to be used as prompt embeddings.
+            config: Render configuration controlling how prompts are prepared
+                (e.g., tokenization and length handling). 
+
+        Returns:
+            list[Union[EngineTokensPrompt, EngineEmbedsPrompt]]:
+                Engine-ready prompt objects.
+
+        Raises:
+            ValueError: If both ``prompt_or_prompts`` and ``prompt_embeds``
+                are omitted or empty (decoder prompt cannot be empty), or if
+                length limits are exceeded.
+        """
+        raise NotImplementedError
+
+    @classmethod
+    def load_prompt_embeds(
+        cls,
+        prompt_embeds: Union[bytes, list[bytes]],
+        truncate_prompt_tokens: Optional[Annotated[int, Field(ge=0)]] = None,
+        cache_salt: Optional[str] = None,
+    ) -> list[EngineEmbedsPrompt]:
+        """Load and validate base64-encoded embeddings into prompt objects."""
+
+        def _load_and_validate_embed(embed: bytes) -> EngineEmbedsPrompt:
+            tensor = torch.load(
+                io.BytesIO(pybase64.b64decode(embed, validate=True)),
+                weights_only=True,
+                map_location=torch.device("cpu"),
+            )
+            assert isinstance(tensor, torch.Tensor) and tensor.dtype in (
+                torch.float32,
+                torch.bfloat16,
+                torch.float16,
+            )
+            tensor = tensor.to_dense()
+            if tensor.dim() > 2:
+                tensor = tensor.squeeze(0)
+                assert tensor.dim() == 2
+            if truncate_prompt_tokens is not None:
+                tensor = tensor[-truncate_prompt_tokens:]
+            embeds_prompt = EngineEmbedsPrompt(prompt_embeds=tensor)
+            if cache_salt is not None:
+                embeds_prompt["cache_salt"] = cache_salt
+            return embeds_prompt
+
+        if isinstance(prompt_embeds, list):
+            return [_load_and_validate_embed(embed) for embed in prompt_embeds]
+
+        return [_load_and_validate_embed(prompt_embeds)]
+
+
+class CompletionRenderer(BaseRenderer):
+
+    def __init__(
+        self,
+        model_config: ModelConfig,
+        tokenizer: Optional[AnyTokenizer] = None,
+        async_tokenizer_pool: Optional[dict[AnyTokenizer,
+                                            AsyncMicrobatchTokenizer]] = None,
+    ):
+        super().__init__(model_config, tokenizer)
+        self.async_tokenizer_pool = async_tokenizer_pool
+        self.async_tokenizer: Optional[AsyncMicrobatchTokenizer] = None
+
+    async def render_prompt(
+        self,
+        *,
+        prompt_or_prompts: Union[str, list[str], list[int], list[list[int]]],
+        config: "RenderConfig",
+    ) -> list[EngineTokensPrompt]:
+        """Implementation of prompt rendering for completion-style requests.
+        
+        Uses async tokenizer pooling for improved performance. See base class
+        for detailed parameter documentation.
+        """
+        truncate_prompt_tokens = self._validate_and_normalize_truncate_tokens(
+            config.truncate_prompt_tokens, config.max_length)
+        if truncate_prompt_tokens == 0:
+            return []
+
+        # Parse and batch the input prompts
+        batch_inputs = parse_and_batch_prompt(prompt_or_prompts)
+
+        tasks = []
+        for prompt_input in batch_inputs:
+            if prompt_input["is_tokens"] is True:
+                # Token input
+                # Note: detokenization is needed when echo is enabled,
+                # where the input token IDs are decoded back to text.
+                task = self._maybe_detokenize(prompt_input["content"],
+                                              config.max_length,
+                                              truncate_prompt_tokens,
+                                              config.cache_salt,
+                                              config.needs_detokenization)
+            else:
+                # Text input
+                task = self._tokenize(prompt_input["content"],
+                                      config.max_length,
+                                      truncate_prompt_tokens,
+                                      config.add_special_tokens,
+                                      config.cache_salt)
+            tasks.append(task)
+
+        # Wait for all text tokenization to finish
+        if tasks:
+            tokenized_text_prompts = await asyncio.gather(*tasks)
+            return tokenized_text_prompts
+
+        return []
+
+    async def render_prompt_and_embeds(
+        self,
+        *,
+        prompt_or_prompts: Optional[Union[str, list[str], list[int],
+                                          list[list[int]]]] = None,
+        prompt_embeds: Optional[Union[bytes, list[bytes]]] = None,
+        config: "RenderConfig",
+    ) -> list[Union[EngineTokensPrompt, EngineEmbedsPrompt]]:
+        """
+        Render text/token prompts and/or precomputed embedding prompts. At
+        least one of `prompt_or_prompts` or `prompt_embeds` must be provided.
+        """
+        truncate_prompt_tokens = self._validate_and_normalize_truncate_tokens(
+            config.truncate_prompt_tokens, config.max_length)
+        if truncate_prompt_tokens == 0:
+            return []
+
+        rendered: list[Union[EngineTokensPrompt, EngineEmbedsPrompt]] = []
+
+        if prompt_embeds is not None:
+            rendered.extend(
+                self.load_prompt_embeds(prompt_embeds, truncate_prompt_tokens,
+                                        config.cache_salt))
+        if prompt_or_prompts is None or prompt_or_prompts == "":
+            return rendered
+
+        token_prompts = await self.render_prompt(
+            prompt_or_prompts=prompt_or_prompts,
+            config=config,
+        )
+        rendered.extend(token_prompts)
+
+        return rendered
+
+    def _validate_and_normalize_truncate_tokens(
+        self,
+        truncate_prompt_tokens: Optional[int],
+        max_length: Optional[int],
+    ) -> Optional[int]:
+        """Validate and normalize truncate_prompt_tokens parameter."""
+        if truncate_prompt_tokens is None:
+            return None
+
+        if truncate_prompt_tokens == 0:
+            return 0
+
+        if truncate_prompt_tokens < 0:
+            truncate_prompt_tokens = self.model_config.max_model_len
+
+        if max_length is not None and truncate_prompt_tokens > max_length:
+            raise ValueError(
+                f"truncate_prompt_tokens ({truncate_prompt_tokens}) "
+                f"cannot be greater than max_length ({max_length}). "
+                f"Please select a smaller truncation size.")
+
+        return truncate_prompt_tokens
+
+    def _maybe_apply_truncation(
+            self, token_ids: list[int],
+            truncate_prompt_tokens: Optional[int]) -> list[int]:
+        """Apply truncation to token sequence."""
+        if truncate_prompt_tokens is None:
+            return token_ids
+        if truncate_prompt_tokens >= len(token_ids):
+            return token_ids
+
+        return token_ids[-truncate_prompt_tokens:]
+
+    async def _tokenize(
+        self,
+        text: str,
+        max_length: Optional[int],
+        truncate_prompt_tokens: Optional[int],
+        add_special_tokens: Optional[bool],
+        cache_salt: Optional[str],
+    ) -> EngineTokensPrompt:
+        """Tokenize text input asynchronously."""
+        async_tokenizer = self._get_async_tokenizer()
+
+        # Handle encoder-specific preprocessing
+        if (self.model_config.encoder_config is not None
+                and self.model_config.encoder_config.get(
+                    "do_lower_case", False)):
+            text = text.lower()
+
+        # Tokenize texts
+        if truncate_prompt_tokens is None:
+            encoded = await async_tokenizer(
+                text, add_special_tokens=add_special_tokens)
+        else:
+            encoded = await async_tokenizer(
+                text,
+                add_special_tokens=add_special_tokens,
+                truncation=True,
+                max_length=truncate_prompt_tokens)
+
+        return self._create_tokens_prompt(encoded.input_ids, max_length,
+                                          cache_salt, text)
+
+    async def _maybe_detokenize(
+        self,
+        token_ids: list[int],
+        max_length: Optional[int],
+        truncate_prompt_tokens: Optional[int],
+        cache_salt: Optional[str],
+        needs_detokenization: Optional[bool] = False,
+    ) -> EngineTokensPrompt:
+        """Optionally detokenize token IDs and build a tokens prompt."""
+        token_ids = self._maybe_apply_truncation(token_ids,
+                                                 truncate_prompt_tokens)
+
+        prompt = None
+        if needs_detokenization is True:
+            async_tokenizer = self._get_async_tokenizer()
+            prompt = await async_tokenizer.decode(token_ids)
+
+        return self._create_tokens_prompt(token_ids=token_ids,
+                                          max_length=max_length,
+                                          cache_salt=cache_salt,
+                                          prompt=prompt)
+
+    def _get_async_tokenizer(self) -> AsyncMicrobatchTokenizer:
+        """Get or create async tokenizer using shared pool."""
+        async_tokenizer = self.async_tokenizer
+        if async_tokenizer is not None:
+            return async_tokenizer
+
+        tokenizer = self.tokenizer
+        if self.tokenizer is None:
+            raise ValueError(
+                "No tokenizer available for text input processing")
+
+        if self.async_tokenizer_pool is None:
+            async_tokenizer = AsyncMicrobatchTokenizer(tokenizer)
+        else:
+            async_tokenizer = self.async_tokenizer_pool.get(tokenizer)
+            if async_tokenizer is None:
+                async_tokenizer = AsyncMicrobatchTokenizer(tokenizer)
+                self.async_tokenizer_pool[tokenizer] = async_tokenizer
+        self.async_tokenizer = async_tokenizer
+        return async_tokenizer
+
+    def _create_tokens_prompt(
+        self,
+        token_ids: list[int],
+        max_length: Optional[int] = None,
+        cache_salt: Optional[str] = None,
+        prompt: Optional[str] = None,
+    ) -> EngineTokensPrompt:
+        """Create validated EngineTokensPrompt."""
+        if max_length is not None and len(token_ids) > max_length:
+            raise ValueError(
+                f"This maximum context length is {max_length} tokens. "
+                f"However, your request has {len(token_ids)} input tokens. "
+                "Please reduce the length of the input messages.")
+
+        tokens_prompt = EngineTokensPrompt(prompt_token_ids=token_ids)
+        if cache_salt is not None:
+            tokens_prompt["cache_salt"] = cache_salt
+        if prompt is not None:
+            tokens_prompt["prompt"] = prompt
+        return tokens_prompt
diff --git a/vllm/entrypoints/tool.py b/vllm/entrypoints/tool.py
index 758789a5e059dfb254131e44916deb786751b46c..f5f4d7d3b5565bed913df609fbfb5563dc4a3a7e 100644
--- a/vllm/entrypoints/tool.py
+++ b/vllm/entrypoints/tool.py
@@ -4,6 +4,8 @@ import os
 from abc import ABC, abstractmethod
 from typing import TYPE_CHECKING, Any
 
+from openai_harmony import Author, Message, Role, TextContent
+
 from vllm.logger import init_logger
 
 if TYPE_CHECKING:
@@ -99,6 +101,28 @@ class HarmonyPythonTool(Tool):
             return
 
         self.python_tool = PythonTool()
+
+    async def validate(self):
+        if not self.enabled:
+            return
+        try:
+            message = Message(
+                author=Author(role=Role.ASSISTANT),
+                content=[TextContent(text="print('Hello, world!')")],
+                channel="analysis",
+                recipient="python",
+                content_type="code",
+            )
+            msgs = []
+            async for msg in self.python_tool.process(message):
+                msgs.append(msg)
+            assert msgs[0].content[0].text == "Hello, world!\n"
+        except Exception as e:
+            self.enabled = False
+            logger.warning_once(
+                "Code interpreter tool failed to initialize (%s), code "
+                "interpreter is disabled", e)
+            return
         logger.info_once("Code interpreter tool initialized")
 
     async def get_result(self, context: "ConversationContext") -> Any:
diff --git a/vllm/entrypoints/tool_server.py b/vllm/entrypoints/tool_server.py
index 2f28595f27c6a9b6ac67efd00d9403d8d33149a4..056a571fb2fd1a1eac765c8ae51eb46d4c7ae81a 100644
--- a/vllm/entrypoints/tool_server.py
+++ b/vllm/entrypoints/tool_server.py
@@ -86,7 +86,8 @@ class ToolServer(ABC):
         pass
 
     @abstractmethod
-    def new_session(self, tool_name: str) -> AbstractAsyncContextManager[Any]:
+    def new_session(self, tool_name: str,
+                    session_id: str) -> AbstractAsyncContextManager[Any]:
         """
         Create a session for the tool.
         """
@@ -124,7 +125,8 @@ class MCPToolServer(ToolServer):
                                         description=tool.description,
                                         parameters=tool.inputSchema)
                     for tool in list_tools_response.tools
-                ])
+                ],
+            )
             self.harmony_tool_descriptions[tool_from_mcp.name] = tool_from_mcp
             if tool_from_mcp.name not in self.urls:
                 self.urls[tool_from_mcp.name] = url
@@ -142,14 +144,16 @@ class MCPToolServer(ToolServer):
         return self.harmony_tool_descriptions.get(tool_name)
 
     @asynccontextmanager
-    async def new_session(self, tool_name: str):
+    async def new_session(self, tool_name: str, session_id: str):
         from mcp import ClientSession
         from mcp.client.sse import sse_client
         url = self.urls.get(tool_name)
+        headers = {"x-session-id": session_id}
         if not url:
             raise KeyError(f"Tool '{tool_name}' is not supported")
-        async with sse_client(url=url) as streams, ClientSession(
-                *streams) as session:
+        async with sse_client(url=url,
+                              headers=headers) as streams, ClientSession(
+                                  *streams) as session:
             await session.initialize()
             yield session
 
@@ -158,10 +162,13 @@ class DemoToolServer(ToolServer):
 
     def __init__(self):
         self.tools: dict[str, Tool] = {}
+
+    async def init_and_validate(self):
         browser_tool = HarmonyBrowserTool()
+        python_tool = HarmonyPythonTool()
+        await python_tool.validate()
         if browser_tool.enabled:
             self.tools["browser"] = browser_tool
-        python_tool = HarmonyPythonTool()
         if python_tool.enabled:
             self.tools["python"] = python_tool
         logger.info("DemoToolServer initialized with tools: %s",
@@ -182,7 +189,7 @@ class DemoToolServer(ToolServer):
             raise ValueError(f"Unknown tool {tool_name}")
 
     @asynccontextmanager
-    async def new_session(self, tool_name: str):
+    async def new_session(self, tool_name: str, session_id: str):
         if tool_name not in self.tools:
             raise KeyError(f"Tool '{tool_name}' is not supported")
         yield self.tools[tool_name]
diff --git a/vllm/env_override.py b/vllm/env_override.py
index 9fbdd855382c026e7e33924404dcd94e8a999907..6661b4b134e18cd79ed15a547cd18532226b700d 100644
--- a/vllm/env_override.py
+++ b/vllm/env_override.py
@@ -13,24 +13,6 @@ logger = init_logger(__name__)
 # that interact with vllm workers.
 # they are executed whenever `import vllm` is called.
 
-if os.environ.get('NCCL_CUMEM_ENABLE', '0') != '0':
-    logger.warning(
-        "NCCL_CUMEM_ENABLE is set to %s, skipping override. "
-        "This may increase memory overhead with cudagraph+allreduce: "
-        "https://github.com/NVIDIA/nccl/issues/1234",
-        os.environ['NCCL_CUMEM_ENABLE'])
-elif not os.path.exists('/dev/nvidia-caps-imex-channels'):
-    # NCCL requires NCCL_CUMEM_ENABLE to work with
-    # multi-node NVLink, typically on GB200-NVL72 systems.
-    # The ultimate way to detect multi-node NVLink is to use
-    # NVML APIs, which are too expensive to call here.
-    # As an approximation, we check the existence of
-    # /dev/nvidia-caps-imex-channels, used by
-    # multi-node NVLink to communicate across nodes.
-    # This will still cost some GPU memory, but it is worthwhile
-    # because we can get very fast cross-node bandwidth with NVLink.
-    os.environ['NCCL_CUMEM_ENABLE'] = '0'
-
 # see https://github.com/vllm-project/vllm/pull/15951
 # it avoids unintentional cuda initialization from torch.cuda.is_available()
 os.environ['PYTORCH_NVML_BASED_CUDA_CHECK'] = '1'
diff --git a/vllm/envs.py b/vllm/envs.py
index 33b0e9dedb235270561539783a2f5d73a1f42dc5..bab7c23f3c1c446bd43efbf6c82f3f576a2fd315 100755
--- a/vllm/envs.py
+++ b/vllm/envs.py
@@ -37,6 +37,7 @@ if TYPE_CHECKING:
     VLLM_CONFIGURE_LOGGING: int = 1
     VLLM_LOGGING_LEVEL: str = "INFO"
     VLLM_LOGGING_PREFIX: str = ""
+    VLLM_LOGGING_STREAM: str = "ext://sys.stdout"
     VLLM_LOGGING_CONFIG_PATH: Optional[str] = None
     VLLM_LOGITS_PROCESSOR_THREADS: Optional[int] = None
     VLLM_LOG_STATS_INTERVAL: float = 10.
@@ -162,12 +163,18 @@ if TYPE_CHECKING:
     VLLM_ALLOW_CHUNKED_LOCAL_ATTN_WITH_HYBRID_KV_CACHE: bool = False
     VLLM_ENABLE_RESPONSES_API_STORE: bool = False
     VLLM_USE_TRTLLM_ATTENTION: Optional[str] = None
+    VLLM_FLASHINFER_DISABLE_Q_QUANTIZATION: bool = False
     VLLM_HAS_FLASHINFER_CUBIN: bool = False
     VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8: bool = False
     VLLM_USE_FLASHINFER_MOE_MXFP4_BF16: bool = False
+    VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8_CUTLASS: bool = False
     VLLM_ALLREDUCE_USE_SYMM_MEM: bool = False
     VLLM_TUNED_CONFIG_FOLDER: Optional[str] = None
     VLLM_DISABLE_PAD_FOR_CUDAGRAPH: bool = False
+    VLLM_GPT_OSS_USE_CONTAINER_TOOL: bool = False
+    VLLM_GPT_OSS_HARMONY_SYSTEM_INSTRUCTIONS: bool = False
+    VLLM_CUSTOM_SCOPES_FOR_PROFILING: bool = False
+    VLLM_KV_EVENTS_USE_INT_BLOCK_HASHES: bool = True
 
 
 def get_default_cache_root():
@@ -235,7 +242,7 @@ environment_variables: dict[str, Callable[[], Any]] = {
     # ================== Installation Time Env Vars ==================
 
     # Target device of vLLM, supporting [cuda (by default),
-    # rocm, neuron, cpu]
+    # rocm, cpu]
     "VLLM_TARGET_DEVICE":
     lambda: os.getenv("VLLM_TARGET_DEVICE", "cuda").lower(),
 
@@ -431,6 +438,10 @@ environment_variables: dict[str, Callable[[], Any]] = {
     "VLLM_LOGGING_LEVEL":
     lambda: os.getenv("VLLM_LOGGING_LEVEL", "INFO").upper(),
 
+    # this is used for configuring the default logging stream
+    "VLLM_LOGGING_STREAM":
+    lambda: os.getenv("VLLM_LOGGING_STREAM", "ext://sys.stdout"),
+
     # if set, VLLM_LOGGING_PREFIX will be prepended to all log messages
     "VLLM_LOGGING_PREFIX":
     lambda: os.getenv("VLLM_LOGGING_PREFIX", ""),
@@ -463,6 +474,7 @@ environment_variables: dict[str, Callable[[], Any]] = {
     # - "ROCM_FLASH": use ROCmFlashAttention
     # - "FLASHINFER": use flashinfer
     # - "FLASHMLA": use FlashMLA
+    # - "FLASH_ATTN_MLA": use FlashAttention for MLA
     "VLLM_ATTENTION_BACKEND":
     lambda: os.getenv("VLLM_ATTENTION_BACKEND", None),
 
@@ -994,6 +1006,15 @@ environment_variables: dict[str, Callable[[], Any]] = {
     "VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8":
     lambda: bool(int(os.getenv("VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8", "0"))),
 
+    # If set to 1, use the FlashInfer CUTLASS backend for
+    # MXFP8 (activation) x MXFP4 (weight) MoE.
+    # This is separate from the TRTLLMGEN path controlled by
+    # VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8.
+    "VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8_CUTLASS":
+    lambda: bool(int(
+        os.getenv("VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8_CUTLASS", "0")
+        )),
+
     # If set to 1, use the FlashInfer
     # BF16 (activation) x MXFP4 (weight) MoE backend.
     "VLLM_USE_FLASHINFER_MOE_MXFP4_BF16":
@@ -1063,7 +1084,7 @@ environment_variables: dict[str, Callable[[], Any]] = {
     # vllm should use flashinfer fused allreduce. The variable should be a
     # JSON with the following format:
     #     { <world size>: <max size in mb> }
-    # Unspecified world sizes will fallback to
+    # Unspecified world sizes will fall back to
     #     { 2: 64, 4: 1, <everything else>: 0.5 }
     "VLLM_FLASHINFER_ALLREDUCE_FUSION_THRESHOLDS_MB":
     lambda: json.loads(os.getenv(
@@ -1135,6 +1156,10 @@ environment_variables: dict[str, Callable[[], Any]] = {
     "VLLM_USE_TRTLLM_ATTENTION":
     lambda: os.getenv("VLLM_USE_TRTLLM_ATTENTION", None),
 
+    # If set to 1, when we use fp8 kv, we do not quantize Q to fp8
+    "VLLM_FLASHINFER_DISABLE_Q_QUANTIZATION":
+    lambda: bool(int(os.getenv("VLLM_FLASHINFER_DISABLE_Q_QUANTIZATION", "0"))),
+
     # If set, it means we pre-downloaded cubin files and flashinfer will
     # read the cubin files directly.
     "VLLM_HAS_FLASHINFER_CUBIN":
@@ -1199,6 +1224,23 @@ environment_variables: dict[str, Callable[[], Any]] = {
     "VLLM_TUNED_CONFIG_FOLDER":
     lambda: os.getenv("VLLM_TUNED_CONFIG_FOLDER", None),
 
+    # Allows vllm use container tool
+    "VLLM_GPT_OSS_USE_CONTAINER_TOOL":
+    lambda: bool(int(os.getenv("VLLM_GPT_OSS_USE_CONTAINER_TOOL", "0"))),
+
+    # Allows harmony instructions to be injected on system messages
+    "VLLM_GPT_OSS_HARMONY_SYSTEM_INSTRUCTIONS":
+    lambda: bool(
+        int(os.getenv("VLLM_GPT_OSS_HARMONY_SYSTEM_INSTRUCTIONS", "0"))),
+
+    # Add optional custom scopes for profiling, disable to avoid overheads
+    "VLLM_CUSTOM_SCOPES_FOR_PROFILING":
+    lambda: bool(int(os.getenv("VLLM_CUSTOM_SCOPES_FOR_PROFILING", "0"))),
+
+    # Represent block hashes in KV cache events as 64-bit integers instead of
+    # raw bytes. Defaults to True for backward compatibility.
+    "VLLM_KV_EVENTS_USE_INT_BLOCK_HASHES":
+    lambda: bool(int(os.getenv("VLLM_KV_EVENTS_USE_INT_BLOCK_HASHES", "1"))),
 }
 
 # --8<-- [end:env-vars-definition]
@@ -1269,9 +1311,11 @@ def compute_hash() -> str:
         "VLLM_USE_FLASHINFER_MOE_FP8",
         "VLLM_USE_FLASHINFER_MOE_FP4",
         "VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8",
+        "VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8_CUTLASS",
         "VLLM_USE_FLASHINFER_MOE_MXFP4_BF16",
         "VLLM_USE_CUDNN_PREFILL",
         "VLLM_USE_TRTLLM_ATTENTION",
+        "VLLM_FLASHINFER_DISABLE_Q_QUANTIZATION",
         "VLLM_ROCM_USE_AITER",
         "VLLM_ROCM_USE_AITER_PAGED_ATTN",
         "VLLM_ROCM_USE_AITER_LINEAR",
diff --git a/vllm/executor/executor_base.py b/vllm/executor/executor_base.py
index 813232cd19281478a720284403020badb685206c..a3c1d79a58b269f079ef990264623a35e9242939 100644
--- a/vllm/executor/executor_base.py
+++ b/vllm/executor/executor_base.py
@@ -231,7 +231,7 @@ class ExecutorBase(ABC):
 
     def shutdown(self) -> None:
         """Shutdown the executor."""
-        return
+        self.collective_rpc("shutdown")
 
     def __del__(self):
         self.shutdown()
diff --git a/vllm/executor/ray_distributed_executor.py b/vllm/executor/ray_distributed_executor.py
index 37c3fe59c65dd3d8901d668ca92c79ec750adf3f..78d0ee6c1e3fc6f861dd5599a7761db60ca22b32 100644
--- a/vllm/executor/ray_distributed_executor.py
+++ b/vllm/executor/ray_distributed_executor.py
@@ -117,10 +117,12 @@ class RayDistributedExecutor(DistributedExecutorBase):
                 self.driver_worker.execute_method)
 
     def shutdown(self) -> None:
-        logger.info(
-            "Shutting down Ray distributed executor. If you see error log "
-            "from logging.cc regarding SIGTERM received, please ignore because "
-            "this is the expected termination process in Ray.")
+        if logger:
+            # Somehow logger can be None here.
+            logger.info(
+                "Shutting down Ray distributed executor. If you see error log "
+                "from logging.cc regarding SIGTERM received, please ignore "
+                "because this is the expected termination process in Ray.")
         if hasattr(self, "forward_dag") and self.forward_dag is not None:
             self.forward_dag.teardown()
             import ray
diff --git a/vllm/executor/ray_utils.py b/vllm/executor/ray_utils.py
index 4b2a15afb67a7d4530381cb313e7cc1817601c54..0bdeb28569892dd7d8cbedb0000529a8201c6ec3 100644
--- a/vllm/executor/ray_utils.py
+++ b/vllm/executor/ray_utils.py
@@ -223,7 +223,7 @@ def _wait_until_pg_ready(current_placement_group: "PlacementGroup"):
 
     """
     # Wait until PG is ready - this will block until all
-    # requested resources are available, and will timeout
+    # requested resources are available, and will time out
     # if they cannot be provisioned.
     placement_group_specs = current_placement_group.bundle_specs
 
diff --git a/vllm/executor/uniproc_executor.py b/vllm/executor/uniproc_executor.py
index aabc9ed9b80a23c2e5db5a56cd655dc6218520cf..f45a94f3151b699e703dacd30db04f5b63e98433 100644
--- a/vllm/executor/uniproc_executor.py
+++ b/vllm/executor/uniproc_executor.py
@@ -71,6 +71,10 @@ class UniProcExecutor(ExecutorBase):
             self.shutdown()
         return
 
+    def shutdown(self) -> None:
+        if worker := self.driver_worker:
+            worker.shutdown()
+
 
 UniProcExecutorAsync = UniProcExecutor
 
diff --git a/vllm/inputs/data.py b/vllm/inputs/data.py
index 065d0ab59291af884d6af5d6058c1cbbae35e97f..6a005aa634e854af3f06cef3f984a9c0bff16553 100644
--- a/vllm/inputs/data.py
+++ b/vllm/inputs/data.py
@@ -52,6 +52,9 @@ class TokensPrompt(TypedDict):
     prompt_token_ids: list[int]
     """A list of token IDs to pass to the model."""
 
+    prompt: NotRequired[str]
+    """The prompt text corresponding to the token IDs, if available."""
+
     token_type_ids: NotRequired[list[int]]
     """A list of token type IDs to pass to the cross encoder model."""
 
diff --git a/vllm/inputs/preprocess.py b/vllm/inputs/preprocess.py
index 094fcf021b61926bddaf046765186b2f5c31b5c9..22287aa6f41e004cbdd21c286f745e5d0b69ede6 100644
--- a/vllm/inputs/preprocess.py
+++ b/vllm/inputs/preprocess.py
@@ -258,8 +258,7 @@ class InputPreprocessor:
         tokenization_kwargs: Optional[dict[str, Any]] = None,
         lora_request: Optional[LoRARequest] = None,
         *,
-        mm_hash_overrides: Optional[Union[dict[str, list[str]],
-                                          MultiModalUUIDDict]] = None,
+        mm_uuids: Optional[MultiModalUUIDDict] = None,
     ) -> MultiModalInputs:
         """
         Apply the model's multi-modal processor to a multi-modal prompt,
@@ -276,13 +275,23 @@ class InputPreprocessor:
         if mm_processor_kwargs is None:
             mm_processor_kwargs = {}
 
-        return mm_processor.apply(
+        mm_input = mm_processor.apply(
             prompt,
             mm_data,
             hf_processor_mm_kwargs=mm_processor_kwargs,
             tokenization_kwargs=tokenization_kwargs,
-            mm_hash_overrides=mm_hash_overrides,
+            mm_uuids=mm_uuids,
         )
+        mm_hashes = mm_input["mm_hashes"]
+
+        # Validate that all mm items have a string as their hash
+        if not contains_only_strings(mm_hashes):
+            raise ValueError(
+                f"mm_hashes must contain only strings, got: {mm_hashes}. "
+                "This is likely due to an incorrect custom implementation of "
+                "MultiModalProcessor.apply method.")
+
+        return mm_input
 
     async def _process_multimodal_async(
         self,
@@ -292,8 +301,7 @@ class InputPreprocessor:
         tokenization_kwargs: Optional[dict[str, Any]] = None,
         lora_request: Optional[LoRARequest] = None,
         *,
-        mm_hash_overrides: Optional[Union[dict[str, list[str]],
-                                          MultiModalUUIDDict]] = None,
+        mm_uuids: Optional[MultiModalUUIDDict] = None,
     ) -> MultiModalInputs:
         """
         Async version of
@@ -310,13 +318,23 @@ class InputPreprocessor:
         if mm_processor_kwargs is None:
             mm_processor_kwargs = {}
 
-        return mm_processor.apply(
+        mm_input = mm_processor.apply(
             prompt,
             mm_data,
             hf_processor_mm_kwargs=mm_processor_kwargs,
             tokenization_kwargs=tokenization_kwargs,
-            mm_hash_overrides=mm_hash_overrides,
+            mm_uuids=mm_uuids,
         )
+        mm_hashes = mm_input["mm_hashes"]
+
+        # Validate that all mm items have a string as their hash
+        if not contains_only_strings(mm_hashes):
+            raise ValueError(
+                f"mm_hashes must contain only strings, got: {mm_hashes}. "
+                "This is likely due to an incorrect custom implementation of "
+                "MultiModalProcessor.apply method.")
+
+        return mm_input
 
     def _process_embeds(
         self,
@@ -370,8 +388,7 @@ class InputPreprocessor:
         tokenization_kwargs: Optional[dict[str, Any]] = None,
         lora_request: Optional[LoRARequest] = None,
         *,
-        mm_hash_overrides: Optional[Union[dict[str, list[str]],
-                                          MultiModalUUIDDict]] = None,
+        mm_uuids: Optional[MultiModalUUIDDict] = None,
     ) -> Union[TokenInputs, MultiModalInputs]:
         prompt_token_ids = self._truncate_inputs(
             parsed_content["prompt_token_ids"], tokenization_kwargs)
@@ -384,7 +401,7 @@ class InputPreprocessor:
                 parsed_content.get("mm_processor_kwargs"),
                 tokenization_kwargs=tokenization_kwargs,
                 lora_request=lora_request,
-                mm_hash_overrides=mm_hash_overrides,
+                mm_uuids=mm_uuids,
             )
         else:
             inputs = token_inputs(prompt_token_ids=prompt_token_ids)
@@ -400,8 +417,7 @@ class InputPreprocessor:
         tokenization_kwargs: Optional[dict[str, Any]] = None,
         lora_request: Optional[LoRARequest] = None,
         *,
-        mm_hash_overrides: Optional[Union[dict[str, list[str]],
-                                          MultiModalUUIDDict]] = None,
+        mm_uuids: Optional[MultiModalUUIDDict] = None,
     ) -> Union[TokenInputs, MultiModalInputs]:
         prompt_token_ids = self._truncate_inputs(
             parsed_content["prompt_token_ids"], tokenization_kwargs)
@@ -414,7 +430,7 @@ class InputPreprocessor:
                 parsed_content.get("mm_processor_kwargs"),
                 tokenization_kwargs=tokenization_kwargs,
                 lora_request=lora_request,
-                mm_hash_overrides=mm_hash_overrides,
+                mm_uuids=mm_uuids,
             )
         else:
             inputs = token_inputs(prompt_token_ids=prompt_token_ids, )
@@ -430,8 +446,7 @@ class InputPreprocessor:
         tokenization_kwargs: Optional[dict[str, Any]] = None,
         lora_request: Optional[LoRARequest] = None,
         *,
-        mm_hash_overrides: Optional[Union[dict[str, list[str]],
-                                          MultiModalUUIDDict]] = None,
+        mm_uuids: Optional[MultiModalUUIDDict] = None,
     ) -> Union[TokenInputs, MultiModalInputs]:
         prompt_text = parsed_content["prompt"]
 
@@ -443,7 +458,7 @@ class InputPreprocessor:
                 parsed_content.get("mm_processor_kwargs"),
                 tokenization_kwargs=tokenization_kwargs,
                 lora_request=lora_request,
-                mm_hash_overrides=mm_hash_overrides,
+                mm_uuids=mm_uuids,
             )
         else:
             prompt_token_ids = self._tokenize_prompt(
@@ -467,8 +482,7 @@ class InputPreprocessor:
         tokenization_kwargs: Optional[dict[str, Any]] = None,
         lora_request: Optional[LoRARequest] = None,
         *,
-        mm_hash_overrides: Optional[Union[dict[str, list[str]],
-                                          MultiModalUUIDDict]] = None,
+        mm_uuids: Optional[MultiModalUUIDDict] = None,
     ) -> Union[TokenInputs, MultiModalInputs]:
         prompt_text = parsed_content["prompt"]
 
@@ -480,7 +494,7 @@ class InputPreprocessor:
                 parsed_content.get("mm_processor_kwargs"),
                 tokenization_kwargs=tokenization_kwargs,
                 lora_request=lora_request,
-                mm_hash_overrides=mm_hash_overrides,
+                mm_uuids=mm_uuids,
             )
         else:
             prompt_token_ids = await self._tokenize_prompt_async(
@@ -504,8 +518,7 @@ class InputPreprocessor:
         tokenization_kwargs: Optional[dict[str, Any]] = None,
         lora_request: Optional[LoRARequest] = None,
         *,
-        mm_hash_overrides: Optional[Union[dict[str, list[str]],
-                                          MultiModalUUIDDict]] = None,
+        mm_uuids: Optional[MultiModalUUIDDict] = None,
     ) -> SingletonInputs:
         """
         Extract the singleton inputs from a prompt.
@@ -527,21 +540,21 @@ class InputPreprocessor:
             return self._process_tokens(
                 parsed["content"],
                 lora_request=lora_request,
-                mm_hash_overrides=mm_hash_overrides,
+                mm_uuids=mm_uuids,
             )
         if parsed["type"] == "text":
             return self._process_text(
                 parsed["content"],
                 tokenization_kwargs=tokenization_kwargs,
                 lora_request=lora_request,
-                mm_hash_overrides=mm_hash_overrides,
+                mm_uuids=mm_uuids,
             )
         if parsed["type"] == "str":
             return self._process_text(
                 TextPrompt(prompt=parsed["content"]),
                 tokenization_kwargs=tokenization_kwargs,
                 lora_request=lora_request,
-                mm_hash_overrides=mm_hash_overrides,
+                mm_uuids=mm_uuids,
             )
 
         assert_never(parsed)
@@ -552,8 +565,7 @@ class InputPreprocessor:
         tokenization_kwargs: Optional[dict[str, Any]] = None,
         lora_request: Optional[LoRARequest] = None,
         *,
-        mm_hash_overrides: Optional[Union[dict[str, list[str]],
-                                          MultiModalUUIDDict]] = None,
+        mm_uuids: Optional[MultiModalUUIDDict] = None,
     ) -> SingletonInputs:
         """
         Async version of
@@ -567,21 +579,21 @@ class InputPreprocessor:
             return await self._process_tokens_async(
                 parsed["content"],
                 lora_request=lora_request,
-                mm_hash_overrides=mm_hash_overrides,
+                mm_uuids=mm_uuids,
             )
         if parsed["type"] == "text":
             return await self._process_text_async(
                 parsed["content"],
                 tokenization_kwargs=tokenization_kwargs,
                 lora_request=lora_request,
-                mm_hash_overrides=mm_hash_overrides,
+                mm_uuids=mm_uuids,
             )
         if parsed["type"] == "str":
             return await self._process_text_async(
                 TextPrompt(prompt=parsed["content"]),
                 tokenization_kwargs=tokenization_kwargs,
                 lora_request=lora_request,
-                mm_hash_overrides=mm_hash_overrides,
+                mm_uuids=mm_uuids,
             )
 
         assert_never(parsed)
@@ -692,8 +704,7 @@ class InputPreprocessor:
         prompt: PromptType,
         tokenization_kwargs: Optional[dict[str, Any]] = None,
         *,
-        mm_hash_overrides: Optional[Union[dict[str, list[str]],
-                                          MultiModalUUIDDict]] = None,
+        mm_uuids: Optional[MultiModalUUIDDict] = None,
     ) -> EncoderDecoderInputs:
         """
         For encoder/decoder models only:
@@ -735,7 +746,7 @@ class InputPreprocessor:
             encoder_inputs = self._prompt_to_llm_inputs(
                 prompt["encoder_prompt"],
                 tokenization_kwargs=tokenization_kwargs,
-                mm_hash_overrides=mm_hash_overrides,
+                mm_uuids=mm_uuids,
             )
             if (decoder_input := prompt["decoder_prompt"]) is None:
                 decoder_inputs = None
@@ -751,7 +762,7 @@ class InputPreprocessor:
             inputs = self._prompt_to_llm_inputs(
                 prompt,
                 tokenization_kwargs=tokenization_kwargs,
-                mm_hash_overrides=mm_hash_overrides,
+                mm_uuids=mm_uuids,
             )
             if self.model_config.is_multimodal_model:
                 # Encoder-Decoder Multimodal model
@@ -768,8 +779,7 @@ class InputPreprocessor:
         prompt: PromptType,
         tokenization_kwargs: Optional[dict[str, Any]] = None,
         *,
-        mm_hash_overrides: Optional[Union[dict[str, list[str]],
-                                          MultiModalUUIDDict]] = None,
+        mm_uuids: Optional[MultiModalUUIDDict] = None,
     ) -> EncoderDecoderInputs:
         """
         Async version of
@@ -782,7 +792,7 @@ class InputPreprocessor:
             encoder_task = self._prompt_to_llm_inputs_async(
                 prompt["encoder_prompt"],
                 tokenization_kwargs=tokenization_kwargs,
-                mm_hash_overrides=mm_hash_overrides,
+                mm_uuids=mm_uuids,
             )
 
             if (decoder_input := prompt["decoder_prompt"]) is None:
@@ -792,7 +802,7 @@ class InputPreprocessor:
                 decoder_task = self._prompt_to_llm_inputs_async(
                     decoder_input,
                     tokenization_kwargs=tokenization_kwargs,
-                    mm_hash_overrides=mm_hash_overrides,
+                    mm_uuids=mm_uuids,
                 )
 
                 encoder_inputs, decoder_inputs = await asyncio.gather(
@@ -808,7 +818,7 @@ class InputPreprocessor:
             inputs = await self._prompt_to_llm_inputs_async(
                 prompt,
                 tokenization_kwargs=tokenization_kwargs,
-                mm_hash_overrides=mm_hash_overrides,
+                mm_uuids=mm_uuids,
             )
             if self.model_config.is_multimodal_model:
                 # Encoder-Decoder Multimodal model
@@ -836,8 +846,7 @@ class InputPreprocessor:
         tokenization_kwargs: Optional[dict[str, Any]] = None,
         lora_request: Optional[LoRARequest] = None,
         *,
-        mm_hash_overrides: Optional[Union[dict[str, list[str]],
-                                          MultiModalUUIDDict]] = None,
+        mm_uuids: Optional[MultiModalUUIDDict] = None,
     ) -> DecoderOnlyInputs:
         """
         For decoder-only models:
@@ -858,7 +867,7 @@ class InputPreprocessor:
             prompt,
             tokenization_kwargs=tokenization_kwargs,
             lora_request=lora_request,
-            mm_hash_overrides=mm_hash_overrides,
+            mm_uuids=mm_uuids,
         )
 
         return self._build_decoder_only_llm_inputs(prompt_comps)
@@ -869,8 +878,7 @@ class InputPreprocessor:
         tokenization_kwargs: Optional[dict[str, Any]] = None,
         lora_request: Optional[LoRARequest] = None,
         *,
-        mm_hash_overrides: Optional[Union[dict[str, list[str]],
-                                          MultiModalUUIDDict]] = None,
+        mm_uuids: Optional[MultiModalUUIDDict] = None,
     ) -> DecoderOnlyInputs:
         """
         Async version of
@@ -880,7 +888,7 @@ class InputPreprocessor:
             prompt,
             tokenization_kwargs=tokenization_kwargs,
             lora_request=lora_request,
-            mm_hash_overrides=mm_hash_overrides,
+            mm_uuids=mm_uuids,
         )
 
         return self._build_decoder_only_llm_inputs(prompt_comps)
@@ -891,8 +899,7 @@ class InputPreprocessor:
         tokenization_kwargs: Optional[dict[str, Any]] = None,
         lora_request: Optional[LoRARequest] = None,
         *,
-        mm_hash_overrides: Optional[Union[dict[str, list[str]],
-                                          MultiModalUUIDDict]] = None,
+        mm_uuids: Optional[MultiModalUUIDDict] = None,
     ) -> ProcessorInputs:
         """Preprocess the input prompt."""
         if self.model_config.is_encoder_decoder:
@@ -901,7 +908,7 @@ class InputPreprocessor:
             return self._process_encoder_decoder_prompt(
                 prompt,
                 tokenization_kwargs,
-                mm_hash_overrides=mm_hash_overrides,
+                mm_uuids=mm_uuids,
             )
 
         if is_explicit_encoder_decoder_prompt(prompt):
@@ -913,7 +920,7 @@ class InputPreprocessor:
             prompt,
             tokenization_kwargs=tokenization_kwargs,
             lora_request=lora_request,
-            mm_hash_overrides=mm_hash_overrides,
+            mm_uuids=mm_uuids,
         )
 
     async def preprocess_async(
@@ -922,8 +929,7 @@ class InputPreprocessor:
         tokenization_kwargs: Optional[dict[str, Any]] = None,
         lora_request: Optional[LoRARequest] = None,
         *,
-        mm_hash_overrides: Optional[Union[dict[str, list[str]],
-                                          MultiModalUUIDDict]] = None,
+        mm_uuids: Optional[MultiModalUUIDDict] = None,
     ) -> ProcessorInputs:
         """
         Async version of
@@ -935,7 +941,7 @@ class InputPreprocessor:
             return await self._process_encoder_decoder_prompt_async(
                 prompt,
                 tokenization_kwargs,
-                mm_hash_overrides=mm_hash_overrides,
+                mm_uuids=mm_uuids,
             )
 
         if is_explicit_encoder_decoder_prompt(prompt):
@@ -947,9 +953,21 @@ class InputPreprocessor:
             prompt,
             tokenization_kwargs=tokenization_kwargs,
             lora_request=lora_request,
-            mm_hash_overrides=mm_hash_overrides,
+            mm_uuids=mm_uuids,
         )
 
     def clear_cache(self) -> None:
         if self.mm_processor_cache is not None:
             self.mm_processor_cache.clear_cache()
+
+
+# Helper function to validate that a nested dictionary contains
+# only strings or list of strings as the leaf values.
+def contains_only_strings(obj: object):
+    if isinstance(obj, str):
+        return True
+    if isinstance(obj, list):
+        return all(isinstance(x, str) for x in obj)
+    if isinstance(obj, dict):
+        return all(contains_only_strings(v) for v in obj.values())
+    return False
diff --git a/vllm/logger.py b/vllm/logger.py
index 8f06eb03c7f93bf5ad4e6e2cfa2e43ea19e2e5f8..2861e0f1686c4ecbe830c2b5152e31fa550207e2 100644
--- a/vllm/logger.py
+++ b/vllm/logger.py
@@ -20,9 +20,10 @@ VLLM_CONFIGURE_LOGGING = envs.VLLM_CONFIGURE_LOGGING
 VLLM_LOGGING_CONFIG_PATH = envs.VLLM_LOGGING_CONFIG_PATH
 VLLM_LOGGING_LEVEL = envs.VLLM_LOGGING_LEVEL
 VLLM_LOGGING_PREFIX = envs.VLLM_LOGGING_PREFIX
+VLLM_LOGGING_STREAM = envs.VLLM_LOGGING_STREAM
 
 _FORMAT = (f"{VLLM_LOGGING_PREFIX}%(levelname)s %(asctime)s "
-           "[%(filename)s:%(lineno)d] %(message)s")
+           "[%(fileinfo)s:%(lineno)d] %(message)s")
 _DATE_FORMAT = "%m-%d %H:%M:%S"
 
 DEFAULT_LOGGING_CONFIG = {
@@ -38,7 +39,7 @@ DEFAULT_LOGGING_CONFIG = {
             "class": "logging.StreamHandler",
             "formatter": "vllm",
             "level": VLLM_LOGGING_LEVEL,
-            "stream": "ext://sys.stdout",
+            "stream": VLLM_LOGGING_STREAM,
         },
     },
     "loggers": {
diff --git a/vllm/logging_utils/formatter.py b/vllm/logging_utils/formatter.py
index 0affef10078dcb6c9ef74926ca9a72f5cfe8ba60..004b79f3ea6e274431daa619b4a25471751e83bb 100644
--- a/vllm/logging_utils/formatter.py
+++ b/vllm/logging_utils/formatter.py
@@ -2,16 +2,77 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import logging
+from pathlib import Path
+
+from vllm import envs
 
 
 class NewLineFormatter(logging.Formatter):
     """Adds logging prefix to newlines to align multi-line messages."""
 
     def __init__(self, fmt, datefmt=None, style="%"):
-        logging.Formatter.__init__(self, fmt, datefmt, style)
+        super().__init__(fmt, datefmt, style)
+
+        self.use_relpath = envs.VLLM_LOGGING_LEVEL == "DEBUG"
+        if self.use_relpath:
+            self.root_dir = Path(__file__).resolve().parent.parent.parent
 
     def format(self, record):
-        msg = logging.Formatter.format(self, record)
+
+        def shrink_path(relpath: Path) -> str:
+            """
+            Shortens a file path for logging display:
+            - Removes leading 'vllm' folder if present.
+            - If path starts with 'v1',
+            keeps the first two and last two levels,
+            collapsing the middle as '...'.
+            - Otherwise, keeps the first and last two levels,
+            collapsing the middle as '...'.
+            - If the path is short, returns it as-is.
+            - Examples:
+            vllm/model_executor/layers/quantization/utils/fp8_utils.py ->
+            model_executor/.../quantization/utils/fp8_utils.py
+            vllm/model_executor/layers/quantization/awq.py ->
+            model_executor/layers/quantization/awq.py
+            vllm/v1/attention/backends/mla/common.py ->
+            v1/attention/backends/mla/common.py
+
+            Args:
+                relpath (Path): The relative path to be shortened.
+            Returns:
+                str: The shortened path string for display.
+            """
+            parts = list(relpath.parts)
+            new_parts = []
+            if parts and parts[0] == "vllm":
+                parts = parts[1:]
+            if parts and parts[0] == "v1":
+                new_parts += parts[:2]
+                parts = parts[2:]
+            elif parts:
+                new_parts += parts[:1]
+                parts = parts[1:]
+            if len(parts) > 2:
+                new_parts += ["..."] + parts[-2:]
+            else:
+                new_parts += parts
+            return "/".join(new_parts)
+
+        if self.use_relpath:
+            abs_path = getattr(record, "pathname", None)
+            if abs_path:
+                try:
+                    relpath = Path(abs_path).resolve().relative_to(
+                        self.root_dir)
+                except Exception:
+                    relpath = Path(record.filename)
+            else:
+                relpath = Path(record.filename)
+            record.fileinfo = shrink_path(relpath)
+        else:
+            record.fileinfo = record.filename
+
+        msg = super().format(record)
         if record.message != "":
             parts = msg.split(record.message)
             msg = msg.replace("\n", "\r\n" + parts[0])
diff --git a/vllm/logprobs.py b/vllm/logprobs.py
new file mode 100644
index 0000000000000000000000000000000000000000..e58ca142c00a4dd07f479c3f2f9e34786a0df36d
--- /dev/null
+++ b/vllm/logprobs.py
@@ -0,0 +1,28 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from dataclasses import dataclass
+from typing import Optional
+
+
+# We use dataclass for now because it is used for
+# openai server output, and msgspec is not serializable.
+# TODO(sang): Fix it.
+@dataclass
+class Logprob:
+    """Infos for supporting OpenAI compatible logprobs and token ranks.
+
+    Attributes:
+        logprob: The logprob of chosen token
+        rank: The vocab rank of chosen token (>=1)
+        decoded_token: The decoded chosen token index
+    """
+    logprob: float
+    rank: Optional[int] = None
+    decoded_token: Optional[str] = None
+
+
+# {token_id -> logprob} per each sequence group. None if the corresponding
+# sequence group doesn't require prompt logprob.
+PromptLogprobs = list[Optional[dict[int, Logprob]]]
+# {token_id -> logprob} for each sequence group.
+SampleLogprobs = list[dict[int, Logprob]]
diff --git a/vllm/lora/fully_sharded_layers.py b/vllm/lora/fully_sharded_layers.py
deleted file mode 100644
index 7fc4cfe026aee112a37248568ff058c8d3ac1193..0000000000000000000000000000000000000000
--- a/vllm/lora/fully_sharded_layers.py
+++ /dev/null
@@ -1,355 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-# pylint: disable=unused-argument
-from typing import TYPE_CHECKING, Optional, Union, cast
-
-import torch
-import torch.nn as nn
-from transformers import PretrainedConfig
-
-from vllm.config import LoRAConfig
-from vllm.distributed.communication_op import (
-    tensor_model_parallel_all_gather, tensor_model_parallel_all_reduce)
-from vllm.distributed.parallel_state import get_tensor_model_parallel_rank
-from vllm.lora.layers import (ColumnParallelLinearWithLoRA,
-                              MergedColumnParallelLinearWithLoRA,
-                              MergedQKVParallelLinearWithLoRA,
-                              QKVParallelLinearWithLoRA,
-                              RowParallelLinearWithLoRA)
-from vllm.platforms import current_platform
-
-if TYPE_CHECKING:
-    pass
-
-
-def _fully_sharded_can_replace(can_replace):
-    """
-    decorator which adds the condition of fully sharded loras
-    intended to wrap can_replace_layer()
-    """
-
-    def dec(*args, **kwargs):
-        return (can_replace(*args, **kwargs)
-                and kwargs["lora_config"].fully_sharded_loras)
-
-    return dec
-
-
-def _mcp_apply(x, bias, layer: ColumnParallelLinearWithLoRA):
-    """ 
-    For `ColumnParallelLinearWithLoRA` or classes that inherit from 
-    `ColumnParallelLinearWithLoRA`, they share the same `apply` logic.
-    """
-    assert (layer.n_slices == len(layer.lora_a_stacked) == len(
-        layer.lora_b_stacked) == len(layer.output_slices))
-    if layer.lora_bias_stacked is not None:
-        assert layer.n_slices == len(layer.lora_bias_stacked)
-
-    output = layer.base_layer.quant_method.apply(layer.base_layer, x, bias)
-
-    x = x.view(-1, x.shape[-1])
-    output, out_orig_shape = output.view(-1, output.shape[-1]), output.shape
-
-    # Since communication is needed, the buffer is directly initialized as a
-    # tensor rather than a tuple of tensor.
-    buffers = torch.zeros(
-        (layer.n_slices, x.shape[0], layer.lora_a_stacked[0].shape[2]),
-        dtype=torch.float32,
-        device=x.device,
-    )
-
-    shrunk_buffers: Optional[torch.Tensor] = layer.punica_wrapper.add_shrink(
-        buffers, x, layer.lora_a_stacked, 1.0)
-
-    if not current_platform.can_update_inplace():
-        buffers = shrunk_buffers
-
-    buffers = tensor_model_parallel_all_gather(buffers)
-
-    lora_output: Optional[torch.Tensor] = layer.punica_wrapper.add_expand(
-        output,
-        buffers,
-        layer.lora_b_stacked,
-        layer.lora_bias_stacked,
-        layer.output_slices,
-        offset_start=0,
-        add_input=True)
-
-    if not current_platform.can_update_inplace():
-        output = lora_output
-
-    output = output.view(*out_orig_shape)
-    # now have column partitioned and packed output
-    return output
-
-
-# these layers are based on the tensor parallelism strategy given in
-# Y. Sheng et al., S-LoRA: Serving Thousands of Concurrent LoRA Adapters. 2023,
-# https://arxiv.org/abs/2311.03285.
-
-
-class ColumnParallelLinearWithShardedLoRA(ColumnParallelLinearWithLoRA):
-    """
-    Differs from ColumnParallelLinearWithLoRA by slicing LoRA A also.
-
-    Based on S-LoRA, slicing happens along the rank dim.
-    """
-
-    # For all LoRA layers where the `base_layer` is `ColumnParallelLinear`,
-    # their `lora_a` and `lora_b` have different sharding patterns. After
-    # completing the `lora_a` GEMM , a gather operation is performed.
-    # Therefore, the sharding of `lora_a` only needs to correspond with the
-    # gather operation.
-    def slice_lora_a(self, lora_a: torch.Tensor) -> torch.Tensor:
-        tp_rank = get_tensor_model_parallel_rank()
-        shard_size = self.lora_a_stacked[0].shape[2]
-        start_idx = tp_rank * shard_size
-        lora_a = lora_a[:, start_idx:start_idx + shard_size]
-        return lora_a
-
-    def apply(self,
-              x: torch.Tensor,
-              bias: Optional[torch.Tensor] = None) -> torch.Tensor:
-        return _mcp_apply(x, bias, self)
-
-    @classmethod
-    @_fully_sharded_can_replace
-    def can_replace_layer(
-        cls,
-        source_layer: nn.Module,
-        lora_config: LoRAConfig,
-        packed_modules_list: list,
-        model_config: Optional[PretrainedConfig],
-    ) -> bool:
-        # specifying kwargs so they can be easily accessed in decorator
-        return super().can_replace_layer(
-            source_layer=source_layer,
-            lora_config=lora_config,
-            packed_modules_list=packed_modules_list,
-            model_config=model_config,
-            decorate=False,
-        )
-
-
-class MergedColumnParallelLinearWithShardedLoRA(
-        MergedColumnParallelLinearWithLoRA):
-    """
-    Differs from MergedColumnParallelLinearWithLoRA by slicing the
-    LoRA A's also.
-
-    Based on S-LoRA, slicing happens along the rank dim.
-    """
-
-    def slice_lora_a(
-        self, lora_a: list[Union[torch.Tensor, None]]
-    ) -> list[Union[torch.Tensor, None]]:
-        #NOTE: lora_a contains 2 subloras, and each sublora could be None.
-        output_shard_size = self.lora_a_stacked[0].shape[2]
-        output_start_idx = self.tp_rank * output_shard_size
-        lora_a = [
-            lora_a[0][:, output_start_idx:output_start_idx +
-                      output_shard_size] if lora_a[0] is not None else None,
-            lora_a[1][:, output_start_idx:output_start_idx +
-                      output_shard_size] if lora_a[1] is not None else None,
-        ]
-        return lora_a
-
-    def apply(self,
-              x: torch.Tensor,
-              bias: Optional[torch.Tensor] = None) -> torch.Tensor:
-        return _mcp_apply(x, bias, self)
-
-    @classmethod
-    @_fully_sharded_can_replace
-    def can_replace_layer(
-        cls,
-        source_layer: nn.Module,
-        lora_config: LoRAConfig,
-        packed_modules_list: list,
-        model_config: Optional[PretrainedConfig],
-    ) -> bool:
-        # specifying kwargs so they can be easily accessed in decorator
-        return super().can_replace_layer(
-            source_layer=source_layer,
-            lora_config=lora_config,
-            packed_modules_list=packed_modules_list,
-            model_config=model_config,
-            decorate=False,
-        )
-
-
-class QKVParallelLinearWithShardedLoRA(QKVParallelLinearWithLoRA):
-    """
-    Differs from QKVParallelLinearWithLoRA by slicing the
-    LoRA A's also.
-
-    Based on S-LoRA, slicing happens along the rank dim.
-    """
-
-    def slice_lora_a(self, lora_a: torch.Tensor) -> torch.Tensor:
-        tp_rank = get_tensor_model_parallel_rank()
-        shard_size = self.lora_a_stacked[0].shape[2]
-        start_idx = tp_rank * shard_size
-        lora_a = lora_a[:, start_idx:start_idx + shard_size]
-        return lora_a
-
-    def apply(self,
-              x: torch.Tensor,
-              bias: Optional[torch.Tensor] = None) -> torch.Tensor:
-        return _mcp_apply(x, bias, self)
-
-    @classmethod
-    @_fully_sharded_can_replace
-    def can_replace_layer(cls, source_layer: nn.Module,
-                          lora_config: LoRAConfig, packed_modules_list: list,
-                          model_config: Optional[PretrainedConfig]) -> bool:
-        # specifying kwargs so they can be easily accessed in decorator
-        return super().can_replace_layer(
-            source_layer=source_layer,
-            lora_config=lora_config,
-            packed_modules_list=packed_modules_list,
-            model_config=model_config,
-            decorate=False,
-        )
-
-
-class MergedQKVParallelLinearWithShardedLoRA(MergedQKVParallelLinearWithLoRA):
-    """
-    Differs from MergedQKVParallelLinearWithLoRA by slicing the 
-    LoRA A's also.
-
-    Based on S-LoRA, slicing happens along the rank dim.
-    """
-
-    def slice_lora_a(
-        self, lora_a: list[Union[torch.Tensor, None]]
-    ) -> list[Union[torch.Tensor, None]]:
-        # NOTE: lora_a contains 3 subloras, and each sublora could be None.
-        shard_size = [self.lora_a_stacked[i].shape[2] for i in range(3)]
-        start_idx = [self.tp_rank * shard_size[i] for i in range(3)]
-        lora_a = [
-            lora_a[0][:, start_idx[0]:start_idx[0] +
-                      shard_size[0]] if lora_a[0] is not None else None,
-            lora_a[1][:, start_idx[1]:start_idx[1] +
-                      shard_size[1]] if lora_a[1] is not None else None,
-            lora_a[2][:, start_idx[2]:start_idx[2] +
-                      shard_size[2]] if lora_a[2] is not None else None,
-        ]
-        return lora_a
-
-    def apply(self,
-              x: torch.Tensor,
-              bias: Optional[torch.Tensor] = None) -> torch.Tensor:
-        return _mcp_apply(x, bias, self)
-
-    @classmethod
-    @_fully_sharded_can_replace
-    def can_replace_layer(
-        cls,
-        source_layer: nn.Module,
-        lora_config: LoRAConfig,
-        packed_modules_list: list,
-        model_config: Optional[PretrainedConfig],
-    ) -> bool:
-        # specifying kwargs so they can be easily accessed in decorator
-        return super().can_replace_layer(
-            source_layer=source_layer,
-            lora_config=lora_config,
-            packed_modules_list=packed_modules_list,
-            model_config=model_config,
-            decorate=False,
-        )
-
-
-class RowParallelLinearWithShardedLoRA(RowParallelLinearWithLoRA):
-    """
-    Differs from RowParallelLinearWithLoRA by slicing the
-    LoRA B's also.
-
-    Based on S-LoRA, slicing happens along the output dim.
-    This yields a combined partial sum from the row parallel base
-    layer and column partitioned output from the LoRA.
-    """
-
-    def slice_lora_b(self, lora_b: torch.Tensor) -> torch.Tensor:
-        shard_size = self.lora_b_stacked[0].shape[2]
-        start_idx = self.tp_rank * shard_size
-        end_idx = (self.tp_rank + 1) * shard_size
-        lora_b = lora_b[:, start_idx:end_idx]
-        return lora_b
-
-    def slice_bias(self, bias: torch.Tensor) -> torch.Tensor:
-        if bias is None:
-            return bias
-        self.lora_bias_stacked = cast(tuple[torch.Tensor, ...],
-                                      self.lora_bias_stacked)
-        shard_size = self.lora_bias_stacked[0].shape[2]
-        start_idx = self.tp_rank * shard_size
-        end_idx = (self.tp_rank + 1) * shard_size
-        bias = bias[start_idx:end_idx]
-        return bias
-
-    def apply(self,
-              x: torch.Tensor,
-              bias: Optional[torch.Tensor] = None) -> torch.Tensor:
-        output = self.base_layer.quant_method.apply(self.base_layer, x)
-
-        x = x.view(-1, x.shape[-1])
-        output, out_orig_shape = output.view(-1,
-                                             output.shape[-1]), output.shape
-        buffer = torch.zeros(
-            (self.n_slices, x.shape[0], self.lora_a_stacked[0].shape[2]),
-            dtype=torch.float32,
-            device=x.device,
-        )
-
-        shrunk_buffer: Optional[torch.Tensor] = self.punica_wrapper.add_shrink(
-            buffer, x, self.lora_a_stacked, 1.0)
-        if not current_platform.can_update_inplace():
-            buffer = shrunk_buffer
-
-        buffer = tensor_model_parallel_all_reduce(buffer)
-
-        # following S-LoRA, allows the fusing of all_gather and all_reduce
-        # by adding the column partitioned lora output to a slice of output
-        # tensor, which is a partial sum due to row parallel. All that
-        # remains is a standard all_reduce. User should be aware though that
-        # the output is not the same as a normal row_parallel, it should be
-        # reduced before being used
-        # NOTE offset are based on the rank.
-        shard_size = self.lora_b_stacked[0].shape[2]
-        offset_start = self.tp_rank * shard_size
-        lora_output: Optional[torch.Tensor] = self.punica_wrapper.add_expand(
-            output,
-            buffer,
-            self.lora_b_stacked,
-            self.lora_bias_stacked,
-            self.output_slices,
-            offset_start=offset_start,
-            add_input=True,
-        )
-
-        if not current_platform.can_update_inplace():
-            output = lora_output
-
-        output = output.view(*out_orig_shape)
-        return output
-
-    @classmethod
-    @_fully_sharded_can_replace
-    def can_replace_layer(
-        cls,
-        source_layer: nn.Module,
-        lora_config: LoRAConfig,
-        packed_modules_list: list,
-        model_config: Optional[PretrainedConfig],
-    ) -> bool:
-        # specifying kwargs so they can be easily accessed in decorator
-        return super().can_replace_layer(
-            source_layer=source_layer,
-            lora_config=lora_config,
-            packed_modules_list=packed_modules_list,
-            model_config=model_config,
-            decorate=False,
-        )
diff --git a/vllm/lora/layers.py b/vllm/lora/layers.py
deleted file mode 100644
index d8503b20459f6d18e8e6dc9707a3253595760f99..0000000000000000000000000000000000000000
--- a/vllm/lora/layers.py
+++ /dev/null
@@ -1,1192 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-# pylint: disable=unused-argument
-import math
-from dataclasses import dataclass
-from typing import TYPE_CHECKING, Optional, Union, cast
-
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-from transformers import PretrainedConfig
-
-from vllm.adapter_commons.layers import AdapterMapping
-from vllm.config import LoRAConfig
-from vllm.distributed import (get_tensor_model_parallel_rank,
-                              get_tensor_model_parallel_world_size,
-                              split_tensor_along_last_dim,
-                              tensor_model_parallel_all_gather,
-                              tensor_model_parallel_all_reduce)
-from vllm.distributed.utils import divide
-# yapf: disable
-from vllm.model_executor.layers.linear import (ColumnParallelLinear,
-                                               LinearBase,
-                                               MergedColumnParallelLinear,
-                                               QKVParallelLinear,
-                                               ReplicatedLinear,
-                                               RowParallelLinear)
-# yapf: enable
-from vllm.model_executor.layers.logits_processor import LogitsProcessor
-from vllm.model_executor.layers.vocab_parallel_embedding import (
-    VocabParallelEmbedding)
-from vllm.platforms import current_platform
-
-if TYPE_CHECKING:
-    from vllm.lora.punica_wrapper import PunicaWrapperBase
-
-
-def _get_lora_device(base_layer: nn.Module) -> torch.device:
-    # code borrowed from https://github.com/fmmoret/vllm/blob/fm-support-lora-on-quantized-models/vllm/lora/layers.py#L34
-    """Returns the device for where to place the LoRA tensors."""
-    # unquantizedLinear
-    if hasattr(base_layer, "weight"):
-        return base_layer.weight.device
-    # Compressed Tensor
-    elif hasattr(base_layer, "weight_packed"):
-        return base_layer.weight_packed.device
-    # GPTQ/AWQ
-    elif hasattr(base_layer, "qweight"):
-        return base_layer.qweight.device
-    # HQQ marlin
-    elif hasattr(base_layer, "W_q"):
-        return base_layer.W_q.device
-    else:
-        raise ValueError(f"Unsupported base layer: {base_layer}")
-
-
-def _not_fully_sharded_can_replace(can_replace):
-    """
-    decorator which adds the condition of not using fully sharded loras
-    intended to wrap can_replace_layer()
-    """
-
-    def dec(*args, **kwargs):
-        decorate = kwargs.pop("decorate") if "decorate" in kwargs else True
-        condition = (not kwargs["lora_config"].fully_sharded_loras
-                     if decorate else True)
-        return can_replace(*args, **kwargs) and condition
-
-    return dec
-
-
-@dataclass
-class LoRAMapping(AdapterMapping):
-    is_prefill: bool = False
-
-
-class BaseLayerWithLoRA(nn.Module):
-
-    def slice_lora_a(
-        self, lora_a: Union[torch.Tensor, list[Union[torch.Tensor, None]]]
-    ) -> Union[torch.Tensor, list[Union[torch.Tensor, None]]]:
-        """Slice lora a if splitting for tensor parallelism."""
-        ...
-
-    def slice_lora_b(
-        self, lora_b: Union[torch.Tensor, list[Union[torch.Tensor, None]]]
-    ) -> Union[torch.Tensor, list[Union[torch.Tensor, None]]]:
-        """Slice lora b if splitting with tensor parallelism."""
-        ...
-
-    def create_lora_weights(
-        self,
-        max_loras: int,
-        lora_config: LoRAConfig,
-        model_config: Optional[PretrainedConfig] = None,
-    ) -> None:
-        """Initializes lora matrices."""
-        ...
-
-    def reset_lora(self, index: int):
-        """Resets the lora weights at index back to 0."""
-        ...
-
-    def set_lora(
-        self,
-        index: int,
-        lora_a: torch.Tensor,
-        lora_b: torch.Tensor,
-        embeddings_tensor: Optional[torch.Tensor],
-        bias: Optional[torch.Tensor] = None,
-    ):
-        """Overwrites lora tensors at index."""
-        ...
-
-    def set_mapping(
-        self,
-        punica_wrapper,
-    ):
-        self.punica_wrapper: PunicaWrapperBase = punica_wrapper
-
-    @classmethod
-    def can_replace_layer(
-        cls,
-        source_layer: nn.Module,
-        lora_config: LoRAConfig,
-        packed_modules_list: list,
-        model_config: Optional[PretrainedConfig],
-    ) -> bool:
-        """Returns True if the layer can be replaced by this LoRA layer."""
-        raise NotImplementedError
-
-
-class VocabParallelEmbeddingWithLoRA(BaseLayerWithLoRA):
-
-    def __init__(self, base_layer: VocabParallelEmbedding) -> None:
-        super().__init__()
-        self.base_layer = base_layer
-        self.embeddings_slice: Optional[tuple[int, int]]
-        self.embeddings_weights: Optional[torch.Tensor]
-
-    def create_lora_weights(
-            self,
-            max_loras: int,
-            lora_config: LoRAConfig,
-            model_config: Optional[PretrainedConfig] = None) -> None:
-
-        if self.base_layer.num_added_embeddings_per_partition > 0:
-            # We can start adding lora weights
-            self.embeddings_weights = self.base_layer.weight.data[
-                self.base_layer.num_org_embeddings_per_partition:self.
-                base_layer.num_org_embeddings_per_partition +
-                self.base_layer.num_added_embeddings_per_partition]
-            self.embeddings_slice = (
-                self.base_layer.shard_indices.added_vocab_start_index -
-                self.base_layer.org_vocab_size,
-                self.base_layer.shard_indices.added_vocab_end_index -
-                self.base_layer.org_vocab_size)
-            self.base_layer.weight.data[
-                self.base_layer.num_org_embeddings_per_partition:].fill_(0)
-        else:
-            self.embeddings_slice = None
-            self.embeddings_weights = None
-
-        self.embeddings_tensors = torch.zeros(
-            (
-                max_loras,
-                lora_config.lora_extra_vocab_size,
-                self.base_layer.embedding_dim,
-            ),
-            dtype=self.base_layer.weight.dtype,
-            device=self.base_layer.weight.device,
-        )
-        self.lora_a_stacked = torch.zeros(
-            (
-                max_loras,
-                self.base_layer.org_vocab_size +
-                lora_config.lora_extra_vocab_size,
-                lora_config.max_lora_rank,
-            ),
-            dtype=lora_config.lora_dtype,
-            device=self.base_layer.weight.device,
-        )
-        self.lora_b_stacked = torch.zeros(
-            (
-                max_loras,
-                1,
-                self.base_layer.embedding_dim,
-                lora_config.max_lora_rank,
-            ),
-            dtype=lora_config.lora_dtype,
-            device=self.base_layer.weight.device,
-        )
-        self.lora_a_stacked_2d = self.lora_a_stacked.view(
-            self.lora_a_stacked.shape[0] * self.lora_a_stacked.shape[1],
-            self.lora_a_stacked.shape[2],
-        )
-
-    def reset_lora(self, index: int):
-        self.lora_a_stacked[index] = 0
-        self.lora_b_stacked[index] = 0
-        self.embeddings_tensors[index] = 0
-
-    def set_lora(
-        self,
-        index: int,
-        lora_a: torch.Tensor,
-        lora_b: torch.Tensor,
-        embeddings_tensor: Optional[torch.Tensor],
-        bias: Optional[torch.Tensor] = None,
-    ):
-        self.reset_lora(index)
-        self.lora_a_stacked[index, :lora_a.shape[0], :lora_a.shape[1]].copy_(
-            lora_a, non_blocking=True)
-        self.lora_b_stacked[index,
-                            0, :lora_b.shape[1], :lora_b.shape[0]].copy_(
-                                lora_b.T, non_blocking=True)
-        if embeddings_tensor is not None:
-            self.embeddings_tensors[
-                index,
-                :embeddings_tensor.shape[0],
-                :embeddings_tensor.shape[1],
-            ].copy_(embeddings_tensor, non_blocking=True)
-            if self.embeddings_slice is not None:
-                # TODO(yard1): Optimize this copy, we don't need to copy
-                # everything, just the modified part
-                embeddings = self.embeddings_tensors.view(
-                    self.embeddings_tensors.shape[0] *
-                    self.embeddings_tensors.shape[1],
-                    self.embeddings_tensors.shape[2],
-                )[self.embeddings_slice[0]:self.embeddings_slice[1]]
-                assert self.embeddings_weights is not None
-                self.embeddings_weights[:embeddings.shape[0]].copy_(embeddings)
-
-    def forward(self, x: torch.Tensor) -> torch.Tensor:
-        added_tokens_mask = torch.where(x > self.base_layer.org_vocab_size - 1,
-                                        1, 0)
-
-        # NB: Don't use torch.narrow here. torch.narrow triggers some
-        # Dynamic Shape specialization in torch.compile
-        num_tokens = x.shape[0]
-        indices_1 = self.punica_wrapper._embeddings_indices[1][:num_tokens]
-        indices_0 = self.punica_wrapper._embeddings_indices[0][:num_tokens]
-
-        full_lora_a_embeddings = F.embedding(
-            x + indices_1,
-            self.lora_a_stacked_2d,
-        )
-        full_output = self.base_layer.forward(x +
-                                              (indices_0 * added_tokens_mask))
-
-        full_output_org = full_output
-        if full_output.ndim == 3:
-            full_output = full_output.view(
-                full_output.shape[0] * full_output.shape[1], -1)
-        if full_lora_a_embeddings.ndim == 3:
-            full_lora_a_embeddings = full_lora_a_embeddings.view(
-                full_lora_a_embeddings.shape[0] *
-                full_lora_a_embeddings.shape[1],
-                -1,
-            )
-
-        lora_output: Optional[
-            torch.Tensor] = self.punica_wrapper.add_lora_embedding(
-                full_output,
-                full_lora_a_embeddings,
-                self.lora_b_stacked,
-                add_input=True)
-
-        if not current_platform.can_update_inplace():
-            full_output = lora_output
-
-        return full_output.view_as(full_output_org)
-
-    @classmethod
-    def can_replace_layer(
-        cls,
-        source_layer: nn.Module,
-        lora_config: LoRAConfig,
-        packed_modules_list: list,
-        model_config: Optional[PretrainedConfig],
-    ) -> bool:
-        return type(source_layer) is VocabParallelEmbedding
-
-    @property
-    def weight(self):
-        return self.base_layer.weight
-
-
-class BaseLinearLayerWithLoRA(BaseLayerWithLoRA):
-
-    def __init__(self, base_layer: LinearBase):
-        super().__init__()
-        self.base_layer = base_layer
-        self.input_size = self.base_layer.input_size
-        self.device = _get_lora_device(self.base_layer)
-        self.lora_bias_stacked: Optional[tuple[torch.Tensor, ...]] = None
-
-        self.output_slices: tuple[int, ...]
-        self.tp_size: int
-        self.output_size: int
-        self.n_slices: int
-
-    def create_lora_weights(
-        self,
-        max_loras: int,
-        lora_config: LoRAConfig,
-        model_config: Optional[PretrainedConfig] = None,
-    ) -> None:
-        self.lora_config = lora_config
-        #
-        if isinstance(self.base_layer, ReplicatedLinear):
-            lora_a_out_size = lora_config.max_lora_rank
-            lora_b_out_size = self.output_size
-
-        elif isinstance(self.base_layer, ColumnParallelLinear):
-            lora_a_out_size = (lora_config.max_lora_rank if
-                               not lora_config.fully_sharded_loras else divide(
-                                   lora_config.max_lora_rank, self.tp_size))
-            lora_b_out_size = self.output_size
-
-        elif isinstance(self.base_layer, RowParallelLinear):
-            lora_a_out_size = lora_config.max_lora_rank
-            lora_b_out_size = (self.output_size if
-                               not lora_config.fully_sharded_loras else divide(
-                                   self.output_size, self.tp_size))
-        else:
-            raise NotImplementedError
-
-        self.lora_a_stacked = tuple(
-            torch.zeros(
-                max_loras,
-                1,
-                lora_a_out_size,
-                self.input_size,
-                dtype=lora_config.lora_dtype,
-                device=self.device,
-            ) for _ in range(self.n_slices))
-        self.lora_b_stacked = tuple(
-            torch.zeros(
-                max_loras,
-                1,
-                lora_b_out_size,
-                lora_config.max_lora_rank,
-                dtype=lora_config.lora_dtype,
-                device=self.device,
-            ) for _ in range(self.n_slices))
-        if lora_config.bias_enabled:
-            lora_bias_out_size = lora_b_out_size
-            self.lora_bias_stacked = tuple(
-                torch.zeros(
-                    max_loras,
-                    1,
-                    lora_bias_out_size,
-                    dtype=lora_config.lora_dtype,
-                    device=self.device,
-                ) for _ in range(self.n_slices))
-        self.output_slices = (self.lora_b_stacked[0].shape[2], )
-
-    def reset_lora(self, index: int):
-        for s_index in range(self.n_slices):
-            self.lora_a_stacked[s_index][index] = 0
-            self.lora_b_stacked[s_index][index] = 0
-            if self.lora_config.bias_enabled:
-                # Make mypy happy
-                self.lora_bias_stacked = cast(tuple[torch.Tensor, ...],
-                                              self.lora_bias_stacked)
-                self.lora_bias_stacked[s_index][index] = 0
-
-    def set_lora(
-        self,
-        index: int,
-        lora_a: torch.Tensor,
-        lora_b: torch.Tensor,
-        embeddings_tensor: Optional[torch.Tensor],
-        lora_bias: Optional[torch.Tensor] = None,
-    ):
-        # Except for QKVParallelLinearWithLoRA and
-        # MergedColumnParallelLinearWithLoRA, all other linear LoRA layers
-        # store weights in a tuple of size 1. These two layers will
-        # override this function.
-        assert (len(self.lora_a_stacked) == len(self.lora_b_stacked) ==
-                self.n_slices == 1)
-
-        self.reset_lora(index)
-        if self.tp_size > 1:
-            lora_a = self.slice_lora_a(lora_a)
-            lora_b = self.slice_lora_b(lora_b)
-            if lora_bias is not None:
-                lora_bias = self.slice_bias(lora_bias)
-
-        self.lora_a_stacked[0][index,
-                               0, :lora_a.shape[1], :lora_a.shape[0]].copy_(
-                                   lora_a.T, non_blocking=True)
-        self.lora_b_stacked[0][index,
-                               0, :lora_b.shape[1], :lora_b.shape[0]].copy_(
-                                   lora_b.T, non_blocking=True)
-        if lora_bias is not None:
-
-            self.lora_bias_stacked = cast(tuple[torch.Tensor, ...],
-                                          self.lora_bias_stacked)
-            assert len(self.lora_bias_stacked)
-            self.lora_bias_stacked[0][index, 0, :lora_bias.shape[0]].copy_(
-                lora_bias.T, non_blocking=True)
-
-    def apply(self,
-              x: torch.Tensor,
-              bias: Optional[torch.Tensor] = None) -> torch.Tensor:
-        output = self.base_layer.quant_method.apply(self.base_layer, x, bias)
-
-        # In transformers backend, x and output have extra batch dimension like
-        # (1, seq_len, hidden_dim), while punica expects (seq_len, hidden_dim),
-        # therefore we need to flatten the batch dimensions.
-        if x.ndim == 3 and output.ndim == 3:
-            output = output.flatten(0, 1)
-            x = x.flatten(0, 1)
-
-        lora_output: Optional[
-            torch.Tensor] = self.punica_wrapper.add_lora_linear(
-                output, x, self.lora_a_stacked, self.lora_b_stacked,
-                self.lora_bias_stacked, 1.0, self.output_slices)
-        if not current_platform.can_update_inplace():
-            output = lora_output
-
-        return output
-
-    @property
-    def weight(self) -> torch.Tensor:
-
-        # unquantizedLinear
-        if hasattr(self.base_layer, "weight"):
-            return self.base_layer.weight
-        # Compressed Tensor
-        elif hasattr(self.base_layer, "weight_packed"):
-            return self.base_layer.weight_packed
-        # GPTQ/AWQ
-        elif hasattr(self.base_layer, "qweight"):
-            return self.base_layer.qweight
-        # marlin
-        elif hasattr(self.base_layer, "B"):
-            return self.base_layer.B
-        # HQQ marlin
-        elif hasattr(self.base_layer, "W_q"):
-            return self.base_layer.W_q
-        else:
-            raise ValueError(f"Unsupported base layer: {self.base_layer}")
-
-    @property
-    def bias(self) -> Optional[torch.Tensor]:
-        if hasattr(self.base_layer, "bias"):
-            return self.base_layer.bias
-        else:
-            return None
-
-
-class ReplicatedLinearWithLoRA(BaseLinearLayerWithLoRA):
-
-    def __init__(self, base_layer: ReplicatedLinear) -> None:
-        super().__init__(base_layer, )
-        # To ensure interface compatibility, set to 1 always.
-        self.tp_size = 1
-        self.output_size = self.base_layer.output_size
-        self.n_slices = 1
-
-    def forward(
-        self, input_: torch.Tensor
-    ) -> Union[torch.Tensor, tuple[torch.Tensor, Optional[torch.Tensor]]]:
-        """Forward of ReplicatedLinearWithLoRA
-
-        Args:
-            input_: Tensor whose last dimension is `input_size`.
-
-        Returns:
-            - output
-            - bias
-        """
-        bias = (self.base_layer.bias
-                if not self.base_layer.skip_bias_add else None)
-
-        # Matrix multiply.
-        output = self.apply(input_, bias)
-
-        output_bias = (self.base_layer.bias
-                       if self.base_layer.skip_bias_add else None)
-
-        if not self.base_layer.return_bias:
-            return output
-
-        return output, output_bias
-
-    # ReplicatedLinear should always be replaced, regardless of the fully
-    # sharded LoRAs setting, because it is, by definition, copied per GPU.
-    @classmethod
-    def can_replace_layer(
-        cls,
-        source_layer: nn.Module,
-        lora_config: LoRAConfig,
-        packed_modules_list: list,
-        model_config: Optional[PretrainedConfig],
-    ) -> bool:
-        return type(source_layer) is ReplicatedLinear
-
-
-class ColumnParallelLinearWithLoRA(BaseLinearLayerWithLoRA):
-    """
-    LoRA on top of ColumnParallelLinear layer.
-    LoRA B is sliced for tensor parallelism.
-    There are two types for the `base_layer`:
-    1. ColumnParallelLinear, e.g.`dense_h_to_4h` in `FalconForCausalLM`.
-    2. MergedColumnParallelLinear, e.g.`gate_up_proj` in `Phi3ForCausalLM`.
-    """
-
-    def __init__(self, base_layer: ColumnParallelLinear) -> None:
-        super().__init__(base_layer)
-        # The base_layer type is ColumnParallelLinear or
-        # MergedColumnParallelLinear, their weight sharding logic is
-        # inconsistent when TP is greater than 1.
-        self.is_merged_col_linear = type(
-            base_layer) is MergedColumnParallelLinear
-        self.tp_size = get_tensor_model_parallel_world_size()
-        self.output_size = self.base_layer.output_size_per_partition
-        # There is only one LoRA layer
-        self.n_slices = 1
-
-    def slice_lora_a(self, lora_a: torch.Tensor) -> torch.Tensor:
-        return lora_a
-
-    def slice_lora_b(self, lora_b: torch.Tensor) -> torch.Tensor:
-        # Applicable to cases where the base_layer is
-        # MergedColumnParallelLinear.
-        if self.is_merged_col_linear:
-            tp_rank = get_tensor_model_parallel_rank()
-            shard_size = self.output_size // 2
-            offset = lora_b.shape[-1] // 2
-
-            left_weight = lora_b[:, tp_rank * shard_size:(tp_rank + 1) *
-                                 shard_size]
-            right_weight = lora_b[:, offset + tp_rank * shard_size:offset +
-                                  (tp_rank + 1) * shard_size]
-            lora_b = torch.cat([left_weight, right_weight], dim=1)
-        # Applicable to cases where the base_layer is
-        # ColumnParallelLinear.
-        else:
-            tensor_model_parallel_rank = get_tensor_model_parallel_rank()
-            shard_size = self.output_size
-            start_idx = tensor_model_parallel_rank * shard_size
-            end_idx = (tensor_model_parallel_rank + 1) * shard_size
-            lora_b = lora_b[:, start_idx:end_idx]
-        return lora_b
-
-    def slice_bias(self, bias: torch.Tensor) -> torch.Tensor:
-        # TODO: Fix the slicing logic of bias.
-        if bias is None:
-            return bias
-        tensor_model_parallel_rank = get_tensor_model_parallel_rank()
-        shard_size = self.output_size
-        start_idx = tensor_model_parallel_rank * shard_size
-        end_idx = (tensor_model_parallel_rank + 1) * shard_size
-        bias = bias[start_idx:end_idx]
-        return bias
-
-    def forward(
-        self, input_: torch.Tensor
-    ) -> Union[torch.Tensor, tuple[torch.Tensor, Optional[torch.Tensor]]]:
-        """Forward of ColumnParallelLinear
-
-        Args:
-            input_: Tensor whose last dimension is `input_size`.
-
-        Returns:
-            - output
-            - bias
-        """
-        bias = (self.base_layer.bias
-                if not self.base_layer.skip_bias_add else None)
-
-        # Matrix multiply.
-        output_parallel = self.apply(input_, bias)
-        if self.base_layer.gather_output:
-            # All-gather across the partitions.
-            output = tensor_model_parallel_all_gather(output_parallel)
-        else:
-            output = output_parallel
-
-        if not self.base_layer.return_bias:
-            return output
-
-        output_bias = (self.base_layer.bias
-                       if self.base_layer.skip_bias_add else None)
-        return output, output_bias
-
-    @classmethod
-    @_not_fully_sharded_can_replace
-    def can_replace_layer(
-        cls,
-        source_layer: nn.Module,
-        lora_config: LoRAConfig,
-        packed_modules_list: list,
-        model_config: Optional[PretrainedConfig],
-    ) -> bool:
-        return type(source_layer) is ColumnParallelLinear or (
-            type(source_layer) is MergedColumnParallelLinear
-            and len(packed_modules_list) == 1)
-
-
-class MergedColumnParallelLinearWithLoRA(ColumnParallelLinearWithLoRA):
-    """ColumnParallelLinear layer that is composed of 2 sublayers (slices)
-    packed together (e.g. gate_proj + up_proj -> gate_up_proj).
-
-    This means we have 2 LoRAs, each applied to one half of the layer.
-
-    Both slices must have the same size.
-    """
-
-    def __init__(
-        self, base_layer: Union[MergedColumnParallelLinear,
-                                QKVParallelLinear]) -> None:
-        super().__init__(base_layer)
-        # There are two LoRA layers
-        self.tp_size = get_tensor_model_parallel_world_size()
-        self.tp_rank = get_tensor_model_parallel_rank()
-        # the output_sizes in MergedColumnParallelLinear is not sharded by tp
-        # we need to divide it by the tp_size to get correct slices size
-        output_sizes = self.base_layer.output_sizes
-        self.output_slices = tuple(
-            divide(output_size, self.tp_size) for output_size in output_sizes)
-        self.n_slices = len(self.output_slices)
-        self.output_ids = (self.tp_rank, ) * self.n_slices
-
-    def create_lora_weights(
-        self,
-        max_loras: int,
-        lora_config: LoRAConfig,
-        model_config: Optional[PretrainedConfig] = None,
-    ) -> None:
-        """
-        The main reason for overriding this function is to enhance  code 
-        maintainability.
-        """
-        self.lora_config = lora_config
-
-        lora_a_output_size_per_partition = (
-            lora_config.max_lora_rank if not lora_config.fully_sharded_loras
-            else divide(lora_config.max_lora_rank, self.tp_size))
-
-        self.lora_a_stacked = tuple(
-            torch.zeros(
-                max_loras,
-                1,
-                lora_a_output_size_per_partition,
-                self.input_size,
-                dtype=lora_config.lora_dtype,
-                device=self.device,
-            ) for _ in range(self.n_slices))
-        self.lora_b_stacked = tuple(
-            torch.zeros(
-                max_loras,
-                1,
-                output_size,
-                lora_config.max_lora_rank,
-                dtype=lora_config.lora_dtype,
-                device=self.device,
-            ) for output_size in self.output_slices)
-        if lora_config.bias_enabled:
-            self.lora_bias_stacked = tuple(
-                torch.zeros(
-                    max_loras,
-                    1,
-                    output_size,
-                    dtype=lora_config.lora_dtype,
-                    device=self.device,
-                ) for output_size in self.output_slices)
-
-    def slice_lora_a(
-        self, lora_a: list[Union[torch.Tensor, None]]
-    ) -> list[Union[torch.Tensor, None]]:
-        return lora_a
-
-    def slice_lora_b(
-        self, lora_b: list[Union[torch.Tensor, None]]
-    ) -> list[Union[torch.Tensor, None]]:
-        sliced_lora_b = [None] * self.n_slices
-        for i, (shard_id, shard_size) in enumerate(
-                zip(self.output_ids, self.output_slices)):
-            if (lora_b_i := lora_b[i]) is not None:
-                sliced_lora_b[i] = lora_b_i[:,
-                                            shard_size * shard_id:shard_size *
-                                            (shard_id + 1)]
-        return sliced_lora_b
-
-    def slice_bias(
-        self, bias: list[Union[torch.Tensor,
-                               None]]) -> list[Union[torch.Tensor, None]]:
-        for i, (shard_id, shard_size) in enumerate(
-                zip(self.output_ids, self.output_slices)):
-            if (bias_i := bias[i]) is not None:
-                bias[i] = bias_i[shard_size * shard_id:shard_size *
-                                 (shard_id + 1)]
-        return bias
-
-    def set_lora(
-        self,
-        index: int,
-        lora_a: torch.Tensor,
-        lora_b: torch.Tensor,
-        embeddings_tensor: Optional[torch.Tensor],
-        lora_bias: Optional[torch.Tensor] = None,
-    ):
-        self.reset_lora(index)
-
-        if self.tp_size > 1:
-            lora_a = self.slice_lora_a(lora_a)
-            lora_b = self.slice_lora_b(lora_b)
-            if lora_bias is not None:
-                lora_bias = self.slice_bias(lora_bias)
-
-        for i in range(self.n_slices):
-            if (lora_a_i := lora_a[i]) is not None:
-                self.lora_a_stacked[i][
-                    index, 0, :lora_a_i.shape[1], :lora_a_i.shape[0]].copy_(
-                        lora_a_i.T, non_blocking=True)
-            if (lora_b_i := lora_b[i]) is not None:
-                self.lora_b_stacked[i][
-                    index, 0, :lora_b_i.shape[1], :lora_b_i.shape[0]].copy_(
-                        lora_b_i.T, non_blocking=True)
-
-        if lora_bias is not None:
-            self.lora_bias_stacked = cast(tuple[torch.Tensor, ...],
-                                          self.lora_bias_stacked)
-            for i in range(self.n_slices):
-                if (lora_bias_i := lora_bias[i]) is not None:
-                    self.lora_bias_stacked[i][index,
-                                              0, :lora_bias_i.shape[0]].copy_(
-                                                  lora_bias_i.T,
-                                                  non_blocking=True)
-
-    @classmethod
-    @_not_fully_sharded_can_replace
-    def can_replace_layer(
-        cls,
-        source_layer: nn.Module,
-        lora_config: LoRAConfig,
-        packed_modules_list: list,
-        model_config: Optional[PretrainedConfig],
-    ) -> bool:
-        return (type(source_layer) is MergedColumnParallelLinear
-                and len(packed_modules_list) == 2)
-
-
-class QKVParallelLinearWithLoRA(ColumnParallelLinearWithLoRA):
-    """
-    ColumnParallelLinear layer that is specifically designed for
-    qkv_proj. Certain models, such as chatglm3 and baichuan-7b,
-    only contains a single LoRA within their qkv_proj layer.
-
-    During inference with Tensor Parallel, the weights of lora_b
-    must be accurately partitioned according to the respective ranks.
-
-    Q slice may have different shape than K and V slices (which both have
-    the same shape).
-    """
-
-    def __init__(self, base_layer: QKVParallelLinear) -> None:
-        super().__init__(base_layer)
-        self.q_proj_total_size = (self.base_layer.total_num_heads *
-                                  self.base_layer.head_size)
-        self.q_proj_shard_size = (self.base_layer.num_heads *
-                                  self.base_layer.head_size)
-        self.kv_proj_shard_size = (self.base_layer.num_kv_heads *
-                                   self.base_layer.head_size)
-        self.kv_proj_total_size = (self.base_layer.total_num_kv_heads *
-                                   self.base_layer.head_size)
-        # There is only one LoRA layer
-        self.n_slices = 1
-
-    def slice_lora_b(self, lora_b: torch.Tensor) -> torch.Tensor:
-        tp_rank = get_tensor_model_parallel_rank()
-        self.q_shard_id = tp_rank
-        self.kv_shard_id = tp_rank // self.base_layer.num_kv_head_replicas
-        lora_b_q = lora_b[:, self.q_proj_shard_size *
-                          self.q_shard_id:self.q_proj_shard_size *
-                          (self.q_shard_id + 1)]
-        k_offset = self.q_proj_total_size
-        lora_b_k = lora_b[:, k_offset +
-                          self.kv_proj_shard_size * self.kv_shard_id:k_offset +
-                          self.kv_proj_shard_size * (self.kv_shard_id + 1)]
-        v_offset = k_offset + self.kv_proj_total_size
-        lora_b_v = lora_b[:, v_offset +
-                          self.kv_proj_shard_size * self.kv_shard_id:v_offset +
-                          self.kv_proj_shard_size * (self.kv_shard_id + 1)]
-        lora_b = torch.cat([lora_b_q, lora_b_k, lora_b_v], dim=1)
-        return lora_b
-
-    def slice_bias(self, bias: torch.Tensor) -> torch.Tensor:
-        bias_q = bias[self.q_proj_shard_size *
-                      self.q_shard_id:self.q_proj_shard_size *
-                      (self.q_shard_id + 1)]
-        k_offset = self.q_proj_total_size
-        bias_k = bias[k_offset +
-                      self.kv_proj_shard_size * self.kv_shard_id:k_offset +
-                      self.kv_proj_shard_size * (self.kv_shard_id + 1)]
-        v_offset = k_offset + self.kv_proj_total_size
-        bias_v = bias[v_offset +
-                      self.kv_proj_shard_size * self.kv_shard_id:v_offset +
-                      self.kv_proj_shard_size * (self.kv_shard_id + 1)]
-        bias = torch.cat([bias_q, bias_k, bias_v], dim=1)
-        return bias
-
-    @classmethod
-    @_not_fully_sharded_can_replace
-    def can_replace_layer(cls, source_layer: nn.Module,
-                          lora_config: LoRAConfig, packed_modules_list: list,
-                          model_config: Optional[PretrainedConfig]) -> bool:
-        return type(source_layer) is QKVParallelLinear and len(
-            packed_modules_list) == 1
-
-
-class MergedQKVParallelLinearWithLoRA(MergedColumnParallelLinearWithLoRA):
-    """MergedColumnParallelLinear layer that is composed of 3 sublayers (slices)
-    packed together in qkv proj fashion
-    (q_proj + k_proj + v_proj -> qkv_proj).
-
-    This means we have 3 LoRAs, each applied to one slice of the layer.
-
-    Q slice may have different shape than K and V slices (which both have
-    the same shape).
-    """
-
-    def __init__(self, base_layer: QKVParallelLinear) -> None:
-        super().__init__(base_layer)
-        # There are three LoRA layer.
-        self.n_slices = len(self.base_layer.output_sizes)
-        self.tp_size = get_tensor_model_parallel_world_size()
-        self.tp_rank = get_tensor_model_parallel_rank()
-
-        self.q_proj_shard_size = (self.base_layer.num_heads *
-                                  self.base_layer.head_size)
-        self.kv_proj_shard_size = (self.base_layer.num_kv_heads *
-                                   self.base_layer.head_size)
-        self.q_shard_id = self.tp_rank
-        self.kv_shard_id = self.tp_rank // self.base_layer.num_kv_head_replicas
-
-        self.output_slices = (
-            self.q_proj_shard_size,
-            self.kv_proj_shard_size,
-            self.kv_proj_shard_size,
-        )
-        self.output_ids = (
-            self.q_shard_id,
-            self.kv_shard_id,
-            self.kv_shard_id,
-        )
-
-    def create_lora_weights(
-        self,
-        max_loras: int,
-        lora_config: LoRAConfig,
-        model_config: Optional[PretrainedConfig] = None,
-    ) -> None:
-        """
-        The main reason for overloading this function is to handle inconsistent 
-        weight dimensions in qkv lora.
-        """
-        super().create_lora_weights(max_loras, lora_config, model_config)
-
-    @classmethod
-    @_not_fully_sharded_can_replace
-    def can_replace_layer(
-        cls,
-        source_layer: nn.Module,
-        lora_config: LoRAConfig,
-        packed_modules_list: list,
-        model_config: Optional[PretrainedConfig],
-    ) -> bool:
-        return (type(source_layer) is QKVParallelLinear
-                and len(packed_modules_list) == 3)
-
-
-#TODO: Implement this
-class QKVCrossParallelLinearWithLoRA(BaseLayerWithLoRA):
-    pass
-
-
-class RowParallelLinearWithLoRA(BaseLinearLayerWithLoRA):
-
-    def __init__(self, base_layer: RowParallelLinear) -> None:
-        super().__init__(base_layer)
-
-        self.tp_size = get_tensor_model_parallel_world_size()
-        # reset input_size
-        self.input_size = self.base_layer.input_size_per_partition
-        self.output_size = self.base_layer.output_size
-
-        self.tp_rank = get_tensor_model_parallel_rank()
-        # There is only one LoRA layer.
-        self.n_slices = 1
-
-    def slice_lora_a(self, lora_a: torch.Tensor) -> torch.Tensor:
-
-        shard_size = self.input_size
-        start_idx = self.tp_rank * shard_size
-        end_idx = (self.tp_rank + 1) * shard_size
-        lora_a = lora_a[start_idx:end_idx, :]
-        return lora_a
-
-    def slice_lora_b(self, lora_b: torch.Tensor) -> torch.Tensor:
-        return lora_b
-
-    def slice_bias(self, bias: torch.Tensor) -> torch.Tensor:
-        return bias
-
-    def forward(
-        self, input_: torch.Tensor
-    ) -> Union[torch.Tensor, tuple[torch.Tensor, Optional[torch.Tensor]]]:
-        """Forward of RowParallelLinear
-
-        Args:
-            input_: tensor whose last dimension is `input_size`. If
-                    `input_is_parallel` is set, then the last dimension
-                    is `input_size // tp_size`.
-
-        Returns:
-            - output
-            - bias
-        """
-        # set up backprop all-reduce.
-        if self.base_layer.input_is_parallel:
-            input_parallel = input_
-        else:
-            # TODO: simplify code below
-            splitted_input = split_tensor_along_last_dim(
-                input_, num_partitions=self.base_layer.tp_size)
-            input_parallel = splitted_input[self.tp_rank].contiguous()
-
-        # Matrix multiply.
-        output_parallel = self.apply(input_parallel)
-        if self.base_layer.reduce_results and self.base_layer.tp_size > 1:
-            output_ = tensor_model_parallel_all_reduce(output_parallel)
-        else:
-            output_ = output_parallel
-
-        if not self.base_layer.skip_bias_add:
-            output = (output_ + self.base_layer.bias
-                      if self.base_layer.bias is not None else output_)
-            output_bias = None
-        else:
-            output = output_
-            output_bias = self.base_layer.bias
-
-        if not self.base_layer.return_bias:
-            return output
-
-        return output, output_bias
-
-    @classmethod
-    @_not_fully_sharded_can_replace
-    def can_replace_layer(
-        cls,
-        source_layer: nn.Module,
-        lora_config: LoRAConfig,
-        packed_modules_list: list,
-        model_config: Optional[PretrainedConfig],
-    ) -> bool:
-        return type(source_layer) is RowParallelLinear
-
-
-class LogitsProcessorWithLoRA(BaseLayerWithLoRA):
-    """
-    LoRA wrapper for LogitsProcessor, with extra logic to handle the
-    application of the LoRA adapter and added LoRA vocabulary.
-
-    Args:
-        base_layer: LogitsProcessor layer
-        hidden_size: hidden size of the model
-        dtype: data type of the model
-        device: device of the model
-        sharded_to_full_mapping: index mapping from sharded vocab to full vocab
-            received from base_layer.get_sharded_to_full_mapping(). If None,
-            no reindexing will be done.
-    """
-
-    def __init__(self, base_layer: LogitsProcessor, hidden_size: int,
-                 dtype: torch.dtype, device: torch.device,
-                 sharded_to_full_mapping: Optional[list[int]]) -> None:
-        super().__init__()
-        self.base_layer = base_layer
-        self.hidden_size = hidden_size
-        self.dtype = dtype
-        self.device = device
-        self.tp_size = get_tensor_model_parallel_world_size()
-        self.tp_rank = get_tensor_model_parallel_rank()
-        self.sharded_to_full_mapping = sharded_to_full_mapping
-
-    @property
-    def logits_as_input(self):
-        return self.base_layer.logits_as_input
-
-    @property
-    def vocab_size(self):
-        return self.base_layer.vocab_size
-
-    @property
-    def scale(self):
-        return self.base_layer.scale
-
-    @property
-    def soft_cap(self):
-        return self.base_layer.soft_cap
-
-    @property
-    def use_all_gather(self):
-        return self.base_layer.use_all_gather
-
-    @property
-    def org_vocab_size(self):
-        return self.base_layer.org_vocab_size
-
-    @property
-    def include_gpu_probs_tensor(self):
-        return self.base_layer.include_gpu_probs_tensor
-
-    @property
-    def should_modify_greedy_probs_inplace(self):
-        return self.base_layer.should_modify_greedy_probs_inplace
-
-    def create_lora_weights(
-        self,
-        max_loras: int,
-        lora_config: LoRAConfig,
-        model_config: Optional[PretrainedConfig] = None,
-    ) -> None:
-        # TODO: Verify if this condition can be further relaxed
-        if 32000 < self.base_layer.vocab_size > 257024:
-            raise ValueError("When using LoRA, vocab size must be "
-                             "32000 >= vocab_size <= 257024")
-        self.lora_a_stacked = torch.zeros(
-            (
-                max_loras,
-                1,
-                lora_config.max_lora_rank,
-                self.hidden_size,
-            ),
-            dtype=lora_config.lora_dtype,
-            device=self.device,
-        )
-        self.lora_b_stacked = torch.zeros(
-            (
-                max_loras,
-                1,
-                # Pad for kernel compatibility
-                math.ceil(self.base_layer.vocab_size /
-                          lora_config.lora_vocab_padding_size) *
-                lora_config.lora_vocab_padding_size,
-                lora_config.max_lora_rank,
-            ),
-            dtype=lora_config.lora_dtype,
-            device=self.device,
-        )
-        self.embeddings_tensors = torch.full(
-            (max_loras, lora_config.lora_extra_vocab_size, self.hidden_size),
-            fill_value=float("-inf"),
-            dtype=self.dtype,
-            device=self.device,
-        )
-        if self.sharded_to_full_mapping is not None:
-            self.sharded_to_full_mapping_gpu = torch.tensor(
-                self.sharded_to_full_mapping,
-                device=self.device,
-                dtype=torch.long)
-        else:
-            self.sharded_to_full_mapping_gpu = None
-
-    def reset_lora(self, index: int):
-        self.lora_a_stacked[index] = 0
-        self.lora_b_stacked[index] = 0
-        self.embeddings_tensors[index] = float("-inf")
-
-    def set_lora(
-        self,
-        index: int,
-        lora_a: torch.Tensor,
-        lora_b: torch.Tensor,
-        embeddings_tensor: Optional[torch.Tensor],
-        bias: Optional[torch.Tensor] = None,
-    ):
-        self.reset_lora(index)
-        self.lora_a_stacked[index,
-                            0, :lora_a.shape[1], :lora_a.shape[0]].copy_(
-                                lora_a.T, non_blocking=True)
-        self.lora_b_stacked[index,
-                            0, :lora_b.shape[1], :lora_b.shape[0]].copy_(
-                                lora_b.T, non_blocking=True)
-        if embeddings_tensor is not None:
-            self.embeddings_tensors[
-                index,
-                :embeddings_tensor.shape[0],
-                :embeddings_tensor.shape[1],
-            ] = embeddings_tensor
-
-    def _get_logits(
-        self,
-        hidden_states: torch.Tensor,
-        lm_head: VocabParallelEmbedding,
-        embedding_bias: Optional[torch.Tensor] = None,
-    ) -> Optional[torch.Tensor]:
-        # Get the logits for the next tokens.
-        logits = lm_head.quant_method.apply(lm_head, hidden_states)
-        if embedding_bias is not None:
-            logits += embedding_bias
-
-        # Gather logits for TP
-        logits = self.base_layer._gather_logits(logits)
-
-        if logits is None:
-            return None
-
-        if self.sharded_to_full_mapping_gpu is not None:
-            # Reindex full logits tensor to ensure 1:1 mapping between
-            # index and token_id
-            # Example for:
-            #   org_vocab_size = 4
-            #   added_vocab_size = 2
-            #   pad_to_size = 8
-            #   tp_size = 2
-
-            # indices:  [0, 1, 2,  3, 4, 5, 6,  7]
-            # token_id: [0, 1, 4, -1, 2, 3, 5, -1]
-
-            # Therefore, the mapping is expected to be:
-            # [0, 1, 4, 6, 2, 3, 5, 7] so that when we reindex,
-            # we get:
-            # indices:  [0, 1, 2, 3, 4, 5,  6,  7]
-            # token_id: [0, 1, 2, 3, 4, 5, -1, -1]
-            logits = logits[:, self.sharded_to_full_mapping_gpu]
-
-        lora_logits = torch.empty(
-            self.embeddings_tensors.shape[0] + 1,
-            self.embeddings_tensors.shape[1],
-            hidden_states.shape[0],
-            dtype=self.embeddings_tensors.dtype,
-            device=self.embeddings_tensors.device,
-        )
-        torch.matmul(self.embeddings_tensors,
-                     hidden_states.T,
-                     out=lora_logits[:-1])
-
-        neg_inf, pos_inf = current_platform.get_infinity_values(
-            lora_logits.dtype)
-
-        lora_logits[-1] = neg_inf
-        lora_logits = lora_logits.mT
-        indices_padded = self.punica_wrapper.sampler_indices_padded
-
-        if current_platform.is_tpu():
-            indices_padded = indices_padded[:logits.size(0)]
-
-        lora_logits = (lora_logits.reshape(
-            lora_logits.shape[0] * lora_logits.shape[1],
-            lora_logits.shape[2],
-        ).index_select(0, indices_padded).nan_to_num_(nan=neg_inf,
-                                                      posinf=pos_inf,
-                                                      neginf=neg_inf))
-
-        logits[:,
-               self.base_layer.org_vocab_size:self.base_layer.org_vocab_size +
-               lora_logits.shape[1]] = lora_logits
-
-        lora_output: Optional[
-            torch.Tensor] = self.punica_wrapper.add_lora_logits(
-                logits, hidden_states, self.lora_a_stacked,
-                self.lora_b_stacked, 1.0)
-
-        if not current_platform.can_update_inplace():
-            logits = lora_output
-
-        # Remove paddings in vocab (if any).
-        logits = logits[:, :self.base_layer.vocab_size]
-        return logits
-
-    def forward(self, *args, **kwargs):
-        return type(self.base_layer).forward(self, *args, **kwargs)
-
-    @classmethod
-    def can_replace_layer(
-        cls,
-        source_layer: nn.Module,
-        lora_config: LoRAConfig,
-        packed_modules_list: list,
-        model_config: Optional[PretrainedConfig],
-    ) -> bool:
-        # Special handling for the LogitsProcessor.
-        return False
diff --git a/vllm/lora/layers/__init__.py b/vllm/lora/layers/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..d3bb145dc7bf83dd9ca196bc209fccd64b75d96b
--- /dev/null
+++ b/vllm/lora/layers/__init__.py
@@ -0,0 +1,34 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from vllm.lora.layers.base import BaseLayerWithLoRA
+from vllm.lora.layers.column_parallel_linear import (
+    ColumnParallelLinearWithLoRA, ColumnParallelLinearWithShardedLoRA,
+    MergedColumnParallelLinearWithLoRA,
+    MergedColumnParallelLinearWithShardedLoRA, MergedQKVParallelLinearWithLoRA,
+    MergedQKVParallelLinearWithShardedLoRA, QKVParallelLinearWithLoRA,
+    QKVParallelLinearWithShardedLoRA)
+from vllm.lora.layers.logits_processor import LogitsProcessorWithLoRA
+from vllm.lora.layers.replicated_linear import ReplicatedLinearWithLoRA
+from vllm.lora.layers.row_parallel_linear import (
+    RowParallelLinearWithLoRA, RowParallelLinearWithShardedLoRA)
+from vllm.lora.layers.utils import LoRAMapping
+from vllm.lora.layers.vocal_parallel_embedding import (
+    VocabParallelEmbeddingWithLoRA)
+
+__all__ = [
+    "BaseLayerWithLoRA",
+    "VocabParallelEmbeddingWithLoRA",
+    "LogitsProcessorWithLoRA",
+    "ColumnParallelLinearWithLoRA",
+    "ColumnParallelLinearWithShardedLoRA",
+    "MergedColumnParallelLinearWithLoRA",
+    "MergedColumnParallelLinearWithShardedLoRA",
+    "MergedQKVParallelLinearWithLoRA",
+    "MergedQKVParallelLinearWithShardedLoRA",
+    "QKVParallelLinearWithLoRA",
+    "QKVParallelLinearWithShardedLoRA",
+    "RowParallelLinearWithLoRA",
+    "RowParallelLinearWithShardedLoRA",
+    "ReplicatedLinearWithLoRA",
+    "LoRAMapping",
+]
diff --git a/vllm/lora/layers/base.py b/vllm/lora/layers/base.py
new file mode 100644
index 0000000000000000000000000000000000000000..a80a033e39b4044121ba2158ff4d6014f2382c36
--- /dev/null
+++ b/vllm/lora/layers/base.py
@@ -0,0 +1,69 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from typing import TYPE_CHECKING, Optional, Union
+
+import torch
+import torch.nn as nn
+from transformers import PretrainedConfig
+
+from vllm.config.lora import LoRAConfig
+
+if TYPE_CHECKING:
+    from vllm.lora.punica_wrapper import PunicaWrapperBase
+
+
+class BaseLayerWithLoRA(nn.Module):
+
+    def slice_lora_a(
+        self, lora_a: Union[torch.Tensor, list[Union[torch.Tensor, None]]]
+    ) -> Union[torch.Tensor, list[Union[torch.Tensor, None]]]:
+        """Slice lora a if splitting for tensor parallelism."""
+        ...
+
+    def slice_lora_b(
+        self, lora_b: Union[torch.Tensor, list[Union[torch.Tensor, None]]]
+    ) -> Union[torch.Tensor, list[Union[torch.Tensor, None]]]:
+        """Slice lora b if splitting with tensor parallelism."""
+        ...
+
+    def create_lora_weights(
+        self,
+        max_loras: int,
+        lora_config: LoRAConfig,
+        model_config: Optional[PretrainedConfig] = None,
+    ) -> None:
+        """Initializes lora matrices."""
+        ...
+
+    def reset_lora(self, index: int):
+        """Resets the lora weights at index back to 0."""
+        ...
+
+    def set_lora(
+        self,
+        index: int,
+        lora_a: torch.Tensor,
+        lora_b: torch.Tensor,
+        embeddings_tensor: Optional[torch.Tensor],
+        bias: Optional[torch.Tensor] = None,
+    ):
+        """Overwrites lora tensors at index."""
+        ...
+
+    def set_mapping(
+        self,
+        punica_wrapper,
+    ):
+        self.punica_wrapper: PunicaWrapperBase = punica_wrapper
+
+    @classmethod
+    def can_replace_layer(
+        cls,
+        source_layer: nn.Module,
+        lora_config: LoRAConfig,
+        packed_modules_list: list,
+        model_config: Optional[PretrainedConfig],
+    ) -> bool:
+        """Returns True if the layer can be replaced by this LoRA layer."""
+        raise NotImplementedError
diff --git a/vllm/lora/layers/base_linear.py b/vllm/lora/layers/base_linear.py
new file mode 100644
index 0000000000000000000000000000000000000000..85a1f86ce6bf2bb51784231e559c35e85909e650
--- /dev/null
+++ b/vllm/lora/layers/base_linear.py
@@ -0,0 +1,184 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from typing import Optional, cast
+
+import torch
+from transformers import PretrainedConfig
+
+from vllm.config.lora import LoRAConfig
+from vllm.distributed.utils import divide
+# yapf: disable
+from vllm.model_executor.layers.linear import (ColumnParallelLinear,
+                                               LinearBase, ReplicatedLinear,
+                                               RowParallelLinear)
+from vllm.platforms import current_platform
+
+from .base import BaseLayerWithLoRA
+from .utils import _get_lora_device
+
+
+class BaseLinearLayerWithLoRA(BaseLayerWithLoRA):
+
+    def __init__(self, base_layer: LinearBase):
+        super().__init__()
+        self.base_layer = base_layer
+        self.input_size = self.base_layer.input_size
+        self.device = _get_lora_device(self.base_layer)
+        self.lora_bias_stacked: Optional[tuple[torch.Tensor, ...]] = None
+
+        self.output_slices: tuple[int, ...]
+        self.tp_size: int
+        self.output_size: int
+        self.n_slices: int
+
+    def create_lora_weights(
+        self,
+        max_loras: int,
+        lora_config: LoRAConfig,
+        model_config: Optional[PretrainedConfig] = None,
+    ) -> None:
+        self.lora_config = lora_config
+        #
+        if isinstance(self.base_layer, ReplicatedLinear):
+            lora_a_out_size = lora_config.max_lora_rank
+            lora_b_out_size = self.output_size
+
+        elif isinstance(self.base_layer, ColumnParallelLinear):
+            lora_a_out_size = (lora_config.max_lora_rank if
+                               not lora_config.fully_sharded_loras else divide(
+                                   lora_config.max_lora_rank, self.tp_size))
+            lora_b_out_size = self.output_size
+
+        elif isinstance(self.base_layer, RowParallelLinear):
+            lora_a_out_size = lora_config.max_lora_rank
+            lora_b_out_size = (self.output_size if
+                               not lora_config.fully_sharded_loras else divide(
+                                   self.output_size, self.tp_size))
+        else:
+            raise NotImplementedError
+
+        self.lora_a_stacked = tuple(
+            torch.zeros(
+                max_loras,
+                1,
+                lora_a_out_size,
+                self.input_size,
+                dtype=lora_config.lora_dtype,
+                device=self.device,
+            ) for _ in range(self.n_slices))
+        self.lora_b_stacked = tuple(
+            torch.zeros(
+                max_loras,
+                1,
+                lora_b_out_size,
+                lora_config.max_lora_rank,
+                dtype=lora_config.lora_dtype,
+                device=self.device,
+            ) for _ in range(self.n_slices))
+        if lora_config.bias_enabled:
+            lora_bias_out_size = lora_b_out_size
+            self.lora_bias_stacked = tuple(
+                torch.zeros(
+                    max_loras,
+                    1,
+                    lora_bias_out_size,
+                    dtype=lora_config.lora_dtype,
+                    device=self.device,
+                ) for _ in range(self.n_slices))
+        self.output_slices = (self.lora_b_stacked[0].shape[2], )
+
+    def reset_lora(self, index: int):
+        for s_index in range(self.n_slices):
+            self.lora_a_stacked[s_index][index] = 0
+            self.lora_b_stacked[s_index][index] = 0
+            if self.lora_config.bias_enabled:
+                # Make mypy happy
+                self.lora_bias_stacked = cast(tuple[torch.Tensor, ...],
+                                              self.lora_bias_stacked)
+                self.lora_bias_stacked[s_index][index] = 0
+
+    def set_lora(
+        self,
+        index: int,
+        lora_a: torch.Tensor,
+        lora_b: torch.Tensor,
+        embeddings_tensor: Optional[torch.Tensor],
+        lora_bias: Optional[torch.Tensor] = None,
+    ):
+        # Except for QKVParallelLinearWithLoRA and
+        # MergedColumnParallelLinearWithLoRA, all other linear LoRA layers
+        # store weights in a tuple of size 1. These two layers will
+        # override this function.
+        assert (len(self.lora_a_stacked) == len(self.lora_b_stacked) ==
+                self.n_slices == 1)
+
+        self.reset_lora(index)
+        if self.tp_size > 1:
+            lora_a = self.slice_lora_a(lora_a)
+            lora_b = self.slice_lora_b(lora_b)
+            if lora_bias is not None:
+                lora_bias = self.slice_bias(lora_bias)
+
+        self.lora_a_stacked[0][index,
+                               0, :lora_a.shape[1], :lora_a.shape[0]].copy_(
+                                   lora_a.T, non_blocking=True)
+        self.lora_b_stacked[0][index,
+                               0, :lora_b.shape[1], :lora_b.shape[0]].copy_(
+                                   lora_b.T, non_blocking=True)
+        if lora_bias is not None:
+
+            self.lora_bias_stacked = cast(tuple[torch.Tensor, ...],
+                                          self.lora_bias_stacked)
+            assert len(self.lora_bias_stacked)
+            self.lora_bias_stacked[0][index, 0, :lora_bias.shape[0]].copy_(
+                lora_bias.T, non_blocking=True)
+
+    def apply(self,
+              x: torch.Tensor,
+              bias: Optional[torch.Tensor] = None) -> torch.Tensor:
+        output = self.base_layer.quant_method.apply(self.base_layer, x, bias)
+
+        # In transformers backend, x and output have extra batch dimension like
+        # (1, seq_len, hidden_dim), while punica expects (seq_len, hidden_dim),
+        # therefore we need to flatten the batch dimensions.
+        if x.ndim == 3 and output.ndim == 3:
+            output = output.flatten(0, 1)
+            x = x.flatten(0, 1)
+
+        lora_output: Optional[
+            torch.Tensor] = self.punica_wrapper.add_lora_linear(
+                output, x, self.lora_a_stacked, self.lora_b_stacked,
+                self.lora_bias_stacked, 1.0, self.output_slices)
+        if not current_platform.can_update_inplace():
+            output = lora_output
+
+        return output
+
+    @property
+    def weight(self) -> torch.Tensor:
+
+        # unquantizedLinear
+        if hasattr(self.base_layer, "weight"):
+            return self.base_layer.weight
+        # Compressed Tensor
+        elif hasattr(self.base_layer, "weight_packed"):
+            return self.base_layer.weight_packed
+        # GPTQ/AWQ
+        elif hasattr(self.base_layer, "qweight"):
+            return self.base_layer.qweight
+        # marlin
+        elif hasattr(self.base_layer, "B"):
+            return self.base_layer.B
+        # HQQ marlin
+        elif hasattr(self.base_layer, "W_q"):
+            return self.base_layer.W_q
+        else:
+            raise ValueError(f"Unsupported base layer: {self.base_layer}")
+
+    @property
+    def bias(self) -> Optional[torch.Tensor]:
+        if hasattr(self.base_layer, "bias"):
+            return self.base_layer.bias
+        else:
+            return None
diff --git a/vllm/lora/layers/column_parallel_linear.py b/vllm/lora/layers/column_parallel_linear.py
new file mode 100644
index 0000000000000000000000000000000000000000..658fd23165da092a84f82d65b6977af7497ff5fa
--- /dev/null
+++ b/vllm/lora/layers/column_parallel_linear.py
@@ -0,0 +1,622 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from typing import Optional, Union, cast
+
+import torch
+import torch.nn as nn
+from transformers import PretrainedConfig
+
+from vllm.config.lora import LoRAConfig
+from vllm.distributed import (get_tensor_model_parallel_rank,
+                              get_tensor_model_parallel_world_size,
+                              tensor_model_parallel_all_gather)
+from vllm.distributed.utils import divide
+from vllm.model_executor.layers.linear import (ColumnParallelLinear,
+                                               MergedColumnParallelLinear,
+                                               QKVParallelLinear)
+from vllm.platforms import current_platform
+
+from .base_linear import BaseLinearLayerWithLoRA
+from .utils import _fully_sharded_can_replace, _not_fully_sharded_can_replace
+
+
+def _mcp_apply(x, bias, layer: "ColumnParallelLinearWithLoRA"):
+    """ 
+    For `ColumnParallelLinearWithLoRA` or classes that inherit from 
+    `ColumnParallelLinearWithLoRA`, they share the same `apply` logic.
+    """
+    assert (layer.n_slices == len(layer.lora_a_stacked) == len(
+        layer.lora_b_stacked) == len(layer.output_slices))
+    if layer.lora_bias_stacked is not None:
+        assert layer.n_slices == len(layer.lora_bias_stacked)
+
+    output = layer.base_layer.quant_method.apply(layer.base_layer, x, bias)
+
+    x = x.view(-1, x.shape[-1])
+    output, out_orig_shape = output.view(-1, output.shape[-1]), output.shape
+
+    # Since communication is needed, the buffer is directly initialized as a
+    # tensor rather than a tuple of tensor.
+    buffers = torch.zeros(
+        (layer.n_slices, x.shape[0], layer.lora_a_stacked[0].shape[2]),
+        dtype=torch.float32,
+        device=x.device,
+    )
+
+    shrunk_buffers: Optional[torch.Tensor] = layer.punica_wrapper.add_shrink(
+        buffers, x, layer.lora_a_stacked, 1.0)
+
+    if not current_platform.can_update_inplace():
+        buffers = shrunk_buffers
+
+    buffers = tensor_model_parallel_all_gather(buffers)
+
+    lora_output: Optional[torch.Tensor] = layer.punica_wrapper.add_expand(
+        output,
+        buffers,
+        layer.lora_b_stacked,
+        layer.lora_bias_stacked,
+        layer.output_slices,
+        offset_start=0,
+        add_input=True)
+
+    if not current_platform.can_update_inplace():
+        output = lora_output
+
+    output = output.view(*out_orig_shape)
+    # now have column partitioned and packed output
+    return output
+
+
+class ColumnParallelLinearWithLoRA(BaseLinearLayerWithLoRA):
+    """
+    LoRA on top of ColumnParallelLinear layer.
+    LoRA B is sliced for tensor parallelism.
+    There are two types for the `base_layer`:
+    1. ColumnParallelLinear, e.g.`dense_h_to_4h` in `FalconForCausalLM`.
+    2. MergedColumnParallelLinear, e.g.`gate_up_proj` in `Phi3ForCausalLM`.
+    """
+
+    def __init__(self, base_layer: ColumnParallelLinear) -> None:
+        super().__init__(base_layer)
+        # The base_layer type is ColumnParallelLinear or
+        # MergedColumnParallelLinear, their weight sharding logic is
+        # inconsistent when TP is greater than 1.
+        self.is_merged_col_linear = type(
+            base_layer) is MergedColumnParallelLinear
+        self.tp_size = get_tensor_model_parallel_world_size()
+        self.output_size = self.base_layer.output_size_per_partition
+        # There is only one LoRA layer
+        self.n_slices = 1
+
+    def slice_lora_a(self, lora_a: torch.Tensor) -> torch.Tensor:
+        return lora_a
+
+    def slice_lora_b(self, lora_b: torch.Tensor) -> torch.Tensor:
+        # Applicable to cases where the base_layer is
+        # MergedColumnParallelLinear.
+        if self.is_merged_col_linear:
+            tp_rank = get_tensor_model_parallel_rank()
+            shard_size = self.output_size // 2
+            offset = lora_b.shape[-1] // 2
+
+            left_weight = lora_b[:, tp_rank * shard_size:(tp_rank + 1) *
+                                 shard_size]
+            right_weight = lora_b[:, offset + tp_rank * shard_size:offset +
+                                  (tp_rank + 1) * shard_size]
+            lora_b = torch.cat([left_weight, right_weight], dim=1)
+        # Applicable to cases where the base_layer is
+        # ColumnParallelLinear.
+        else:
+            tensor_model_parallel_rank = get_tensor_model_parallel_rank()
+            shard_size = self.output_size
+            start_idx = tensor_model_parallel_rank * shard_size
+            end_idx = (tensor_model_parallel_rank + 1) * shard_size
+            lora_b = lora_b[:, start_idx:end_idx]
+        return lora_b
+
+    def slice_bias(self, bias: torch.Tensor) -> torch.Tensor:
+        # TODO: Fix the slicing logic of bias.
+        if bias is None:
+            return bias
+        tensor_model_parallel_rank = get_tensor_model_parallel_rank()
+        shard_size = self.output_size
+        start_idx = tensor_model_parallel_rank * shard_size
+        end_idx = (tensor_model_parallel_rank + 1) * shard_size
+        bias = bias[start_idx:end_idx]
+        return bias
+
+    def forward(
+        self, input_: torch.Tensor
+    ) -> Union[torch.Tensor, tuple[torch.Tensor, Optional[torch.Tensor]]]:
+        """Forward of ColumnParallelLinear
+
+        Args:
+            input_: Tensor whose last dimension is `input_size`.
+
+        Returns:
+            - output
+            - bias
+        """
+        bias = (self.base_layer.bias
+                if not self.base_layer.skip_bias_add else None)
+
+        # Matrix multiply.
+        output_parallel = self.apply(input_, bias)
+        if self.base_layer.gather_output:
+            # All-gather across the partitions.
+            output = tensor_model_parallel_all_gather(output_parallel)
+        else:
+            output = output_parallel
+
+        if not self.base_layer.return_bias:
+            return output
+
+        output_bias = (self.base_layer.bias
+                       if self.base_layer.skip_bias_add else None)
+        return output, output_bias
+
+    @classmethod
+    @_not_fully_sharded_can_replace
+    def can_replace_layer(
+        cls,
+        source_layer: nn.Module,
+        lora_config: LoRAConfig,
+        packed_modules_list: list,
+        model_config: Optional[PretrainedConfig],
+    ) -> bool:
+        return type(source_layer) is ColumnParallelLinear or (
+            type(source_layer) is MergedColumnParallelLinear
+            and len(packed_modules_list) == 1)
+
+
+class MergedColumnParallelLinearWithLoRA(ColumnParallelLinearWithLoRA):
+    """ColumnParallelLinear layer that is composed of 2 sublayers (slices)
+    packed together (e.g. gate_proj + up_proj -> gate_up_proj).
+
+    This means we have 2 LoRAs, each applied to one half of the layer.
+
+    Both slices must have the same size.
+    """
+
+    def __init__(
+        self, base_layer: Union[MergedColumnParallelLinear,
+                                QKVParallelLinear]) -> None:
+        super().__init__(base_layer)
+        # There are two LoRA layers
+        self.tp_size = get_tensor_model_parallel_world_size()
+        self.tp_rank = get_tensor_model_parallel_rank()
+        # the output_sizes in MergedColumnParallelLinear is not sharded by tp
+        # we need to divide it by the tp_size to get correct slices size
+        output_sizes = self.base_layer.output_sizes
+        self.output_slices = tuple(
+            divide(output_size, self.tp_size) for output_size in output_sizes)
+        self.n_slices = len(self.output_slices)
+        self.output_ids = (self.tp_rank, ) * self.n_slices
+
+    def create_lora_weights(
+        self,
+        max_loras: int,
+        lora_config: LoRAConfig,
+        model_config: Optional[PretrainedConfig] = None,
+    ) -> None:
+        """
+        The main reason for overriding this function is to enhance  code 
+        maintainability.
+        """
+        self.lora_config = lora_config
+
+        lora_a_output_size_per_partition = (
+            lora_config.max_lora_rank if not lora_config.fully_sharded_loras
+            else divide(lora_config.max_lora_rank, self.tp_size))
+
+        self.lora_a_stacked = tuple(
+            torch.zeros(
+                max_loras,
+                1,
+                lora_a_output_size_per_partition,
+                self.input_size,
+                dtype=lora_config.lora_dtype,
+                device=self.device,
+            ) for _ in range(self.n_slices))
+        self.lora_b_stacked = tuple(
+            torch.zeros(
+                max_loras,
+                1,
+                output_size,
+                lora_config.max_lora_rank,
+                dtype=lora_config.lora_dtype,
+                device=self.device,
+            ) for output_size in self.output_slices)
+        if lora_config.bias_enabled:
+            self.lora_bias_stacked = tuple(
+                torch.zeros(
+                    max_loras,
+                    1,
+                    output_size,
+                    dtype=lora_config.lora_dtype,
+                    device=self.device,
+                ) for output_size in self.output_slices)
+
+    def slice_lora_a(
+        self, lora_a: list[Union[torch.Tensor, None]]
+    ) -> list[Union[torch.Tensor, None]]:
+        return lora_a
+
+    def slice_lora_b(
+        self, lora_b: list[Union[torch.Tensor, None]]
+    ) -> list[Union[torch.Tensor, None]]:
+        sliced_lora_b = [None] * self.n_slices
+        for i, (shard_id, shard_size) in enumerate(
+                zip(self.output_ids, self.output_slices)):
+            if (lora_b_i := lora_b[i]) is not None:
+                sliced_lora_b[i] = lora_b_i[:,
+                                            shard_size * shard_id:shard_size *
+                                            (shard_id + 1)]
+        return sliced_lora_b
+
+    def slice_bias(
+        self, bias: list[Union[torch.Tensor,
+                               None]]) -> list[Union[torch.Tensor, None]]:
+        for i, (shard_id, shard_size) in enumerate(
+                zip(self.output_ids, self.output_slices)):
+            if (bias_i := bias[i]) is not None:
+                bias[i] = bias_i[shard_size * shard_id:shard_size *
+                                 (shard_id + 1)]
+        return bias
+
+    def set_lora(
+        self,
+        index: int,
+        lora_a: torch.Tensor,
+        lora_b: torch.Tensor,
+        embeddings_tensor: Optional[torch.Tensor],
+        lora_bias: Optional[torch.Tensor] = None,
+    ):
+        self.reset_lora(index)
+
+        if self.tp_size > 1:
+            lora_a = self.slice_lora_a(lora_a)
+            lora_b = self.slice_lora_b(lora_b)
+            if lora_bias is not None:
+                lora_bias = self.slice_bias(lora_bias)
+
+        for i in range(self.n_slices):
+            if (lora_a_i := lora_a[i]) is not None:
+                self.lora_a_stacked[i][
+                    index, 0, :lora_a_i.shape[1], :lora_a_i.shape[0]].copy_(
+                        lora_a_i.T, non_blocking=True)
+            if (lora_b_i := lora_b[i]) is not None:
+                self.lora_b_stacked[i][
+                    index, 0, :lora_b_i.shape[1], :lora_b_i.shape[0]].copy_(
+                        lora_b_i.T, non_blocking=True)
+
+        if lora_bias is not None:
+            self.lora_bias_stacked = cast(tuple[torch.Tensor, ...],
+                                          self.lora_bias_stacked)
+            for i in range(self.n_slices):
+                if (lora_bias_i := lora_bias[i]) is not None:
+                    self.lora_bias_stacked[i][index,
+                                              0, :lora_bias_i.shape[0]].copy_(
+                                                  lora_bias_i.T,
+                                                  non_blocking=True)
+
+    @classmethod
+    @_not_fully_sharded_can_replace
+    def can_replace_layer(
+        cls,
+        source_layer: nn.Module,
+        lora_config: LoRAConfig,
+        packed_modules_list: list,
+        model_config: Optional[PretrainedConfig],
+    ) -> bool:
+        return (type(source_layer) is MergedColumnParallelLinear
+                and len(packed_modules_list) == 2)
+
+
+class QKVParallelLinearWithLoRA(ColumnParallelLinearWithLoRA):
+    """
+    ColumnParallelLinear layer that is specifically designed for
+    qkv_proj. Certain models, such as chatglm3 and baichuan-7b,
+    only contains a single LoRA within their qkv_proj layer.
+
+    During inference with Tensor Parallel, the weights of lora_b
+    must be accurately partitioned according to the respective ranks.
+
+    Q slice may have different shape than K and V slices (which both have
+    the same shape).
+    """
+
+    def __init__(self, base_layer: QKVParallelLinear) -> None:
+        super().__init__(base_layer)
+        self.q_proj_total_size = (self.base_layer.total_num_heads *
+                                  self.base_layer.head_size)
+        self.q_proj_shard_size = (self.base_layer.num_heads *
+                                  self.base_layer.head_size)
+        self.kv_proj_shard_size = (self.base_layer.num_kv_heads *
+                                   self.base_layer.head_size)
+        self.kv_proj_total_size = (self.base_layer.total_num_kv_heads *
+                                   self.base_layer.head_size)
+        # There is only one LoRA layer
+        self.n_slices = 1
+
+    def slice_lora_b(self, lora_b: torch.Tensor) -> torch.Tensor:
+        tp_rank = get_tensor_model_parallel_rank()
+        self.q_shard_id = tp_rank
+        self.kv_shard_id = tp_rank // self.base_layer.num_kv_head_replicas
+        lora_b_q = lora_b[:, self.q_proj_shard_size *
+                          self.q_shard_id:self.q_proj_shard_size *
+                          (self.q_shard_id + 1)]
+        k_offset = self.q_proj_total_size
+        lora_b_k = lora_b[:, k_offset +
+                          self.kv_proj_shard_size * self.kv_shard_id:k_offset +
+                          self.kv_proj_shard_size * (self.kv_shard_id + 1)]
+        v_offset = k_offset + self.kv_proj_total_size
+        lora_b_v = lora_b[:, v_offset +
+                          self.kv_proj_shard_size * self.kv_shard_id:v_offset +
+                          self.kv_proj_shard_size * (self.kv_shard_id + 1)]
+        lora_b = torch.cat([lora_b_q, lora_b_k, lora_b_v], dim=1)
+        return lora_b
+
+    def slice_bias(self, bias: torch.Tensor) -> torch.Tensor:
+        bias_q = bias[self.q_proj_shard_size *
+                      self.q_shard_id:self.q_proj_shard_size *
+                      (self.q_shard_id + 1)]
+        k_offset = self.q_proj_total_size
+        bias_k = bias[k_offset +
+                      self.kv_proj_shard_size * self.kv_shard_id:k_offset +
+                      self.kv_proj_shard_size * (self.kv_shard_id + 1)]
+        v_offset = k_offset + self.kv_proj_total_size
+        bias_v = bias[v_offset +
+                      self.kv_proj_shard_size * self.kv_shard_id:v_offset +
+                      self.kv_proj_shard_size * (self.kv_shard_id + 1)]
+        bias = torch.cat([bias_q, bias_k, bias_v], dim=1)
+        return bias
+
+    @classmethod
+    @_not_fully_sharded_can_replace
+    def can_replace_layer(cls, source_layer: nn.Module,
+                          lora_config: LoRAConfig, packed_modules_list: list,
+                          model_config: Optional[PretrainedConfig]) -> bool:
+        return type(source_layer) is QKVParallelLinear and len(
+            packed_modules_list) == 1
+
+
+class MergedQKVParallelLinearWithLoRA(MergedColumnParallelLinearWithLoRA):
+    """MergedColumnParallelLinear layer that is composed of 3 sublayers (slices)
+    packed together in qkv proj fashion
+    (q_proj + k_proj + v_proj -> qkv_proj).
+
+    This means we have 3 LoRAs, each applied to one slice of the layer.
+
+    Q slice may have different shape than K and V slices (which both have
+    the same shape).
+    """
+
+    def __init__(self, base_layer: QKVParallelLinear) -> None:
+        super().__init__(base_layer)
+        # There are three LoRA layer.
+        self.n_slices = len(self.base_layer.output_sizes)
+        self.tp_size = get_tensor_model_parallel_world_size()
+        self.tp_rank = get_tensor_model_parallel_rank()
+
+        self.q_proj_shard_size = (self.base_layer.num_heads *
+                                  self.base_layer.head_size)
+        self.kv_proj_shard_size = (self.base_layer.num_kv_heads *
+                                   self.base_layer.head_size)
+        self.q_shard_id = self.tp_rank
+        self.kv_shard_id = self.tp_rank // self.base_layer.num_kv_head_replicas
+
+        self.output_slices = (
+            self.q_proj_shard_size,
+            self.kv_proj_shard_size,
+            self.kv_proj_shard_size,
+        )
+        self.output_ids = (
+            self.q_shard_id,
+            self.kv_shard_id,
+            self.kv_shard_id,
+        )
+
+    def create_lora_weights(
+        self,
+        max_loras: int,
+        lora_config: LoRAConfig,
+        model_config: Optional[PretrainedConfig] = None,
+    ) -> None:
+        """
+        The main reason for overloading this function is to handle inconsistent 
+        weight dimensions in qkv lora.
+        """
+        super().create_lora_weights(max_loras, lora_config, model_config)
+
+    @classmethod
+    @_not_fully_sharded_can_replace
+    def can_replace_layer(
+        cls,
+        source_layer: nn.Module,
+        lora_config: LoRAConfig,
+        packed_modules_list: list,
+        model_config: Optional[PretrainedConfig],
+    ) -> bool:
+        return (type(source_layer) is QKVParallelLinear
+                and len(packed_modules_list) == 3)
+
+
+# These following layers are based on the tensor parallelism strategy given in
+# Y. Sheng et al., S-LoRA: Serving Thousands of Concurrent LoRA Adapters. 2023,
+# https://arxiv.org/abs/2311.03285.
+
+
+class ColumnParallelLinearWithShardedLoRA(ColumnParallelLinearWithLoRA):
+    """
+    Differs from ColumnParallelLinearWithLoRA by slicing LoRA A also.
+
+    Based on S-LoRA, slicing happens along the rank dim.
+    """
+
+    # For all LoRA layers where the `base_layer` is `ColumnParallelLinear`,
+    # their `lora_a` and `lora_b` have different sharding patterns. After
+    # completing the `lora_a` GEMM , a gather operation is performed.
+    # Therefore, the sharding of `lora_a` only needs to correspond with the
+    # gather operation.
+    def slice_lora_a(self, lora_a: torch.Tensor) -> torch.Tensor:
+        tp_rank = get_tensor_model_parallel_rank()
+        shard_size = self.lora_a_stacked[0].shape[2]
+        start_idx = tp_rank * shard_size
+        lora_a = lora_a[:, start_idx:start_idx + shard_size]
+        return lora_a
+
+    def apply(self,
+              x: torch.Tensor,
+              bias: Optional[torch.Tensor] = None) -> torch.Tensor:
+        return _mcp_apply(x, bias, self)
+
+    @classmethod
+    @_fully_sharded_can_replace
+    def can_replace_layer(
+        cls,
+        source_layer: nn.Module,
+        lora_config: LoRAConfig,
+        packed_modules_list: list,
+        model_config: Optional[PretrainedConfig],
+    ) -> bool:
+        # specifying kwargs so they can be easily accessed in decorator
+        return super().can_replace_layer(
+            source_layer=source_layer,
+            lora_config=lora_config,
+            packed_modules_list=packed_modules_list,
+            model_config=model_config,
+            decorate=False,
+        )
+
+
+class MergedColumnParallelLinearWithShardedLoRA(
+        MergedColumnParallelLinearWithLoRA):
+    """
+    Differs from MergedColumnParallelLinearWithLoRA by slicing the
+    LoRA A's also.
+
+    Based on S-LoRA, slicing happens along the rank dim.
+    """
+
+    def slice_lora_a(
+        self, lora_a: list[Union[torch.Tensor, None]]
+    ) -> list[Union[torch.Tensor, None]]:
+        #NOTE: lora_a contains 2 subloras, and each sublora could be None.
+        output_shard_size = self.lora_a_stacked[0].shape[2]
+        output_start_idx = self.tp_rank * output_shard_size
+        lora_a = [
+            lora_a[0][:, output_start_idx:output_start_idx +
+                      output_shard_size] if lora_a[0] is not None else None,
+            lora_a[1][:, output_start_idx:output_start_idx +
+                      output_shard_size] if lora_a[1] is not None else None,
+        ]
+        return lora_a
+
+    def apply(self,
+              x: torch.Tensor,
+              bias: Optional[torch.Tensor] = None) -> torch.Tensor:
+        return _mcp_apply(x, bias, self)
+
+    @classmethod
+    @_fully_sharded_can_replace
+    def can_replace_layer(
+        cls,
+        source_layer: nn.Module,
+        lora_config: LoRAConfig,
+        packed_modules_list: list,
+        model_config: Optional[PretrainedConfig],
+    ) -> bool:
+        # specifying kwargs so they can be easily accessed in decorator
+        return super().can_replace_layer(
+            source_layer=source_layer,
+            lora_config=lora_config,
+            packed_modules_list=packed_modules_list,
+            model_config=model_config,
+            decorate=False,
+        )
+
+
+class QKVParallelLinearWithShardedLoRA(QKVParallelLinearWithLoRA):
+    """
+    Differs from QKVParallelLinearWithLoRA by slicing the
+    LoRA A's also.
+
+    Based on S-LoRA, slicing happens along the rank dim.
+    """
+
+    def slice_lora_a(self, lora_a: torch.Tensor) -> torch.Tensor:
+        tp_rank = get_tensor_model_parallel_rank()
+        shard_size = self.lora_a_stacked[0].shape[2]
+        start_idx = tp_rank * shard_size
+        lora_a = lora_a[:, start_idx:start_idx + shard_size]
+        return lora_a
+
+    def apply(self,
+              x: torch.Tensor,
+              bias: Optional[torch.Tensor] = None) -> torch.Tensor:
+        return _mcp_apply(x, bias, self)
+
+    @classmethod
+    @_fully_sharded_can_replace
+    def can_replace_layer(cls, source_layer: nn.Module,
+                          lora_config: LoRAConfig, packed_modules_list: list,
+                          model_config: Optional[PretrainedConfig]) -> bool:
+        # specifying kwargs so they can be easily accessed in decorator
+        return super().can_replace_layer(
+            source_layer=source_layer,
+            lora_config=lora_config,
+            packed_modules_list=packed_modules_list,
+            model_config=model_config,
+            decorate=False,
+        )
+
+
+class MergedQKVParallelLinearWithShardedLoRA(MergedQKVParallelLinearWithLoRA):
+    """
+    Differs from MergedQKVParallelLinearWithLoRA by slicing the 
+    LoRA A's also.
+
+    Based on S-LoRA, slicing happens along the rank dim.
+    """
+
+    def slice_lora_a(
+        self, lora_a: list[Union[torch.Tensor, None]]
+    ) -> list[Union[torch.Tensor, None]]:
+        # NOTE: lora_a contains 3 subloras, and each sublora could be None.
+        shard_size = [self.lora_a_stacked[i].shape[2] for i in range(3)]
+        start_idx = [self.tp_rank * shard_size[i] for i in range(3)]
+        lora_a = [
+            lora_a[0][:, start_idx[0]:start_idx[0] +
+                      shard_size[0]] if lora_a[0] is not None else None,
+            lora_a[1][:, start_idx[1]:start_idx[1] +
+                      shard_size[1]] if lora_a[1] is not None else None,
+            lora_a[2][:, start_idx[2]:start_idx[2] +
+                      shard_size[2]] if lora_a[2] is not None else None,
+        ]
+        return lora_a
+
+    def apply(self,
+              x: torch.Tensor,
+              bias: Optional[torch.Tensor] = None) -> torch.Tensor:
+        return _mcp_apply(x, bias, self)
+
+    @classmethod
+    @_fully_sharded_can_replace
+    def can_replace_layer(
+        cls,
+        source_layer: nn.Module,
+        lora_config: LoRAConfig,
+        packed_modules_list: list,
+        model_config: Optional[PretrainedConfig],
+    ) -> bool:
+        # specifying kwargs so they can be easily accessed in decorator
+        return super().can_replace_layer(
+            source_layer=source_layer,
+            lora_config=lora_config,
+            packed_modules_list=packed_modules_list,
+            model_config=model_config,
+            decorate=False,
+        )
diff --git a/vllm/lora/layers/logits_processor.py b/vllm/lora/layers/logits_processor.py
new file mode 100644
index 0000000000000000000000000000000000000000..a50dcfa748f2fbce45e4d9d0417553af45ff7a89
--- /dev/null
+++ b/vllm/lora/layers/logits_processor.py
@@ -0,0 +1,247 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import math
+from typing import Optional
+
+import torch
+import torch.nn as nn
+from transformers import PretrainedConfig
+
+from vllm.config.lora import LoRAConfig
+from vllm.distributed import (get_tensor_model_parallel_rank,
+                              get_tensor_model_parallel_world_size)
+from vllm.model_executor.layers.logits_processor import LogitsProcessor
+from vllm.model_executor.layers.vocab_parallel_embedding import (
+    VocabParallelEmbedding)
+from vllm.platforms import current_platform
+
+from .base import BaseLayerWithLoRA
+
+
+class LogitsProcessorWithLoRA(BaseLayerWithLoRA):
+    """
+    LoRA wrapper for LogitsProcessor, with extra logic to handle the
+    application of the LoRA adapter and added LoRA vocabulary.
+
+    Args:
+        base_layer: LogitsProcessor layer
+        hidden_size: hidden size of the model
+        dtype: data type of the model
+        device: device of the model
+        sharded_to_full_mapping: index mapping from sharded vocab to full vocab
+            received from base_layer.get_sharded_to_full_mapping(). If None,
+            no reindexing will be done.
+    """
+
+    def __init__(self, base_layer: LogitsProcessor, hidden_size: int,
+                 dtype: torch.dtype, device: torch.device,
+                 sharded_to_full_mapping: Optional[list[int]]) -> None:
+        super().__init__()
+        self.base_layer = base_layer
+        self.hidden_size = hidden_size
+        self.dtype = dtype
+        self.device = device
+        self.tp_size = get_tensor_model_parallel_world_size()
+        self.tp_rank = get_tensor_model_parallel_rank()
+        self.sharded_to_full_mapping = sharded_to_full_mapping
+
+    @property
+    def logits_as_input(self):
+        return self.base_layer.logits_as_input
+
+    @property
+    def vocab_size(self):
+        return self.base_layer.vocab_size
+
+    @property
+    def scale(self):
+        return self.base_layer.scale
+
+    @property
+    def soft_cap(self):
+        return self.base_layer.soft_cap
+
+    @property
+    def use_all_gather(self):
+        return self.base_layer.use_all_gather
+
+    @property
+    def org_vocab_size(self):
+        return self.base_layer.org_vocab_size
+
+    @property
+    def include_gpu_probs_tensor(self):
+        return self.base_layer.include_gpu_probs_tensor
+
+    @property
+    def should_modify_greedy_probs_inplace(self):
+        return self.base_layer.should_modify_greedy_probs_inplace
+
+    def create_lora_weights(
+        self,
+        max_loras: int,
+        lora_config: LoRAConfig,
+        model_config: Optional[PretrainedConfig] = None,
+    ) -> None:
+        # TODO: Verify if this condition can be further relaxed
+        if 32000 < self.base_layer.vocab_size > 257024:
+            raise ValueError("When using LoRA, vocab size must be "
+                             "32000 >= vocab_size <= 257024")
+        self.lora_a_stacked = torch.zeros(
+            (
+                max_loras,
+                1,
+                lora_config.max_lora_rank,
+                self.hidden_size,
+            ),
+            dtype=lora_config.lora_dtype,
+            device=self.device,
+        )
+        self.lora_b_stacked = torch.zeros(
+            (
+                max_loras,
+                1,
+                # Pad for kernel compatibility
+                math.ceil(self.base_layer.vocab_size /
+                          lora_config.lora_vocab_padding_size) *
+                lora_config.lora_vocab_padding_size,
+                lora_config.max_lora_rank,
+            ),
+            dtype=lora_config.lora_dtype,
+            device=self.device,
+        )
+        self.embeddings_tensors = torch.full(
+            (max_loras, lora_config.lora_extra_vocab_size, self.hidden_size),
+            fill_value=float("-inf"),
+            dtype=self.dtype,
+            device=self.device,
+        )
+        if self.sharded_to_full_mapping is not None:
+            self.sharded_to_full_mapping_gpu = torch.tensor(
+                self.sharded_to_full_mapping,
+                device=self.device,
+                dtype=torch.long)
+        else:
+            self.sharded_to_full_mapping_gpu = None
+
+    def reset_lora(self, index: int):
+        self.lora_a_stacked[index] = 0
+        self.lora_b_stacked[index] = 0
+        self.embeddings_tensors[index] = float("-inf")
+
+    def set_lora(
+        self,
+        index: int,
+        lora_a: torch.Tensor,
+        lora_b: torch.Tensor,
+        embeddings_tensor: Optional[torch.Tensor],
+        bias: Optional[torch.Tensor] = None,
+    ):
+        self.reset_lora(index)
+        self.lora_a_stacked[index,
+                            0, :lora_a.shape[1], :lora_a.shape[0]].copy_(
+                                lora_a.T, non_blocking=True)
+        self.lora_b_stacked[index,
+                            0, :lora_b.shape[1], :lora_b.shape[0]].copy_(
+                                lora_b.T, non_blocking=True)
+        if embeddings_tensor is not None:
+            self.embeddings_tensors[
+                index,
+                :embeddings_tensor.shape[0],
+                :embeddings_tensor.shape[1],
+            ] = embeddings_tensor
+
+    def _get_logits(
+        self,
+        hidden_states: torch.Tensor,
+        lm_head: VocabParallelEmbedding,
+        embedding_bias: Optional[torch.Tensor] = None,
+    ) -> Optional[torch.Tensor]:
+        # Get the logits for the next tokens.
+        logits = lm_head.quant_method.apply(lm_head, hidden_states)
+        if embedding_bias is not None:
+            logits += embedding_bias
+
+        # Gather logits for TP
+        logits = self.base_layer._gather_logits(logits)
+
+        if logits is None:
+            return None
+
+        if self.sharded_to_full_mapping_gpu is not None:
+            # Reindex full logits tensor to ensure 1:1 mapping between
+            # index and token_id
+            # Example for:
+            #   org_vocab_size = 4
+            #   added_vocab_size = 2
+            #   pad_to_size = 8
+            #   tp_size = 2
+
+            # indices:  [0, 1, 2,  3, 4, 5, 6,  7]
+            # token_id: [0, 1, 4, -1, 2, 3, 5, -1]
+
+            # Therefore, the mapping is expected to be:
+            # [0, 1, 4, 6, 2, 3, 5, 7] so that when we reindex,
+            # we get:
+            # indices:  [0, 1, 2, 3, 4, 5,  6,  7]
+            # token_id: [0, 1, 2, 3, 4, 5, -1, -1]
+            logits = logits[:, self.sharded_to_full_mapping_gpu]
+
+        lora_logits = torch.empty(
+            self.embeddings_tensors.shape[0] + 1,
+            self.embeddings_tensors.shape[1],
+            hidden_states.shape[0],
+            dtype=self.embeddings_tensors.dtype,
+            device=self.embeddings_tensors.device,
+        )
+        torch.matmul(self.embeddings_tensors,
+                     hidden_states.T,
+                     out=lora_logits[:-1])
+
+        neg_inf, pos_inf = current_platform.get_infinity_values(
+            lora_logits.dtype)
+
+        lora_logits[-1] = neg_inf
+        lora_logits = lora_logits.mT
+        indices_padded = self.punica_wrapper.sampler_indices_padded
+
+        if current_platform.is_tpu() or current_platform.is_xpu():
+            indices_padded = indices_padded[:logits.size(0)]
+
+        lora_logits = (lora_logits.reshape(
+            lora_logits.shape[0] * lora_logits.shape[1],
+            lora_logits.shape[2],
+        ).index_select(0, indices_padded).nan_to_num_(nan=neg_inf,
+                                                      posinf=pos_inf,
+                                                      neginf=neg_inf))
+
+        logits[:,
+               self.base_layer.org_vocab_size:self.base_layer.org_vocab_size +
+               lora_logits.shape[1]] = lora_logits
+
+        lora_output: Optional[
+            torch.Tensor] = self.punica_wrapper.add_lora_logits(
+                logits, hidden_states, self.lora_a_stacked,
+                self.lora_b_stacked, 1.0)
+
+        if not current_platform.can_update_inplace():
+            logits = lora_output
+
+        # Remove paddings in vocab (if any).
+        logits = logits[:, :self.base_layer.vocab_size]
+        return logits
+
+    def forward(self, *args, **kwargs):
+        return type(self.base_layer).forward(self, *args, **kwargs)
+
+    @classmethod
+    def can_replace_layer(
+        cls,
+        source_layer: nn.Module,
+        lora_config: LoRAConfig,
+        packed_modules_list: list,
+        model_config: Optional[PretrainedConfig],
+    ) -> bool:
+        # Special handling for the LogitsProcessor.
+        return False
diff --git a/vllm/lora/layers/qkv_x_parallel_linear.py b/vllm/lora/layers/qkv_x_parallel_linear.py
new file mode 100644
index 0000000000000000000000000000000000000000..367482d0ee0785f82c34a60bd3805b68b343c4ef
--- /dev/null
+++ b/vllm/lora/layers/qkv_x_parallel_linear.py
@@ -0,0 +1,8 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from .base import BaseLayerWithLoRA
+
+
+#TODO: Implement this
+class QKVCrossParallelLinearWithLoRA(BaseLayerWithLoRA):
+    pass
diff --git a/vllm/lora/layers/replicated_linear.py b/vllm/lora/layers/replicated_linear.py
new file mode 100644
index 0000000000000000000000000000000000000000..3356297c1537a57c824262ca77ca18a5c6070901
--- /dev/null
+++ b/vllm/lora/layers/replicated_linear.py
@@ -0,0 +1,61 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from typing import Optional, Union
+
+import torch
+import torch.nn as nn
+from transformers import PretrainedConfig
+
+from vllm.config.lora import LoRAConfig
+from vllm.model_executor.layers.linear import ReplicatedLinear
+
+from .base_linear import BaseLinearLayerWithLoRA
+
+
+class ReplicatedLinearWithLoRA(BaseLinearLayerWithLoRA):
+
+    def __init__(self, base_layer: ReplicatedLinear) -> None:
+        super().__init__(base_layer, )
+        # To ensure interface compatibility, set to 1 always.
+        self.tp_size = 1
+        self.output_size = self.base_layer.output_size
+        self.n_slices = 1
+
+    def forward(
+        self, input_: torch.Tensor
+    ) -> Union[torch.Tensor, tuple[torch.Tensor, Optional[torch.Tensor]]]:
+        """Forward of ReplicatedLinearWithLoRA
+
+        Args:
+            input_: Tensor whose last dimension is `input_size`.
+
+        Returns:
+            - output
+            - bias
+        """
+        bias = (self.base_layer.bias
+                if not self.base_layer.skip_bias_add else None)
+
+        # Matrix multiply.
+        output = self.apply(input_, bias)
+
+        output_bias = (self.base_layer.bias
+                       if self.base_layer.skip_bias_add else None)
+
+        if not self.base_layer.return_bias:
+            return output
+
+        return output, output_bias
+
+    # ReplicatedLinear should always be replaced, regardless of the fully
+    # sharded LoRAs setting, because it is, by definition, copied per GPU.
+    @classmethod
+    def can_replace_layer(
+        cls,
+        source_layer: nn.Module,
+        lora_config: LoRAConfig,
+        packed_modules_list: list,
+        model_config: Optional[PretrainedConfig],
+    ) -> bool:
+        return type(source_layer) is ReplicatedLinear
diff --git a/vllm/lora/layers/row_parallel_linear.py b/vllm/lora/layers/row_parallel_linear.py
new file mode 100644
index 0000000000000000000000000000000000000000..18ef6fd1ddd7804cac4c5d4557c98f628c36b66d
--- /dev/null
+++ b/vllm/lora/layers/row_parallel_linear.py
@@ -0,0 +1,201 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from typing import Optional, Union, cast
+
+import torch
+import torch.nn as nn
+from transformers import PretrainedConfig
+
+from vllm.config.lora import LoRAConfig
+from vllm.distributed import (get_tensor_model_parallel_rank,
+                              get_tensor_model_parallel_world_size,
+                              split_tensor_along_last_dim,
+                              tensor_model_parallel_all_reduce)
+# yapf: disable
+from vllm.model_executor.layers.linear import RowParallelLinear
+from vllm.platforms import current_platform
+
+from .base_linear import BaseLinearLayerWithLoRA
+from .utils import _fully_sharded_can_replace, _not_fully_sharded_can_replace
+
+
+class RowParallelLinearWithLoRA(BaseLinearLayerWithLoRA):
+
+    def __init__(self, base_layer: RowParallelLinear) -> None:
+        super().__init__(base_layer)
+
+        self.tp_size = get_tensor_model_parallel_world_size()
+        # reset input_size
+        self.input_size = self.base_layer.input_size_per_partition
+        self.output_size = self.base_layer.output_size
+
+        self.tp_rank = get_tensor_model_parallel_rank()
+        # There is only one LoRA layer.
+        self.n_slices = 1
+
+    def slice_lora_a(self, lora_a: torch.Tensor) -> torch.Tensor:
+
+        shard_size = self.input_size
+        start_idx = self.tp_rank * shard_size
+        end_idx = (self.tp_rank + 1) * shard_size
+        lora_a = lora_a[start_idx:end_idx, :]
+        return lora_a
+
+    def slice_lora_b(self, lora_b: torch.Tensor) -> torch.Tensor:
+        return lora_b
+
+    def slice_bias(self, bias: torch.Tensor) -> torch.Tensor:
+        return bias
+
+    def forward(
+        self, input_: torch.Tensor
+    ) -> Union[torch.Tensor, tuple[torch.Tensor, Optional[torch.Tensor]]]:
+        """Forward of RowParallelLinear
+
+        Args:
+            input_: tensor whose last dimension is `input_size`. If
+                    `input_is_parallel` is set, then the last dimension
+                    is `input_size // tp_size`.
+
+        Returns:
+            - output
+            - bias
+        """
+        # set up backprop all-reduce.
+        if self.base_layer.input_is_parallel:
+            input_parallel = input_
+        else:
+            # TODO: simplify code below
+            splitted_input = split_tensor_along_last_dim(
+                input_, num_partitions=self.base_layer.tp_size)
+            input_parallel = splitted_input[self.tp_rank].contiguous()
+
+        # Matrix multiply.
+        output_parallel = self.apply(input_parallel)
+        if self.base_layer.reduce_results and self.base_layer.tp_size > 1:
+            output_ = tensor_model_parallel_all_reduce(output_parallel)
+        else:
+            output_ = output_parallel
+
+        if not self.base_layer.skip_bias_add:
+            output = (output_ + self.base_layer.bias
+                      if self.base_layer.bias is not None else output_)
+            output_bias = None
+        else:
+            output = output_
+            output_bias = self.base_layer.bias
+
+        if not self.base_layer.return_bias:
+            return output
+
+        return output, output_bias
+
+    @classmethod
+    @_not_fully_sharded_can_replace
+    def can_replace_layer(
+        cls,
+        source_layer: nn.Module,
+        lora_config: LoRAConfig,
+        packed_modules_list: list,
+        model_config: Optional[PretrainedConfig],
+    ) -> bool:
+        return type(source_layer) is RowParallelLinear
+
+
+
+# The following layer is based on the tensor parallelism strategy given in
+# Y. Sheng et al., S-LoRA: Serving Thousands of Concurrent LoRA Adapters. 2023,
+# https://arxiv.org/abs/2311.03285.
+
+class RowParallelLinearWithShardedLoRA(RowParallelLinearWithLoRA):
+    """
+    Differs from RowParallelLinearWithLoRA by slicing the
+    LoRA B's also.
+
+    Based on S-LoRA, slicing happens along the output dim.
+    This yields a combined partial sum from the row parallel base
+    layer and column partitioned output from the LoRA.
+    """
+
+    def slice_lora_b(self, lora_b: torch.Tensor) -> torch.Tensor:
+        shard_size = self.lora_b_stacked[0].shape[2]
+        start_idx = self.tp_rank * shard_size
+        end_idx = (self.tp_rank + 1) * shard_size
+        lora_b = lora_b[:, start_idx:end_idx]
+        return lora_b
+
+    def slice_bias(self, bias: torch.Tensor) -> torch.Tensor:
+        if bias is None:
+            return bias
+        self.lora_bias_stacked = cast(tuple[torch.Tensor, ...],
+                                      self.lora_bias_stacked)
+        shard_size = self.lora_bias_stacked[0].shape[2]
+        start_idx = self.tp_rank * shard_size
+        end_idx = (self.tp_rank + 1) * shard_size
+        bias = bias[start_idx:end_idx]
+        return bias
+
+    def apply(self,
+              x: torch.Tensor,
+              bias: Optional[torch.Tensor] = None) -> torch.Tensor:
+        output = self.base_layer.quant_method.apply(self.base_layer, x)
+
+        x = x.view(-1, x.shape[-1])
+        output, out_orig_shape = output.view(-1,
+                                             output.shape[-1]), output.shape
+        buffer = torch.zeros(
+            (self.n_slices, x.shape[0], self.lora_a_stacked[0].shape[2]),
+            dtype=torch.float32,
+            device=x.device,
+        )
+
+        shrunk_buffer: Optional[torch.Tensor] = self.punica_wrapper.add_shrink(
+            buffer, x, self.lora_a_stacked, 1.0)
+        if not current_platform.can_update_inplace():
+            buffer = shrunk_buffer
+
+        buffer = tensor_model_parallel_all_reduce(buffer)
+
+        # following S-LoRA, allows the fusing of all_gather and all_reduce
+        # by adding the column partitioned lora output to a slice of output
+        # tensor, which is a partial sum due to row parallel. All that
+        # remains is a standard all_reduce. User should be aware though that
+        # the output is not the same as a normal row_parallel, it should be
+        # reduced before being used
+        # NOTE offset are based on the rank.
+        shard_size = self.lora_b_stacked[0].shape[2]
+        offset_start = self.tp_rank * shard_size
+        lora_output: Optional[torch.Tensor] = self.punica_wrapper.add_expand(
+            output,
+            buffer,
+            self.lora_b_stacked,
+            self.lora_bias_stacked,
+            self.output_slices,
+            offset_start=offset_start,
+            add_input=True,
+        )
+
+        if not current_platform.can_update_inplace():
+            output = lora_output
+
+        output = output.view(*out_orig_shape)
+        return output
+
+    @classmethod
+    @_fully_sharded_can_replace
+    def can_replace_layer(
+        cls,
+        source_layer: nn.Module,
+        lora_config: LoRAConfig,
+        packed_modules_list: list,
+        model_config: Optional[PretrainedConfig],
+    ) -> bool:
+        # specifying kwargs so they can be easily accessed in decorator
+        return super().can_replace_layer(
+            source_layer=source_layer,
+            lora_config=lora_config,
+            packed_modules_list=packed_modules_list,
+            model_config=model_config,
+            decorate=False,
+        )
diff --git a/vllm/lora/layers/utils.py b/vllm/lora/layers/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..27dcd720fbdeabad3380b603268bfd6ad4cc0552
--- /dev/null
+++ b/vllm/lora/layers/utils.py
@@ -0,0 +1,60 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from dataclasses import dataclass
+
+import torch
+import torch.nn as nn
+
+from vllm.adapter_commons.layers import AdapterMapping
+
+
+@dataclass
+class LoRAMapping(AdapterMapping):
+    is_prefill: bool = False
+
+
+def _get_lora_device(base_layer: nn.Module) -> torch.device:
+    # code borrowed from https://github.com/fmmoret/vllm/blob/fm-support-lora-on-quantized-models/vllm/lora/layers.py#L34
+    """Returns the device for where to place the LoRA tensors."""
+    # unquantizedLinear
+    if hasattr(base_layer, "weight"):
+        return base_layer.weight.device
+    # Compressed Tensor
+    elif hasattr(base_layer, "weight_packed"):
+        return base_layer.weight_packed.device
+    # GPTQ/AWQ
+    elif hasattr(base_layer, "qweight"):
+        return base_layer.qweight.device
+    # HQQ marlin
+    elif hasattr(base_layer, "W_q"):
+        return base_layer.W_q.device
+    else:
+        raise ValueError(f"Unsupported base layer: {base_layer}")
+
+
+def _not_fully_sharded_can_replace(can_replace):
+    """
+    decorator which adds the condition of not using fully sharded loras
+    intended to wrap can_replace_layer()
+    """
+
+    def dec(*args, **kwargs):
+        decorate = kwargs.pop("decorate") if "decorate" in kwargs else True
+        condition = (not kwargs["lora_config"].fully_sharded_loras
+                     if decorate else True)
+        return can_replace(*args, **kwargs) and condition
+
+    return dec
+
+
+def _fully_sharded_can_replace(can_replace):
+    """
+    decorator which adds the condition of fully sharded loras
+    intended to wrap can_replace_layer()
+    """
+
+    def dec(*args, **kwargs):
+        return (can_replace(*args, **kwargs)
+                and kwargs["lora_config"].fully_sharded_loras)
+
+    return dec
diff --git a/vllm/lora/layers/vocal_parallel_embedding.py b/vllm/lora/layers/vocal_parallel_embedding.py
new file mode 100644
index 0000000000000000000000000000000000000000..4d6218d970977982c96f2d8a830905f356f33d96
--- /dev/null
+++ b/vllm/lora/layers/vocal_parallel_embedding.py
@@ -0,0 +1,172 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from typing import Optional
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from transformers import PretrainedConfig
+
+from vllm.config.lora import LoRAConfig
+from vllm.model_executor.layers.vocab_parallel_embedding import (
+    VocabParallelEmbedding)
+from vllm.platforms import current_platform
+
+from .base import BaseLayerWithLoRA
+
+
+class VocabParallelEmbeddingWithLoRA(BaseLayerWithLoRA):
+
+    def __init__(self, base_layer: VocabParallelEmbedding) -> None:
+        super().__init__()
+        self.base_layer = base_layer
+        self.embeddings_slice: Optional[tuple[int, int]]
+        self.embeddings_weights: Optional[torch.Tensor]
+
+    def create_lora_weights(
+            self,
+            max_loras: int,
+            lora_config: LoRAConfig,
+            model_config: Optional[PretrainedConfig] = None) -> None:
+
+        if self.base_layer.num_added_embeddings_per_partition > 0:
+            # We can start adding lora weights
+            self.embeddings_weights = self.base_layer.weight.data[
+                self.base_layer.num_org_embeddings_per_partition:self.
+                base_layer.num_org_embeddings_per_partition +
+                self.base_layer.num_added_embeddings_per_partition]
+            self.embeddings_slice = (
+                self.base_layer.shard_indices.added_vocab_start_index -
+                self.base_layer.org_vocab_size,
+                self.base_layer.shard_indices.added_vocab_end_index -
+                self.base_layer.org_vocab_size)
+            self.base_layer.weight.data[
+                self.base_layer.num_org_embeddings_per_partition:].fill_(0)
+        else:
+            self.embeddings_slice = None
+            self.embeddings_weights = None
+
+        self.embeddings_tensors = torch.zeros(
+            (
+                max_loras,
+                lora_config.lora_extra_vocab_size,
+                self.base_layer.embedding_dim,
+            ),
+            dtype=self.base_layer.weight.dtype,
+            device=self.base_layer.weight.device,
+        )
+        self.lora_a_stacked = torch.zeros(
+            (
+                max_loras,
+                self.base_layer.org_vocab_size +
+                lora_config.lora_extra_vocab_size,
+                lora_config.max_lora_rank,
+            ),
+            dtype=lora_config.lora_dtype,
+            device=self.base_layer.weight.device,
+        )
+        self.lora_b_stacked = torch.zeros(
+            (
+                max_loras,
+                1,
+                self.base_layer.embedding_dim,
+                lora_config.max_lora_rank,
+            ),
+            dtype=lora_config.lora_dtype,
+            device=self.base_layer.weight.device,
+        )
+        self.lora_a_stacked_2d = self.lora_a_stacked.view(
+            self.lora_a_stacked.shape[0] * self.lora_a_stacked.shape[1],
+            self.lora_a_stacked.shape[2],
+        )
+
+    def reset_lora(self, index: int):
+        self.lora_a_stacked[index] = 0
+        self.lora_b_stacked[index] = 0
+        self.embeddings_tensors[index] = 0
+
+    def set_lora(
+        self,
+        index: int,
+        lora_a: torch.Tensor,
+        lora_b: torch.Tensor,
+        embeddings_tensor: Optional[torch.Tensor],
+        bias: Optional[torch.Tensor] = None,
+    ):
+        self.reset_lora(index)
+        self.lora_a_stacked[index, :lora_a.shape[0], :lora_a.shape[1]].copy_(
+            lora_a, non_blocking=True)
+        self.lora_b_stacked[index,
+                            0, :lora_b.shape[1], :lora_b.shape[0]].copy_(
+                                lora_b.T, non_blocking=True)
+        if embeddings_tensor is not None:
+            self.embeddings_tensors[
+                index,
+                :embeddings_tensor.shape[0],
+                :embeddings_tensor.shape[1],
+            ].copy_(embeddings_tensor, non_blocking=True)
+            if self.embeddings_slice is not None:
+                # TODO(yard1): Optimize this copy, we don't need to copy
+                # everything, just the modified part
+                embeddings = self.embeddings_tensors.view(
+                    self.embeddings_tensors.shape[0] *
+                    self.embeddings_tensors.shape[1],
+                    self.embeddings_tensors.shape[2],
+                )[self.embeddings_slice[0]:self.embeddings_slice[1]]
+                assert self.embeddings_weights is not None
+                self.embeddings_weights[:embeddings.shape[0]].copy_(embeddings)
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        added_tokens_mask = torch.where(x > self.base_layer.org_vocab_size - 1,
+                                        1, 0)
+
+        # NB: Don't use torch.narrow here. torch.narrow triggers some
+        # Dynamic Shape specialization in torch.compile
+        num_tokens = x.shape[0]
+        indices_1 = self.punica_wrapper._embeddings_indices[1][:num_tokens]
+        indices_0 = self.punica_wrapper._embeddings_indices[0][:num_tokens]
+
+        full_lora_a_embeddings = F.embedding(
+            x + indices_1,
+            self.lora_a_stacked_2d,
+        )
+        full_output = self.base_layer.forward(x +
+                                              (indices_0 * added_tokens_mask))
+
+        full_output_org = full_output
+        if full_output.ndim == 3:
+            full_output = full_output.view(
+                full_output.shape[0] * full_output.shape[1], -1)
+        if full_lora_a_embeddings.ndim == 3:
+            full_lora_a_embeddings = full_lora_a_embeddings.view(
+                full_lora_a_embeddings.shape[0] *
+                full_lora_a_embeddings.shape[1],
+                -1,
+            )
+
+        lora_output: Optional[
+            torch.Tensor] = self.punica_wrapper.add_lora_embedding(
+                full_output,
+                full_lora_a_embeddings,
+                self.lora_b_stacked,
+                add_input=True)
+
+        if not current_platform.can_update_inplace():
+            full_output = lora_output
+
+        return full_output.view_as(full_output_org)
+
+    @classmethod
+    def can_replace_layer(
+        cls,
+        source_layer: nn.Module,
+        lora_config: LoRAConfig,
+        packed_modules_list: list,
+        model_config: Optional[PretrainedConfig],
+    ) -> bool:
+        return type(source_layer) is VocabParallelEmbedding
+
+    @property
+    def weight(self):
+        return self.base_layer.weight
diff --git a/vllm/lora/models.py b/vllm/lora/models.py
index 3072047a2606c53d1b43ec75d38756bb6943b404..77124380549140571b66b8d77b0aa55ef6f7afbd 100644
--- a/vllm/lora/models.py
+++ b/vllm/lora/models.py
@@ -16,7 +16,7 @@ from vllm.adapter_commons.models import (AdapterLRUCache, AdapterModel,
 from vllm.adapter_commons.utils import (add_adapter, deactivate_adapter,
                                         get_adapter, list_adapters,
                                         remove_adapter, set_adapter_mapping)
-from vllm.config import LoRAConfig
+from vllm.config.lora import LoRAConfig
 from vllm.logger import init_logger
 from vllm.lora.layers import BaseLayerWithLoRA, LoRAMapping
 from vllm.lora.lora import LoRALayerWeights, PackedLoRALayerWeights
diff --git a/vllm/lora/peft_helper.py b/vllm/lora/peft_helper.py
index 8b8e5cb7d5fae3ec7fef87cae1b439cc74414388..dc7249c38602154437ba781ee057c2e8712bec1f 100644
--- a/vllm/lora/peft_helper.py
+++ b/vllm/lora/peft_helper.py
@@ -9,7 +9,7 @@ import os
 from dataclasses import MISSING, dataclass, field, fields
 from typing import Literal, Optional, Union
 
-from vllm.config import LoRAConfig
+from vllm.config.lora import LoRAConfig
 from vllm.logger import init_logger
 from vllm.model_executor.model_loader.tensorizer import TensorizerConfig
 
diff --git a/vllm/lora/punica_wrapper/punica_xpu.py b/vllm/lora/punica_wrapper/punica_xpu.py
index 572e39e0eced091f5a139200e7d424cafb9938fb..163bb412235ce28616347722546648a5f35d4e02 100644
--- a/vllm/lora/punica_wrapper/punica_xpu.py
+++ b/vllm/lora/punica_wrapper/punica_xpu.py
@@ -225,6 +225,13 @@ class PunicaWrapperXPU(PunicaWrapperBase):
             add_inputs=True,
             **kwargs)
 
+    @property
+    def sampler_indices_padded(self) -> torch.Tensor:
+        """
+        This property provides access to padded sampler indices.
+        """
+        return self._sampler_indices_padded[:]
+
     def add_lora_logits(self,
                         y: torch.Tensor,
                         x: torch.Tensor,
@@ -259,11 +266,11 @@ class PunicaWrapperXPU(PunicaWrapperBase):
             buffer = torch.zeros((x.size(0), r),
                                  dtype=torch.float32,
                                  device=x.device)
-
-        bgmv_shrink(x, lora_a_stacked, buffer, self.sampler_indices, scale)
+        sampler_indices = torch.narrow(self._sampler_indices, 0, 0, x.size(0))
+        bgmv_shrink(x, lora_a_stacked, buffer, sampler_indices, scale)
         bgmv_expand(buffer,
                     lora_b_stacked,
                     y,
-                    self.sampler_indices,
+                    sampler_indices,
                     add_inputs=True)
         return y.view_as(y_org)
diff --git a/vllm/lora/utils.py b/vllm/lora/utils.py
index ab0a9fbd255de0755746cb5b94ca6d75a962324b..10ba390bffd9e2a60e3506ee32af751544e13fd1 100644
--- a/vllm/lora/utils.py
+++ b/vllm/lora/utils.py
@@ -11,23 +11,23 @@ from huggingface_hub.utils import (EntryNotFoundError, HfHubHTTPError,
 from torch import nn
 from transformers import PretrainedConfig
 
-from vllm.config import LoRAConfig
+from vllm.config.lora import LoRAConfig
 from vllm.logger import init_logger
-from vllm.lora.fully_sharded_layers import (
-    ColumnParallelLinearWithShardedLoRA,
-    MergedColumnParallelLinearWithShardedLoRA,
-    MergedQKVParallelLinearWithShardedLoRA, QKVParallelLinearWithShardedLoRA,
-    RowParallelLinearWithShardedLoRA)
 # being imported for _all_lora_classes below
 # yapf conflicts with isort for this block
 # yapf: disable
 from vllm.lora.layers import (BaseLayerWithLoRA, ColumnParallelLinearWithLoRA,
+                              ColumnParallelLinearWithShardedLoRA,
                               LogitsProcessorWithLoRA,
                               MergedColumnParallelLinearWithLoRA,
+                              MergedColumnParallelLinearWithShardedLoRA,
                               MergedQKVParallelLinearWithLoRA,
+                              MergedQKVParallelLinearWithShardedLoRA,
                               QKVParallelLinearWithLoRA,
+                              QKVParallelLinearWithShardedLoRA,
                               ReplicatedLinearWithLoRA,
                               RowParallelLinearWithLoRA,
+                              RowParallelLinearWithShardedLoRA,
                               VocabParallelEmbeddingWithLoRA)
 from vllm.model_executor.layers.linear import LinearBase
 
@@ -239,7 +239,7 @@ def get_adapter_absolute_path(lora_path: str) -> str:
     except (HfHubHTTPError, RepositoryNotFoundError, EntryNotFoundError,
             HFValidationError):
         # Handle errors that may occur during the download
-        # Return original path instead instead of throwing error here
+        # Return original path instead of throwing error here
         logger.exception("Error downloading the HuggingFace model")
         return lora_path
 
diff --git a/vllm/lora/worker_manager.py b/vllm/lora/worker_manager.py
index 248d2954f1ef4eb94a0a361412b26a6bbf4dc8c7..3a807b1e161d2b3807ed534c28e6310abe35c192 100644
--- a/vllm/lora/worker_manager.py
+++ b/vllm/lora/worker_manager.py
@@ -11,7 +11,7 @@ from vllm.adapter_commons.utils import (add_adapter_worker,
                                         list_adapters_worker,
                                         set_active_adapters_worker)
 from vllm.adapter_commons.worker_manager import AbstractWorkerManager
-from vllm.config import LoRAConfig
+from vllm.config.lora import LoRAConfig
 from vllm.logger import init_logger
 from vllm.lora.models import (LoRAModel, LoRAModelManager,
                               LRUCacheLoRAModelManager, create_lora_manager)
diff --git a/vllm/model_executor/custom_op.py b/vllm/model_executor/custom_op.py
index 6b5a107396c9286ca584b83f9135758a7969f2dd..e7eb8247d5efd663914a852febe86f9dbdbe8c8f 100644
--- a/vllm/model_executor/custom_op.py
+++ b/vllm/model_executor/custom_op.py
@@ -73,11 +73,6 @@ class CustomOp(nn.Module):
         # NOTE(woosuk): This is a placeholder for future extensions.
         return self.forward_native(*args, **kwargs)
 
-    def forward_neuron(self, *args, **kwargs):
-        # By default, we assume that Neuron ops are compatible with the
-        # PyTorch-native implementation.
-        return self.forward_native(*args, **kwargs)
-
     def forward_oot(self, *args, **kwargs):
         # By default, we assume that OOT ops are compatible with the
         # PyTorch-native implementation.
@@ -105,8 +100,6 @@ class CustomOp(nn.Module):
             return self.forward_tpu
         elif current_platform.is_xpu():
             return self.forward_xpu
-        elif current_platform.is_neuron():
-            return self.forward_neuron
         elif current_platform.is_out_of_tree():
             return self.forward_oot
         else:
diff --git a/vllm/model_executor/layers/activation.py b/vllm/model_executor/layers/activation.py
index eb7e494e328612be32d7e6548ec3f2e6f58aea17..235df1a77c5cec11a85d58cd5fd60ed0a44502da 100644
--- a/vllm/model_executor/layers/activation.py
+++ b/vllm/model_executor/layers/activation.py
@@ -95,13 +95,6 @@ class SiluAndMul(CustomOp):
         self.op(out, x)
         return out
 
-    def forward_neuron(self, x: torch.Tensor) -> torch.Tensor:
-        d = x.shape[-1] // 2
-        x_reshaped = x.view(-1, x.shape[-1])
-        s = x_reshaped[:, :d] * F.sigmoid(x_reshaped[:, :d])
-        result = s * x_reshaped[:, d:]
-        return result.view(*x.shape[:-1], d)
-
 
 @CustomOp.register("mul_and_silu")
 class MulAndSilu(CustomOp):
@@ -362,7 +355,7 @@ class ReLUSquaredActivation(CustomOp):
         return torch.square(F.relu(x))
 
     def forward_cuda(self, x: torch.Tensor) -> torch.Tensor:
-        #TODO : implement cuda kenrels
+        #TODO : implement cuda kernels
         return self.forward_native(x)
 
 
@@ -461,7 +454,7 @@ class XIELU(CustomOp):
         )
         return result.view(original_shape)
 
-    def forward(self, input: torch.Tensor) -> torch.Tensor:
+    def forward_native(self, input: torch.Tensor) -> torch.Tensor:
         if self._xielu_cuda_obj is not None and input.is_cuda:
             if not torch._dynamo.is_compiling():
                 return self._xielu_cuda_fn(input)
@@ -471,6 +464,9 @@ class XIELU(CustomOp):
                 )
         return self._xielu_python(input)
 
+    def forward_cuda(self, input: torch.Tensor) -> torch.Tensor:
+        return self.forward_native(input)
+
 
 class ScaledActivation(nn.Module):
     """An activation function with post-scale parameters.
diff --git a/vllm/model_executor/layers/fla/__init__.py b/vllm/model_executor/layers/fla/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..0e89cf9f79439d0896eec49b16e244c0cafd0466
--- /dev/null
+++ b/vllm/model_executor/layers/fla/__init__.py
@@ -0,0 +1,8 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+# SPDX-FileCopyrightText: Songlin Yang, Yu Zhang
+#
+# This file contains code copied from the flash-linear-attention project.
+# The original source code was licensed under the MIT license and included
+# the following copyright notice:
+# Copyright (c) 2023-2025, Songlin Yang, Yu Zhang
diff --git a/vllm/model_executor/layers/fla/ops/__init__.py b/vllm/model_executor/layers/fla/ops/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..c19cc14ba69288e05e906f1ae61aa77e67fdf410
--- /dev/null
+++ b/vllm/model_executor/layers/fla/ops/__init__.py
@@ -0,0 +1,17 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+# SPDX-FileCopyrightText: Songlin Yang, Yu Zhang
+#
+# This file contains code copied from the flash-linear-attention project.
+# The original source code was licensed under the MIT license and included
+# the following copyright notice:
+# Copyright (c) 2023-2025, Songlin Yang, Yu Zhang
+from .chunk import chunk_gated_delta_rule
+from .fused_recurrent import fused_recurrent_gated_delta_rule
+from .layernorm_guard import RMSNormGated
+
+__all__ = [
+    "RMSNormGated",
+    "chunk_gated_delta_rule",
+    "fused_recurrent_gated_delta_rule",
+]
diff --git a/vllm/model_executor/layers/fla/ops/chunk.py b/vllm/model_executor/layers/fla/ops/chunk.py
new file mode 100644
index 0000000000000000000000000000000000000000..e7d295aff2392911e67418f1330465bc70d83610
--- /dev/null
+++ b/vllm/model_executor/layers/fla/ops/chunk.py
@@ -0,0 +1,225 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+# SPDX-FileCopyrightText: Songlin Yang, Yu Zhang
+#
+# This file contains code copied from the flash-linear-attention project.
+# The original source code was licensed under the MIT license and included
+# the following copyright notice:
+# Copyright (c) 2023-2025, Songlin Yang, Yu Zhang
+# ruff: noqa: E501
+import warnings
+from typing import Optional
+
+import torch
+from einops import rearrange
+
+from .chunk_delta_h import chunk_gated_delta_rule_fwd_h
+from .chunk_o import chunk_fwd_o
+from .chunk_scaled_dot_kkt import chunk_scaled_dot_kkt_fwd
+from .cumsum import chunk_local_cumsum
+from .l2norm import l2norm_fwd
+from .solve_tril import solve_tril
+from .utils import SUPPRESS_LEVEL, input_guard
+from .wy_fast import recompute_w_u_fwd
+
+
+def chunk_gated_delta_rule_fwd(q: torch.Tensor,
+                               k: torch.Tensor,
+                               v: torch.Tensor,
+                               g: torch.Tensor,
+                               beta: torch.Tensor,
+                               scale: float,
+                               initial_state: torch.Tensor,
+                               output_final_state: bool,
+                               cu_seqlens: Optional[torch.LongTensor] = None):
+    g = chunk_local_cumsum(g, chunk_size=64, cu_seqlens=cu_seqlens)
+    # obtain WY representation. u is actually the new v.
+    A = chunk_scaled_dot_kkt_fwd(k=k,
+                                 beta=beta,
+                                 g_cumsum=g,
+                                 cu_seqlens=cu_seqlens,
+                                 output_dtype=torch.float32)
+    A = solve_tril(A=A, cu_seqlens=cu_seqlens, output_dtype=k.dtype)
+    w, u = recompute_w_u_fwd(
+        k=k,
+        v=v,
+        beta=beta,
+        A=A,
+        g_cumsum=g,
+        cu_seqlens=cu_seqlens,
+    )
+    h, v_new, final_state = chunk_gated_delta_rule_fwd_h(
+        k=k,
+        w=w,
+        u=u,
+        g=g,
+        initial_state=initial_state,
+        output_final_state=output_final_state,
+        cu_seqlens=cu_seqlens,
+    )
+    o = chunk_fwd_o(
+        q=q,
+        k=k,
+        v=v_new,
+        h=h,
+        g=g,
+        scale=scale,
+        cu_seqlens=cu_seqlens,
+    )
+    if SUPPRESS_LEVEL < 3:
+        return g, o, A, final_state, None, None, None
+    elif SUPPRESS_LEVEL >= 3:
+        return g, o, A, final_state, w, h, v_new
+
+
+class ChunkGatedDeltaRuleFunction(torch.autograd.Function):
+
+    @staticmethod
+    @input_guard
+    @torch.amp.custom_fwd(device_type='cuda')
+    def forward(ctx,
+                q: torch.Tensor,
+                k: torch.Tensor,
+                v: torch.Tensor,
+                g: torch.Tensor,
+                beta: torch.Tensor,
+                scale: float,
+                initial_state: torch.Tensor,
+                output_final_state: bool,
+                cu_seqlens: Optional[torch.LongTensor] = None,
+                use_qk_l2norm_in_kernel: bool = False):
+        if use_qk_l2norm_in_kernel:
+            q = l2norm_fwd(q)
+            k = l2norm_fwd(k)
+
+        g, o, A, final_state, w, h, v_new = chunk_gated_delta_rule_fwd(
+            q=q,
+            k=k,
+            v=v,
+            g=g,
+            beta=beta,
+            scale=scale,
+            initial_state=initial_state,
+            output_final_state=output_final_state,
+            cu_seqlens=cu_seqlens,
+        )
+        ctx.scale = scale
+        ctx.use_qk_l2norm_in_kernel = use_qk_l2norm_in_kernel
+        return o.to(q.dtype), final_state
+
+
+@torch.compiler.disable
+def chunk_gated_delta_rule(q: torch.Tensor,
+                           k: torch.Tensor,
+                           v: torch.Tensor,
+                           g: torch.Tensor,
+                           beta: torch.Tensor,
+                           scale: float = None,
+                           initial_state: torch.Tensor = None,
+                           output_final_state: bool = False,
+                           cu_seqlens: Optional[torch.LongTensor] = None,
+                           head_first: bool = False,
+                           use_qk_l2norm_in_kernel: bool = False):
+    r"""
+    Args:
+        q (torch.Tensor):
+            queries of shape `[B, T, H, K]` if `head_first=False` else `[B, H, T, K]`.
+        k (torch.Tensor):
+            keys of shape `[B, T, H, K]` if `head_first=False` else `[B, H, T, K]`.
+        v (torch.Tensor):
+            values of shape `[B, T, H, V]` if `head_first=False` else `[B, H, T, V]`.
+        g (torch.Tensor):
+            (forget) gating tensor (in log space!) of shape `[B, T, H]` if `head_first=False` else `[B, H, T]`.
+        beta (torch.Tensor):
+            betas of shape `[B, T, H]` if `head_first=False` else `[B, H, T]`.
+        scale (Optional[int]):
+            Scale factor for the RetNet attention scores.
+            If not provided, it will default to `1 / sqrt(K)`. Default: `None`.
+        initial_state (Optional[torch.Tensor]):
+            Initial state of shape `[N, H, K, V]` for `N` input sequences.
+            For equal-length input sequences, `N` equals the batch size `B`.
+            Default: `None`.
+        output_final_state (Optional[bool]):
+            Whether to output the final state of shape `[N, H, K, V]`. Default: `False`.
+        cu_seqlens (torch.LongTensor):
+            Cumulative sequence lengths of shape `[N+1]` used for variable-length training,
+            consistent with the FlashAttention API.
+        head_first (Optional[bool]):
+            Whether the inputs are in the head-first format, which is not supported for variable-length inputs.
+            Default: `False`.
+
+    Returns:
+        o (torch.Tensor):
+            Outputs of shape `[B, T, H, V]` if `head_first=False` else `[B, H, T, V]`.
+        final_state (torch.Tensor):
+            Final state of shape `[N, H, K, V]` if `output_final_state=True` else `None`.
+
+    Examples::
+        >>> import torch
+        >>> import torch.nn.functional as F
+        >>> from einops import rearrange
+        >>> from fla.ops.gated_delta_rule import chunk_gated_delta_rule
+        # inputs with equal lengths
+        >>> B, T, H, K, V = 4, 2048, 4, 512, 512
+        >>> q = torch.randn(B, T, H, K, dtype=torch.bfloat16, device='cuda')
+        >>> k = F.normalize(torch.randn(B, T, H, K, dtype=torch.bfloat16, device='cuda'), p=2, dim=-1)
+        >>> v = torch.randn(B, T, H, V, dtype=torch.bfloat16, device='cuda')
+        >>> beta = torch.rand(B, T, H, dtype=torch.bfloat16, device='cuda').sigmoid()
+        >>> g = F.logsigmoid(torch.rand(B, T, H, dtype=torch.bfloat16, device='cuda'))
+        >>> h0 = torch.randn(B, H, K, V, dtype=torch.bfloat16, device='cuda')
+        >>> o, ht = chunk_gated_delta_rule(
+            q, k, v, g, beta,
+            initial_state=h0,
+            output_final_state=True
+        )
+        # for variable-length inputs, the batch size `B` is expected to be 1 and `cu_seqlens` is required
+        >>> q, k, v, beta, g = map(lambda x: rearrange(x, 'b t ... -> 1 (b t) ...'), (q, k, v, beta, g))
+        # for a batch with 4 sequences, `cu_seqlens` with 5 start/end positions are expected
+        >>> cu_seqlens = q.new_tensor([0, 2048, 4096, 6144, 8192], dtype=torch.long)
+        >>> o_var, ht_var = chunk_gated_delta_rule(
+            q, k, v, g, beta,
+            initial_state=h0,
+            output_final_state=True,
+            cu_seqlens=cu_seqlens
+        )
+    """
+    assert q.dtype == k.dtype == v.dtype
+    assert q.dtype != torch.float32, "ChunkGatedDeltaRuleFunction does not support float32. Please use bfloat16."
+    assert len(
+        beta.shape
+    ) == 3, "beta must be of shape [B, T, H] if head_first=False, or [B, H, T] otherwise."
+
+    if head_first:
+        raise DeprecationWarning(
+            "head_first is deprecated and will be removed in a future version. "
+            "Please use head_first=False for now instead.",
+            stacklevel=2)
+        q, k, v, beta, g = map(
+            lambda x: rearrange(x, 'b h t ... -> b t h ...'),
+            (q, k, v, beta, g))
+    if not head_first and q.shape[1] < q.shape[2]:
+        warnings.warn(
+            f"Input tensor shape suggests potential format mismatch: seq_len ({q.shape[1]}) < num_heads ({q.shape[2]}). "
+            "This may indicate the inputs were passed in head-first format [B, H, T, ...] "
+            "when head_first=False was specified. "
+            "Please verify your input tensor format matches the expected shape [B, T, H, ...].",
+            stacklevel=2)
+    if cu_seqlens is not None:
+        if q.shape[0] != 1:
+            raise ValueError(
+                f"The batch size is expected to be 1 rather than {q.shape[0]} when using `cu_seqlens`."
+                f"Please flatten variable-length inputs before processing.")
+        if initial_state is not None and initial_state.shape[0] != len(
+                cu_seqlens) - 1:
+            raise ValueError(
+                f"The number of initial states is expected to be equal to the number of input sequences, "
+                f"i.e., {len(cu_seqlens) - 1} rather than {initial_state.shape[0]}."
+            )
+    if scale is None:
+        scale = k.shape[-1]**-0.5
+    o, final_state = ChunkGatedDeltaRuleFunction.apply(
+        q, k, v, g, beta, scale, initial_state, output_final_state, cu_seqlens,
+        use_qk_l2norm_in_kernel)
+    if head_first:
+        o = rearrange(o, 'b t h ... -> b h t ...')
+    return o, final_state
diff --git a/vllm/model_executor/layers/fla/ops/chunk_delta_h.py b/vllm/model_executor/layers/fla/ops/chunk_delta_h.py
new file mode 100644
index 0000000000000000000000000000000000000000..34006f87f457bb7f4ca12279f4659e687b448380
--- /dev/null
+++ b/vllm/model_executor/layers/fla/ops/chunk_delta_h.py
@@ -0,0 +1,290 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+# SPDX-FileCopyrightText: Songlin Yang, Yu Zhang
+#
+# This file contains code copied from the flash-linear-attention project.
+# The original source code was licensed under the MIT license and included
+# the following copyright notice:
+# Copyright (c) 2023-2025, Songlin Yang, Yu Zhang
+# ruff: noqa: E501
+from typing import Optional
+
+import torch
+
+from vllm.triton_utils import tl, triton
+
+from .index import prepare_chunk_indices, prepare_chunk_offsets
+from .op import exp
+from .utils import is_nvidia_hopper, use_cuda_graph
+
+NUM_WARPS = [2, 4] if is_nvidia_hopper else [2, 4, 8, 16]
+
+
+@triton.heuristics({
+    'USE_G': lambda args: args['g'] is not None,
+    'USE_INITIAL_STATE': lambda args: args['h0'] is not None,
+    'STORE_FINAL_STATE': lambda args: args['ht'] is not None,
+    'SAVE_NEW_VALUE': lambda args: args['v_new'] is not None,
+    'IS_VARLEN': lambda args: args['cu_seqlens'] is not None,
+})
+@triton.autotune(
+    configs=[
+        triton.Config({'BV': BV}, num_warps=num_warps, num_stages=num_stages)
+        for num_warps in [2, 4] for num_stages in [2, 3, 4] for BV in [32, 64]
+    ],
+    key=['H', 'K', 'V', 'BT', 'USE_G'],
+    use_cuda_graph=use_cuda_graph,
+)
+@triton.jit(do_not_specialize=['T'])
+def chunk_gated_delta_rule_fwd_kernel_h_blockdim64(
+    k,
+    v,
+    w,
+    v_new,
+    g,
+    h,
+    h0,
+    ht,
+    cu_seqlens,
+    chunk_offsets,
+    T,
+    H: tl.constexpr,
+    Hg: tl.constexpr,
+    K: tl.constexpr,
+    V: tl.constexpr,
+    BT: tl.constexpr,
+    BV: tl.constexpr,
+    USE_G: tl.constexpr,
+    USE_INITIAL_STATE: tl.constexpr,
+    STORE_FINAL_STATE: tl.constexpr,
+    SAVE_NEW_VALUE: tl.constexpr,
+    IS_VARLEN: tl.constexpr,
+):
+    i_v, i_nh = tl.program_id(0), tl.program_id(1)
+    i_n, i_h = i_nh // H, i_nh % H
+    if IS_VARLEN:
+        bos, eos = tl.load(cu_seqlens + i_n).to(
+            tl.int32), tl.load(cu_seqlens + i_n + 1).to(tl.int32)
+        T = eos - bos
+        NT = tl.cdiv(T, BT)
+        boh = tl.load(chunk_offsets + i_n).to(tl.int32)
+    else:
+        bos, eos = i_n * T, i_n * T + T
+        NT = tl.cdiv(T, BT)
+        boh = i_n * NT
+
+    # [BK, BV]
+    b_h1 = tl.zeros([64, BV], dtype=tl.float32)
+    if K > 64:
+        b_h2 = tl.zeros([64, BV], dtype=tl.float32)
+    if K > 128:
+        b_h3 = tl.zeros([64, BV], dtype=tl.float32)
+    if K > 192:
+        b_h4 = tl.zeros([64, BV], dtype=tl.float32)
+
+    # calculate offset
+    h += (boh * H + i_h) * K * V
+    v += (bos * H + i_h) * V
+    k += (bos * Hg + i_h // (H // Hg)) * K
+    w += (bos * H + i_h) * K
+    if SAVE_NEW_VALUE:
+        v_new += (bos * H + i_h) * V
+    stride_v = H * V
+    stride_h = H * K * V
+    stride_k = Hg * K
+    stride_w = H * K
+    if USE_INITIAL_STATE:
+        h0 = h0 + i_nh * K * V
+    if STORE_FINAL_STATE:
+        ht = ht + i_nh * K * V
+
+    # load initial state
+    if USE_INITIAL_STATE:
+        p_h0_1 = tl.make_block_ptr(h0, (K, V), (V, 1), (0, i_v * BV), (64, BV),
+                                   (1, 0))
+        b_h1 += tl.load(p_h0_1, boundary_check=(0, 1)).to(tl.float32)
+        if K > 64:
+            p_h0_2 = tl.make_block_ptr(h0, (K, V), (V, 1), (64, i_v * BV),
+                                       (64, BV), (1, 0))
+            b_h2 += tl.load(p_h0_2, boundary_check=(0, 1)).to(tl.float32)
+        if K > 128:
+            p_h0_3 = tl.make_block_ptr(h0, (K, V), (V, 1), (128, i_v * BV),
+                                       (64, BV), (1, 0))
+            b_h3 += tl.load(p_h0_3, boundary_check=(0, 1)).to(tl.float32)
+        if K > 192:
+            p_h0_4 = tl.make_block_ptr(h0, (K, V), (V, 1), (192, i_v * BV),
+                                       (64, BV), (1, 0))
+            b_h4 += tl.load(p_h0_4, boundary_check=(0, 1)).to(tl.float32)
+
+    # main recurrence
+    for i_t in range(NT):
+        p_h1 = tl.make_block_ptr(h + i_t * stride_h, (K, V), (V, 1),
+                                 (0, i_v * BV), (64, BV), (1, 0))
+        tl.store(p_h1, b_h1.to(p_h1.dtype.element_ty), boundary_check=(0, 1))
+        if K > 64:
+            p_h2 = tl.make_block_ptr(h + i_t * stride_h, (K, V), (V, 1),
+                                     (64, i_v * BV), (64, BV), (1, 0))
+            tl.store(p_h2,
+                     b_h2.to(p_h2.dtype.element_ty),
+                     boundary_check=(0, 1))
+        if K > 128:
+            p_h3 = tl.make_block_ptr(h + i_t * stride_h, (K, V), (V, 1),
+                                     (128, i_v * BV), (64, BV), (1, 0))
+            tl.store(p_h3,
+                     b_h3.to(p_h3.dtype.element_ty),
+                     boundary_check=(0, 1))
+        if K > 192:
+            p_h4 = tl.make_block_ptr(h + i_t * stride_h, (K, V), (V, 1),
+                                     (192, i_v * BV), (64, BV), (1, 0))
+            tl.store(p_h4,
+                     b_h4.to(p_h4.dtype.element_ty),
+                     boundary_check=(0, 1))
+
+        p_v = tl.make_block_ptr(v, (T, V), (stride_v, 1), (i_t * BT, i_v * BV),
+                                (BT, BV), (1, 0))
+        p_v_new = tl.make_block_ptr(v_new, (T, V), (stride_v, 1),
+                                    (i_t * BT, i_v * BV), (BT, BV),
+                                    (1, 0)) if SAVE_NEW_VALUE else None
+        b_v_new = tl.zeros([BT, BV], dtype=tl.float32)
+        p_w = tl.make_block_ptr(w, (T, K), (stride_w, 1), (i_t * BT, 0),
+                                (BT, 64), (1, 0))
+        b_w = tl.load(p_w, boundary_check=(0, 1))
+        b_v_new += tl.dot(b_w, b_h1.to(b_w.dtype))
+        if K > 64:
+            p_w = tl.make_block_ptr(w, (T, K), (stride_w, 1), (i_t * BT, 64),
+                                    (BT, 64), (1, 0))
+            b_w = tl.load(p_w, boundary_check=(0, 1))
+            b_v_new += tl.dot(b_w, b_h2.to(b_w.dtype))
+        if K > 128:
+            p_w = tl.make_block_ptr(w, (T, K), (stride_w, 1), (i_t * BT, 128),
+                                    (BT, 64), (1, 0))
+            b_w = tl.load(p_w, boundary_check=(0, 1))
+            b_v_new += tl.dot(b_w, b_h3.to(b_w.dtype))
+        if K > 192:
+            p_w = tl.make_block_ptr(w, (T, K), (stride_w, 1), (i_t * BT, 192),
+                                    (BT, 64), (1, 0))
+            b_w = tl.load(p_w, boundary_check=(0, 1))
+            b_v_new += tl.dot(b_w, b_h4.to(b_w.dtype))
+        b_v_new = -b_v_new + tl.load(p_v, boundary_check=(0, 1))
+
+        if SAVE_NEW_VALUE:
+            p_v_new = tl.make_block_ptr(v_new, (T, V), (stride_v, 1),
+                                        (i_t * BT, i_v * BV), (BT, BV), (1, 0))
+            tl.store(p_v_new,
+                     b_v_new.to(p_v_new.dtype.element_ty),
+                     boundary_check=(0, 1))
+
+        if USE_G:
+            m_t = (i_t * BT + tl.arange(0, BT)) < T
+            last_idx = min((i_t + 1) * BT, T) - 1
+            b_g_last = tl.load(g + bos * H + last_idx * H + i_h)
+            p_g = tl.make_block_ptr(g + bos * H + i_h, (T, ), (H, ),
+                                    (i_t * BT, ), (BT, ), (0, ))
+            b_g = tl.load(p_g, boundary_check=(0, ))
+            b_v_new = b_v_new * tl.where(m_t, exp(b_g_last - b_g), 0)[:, None]
+            b_g_last = exp(b_g_last)
+            b_h1 = b_h1 * b_g_last
+            if K > 64:
+                b_h2 = b_h2 * b_g_last
+            if K > 128:
+                b_h3 = b_h3 * b_g_last
+            if K > 192:
+                b_h4 = b_h4 * b_g_last
+        b_v_new = b_v_new.to(k.dtype.element_ty)
+        p_k = tl.make_block_ptr(k, (K, T), (1, stride_k), (0, i_t * BT),
+                                (64, BT), (0, 1))
+        b_k = tl.load(p_k, boundary_check=(0, 1))
+        b_h1 += tl.dot(b_k, b_v_new)
+        if K > 64:
+            p_k = tl.make_block_ptr(k, (K, T), (1, stride_k), (64, i_t * BT),
+                                    (64, BT), (0, 1))
+            b_k = tl.load(p_k, boundary_check=(0, 1))
+            b_h2 += tl.dot(b_k, b_v_new)
+        if K > 128:
+            p_k = tl.make_block_ptr(k, (K, T), (1, stride_k), (128, i_t * BT),
+                                    (64, BT), (0, 1))
+            b_k = tl.load(p_k, boundary_check=(0, 1))
+            b_h3 += tl.dot(b_k, b_v_new)
+        if K > 192:
+            p_k = tl.make_block_ptr(k, (K, T), (1, stride_k), (192, i_t * BT),
+                                    (64, BT), (0, 1))
+            b_k = tl.load(p_k, boundary_check=(0, 1))
+            b_h4 += tl.dot(b_k, b_v_new)
+
+    # epilogue
+    if STORE_FINAL_STATE:
+        p_ht = tl.make_block_ptr(ht, (K, V), (V, 1), (0, i_v * BV), (64, BV),
+                                 (1, 0))
+        tl.store(p_ht, b_h1.to(p_ht.dtype.element_ty), boundary_check=(0, 1))
+        if K > 64:
+            p_ht = tl.make_block_ptr(ht, (K, V), (V, 1), (64, i_v * BV),
+                                     (64, BV), (1, 0))
+            tl.store(p_ht,
+                     b_h2.to(p_ht.dtype.element_ty),
+                     boundary_check=(0, 1))
+        if K > 128:
+            p_ht = tl.make_block_ptr(ht, (K, V), (V, 1), (128, i_v * BV),
+                                     (64, BV), (1, 0))
+            tl.store(p_ht,
+                     b_h3.to(p_ht.dtype.element_ty),
+                     boundary_check=(0, 1))
+        if K > 192:
+            p_ht = tl.make_block_ptr(ht, (K, V), (V, 1), (192, i_v * BV),
+                                     (64, BV), (1, 0))
+            tl.store(p_ht,
+                     b_h4.to(p_ht.dtype.element_ty),
+                     boundary_check=(0, 1))
+
+
+def chunk_gated_delta_rule_fwd_h(
+    k: torch.Tensor,
+    w: torch.Tensor,
+    u: torch.Tensor,
+    g: Optional[torch.Tensor] = None,
+    initial_state: Optional[torch.Tensor] = None,
+    output_final_state: bool = False,
+    chunk_size: int = 64,  # SY: remove this argument and force chunk size 64?
+    save_new_value: bool = True,
+    cu_seqlens: Optional[torch.LongTensor] = None,
+) -> tuple[torch.Tensor, torch.Tensor]:
+    B, T, Hg, K, V = *k.shape, u.shape[-1]
+    H = u.shape[-2]
+    BT = chunk_size
+
+    chunk_indices = prepare_chunk_indices(
+        cu_seqlens, chunk_size) if cu_seqlens is not None else None
+    # N: the actual number of sequences in the batch with either equal or variable lengths
+    if cu_seqlens is None:
+        N, NT, chunk_offsets = B, triton.cdiv(T, BT), None
+    else:
+        N, NT, chunk_offsets = len(cu_seqlens) - 1, len(
+            chunk_indices), prepare_chunk_offsets(cu_seqlens, BT)
+    assert K <= 256, "current kernel does not support head dimension larger than 256."
+
+    h = k.new_empty(B, NT, H, K, V)
+    final_state = k.new_empty(
+        N, H, K, V, dtype=torch.float32) if output_final_state else None
+
+    v_new = torch.empty_like(u) if save_new_value else None
+
+    def grid(meta):
+        return (triton.cdiv(V, meta['BV']), N * H)
+
+    chunk_gated_delta_rule_fwd_kernel_h_blockdim64[grid](
+        k=k,
+        v=u,
+        w=w,
+        v_new=v_new,
+        g=g,
+        h=h,
+        h0=initial_state,
+        ht=final_state,
+        cu_seqlens=cu_seqlens,
+        chunk_offsets=chunk_offsets,
+        T=T,
+        H=H,
+        Hg=Hg,
+        K=K,
+        V=V,
+        BT=BT)
+    return h, v_new, final_state
diff --git a/vllm/model_executor/layers/fla/ops/chunk_o.py b/vllm/model_executor/layers/fla/ops/chunk_o.py
new file mode 100644
index 0000000000000000000000000000000000000000..332751a1860a900ad4a3a679f30146cce1e4d0f0
--- /dev/null
+++ b/vllm/model_executor/layers/fla/ops/chunk_o.py
@@ -0,0 +1,177 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+# SPDX-FileCopyrightText: Songlin Yang, Yu Zhang
+#
+# This file contains code copied from the flash-linear-attention project.
+# The original source code was licensed under the MIT license and included
+# the following copyright notice:
+# Copyright (c) 2023-2025, Songlin Yang, Yu Zhang
+
+# ruff: noqa: E501
+
+from typing import Optional
+
+import torch
+
+from vllm.triton_utils import tl, triton
+
+from .index import prepare_chunk_indices
+from .op import exp
+from .utils import FLA_GDN_FIX_BT, check_shared_mem, is_nvidia_hopper
+
+BKV_LIST = [64, 128] if check_shared_mem() else [32, 64]
+NUM_WARPS = [2, 4] if is_nvidia_hopper else [2, 4, 8]
+
+
+@triton.heuristics({
+    'USE_G': lambda args: args['g'] is not None,
+    'IS_VARLEN': lambda args: args['cu_seqlens'] is not None
+})
+@triton.autotune(
+    configs=[
+        triton.Config({
+            'BK': BK,
+            'BV': BV
+        },
+                      num_warps=num_warps,
+                      num_stages=num_stages) for BK in BKV_LIST
+        for BV in BKV_LIST for num_warps in NUM_WARPS
+        for num_stages in [2, 3, 4]
+    ],
+    key=['H', 'K', 'V', 'BT'],
+)
+@triton.jit(do_not_specialize=['T'])
+def chunk_fwd_kernel_o(
+    q,
+    k,
+    v,
+    h,
+    g,
+    o,
+    cu_seqlens,
+    chunk_indices,
+    scale,
+    T,
+    H: tl.constexpr,
+    Hg: tl.constexpr,
+    K: tl.constexpr,
+    V: tl.constexpr,
+    BT: tl.constexpr,
+    BK: tl.constexpr,
+    BV: tl.constexpr,
+    USE_G: tl.constexpr,
+    IS_VARLEN: tl.constexpr,
+):
+    i_v, i_t, i_bh = tl.program_id(0), tl.program_id(1), tl.program_id(2)
+    i_b, i_h = i_bh // H, i_bh % H
+
+    if IS_VARLEN:
+        i_tg = i_t
+        i_n, i_t = tl.load(chunk_indices + i_t * 2).to(
+            tl.int32), tl.load(chunk_indices + i_t * 2 + 1).to(tl.int32)
+        bos, eos = tl.load(cu_seqlens + i_n).to(
+            tl.int32), tl.load(cu_seqlens + i_n + 1).to(tl.int32)
+        T = eos - bos
+        NT = tl.cdiv(T, BT)
+    else:
+        NT = tl.cdiv(T, BT)
+        i_tg = i_b * NT + i_t
+        bos, eos = i_b * T, i_b * T + T
+
+    # offset calculation
+    q += (bos * Hg + i_h // (H // Hg)) * K
+    k += (bos * Hg + i_h // (H // Hg)) * K
+    v += (bos * H + i_h) * V
+    o += (bos * H + i_h) * V
+    h += (i_tg * H + i_h).to(tl.int64) * K * V
+
+    b_o = tl.zeros([BT, BV], dtype=tl.float32)
+    b_A = tl.zeros([BT, BT], dtype=tl.float32)
+
+    for i_k in range(tl.cdiv(K, BK)):
+        p_q = tl.make_block_ptr(q, (T, K), (Hg * K, 1), (i_t * BT, i_k * BK),
+                                (BT, BK), (1, 0))
+        p_k = tl.make_block_ptr(k, (K, T), (1, Hg * K), (i_k * BK, i_t * BT),
+                                (BK, BT), (0, 1))
+        p_h = tl.make_block_ptr(h, (K, V), (V, 1), (i_k * BK, i_v * BV),
+                                (BK, BV), (1, 0))
+        # [BT, BK]
+        b_q = tl.load(p_q, boundary_check=(0, 1))
+        # [BK, BT]
+        b_k = tl.load(p_k, boundary_check=(0, 1))
+        # [BK, BV]
+        b_h = tl.load(p_h, boundary_check=(0, 1))
+
+        # [BT, BK] @ [BK, BV] -> [BT, BV]
+        b_o += tl.dot(b_q, b_h)
+        # [BT, BK] @ [BK, BT] -> [BT, BT]
+        b_A += tl.dot(b_q, b_k)
+
+    if USE_G:
+        g += bos * H + i_h
+        p_g = tl.make_block_ptr(g, (T, ), (H, ), (i_t * BT, ), (BT, ), (0, ))
+        b_g = tl.load(p_g, boundary_check=(0, ))
+        b_o = b_o * exp(b_g)[:, None]
+        b_A = b_A * exp(b_g[:, None] - b_g[None, :])
+
+    o_t = i_t * BT + tl.arange(0, BT)
+    m_t = o_t < T
+    m_A = (o_t[:, None] >= o_t[None, :]) & (m_t[:, None] & m_t)
+    b_A = tl.where(m_A, b_A, 0)
+
+    p_v = tl.make_block_ptr(v, (T, V), (H * V, 1), (i_t * BT, i_v * BV),
+                            (BT, BV), (1, 0))
+    p_o = tl.make_block_ptr(o, (T, V), (H * V, 1), (i_t * BT, i_v * BV),
+                            (BT, BV), (1, 0))
+    b_v = tl.load(p_v, boundary_check=(0, 1))
+
+    # to fix mma -> mma layout conversion
+    # already solved by triton v3.2 or higher
+    b_o = b_o * scale + tl.dot(b_A.to(b_v.dtype), b_v) * scale
+    tl.store(p_o, b_o.to(p_o.dtype.element_ty), boundary_check=(0, 1))
+
+
+def chunk_fwd_o(
+        q: torch.Tensor,
+        k: torch.Tensor,
+        v: torch.Tensor,
+        h: torch.Tensor,
+        g: Optional[torch.Tensor] = None,  # cumsum of log decay
+        scale: Optional[float] = None,
+        cu_seqlens: Optional[torch.LongTensor] = None,
+        chunk_size: int = 64) -> torch.Tensor:
+    B, T, Hg, K, V = *q.shape, v.shape[-1]
+    H = v.shape[-2]
+    if FLA_GDN_FIX_BT:
+        BT = 64
+    else:
+        BT = min(chunk_size, max(16, triton.next_power_of_2(T)))
+    chunk_indices = prepare_chunk_indices(
+        cu_seqlens, BT) if cu_seqlens is not None else None
+    NT = triton.cdiv(T, BT) if cu_seqlens is None else len(chunk_indices)
+    if scale is None:
+        scale = k.shape[-1]**-0.5
+
+    o = torch.empty_like(v)
+
+    def grid(meta):
+        return (triton.cdiv(V, meta['BV']), NT, B * H)
+
+    chunk_fwd_kernel_o[grid](
+        q,
+        k,
+        v,
+        h,
+        g,
+        o,
+        cu_seqlens,
+        chunk_indices,
+        scale,
+        T=T,
+        H=H,
+        Hg=Hg,
+        K=K,
+        V=V,
+        BT=BT,
+    )
+    return o
diff --git a/vllm/model_executor/layers/fla/ops/chunk_scaled_dot_kkt.py b/vllm/model_executor/layers/fla/ops/chunk_scaled_dot_kkt.py
new file mode 100644
index 0000000000000000000000000000000000000000..d1adc6978f245960e770b20308fd601e047eb63d
--- /dev/null
+++ b/vllm/model_executor/layers/fla/ops/chunk_scaled_dot_kkt.py
@@ -0,0 +1,140 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+# SPDX-FileCopyrightText: Songlin Yang, Yu Zhang
+#
+# This file contains code copied from the flash-linear-attention project.
+# The original source code was licensed under the MIT license and included
+# the following copyright notice:
+# Copyright (c) 2023-2025, Songlin Yang, Yu Zhang
+# ruff: noqa: E501
+from typing import Optional
+
+import torch
+
+from vllm.triton_utils import tl, triton
+
+from .index import prepare_chunk_indices
+from .op import exp
+
+
+@triton.heuristics({
+    'IS_VARLEN': lambda args: args['cu_seqlens'] is not None,
+    'USE_G': lambda args: args['g_cumsum'] is not None
+})
+@triton.autotune(
+    configs=[
+        triton.Config({'BK': BK}, num_warps=num_warps, num_stages=num_stages)
+        for BK in [32, 64, 128] for num_warps in [2, 4, 8]
+        for num_stages in [2, 3, 4]
+    ],
+    key=['H', 'K', 'BT', 'IS_VARLEN'],
+)
+@triton.jit(do_not_specialize=['T'])
+def chunk_scaled_dot_kkt_fwd_kernel(
+    k,
+    beta,
+    g_cumsum,
+    A,
+    cu_seqlens,
+    chunk_indices,
+    T,
+    H: tl.constexpr,
+    Hg: tl.constexpr,
+    K: tl.constexpr,
+    BT: tl.constexpr,
+    BK: tl.constexpr,
+    IS_VARLEN: tl.constexpr,
+    USE_G: tl.constexpr,
+):
+    i_t, i_bh = tl.program_id(0), tl.program_id(1)
+    i_b, i_h = i_bh // H, i_bh % H
+    if IS_VARLEN:
+        i_n, i_t = tl.load(chunk_indices + i_t * 2).to(
+            tl.int32), tl.load(chunk_indices + i_t * 2 + 1).to(tl.int32)
+        bos, eos = tl.load(cu_seqlens + i_n).to(
+            tl.int32), tl.load(cu_seqlens + i_n + 1).to(tl.int32)
+        T = eos - bos
+    else:
+        bos, eos = i_b * T, i_b * T + T
+    o_t = i_t * BT + tl.arange(0, BT)
+    m_t = o_t < T
+
+    p_beta = tl.make_block_ptr(beta + bos * H + i_h, (T, ), (H, ),
+                               (i_t * BT, ), (BT, ), (0, ))
+    b_beta = tl.load(p_beta, boundary_check=(0, ))
+
+    b_A = tl.zeros([BT, BT], dtype=tl.float32)
+    for i_k in range(tl.cdiv(K, BK)):
+        p_k = tl.make_block_ptr(k + (bos * Hg + i_h // (H // Hg)) * K, (T, K),
+                                (Hg * K, 1), (i_t * BT, i_k * BK), (BT, BK),
+                                (1, 0))
+        b_k = tl.load(p_k, boundary_check=(0, 1))
+        b_kb = b_k * b_beta[:, None]
+        b_A += tl.dot(b_kb.to(b_k.dtype), tl.trans(b_k))
+
+    if USE_G:
+        p_g = tl.make_block_ptr(g_cumsum + bos * H + i_h, (T, ), (H, ),
+                                (i_t * BT, ), (BT, ), (0, ))
+        b_g = tl.load(p_g, boundary_check=(0, ))
+        b_g_diff = b_g[:, None] - b_g[None, :]
+        b_A = b_A * exp(b_g_diff)
+
+    m_A = (o_t[:, None] > o_t[None, :]) & (m_t[:, None] & m_t)
+    b_A = tl.where(m_A, b_A, 0)
+    p_A = tl.make_block_ptr(A + (bos * H + i_h) * BT, (T, BT), (BT * H, 1),
+                            (i_t * BT, 0), (BT, BT), (1, 0))
+    tl.store(p_A, b_A.to(p_A.dtype.element_ty), boundary_check=(0, 1))
+
+
+def chunk_scaled_dot_kkt_fwd(
+        k: torch.Tensor,
+        beta: torch.Tensor,
+        g_cumsum: Optional[torch.Tensor] = None,
+        cu_seqlens: Optional[torch.LongTensor] = None,
+        chunk_size: int = 64,
+        output_dtype: torch.dtype = torch.float32) -> torch.Tensor:
+    r"""
+    Compute beta * K * K^T.
+
+    Args:
+        k (torch.Tensor):
+            The key tensor of shape `[B, T, H, K]`.
+        beta (torch.Tensor):
+            The beta tensor of shape `[B, T, H]`.
+        g_cumsum (torch.Tensor):
+            The cumulative sum of the gate tensor of shape `[B, T, H]`.
+            Default: None
+        cu_seqlens (torch.LongTensor):
+            The cumulative sequence lengths of the input tensor.
+            Default: None
+        chunk_size (int):
+            The chunk size. Default: 64.
+        output_dtype (torch.dtype):
+            The dtype of the output tensor. Default: `torch.float32`
+
+    Returns:
+        beta * K * K^T of shape `[B, T, H, BT]` where `BT` is the chunk size.
+    """
+
+    B, T, Hg, K = k.shape
+
+    H = beta.shape[-1]
+    BT = chunk_size
+    chunk_indices = prepare_chunk_indices(
+        cu_seqlens, BT) if cu_seqlens is not None else None
+    NT = triton.cdiv(T, BT) if cu_seqlens is None else len(chunk_indices)
+    A = torch.empty(B, T, H, BT, device=k.device, dtype=output_dtype)
+    chunk_scaled_dot_kkt_fwd_kernel[(NT, B * H)](
+        k=k,
+        beta=beta,
+        g_cumsum=g_cumsum,
+        A=A,
+        cu_seqlens=cu_seqlens,
+        chunk_indices=chunk_indices,
+        T=T,
+        H=H,
+        Hg=Hg,
+        K=K,
+        BT=BT,
+    )
+    return A
diff --git a/vllm/model_executor/layers/fla/ops/cumsum.py b/vllm/model_executor/layers/fla/ops/cumsum.py
new file mode 100644
index 0000000000000000000000000000000000000000..370a45fe163582e6db2a5046360927902dcedd9f
--- /dev/null
+++ b/vllm/model_executor/layers/fla/ops/cumsum.py
@@ -0,0 +1,226 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+# SPDX-FileCopyrightText: Songlin Yang, Yu Zhang
+#
+# This file contains code copied from the flash-linear-attention project.
+# The original source code was licensed under the MIT license and included
+# the following copyright notice:
+# Copyright (c) 2023-2025, Songlin Yang, Yu Zhang
+# ruff: noqa: E501
+import warnings
+from typing import Optional
+
+import torch
+
+from vllm.triton_utils import tl, triton
+
+from .index import prepare_chunk_indices
+from .utils import check_shared_mem, input_guard
+
+BS_LIST = [32, 64] if check_shared_mem() else [16, 32]
+
+
+@triton.heuristics({'IS_VARLEN': lambda args: args['cu_seqlens'] is not None})
+@triton.autotune(configs=[
+    triton.Config({}, num_warps=num_warps) for num_warps in [1, 2, 4, 8]
+],
+                 key=['B', 'H', 'BT', 'IS_VARLEN', 'REVERSE'])
+@triton.jit(do_not_specialize=['T'])
+def chunk_local_cumsum_scalar_kernel(
+    s,
+    o,
+    cu_seqlens,
+    chunk_indices,
+    T,
+    B: tl.constexpr,
+    H: tl.constexpr,
+    BT: tl.constexpr,
+    REVERSE: tl.constexpr,
+    IS_VARLEN: tl.constexpr,
+    HEAD_FIRST: tl.constexpr,
+):
+    i_t, i_bh = tl.program_id(0), tl.program_id(1)
+    i_b, i_h = i_bh // H, i_bh % H
+    if IS_VARLEN:
+        i_n, i_t = tl.load(chunk_indices + i_t * 2).to(
+            tl.int32), tl.load(chunk_indices + i_t * 2 + 1).to(tl.int32)
+        bos, eos = tl.load(cu_seqlens + i_n).to(
+            tl.int32), tl.load(cu_seqlens + i_n + 1).to(tl.int32)
+        T = eos - bos
+    else:
+        bos, eos = i_b * T, i_b * T + T
+
+    if HEAD_FIRST:
+        p_s = tl.make_block_ptr(s + bos * H + i_h * T, (T, ), (1, ),
+                                (i_t * BT, ), (BT, ), (0, ))
+        p_o = tl.make_block_ptr(o + bos * H + i_h * T, (T, ), (1, ),
+                                (i_t * BT, ), (BT, ), (0, ))
+    else:
+        p_s = tl.make_block_ptr(s + bos * H + i_h, (T, ), (H, ), (i_t * BT, ),
+                                (BT, ), (0, ))
+        p_o = tl.make_block_ptr(o + bos * H + i_h, (T, ), (H, ), (i_t * BT, ),
+                                (BT, ), (0, ))
+    # [BT]
+    b_s = tl.load(p_s, boundary_check=(0, )).to(tl.float32)
+    b_o = tl.cumsum(b_s, axis=0)
+    if REVERSE:
+        b_z = tl.sum(b_s, axis=0)
+        b_o = -b_o + b_z[None] + b_s
+    tl.store(p_o, b_o.to(p_o.dtype.element_ty), boundary_check=(0, ))
+
+
+@triton.heuristics({'IS_VARLEN': lambda args: args['cu_seqlens'] is not None})
+@triton.autotune(configs=[
+    triton.Config({'BS': BS}, num_warps=num_warps) for BS in BS_LIST
+    for num_warps in [2, 4, 8]
+],
+                 key=['B', 'H', 'S', 'BT', 'IS_VARLEN', 'REVERSE'])
+@triton.jit(do_not_specialize=['T'])
+def chunk_local_cumsum_vector_kernel(
+    s,
+    o,
+    cu_seqlens,
+    chunk_indices,
+    T,
+    B: tl.constexpr,
+    H: tl.constexpr,
+    S: tl.constexpr,
+    BT: tl.constexpr,
+    BS: tl.constexpr,
+    REVERSE: tl.constexpr,
+    IS_VARLEN: tl.constexpr,
+    HEAD_FIRST: tl.constexpr,
+):
+    i_s, i_t, i_bh = tl.program_id(0), tl.program_id(1), tl.program_id(2)
+    i_b, i_h = i_bh // H, i_bh % H
+    if IS_VARLEN:
+        i_n, i_t = tl.load(chunk_indices + i_t * 2).to(
+            tl.int32), tl.load(chunk_indices + i_t * 2 + 1).to(tl.int32)
+        bos, eos = tl.load(cu_seqlens + i_n).to(
+            tl.int32), tl.load(cu_seqlens + i_n + 1).to(tl.int32)
+        T = eos - bos
+    else:
+        bos, eos = i_b * T, i_b * T + T
+
+    o_i = tl.arange(0, BT)
+    if REVERSE:
+        m_s = tl.where(o_i[:, None] <= o_i[None, :], 1., 0.)
+    else:
+        m_s = tl.where(o_i[:, None] >= o_i[None, :], 1., 0.)
+
+    if HEAD_FIRST:
+        p_s = tl.make_block_ptr(s + (bos * H + i_h * T) * S, (T, S), (S, 1),
+                                (i_t * BT, i_s * BS), (BT, BS), (1, 0))
+        p_o = tl.make_block_ptr(o + (bos * H + i_h * T) * S, (T, S), (S, 1),
+                                (i_t * BT, i_s * BS), (BT, BS), (1, 0))
+    else:
+        p_s = tl.make_block_ptr(s + (bos * H + i_h) * S, (T, S), (H * S, 1),
+                                (i_t * BT, i_s * BS), (BT, BS), (1, 0))
+        p_o = tl.make_block_ptr(o + (bos * H + i_h) * S, (T, S), (H * S, 1),
+                                (i_t * BT, i_s * BS), (BT, BS), (1, 0))
+    # [BT, BS]
+    b_s = tl.load(p_s, boundary_check=(0, 1)).to(tl.float32)
+    b_o = tl.dot(m_s, b_s, allow_tf32=False)
+    tl.store(p_o, b_o.to(p_o.dtype.element_ty), boundary_check=(0, 1))
+
+
+def chunk_local_cumsum_scalar(
+        g: torch.Tensor,
+        chunk_size: int,
+        reverse: bool = False,
+        cu_seqlens: Optional[torch.Tensor] = None,
+        head_first: bool = False,
+        output_dtype: Optional[torch.dtype] = torch.float) -> torch.Tensor:
+    if head_first:
+        B, H, T = g.shape
+    else:
+        B, T, H = g.shape
+    assert chunk_size == 2**(chunk_size.bit_length() -
+                             1), "chunk_size must be a power of 2"
+    BT = chunk_size
+    chunk_indices = prepare_chunk_indices(
+        cu_seqlens, BT) if cu_seqlens is not None else None
+    NT = triton.cdiv(T, BT) if cu_seqlens is None else len(chunk_indices)
+    g_org, g = g, torch.empty_like(g, dtype=output_dtype or g.dtype)
+    grid = (NT, B * H)
+    chunk_local_cumsum_scalar_kernel[grid](g_org,
+                                           g,
+                                           cu_seqlens,
+                                           chunk_indices,
+                                           T=T,
+                                           B=B,
+                                           H=H,
+                                           BT=BT,
+                                           HEAD_FIRST=head_first,
+                                           REVERSE=reverse)
+    return g
+
+
+def chunk_local_cumsum_vector(
+        g: torch.Tensor,
+        chunk_size: int,
+        reverse: bool = False,
+        cu_seqlens: Optional[torch.Tensor] = None,
+        head_first: bool = False,
+        output_dtype: Optional[torch.dtype] = torch.float) -> torch.Tensor:
+    if head_first:
+        B, H, T, S = g.shape
+    else:
+        B, T, H, S = g.shape
+    BT = chunk_size
+    chunk_indices = prepare_chunk_indices(
+        cu_seqlens, chunk_size) if cu_seqlens is not None else None
+    NT = triton.cdiv(T, BT) if cu_seqlens is None else len(chunk_indices)
+    assert chunk_size == 2**(chunk_size.bit_length() -
+                             1), "chunk_size must be a power of 2"
+
+    g_org, g = g, torch.empty_like(g, dtype=output_dtype or g.dtype)
+
+    def grid(meta):
+        return (triton.cdiv(meta['S'], meta['BS']), NT, B * H)
+
+    # keep cumulative normalizer in fp32
+    # this kernel is equivalent to
+    # g = g.view(B, H, NT, BT, -1).cumsum(-2).view(B, H, T, -1)
+    chunk_local_cumsum_vector_kernel[grid](g_org,
+                                           g,
+                                           cu_seqlens,
+                                           chunk_indices,
+                                           T=T,
+                                           B=B,
+                                           H=H,
+                                           S=S,
+                                           BT=BT,
+                                           HEAD_FIRST=head_first,
+                                           REVERSE=reverse)
+    return g
+
+
+@input_guard
+def chunk_local_cumsum(g: torch.Tensor,
+                       chunk_size: int,
+                       reverse: bool = False,
+                       cu_seqlens: Optional[torch.Tensor] = None,
+                       head_first: bool = False,
+                       output_dtype: Optional[torch.dtype] = torch.float,
+                       **kwargs) -> torch.Tensor:
+    if not head_first and g.shape[1] < g.shape[2]:
+        warnings.warn(
+            f"Input tensor shape suggests potential format mismatch: seq_len ({g.shape[1]}) < num_heads ({g.shape[2]}). "
+            "This may indicate the inputs were passed in head-first format [B, H, T, ...] "
+            "when head_first=False was specified. "
+            "Please verify your input tensor format matches the expected shape [B, T, H, ...].",
+            stacklevel=2)
+    if cu_seqlens is not None:
+        assert g.shape[
+            0] == 1, "Only batch size 1 is supported when cu_seqlens are provided"
+    if len(g.shape) == 3:
+        return chunk_local_cumsum_scalar(g, chunk_size, reverse, cu_seqlens,
+                                         head_first, output_dtype)
+    elif len(g.shape) == 4:
+        return chunk_local_cumsum_vector(g, chunk_size, reverse, cu_seqlens,
+                                         head_first, output_dtype)
+    else:
+        raise ValueError(f"Unsupported input shape {g.shape}. "
+                         f"which should be (B, T, H, D) if `head_first=False` "
+                         f"or (B, H, T, D) otherwise")
diff --git a/vllm/model_executor/layers/fla/ops/fused_recurrent.py b/vllm/model_executor/layers/fla/ops/fused_recurrent.py
new file mode 100644
index 0000000000000000000000000000000000000000..b278e3741574810d914af1442e6bdeb6ee4c9611
--- /dev/null
+++ b/vllm/model_executor/layers/fla/ops/fused_recurrent.py
@@ -0,0 +1,366 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+# SPDX-FileCopyrightText: Songlin Yang, Yu Zhang
+#
+# This file contains code copied from the flash-linear-attention project.
+# The original source code was licensed under the MIT license and included
+# the following copyright notice:
+# Copyright (c) 2023-2025, Songlin Yang, Yu Zhang
+# ruff: noqa: E501
+from typing import Optional
+
+import torch
+
+from vllm.triton_utils import tl, triton
+
+from .op import exp
+
+
+@triton.heuristics({
+    'USE_INITIAL_STATE':
+    lambda args: args['h0'] is not None,
+    'IS_VARLEN':
+    lambda args: args['cu_seqlens'] is not None,
+    "IS_CONTINUOUS_BATCHING":
+    lambda args: args['ssm_state_indices'] is not None,
+    "IS_SPEC_DECODING":
+    lambda args: args['num_accepted_tokens'] is not None,
+})
+@triton.jit(do_not_specialize=['N', 'T'])
+def fused_recurrent_gated_delta_rule_fwd_kernel(
+    q,
+    k,
+    v,
+    g,
+    beta,
+    o,
+    h0,
+    ht,
+    cu_seqlens,
+    ssm_state_indices,
+    num_accepted_tokens,
+    scale,
+    N: tl.constexpr,  # num of sequences
+    T: tl.constexpr,  # num of tokens
+    B: tl.constexpr,
+    H: tl.constexpr,
+    HV: tl.constexpr,
+    K: tl.constexpr,
+    V: tl.constexpr,
+    BK: tl.constexpr,
+    BV: tl.constexpr,
+    stride_init_state_token: tl.constexpr,
+    stride_final_state_token: tl.constexpr,
+    stride_indices_seq: tl.constexpr,
+    stride_indices_tok: tl.constexpr,
+    USE_INITIAL_STATE: tl.constexpr,  # whether to use initial state
+    INPLACE_FINAL_STATE: tl.constexpr,  # whether to store final state inplace
+    IS_BETA_HEADWISE: tl.
+    constexpr,  # whether beta is headwise vector or scalar,
+    USE_QK_L2NORM_IN_KERNEL: tl.constexpr,
+    IS_VARLEN: tl.constexpr,
+    IS_CONTINUOUS_BATCHING: tl.constexpr,
+    IS_SPEC_DECODING: tl.constexpr,
+):
+    i_k, i_v, i_nh = tl.program_id(0), tl.program_id(1), tl.program_id(2)
+    i_n, i_hv = i_nh // HV, i_nh % HV
+    i_h = i_hv // (HV // H)
+    if IS_VARLEN:
+        bos, eos = tl.load(cu_seqlens + i_n).to(
+            tl.int64), tl.load(cu_seqlens + i_n + 1).to(tl.int64)
+        all = T
+        T = eos - bos
+    else:
+        bos, eos = i_n * T, i_n * T + T
+        all = B * T
+
+    if T == 0:
+        # no tokens to process for this sequence
+        return
+
+    o_k = i_k * BK + tl.arange(0, BK)
+    o_v = i_v * BV + tl.arange(0, BV)
+
+    p_q = q + (bos * H + i_h) * K + o_k
+    p_k = k + (bos * H + i_h) * K + o_k
+    p_v = v + (bos * HV + i_hv) * V + o_v
+    if IS_BETA_HEADWISE:
+        p_beta = beta + (bos * HV + i_hv) * V + o_v
+    else:
+        p_beta = beta + bos * HV + i_hv
+    p_g = g + bos * HV + i_hv
+    p_o = o + ((i_k * all + bos) * HV + i_hv) * V + o_v
+
+    mask_k = o_k < K
+    mask_v = o_v < V
+    mask_h = mask_k[:, None] & mask_v[None, :]
+
+    b_h = tl.zeros([BK, BV], dtype=tl.float32)
+    if USE_INITIAL_STATE:
+        if IS_CONTINUOUS_BATCHING:
+            if IS_SPEC_DECODING:
+                i_t = tl.load(num_accepted_tokens + i_n).to(tl.int64) - 1
+            else:
+                i_t = 0
+            p_h0 = h0 + tl.load(ssm_state_indices + i_n * stride_indices_seq +
+                                i_t).to(tl.int64) * stride_init_state_token
+        else:
+            p_h0 = h0 + bos * HV * K * V
+        p_h0 = p_h0 + i_hv * K * V + o_k[:, None] * V + o_v[None, :]
+        b_h += tl.load(p_h0, mask=mask_h, other=0).to(tl.float32)
+
+    for i_t in range(0, T):
+        b_q = tl.load(p_q, mask=mask_k, other=0).to(tl.float32)
+        b_k = tl.load(p_k, mask=mask_k, other=0).to(tl.float32)
+        b_v = tl.load(p_v, mask=mask_v, other=0).to(tl.float32)
+        b_g = tl.load(p_g).to(tl.float32)
+
+        if USE_QK_L2NORM_IN_KERNEL:
+            b_q = b_q / tl.sqrt(tl.sum(b_q * b_q) + 1e-6)
+            b_k = b_k / tl.sqrt(tl.sum(b_k * b_k) + 1e-6)
+        b_q = b_q * scale
+        # [BK, BV]
+        b_h *= exp(b_g)
+        # [BV]
+        b_v -= tl.sum(b_h * b_k[:, None], 0)
+        if IS_BETA_HEADWISE:
+            b_beta = tl.load(p_beta, mask=mask_v, other=0).to(tl.float32)
+        else:
+            b_beta = tl.load(p_beta).to(tl.float32)
+        b_v *= b_beta
+        # [BK, BV]
+        b_h += b_k[:, None] * b_v[None, :]
+        # [BV]
+        b_o = tl.sum(b_h * b_q[:, None], 0)
+        tl.store(p_o, b_o.to(p_o.dtype.element_ty), mask=mask_v)
+
+        # keep the states for multi-query tokens
+        if INPLACE_FINAL_STATE:
+            p_ht = ht + tl.load(ssm_state_indices + i_n * stride_indices_seq +
+                                i_t).to(tl.int64) * stride_final_state_token
+        else:
+            p_ht = ht + (bos + i_t) * stride_final_state_token
+        p_ht = p_ht + i_hv * K * V + o_k[:, None] * V + o_v[None, :]
+        tl.store(p_ht, b_h.to(p_ht.dtype.element_ty), mask=mask_h)
+
+        p_q += H * K
+        p_k += H * K
+        p_o += HV * V
+        p_v += HV * V
+        p_g += HV
+        p_beta += HV * (V if IS_BETA_HEADWISE else 1)
+
+
+def fused_recurrent_gated_delta_rule_fwd(
+    q: torch.Tensor,
+    k: torch.Tensor,
+    v: torch.Tensor,
+    g: torch.Tensor,
+    beta: torch.Tensor,
+    scale: float,
+    initial_state: torch.Tensor,
+    inplace_final_state: bool = True,
+    cu_seqlens: Optional[torch.LongTensor] = None,
+    ssm_state_indices: Optional[torch.Tensor] = None,
+    num_accepted_tokens: Optional[torch.Tensor] = None,
+    use_qk_l2norm_in_kernel: bool = False,
+) -> tuple[torch.Tensor, torch.Tensor]:
+    B, T, H, K, V = *k.shape, v.shape[-1]
+    HV = v.shape[2]
+    N = B if cu_seqlens is None else len(cu_seqlens) - 1
+    BK, BV = triton.next_power_of_2(K), min(triton.next_power_of_2(V), 8)
+    NK, NV = triton.cdiv(K, BK), triton.cdiv(V, BV)
+    assert NK == 1, "NK > 1 is not supported yet"
+    num_stages = 3
+    num_warps = 1
+
+    o = q.new_empty(NK, *v.shape)
+    if inplace_final_state:
+        final_state = initial_state
+    else:
+        final_state = q.new_empty(T, HV, K, V, dtype=initial_state.dtype)
+
+    stride_init_state_token = initial_state.stride(0)
+    stride_final_state_token = final_state.stride(0)
+
+    if ssm_state_indices is None:
+        stride_indices_seq, stride_indices_tok = 1, 1
+    elif ssm_state_indices.ndim == 1:
+        stride_indices_seq, stride_indices_tok = ssm_state_indices.stride(0), 1
+    else:
+        stride_indices_seq, stride_indices_tok = ssm_state_indices.stride()
+
+    grid = (NK, NV, N * HV)
+    fused_recurrent_gated_delta_rule_fwd_kernel[grid](
+        q=q,
+        k=k,
+        v=v,
+        g=g,
+        beta=beta,
+        o=o,
+        h0=initial_state,
+        ht=final_state,
+        cu_seqlens=cu_seqlens,
+        ssm_state_indices=ssm_state_indices,
+        num_accepted_tokens=num_accepted_tokens,
+        scale=scale,
+        N=N,
+        T=T,
+        B=B,
+        H=H,
+        HV=HV,
+        K=K,
+        V=V,
+        BK=BK,
+        BV=BV,
+        stride_init_state_token=stride_init_state_token,
+        stride_final_state_token=stride_final_state_token,
+        stride_indices_seq=stride_indices_seq,
+        stride_indices_tok=stride_indices_tok,
+        IS_BETA_HEADWISE=beta.ndim == v.ndim,
+        USE_QK_L2NORM_IN_KERNEL=use_qk_l2norm_in_kernel,
+        INPLACE_FINAL_STATE=inplace_final_state,
+        num_warps=num_warps,
+        num_stages=num_stages,
+    )
+    o = o.squeeze(0)
+    return o, final_state
+
+
+class FusedRecurrentFunction(torch.autograd.Function):
+
+    @staticmethod
+    def forward(ctx,
+                q: torch.Tensor,
+                k: torch.Tensor,
+                v: torch.Tensor,
+                g: torch.Tensor,
+                beta: torch.Tensor,
+                scale: float,
+                initial_state: torch.Tensor,
+                inplace_final_state: bool = True,
+                cu_seqlens: Optional[torch.LongTensor] = None,
+                ssm_state_indices: Optional[torch.Tensor] = None,
+                num_accepted_tokens: Optional[torch.Tensor] = None,
+                use_qk_l2norm_in_kernel: bool = False):
+        o, final_state = fused_recurrent_gated_delta_rule_fwd(
+            q=q.contiguous(),
+            k=k.contiguous(),
+            v=v.contiguous(),
+            g=g.contiguous(),
+            beta=beta.contiguous(),
+            scale=scale,
+            initial_state=initial_state,
+            inplace_final_state=inplace_final_state,
+            cu_seqlens=cu_seqlens,
+            ssm_state_indices=ssm_state_indices,
+            num_accepted_tokens=num_accepted_tokens,
+            use_qk_l2norm_in_kernel=use_qk_l2norm_in_kernel,
+        )
+
+        return o, final_state
+
+
+def fused_recurrent_gated_delta_rule(
+    q: torch.Tensor,
+    k: torch.Tensor,
+    v: torch.Tensor,
+    g: torch.Tensor,
+    beta: torch.Tensor = None,
+    scale: float = None,
+    initial_state: torch.Tensor = None,
+    inplace_final_state: bool = True,
+    cu_seqlens: Optional[torch.LongTensor] = None,
+    ssm_state_indices: Optional[torch.Tensor] = None,
+    num_accepted_tokens: Optional[torch.Tensor] = None,
+    use_qk_l2norm_in_kernel: bool = False,
+) -> tuple[torch.Tensor, torch.Tensor]:
+    r"""
+    Args:
+        q (torch.Tensor):
+            queries of shape `[B, T, H, K]`.
+        k (torch.Tensor):
+            keys of shape `[B, T, H, K]`.
+        v (torch.Tensor):
+            values of shape `[B, T, HV, V]`.
+            GVA is applied if `HV > H`.
+        g (torch.Tensor):
+            g (decays) of shape `[B, T, HV]`.
+        beta (torch.Tensor):
+            betas of shape `[B, T, HV]`.
+        scale (Optional[int]):
+            Scale factor for the RetNet attention scores.
+            If not provided, it will default to `1 / sqrt(K)`. Default: `None`.
+        initial_state (Optional[torch.Tensor]):
+            Initial state of shape `[N, HV, K, V]` for `N` input sequences.
+            For equal-length input sequences, `N` equals the batch size `B`.
+            Default: `None`.
+        inplace_final_state: bool:
+            Whether to store the final state in-place to save memory.
+            Default: `True`.
+        cu_seqlens (torch.LongTensor):
+            Cumulative sequence lengths of shape `[N+1]` used for variable-length training,
+            consistent with the FlashAttention API.
+        ssm_state_indices (Optional[torch.Tensor]):
+            Indices to map the input sequences to the initial/final states.
+        num_accepted_tokens (Optional[torch.Tensor]):
+            Number of accepted tokens for each sequence during decoding.
+
+    Returns:
+        o (torch.Tensor):
+            Outputs of shape `[B, T, HV, V]`.
+        final_state (torch.Tensor):
+            Final state of shape `[N, HV, K, V]`.
+
+    Examples::
+        >>> import torch
+        >>> import torch.nn.functional as F
+        >>> from einops import rearrange
+        >>> from fla.ops.gated_delta_rule import fused_recurrent_gated_delta_rule
+        # inputs with equal lengths
+        >>> B, T, H, HV, K, V = 4, 2048, 4, 8, 512, 512
+        >>> q = torch.randn(B, T, H, K, device='cuda')
+        >>> k = F.normalize(torch.randn(B, T, H, K, device='cuda'), p=2, dim=-1)
+        >>> v = torch.randn(B, T, HV, V, device='cuda')
+        >>> g = F.logsigmoid(torch.rand(B, T, HV, device='cuda'))
+        >>> beta = torch.rand(B, T, HV, device='cuda').sigmoid()
+        >>> h0 = torch.randn(B, HV, K, V, device='cuda')
+        >>> o, ht = fused_gated_recurrent_delta_rule(
+            q, k, v, g, beta,
+            initial_state=h0,
+        )
+        # for variable-length inputs, the batch size `B` is expected to be 1 and `cu_seqlens` is required
+        >>> q, k, v, g, beta = map(lambda x: rearrange(x, 'b t ... -> 1 (b t) ...'), (q, k, v, g, beta))
+        # for a batch with 4 sequences, `cu_seqlens` with 5 start/end positions are expected
+        >>> cu_seqlens = q.new_tensor([0, 2048, 4096, 6144, 8192], dtype=torch.long)
+        >>> o_var, ht_var = fused_gated_recurrent_delta_rule(
+            q, k, v, g, beta,
+            initial_state=h0,
+            cu_seqlens=cu_seqlens
+        )
+    """
+    if cu_seqlens is not None and q.shape[0] != 1:
+        raise ValueError(
+            f"The batch size is expected to be 1 rather than {q.shape[0]} when using `cu_seqlens`."
+            f"Please flatten variable-length inputs before processing.")
+    if scale is None:
+        scale = k.shape[-1]**-0.5
+    else:
+        assert scale > 0, "scale must be positive"
+    if beta is None:
+        beta = torch.ones_like(q[..., 0])
+    o, final_state = FusedRecurrentFunction.apply(
+        q,
+        k,
+        v,
+        g,
+        beta,
+        scale,
+        initial_state,
+        inplace_final_state,
+        cu_seqlens,
+        ssm_state_indices,
+        num_accepted_tokens,
+        use_qk_l2norm_in_kernel,
+    )
+    return o, final_state
diff --git a/vllm/model_executor/layers/fla/ops/index.py b/vllm/model_executor/layers/fla/ops/index.py
new file mode 100644
index 0000000000000000000000000000000000000000..9eca32bc31a044b2ff9e36bc2a086b81e5ca5704
--- /dev/null
+++ b/vllm/model_executor/layers/fla/ops/index.py
@@ -0,0 +1,39 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+# SPDX-FileCopyrightText: Songlin Yang, Yu Zhang
+#
+# This file contains code copied from the flash-linear-attention project.
+# The original source code was licensed under the MIT license and included
+# the following copyright notice:
+# Copyright (c) 2023-2025, Songlin Yang, Yu Zhang
+# ruff: noqa: E501
+import torch
+
+from vllm.triton_utils import triton
+
+from .utils import tensor_cache
+
+
+@tensor_cache
+def prepare_lens(cu_seqlens: torch.LongTensor) -> torch.LongTensor:
+    return cu_seqlens[1:] - cu_seqlens[:-1]
+
+
+@tensor_cache
+def prepare_chunk_indices(cu_seqlens: torch.LongTensor,
+                          chunk_size: int) -> torch.LongTensor:
+    indices = torch.cat([
+        torch.arange(n)
+        for n in triton.cdiv(prepare_lens(cu_seqlens), chunk_size).tolist()
+    ])
+    return torch.stack([indices.eq(0).cumsum(0) - 1, indices],
+                       1).to(cu_seqlens)
+
+
+@tensor_cache
+def prepare_chunk_offsets(cu_seqlens: torch.LongTensor,
+                          chunk_size: int) -> torch.LongTensor:
+    return torch.cat([
+        cu_seqlens.new_tensor([0]),
+        triton.cdiv(prepare_lens(cu_seqlens), chunk_size)
+    ]).cumsum(-1)
diff --git a/vllm/model_executor/layers/fla/ops/l2norm.py b/vllm/model_executor/layers/fla/ops/l2norm.py
new file mode 100644
index 0000000000000000000000000000000000000000..ef9788ceaf20e2968169a104c043f3242db29ad9
--- /dev/null
+++ b/vllm/model_executor/layers/fla/ops/l2norm.py
@@ -0,0 +1,143 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+# SPDX-FileCopyrightText: Songlin Yang, Yu Zhang
+#
+# This file contains code copied from the flash-linear-attention project.
+# The original source code was licensed under the MIT license and included
+# the following copyright notice:
+# Copyright (c) 2023-2025, Songlin Yang, Yu Zhang
+
+import os
+from typing import Optional
+
+import torch
+
+from vllm.triton_utils import tl, triton
+
+BT_LIST = [8, 16, 32, 64, 128]
+
+USE_DEFAULT_FLA_NORM = int(os.getenv("USE_DEFAULT_FLA_NORM", "0"))
+
+
+@triton.autotune(configs=[
+    triton.Config({}, num_warps=num_warps)
+    for num_warps in [1, 2, 4, 8, 16, 32]
+],
+                 key=['D'])
+@triton.jit
+def l2norm_fwd_kernel1(
+    x,
+    y,
+    D,
+    BD: tl.constexpr,
+    eps,
+):
+    i_t = tl.program_id(0)
+    x += i_t * D
+    y += i_t * D
+    # Compute mean and variance
+    cols = tl.arange(0, BD)
+    mask = cols < D
+    b_x = tl.load(x + cols, mask=mask, other=0.0).to(tl.float32)
+    b_var = tl.sum(b_x * b_x, axis=0)
+    b_rstd = 1 / tl.sqrt(b_var + eps)
+    # tl.store(Rstd + i_t, rstd)
+    # Normalize and apply linear transformation
+    b_y = b_x * b_rstd
+    tl.store(y + cols, b_y, mask=mask)
+
+
+@triton.autotune(configs=[
+    triton.Config({'BT': BT}, num_warps=num_warps)
+    for num_warps in [1, 2, 4, 8, 16] for BT in BT_LIST
+],
+                 key=['D'])
+@triton.jit(do_not_specialize=["NB"])
+def l2norm_fwd_kernel(
+    x,
+    y,
+    eps,
+    NB,
+    T,
+    D: tl.constexpr,
+    BT: tl.constexpr,
+    BD: tl.constexpr,
+):
+    i_t = tl.program_id(0)
+    p_x = tl.make_block_ptr(x, (T, D), (D, 1), (i_t * BT, 0), (BT, BD), (1, 0))
+    b_x = tl.load(p_x, boundary_check=(0, 1)).to(tl.float32)
+    b_var = tl.sum(b_x * b_x, axis=1)
+    b_y = b_x / tl.sqrt(b_var + eps)[:, None]
+    p_y = tl.make_block_ptr(y, (T, D), (D, 1), (i_t * BT, 0), (BT, BD), (1, 0))
+    tl.store(p_y, b_y.to(p_y.dtype.element_ty), boundary_check=(0, 1))
+
+
+@triton.jit
+def l2norm_fwd_kernel2(X, Y, eps, M, N: tl.constexpr, MBLOCK: tl.constexpr):
+    xoffset = tl.program_id(0) * MBLOCK
+    row_idx = xoffset + tl.arange(0, MBLOCK)[:, None]
+    xmask = row_idx < M
+    rindex = tl.arange(0, N)[None, :]
+    xs = tl.load(X + (rindex + N * row_idx), xmask).to(tl.float32)
+    square = tl.broadcast_to(xs * xs, [MBLOCK, N])
+    square_sum = tl.sum(tl.where(xmask, square, 0), 1)[:, None]
+    rsqrt = tl.rsqrt(square_sum + eps)
+    tl.store(Y + (rindex + N * row_idx), xs * rsqrt, xmask)
+
+
+def l2norm_fwd(x: torch.Tensor,
+               eps: float = 1e-6,
+               output_dtype: Optional[torch.dtype] = None):
+    x_shape_og = x.shape
+    x = x.view(-1, x.shape[-1])
+    # allocate output
+    if output_dtype is None:
+        y = torch.empty_like(x)
+    else:
+        y = torch.empty_like(x, dtype=output_dtype)
+    assert y.stride(-1) == 1
+    T, D = x.shape[0], x.shape[-1]
+    # rstd = torch.empty((T,), dtype=torch.float32, device=x.device)
+    # Less than 64KB per feature: enqueue fused kernel
+    MAX_FUSED_SIZE = 65536 // x.element_size()
+    BD = min(MAX_FUSED_SIZE, triton.next_power_of_2(D))
+    if D > BD:
+        raise RuntimeError("This layer doesn't support feature dim >= 64KB.")
+
+    if not USE_DEFAULT_FLA_NORM:
+        MBLOCK = 32
+        # M, N = x.shape
+        l2norm_fwd_kernel2[(triton.cdiv(T, MBLOCK), )](
+            x,
+            y,
+            eps,
+            T,
+            D,
+            MBLOCK,
+        )
+    else:
+        if D <= 512:
+            NB = triton.cdiv(T, 2048)
+
+            def grid(meta):
+                return (triton.cdiv(T, meta['BT']), )
+
+            l2norm_fwd_kernel[grid](
+                x,
+                y,
+                eps,
+                NB=NB,
+                T=T,
+                D=D,
+                BD=BD,
+            )
+        else:
+            l2norm_fwd_kernel1[(T, )](
+                x,
+                y,
+                eps=eps,
+                D=D,
+                BD=BD,
+            )
+
+    return y.view(x_shape_og)
diff --git a/vllm/model_executor/layers/fla/ops/layernorm_guard.py b/vllm/model_executor/layers/fla/ops/layernorm_guard.py
new file mode 100644
index 0000000000000000000000000000000000000000..a733c6c81e369a2b562edfa405a053c320391497
--- /dev/null
+++ b/vllm/model_executor/layers/fla/ops/layernorm_guard.py
@@ -0,0 +1,337 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+# SPDX-FileCopyrightText: Tri Dao
+#
+# This file contains code copied from the flash-linear-attention project.
+# The original source code was licensed under the MIT license and included
+# the following copyright notice:
+# Copyright (c) 2024, Tri Dao.
+
+# ruff: noqa: E501
+# Based on the Triton LayerNorm tutorial: https://triton-lang.org/main/getting-started/tutorials/05-layer-norm.html
+# For the backward pass, we keep weight_grad and bias_grad in registers and accumulate.
+# This backward pass is faster for dimensions up to 8k, but after that it's much slower due to register spilling.
+# The models we train have hidden dim up to 8k anyway (e.g. Llama 70B), so this is fine.
+
+from typing import Optional
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from einops import rearrange
+
+from vllm.triton_utils import tl, triton
+
+from .utils import input_guard
+
+
+def rms_norm_ref(x,
+                 weight,
+                 bias,
+                 z=None,
+                 eps=1e-6,
+                 group_size=None,
+                 norm_before_gate=True,
+                 upcast=True):
+    dtype = x.dtype
+    weight = weight.float()
+    bias = bias.float() if bias is not None else None
+    if upcast:
+        x = x.float()
+        z = z.float() if z is not None else z
+    if z is not None and not norm_before_gate:
+        x = x * F.silu(z)
+    if group_size is None:
+        rstd = 1 / torch.sqrt((x.square()).mean(dim=-1, keepdim=True) + eps)
+        out = (x * rstd * weight) + bias if bias is not None else (x * rstd *
+                                                                   weight)
+    else:
+        x_group = rearrange(x, "... (g d) -> ... g d", d=group_size)
+        rstd = 1 / torch.sqrt((x_group.square()).mean(dim=-1, keepdim=True) +
+                              eps)
+        out = rearrange(x_group * rstd, "... g d -> ... (g d)") * weight
+        if bias is not None:
+            out = out + bias
+    if z is not None and norm_before_gate:
+        out *= F.silu(z)
+    return out.to(dtype)
+
+
+@triton.heuristics({
+    "HAS_BIAS": lambda args: args["B"] is not None,
+    "HAS_Z": lambda args: args["Z"] is not None,
+})
+@triton.jit
+def layer_norm_fwd_kernel(
+    X,  # pointer to the input
+    Y,  # pointer to the output
+    W,  # pointer to the weights
+    B,  # pointer to the biases
+    Z,  # pointer to the other branch
+    Mean,  # pointer to the mean
+    Rstd,  # pointer to the 1/std
+    stride_x_row,  # how much to increase the pointer when moving by 1 row
+    stride_y_row,
+    stride_z_row,
+    M,  # number of rows in X
+    N,  # number of columns in X
+    eps,  # epsilon to avoid division by zero
+    BLOCK_N: tl.constexpr,
+    HAS_BIAS: tl.constexpr,
+    HAS_Z: tl.constexpr,
+    NORM_BEFORE_GATE: tl.constexpr,
+    IS_RMS_NORM: tl.constexpr,
+):
+    # Map the program id to the row of X and Y it should compute.
+    row = tl.program_id(0)
+    group = tl.program_id(1)
+    X += row * stride_x_row + group * N
+    Y += row * stride_y_row + group * N
+    if HAS_Z:
+        Z += row * stride_z_row + group * N
+    if not IS_RMS_NORM:
+        Mean += group * M
+    Rstd += group * M
+    W += group * N
+    if HAS_BIAS:
+        B += group * N
+    # Compute mean and variance
+    cols = tl.arange(0, BLOCK_N)
+    x = tl.load(X + cols, mask=cols < N, other=0.).to(tl.float32)
+    if HAS_Z and not NORM_BEFORE_GATE:
+        z = tl.load(Z + cols, mask=cols < N).to(tl.float32)
+        x *= z * tl.sigmoid(z)
+    if not IS_RMS_NORM:
+        mean = tl.sum(x, axis=0) / N
+        tl.store(Mean + row, mean)
+        xbar = tl.where(cols < N, x - mean, 0.)
+        var = tl.sum(xbar * xbar, axis=0) / N
+    else:
+        xbar = tl.where(cols < N, x, 0.)
+        var = tl.sum(xbar * xbar, axis=0) / N
+    rstd = 1 / tl.sqrt(var + eps)
+    tl.store(Rstd + row, rstd)
+    # Normalize and apply linear transformation
+    mask = cols < N
+    w = tl.load(W + cols, mask=mask).to(tl.float32)
+    if HAS_BIAS:
+        b = tl.load(B + cols, mask=mask).to(tl.float32)
+    x_hat = (x - mean) * rstd if not IS_RMS_NORM else x * rstd
+    y = x_hat * w + b if HAS_BIAS else x_hat * w
+    if HAS_Z and NORM_BEFORE_GATE:
+        z = tl.load(Z + cols, mask=mask).to(tl.float32)
+        y *= z * tl.sigmoid(z)
+    # Write output
+    tl.store(Y + cols, y, mask=mask)
+
+
+def layer_norm_fwd(
+    x: torch.Tensor,
+    weight: torch.Tensor,
+    bias: torch.Tensor,
+    eps: float,
+    z: torch.Tensor = None,
+    out: torch.Tensor = None,
+    group_size: int = None,
+    norm_before_gate: bool = True,
+    is_rms_norm: bool = False,
+):
+    M, N = x.shape
+    if group_size is None:
+        group_size = N
+    assert N % group_size == 0
+    ngroups = N // group_size
+    assert x.stride(-1) == 1
+    if z is not None:
+        assert z.stride(-1) == 1
+        assert z.shape == (M, N)
+    assert weight.shape == (N, )
+    assert weight.stride(-1) == 1
+    if bias is not None:
+        assert bias.stride(-1) == 1
+        assert bias.shape == (N, )
+    # allocate output
+    if out is not None:
+        assert out.shape == x.shape
+    else:
+        out = torch.empty_like(x)
+    assert out.stride(-1) == 1
+    mean = torch.empty((ngroups * M, ), dtype=torch.float32,
+                       device=x.device) if not is_rms_norm else None
+    rstd = torch.empty((ngroups * M, ), dtype=torch.float32, device=x.device)
+    # Less than 64KB per feature: enqueue fused kernel
+    MAX_FUSED_SIZE = 65536 // x.element_size()
+    BLOCK_N = min(MAX_FUSED_SIZE, triton.next_power_of_2(group_size))
+    if group_size > BLOCK_N:
+        raise RuntimeError(
+            "This layer norm doesn't support feature dim >= 64KB.")
+    # heuristics for number of warps
+    num_warps = min(max(BLOCK_N // 256, 1), 8)
+    grid = (M, ngroups)
+    layer_norm_fwd_kernel[grid](x,
+                                out,
+                                weight,
+                                bias,
+                                z,
+                                mean,
+                                rstd,
+                                x.stride(0),
+                                out.stride(0),
+                                z.stride(0) if z is not None else 0,
+                                M,
+                                group_size,
+                                eps,
+                                BLOCK_N=BLOCK_N,
+                                NORM_BEFORE_GATE=norm_before_gate,
+                                IS_RMS_NORM=is_rms_norm,
+                                num_warps=num_warps)
+    return out, mean, rstd
+
+
+class LayerNormFn(torch.autograd.Function):
+
+    @input_guard
+    @staticmethod
+    def forward(ctx,
+                x,
+                weight,
+                bias,
+                z=None,
+                eps=1e-6,
+                group_size=None,
+                norm_before_gate=True,
+                is_rms_norm=False):
+        """If z is not None, we do norm(x) * silu(z) if norm_before_gate, else norm(x * silu(z))
+        """
+
+        x_shape_og = x.shape
+        # reshape input data into 2D tensor
+        x = x.reshape(-1, x.shape[-1])
+        if x.stride(-1) != 1:
+            x = x.contiguous()
+        if z is not None:
+            assert z.shape == x_shape_og
+            z = z.reshape(-1, z.shape[-1])
+            if z.stride(-1) != 1:
+                z = z.contiguous()
+        weight = weight.contiguous()
+        if bias is not None:
+            bias = bias.contiguous()
+        y, mean, rstd = layer_norm_fwd(
+            x,
+            weight,
+            bias,
+            eps,
+            z=z,
+            group_size=group_size,
+            norm_before_gate=norm_before_gate,
+            is_rms_norm=is_rms_norm,
+        )
+        ctx.save_for_backward(x, weight, bias, mean, rstd, z)
+        ctx.x_shape_og = x_shape_og
+        ctx.eps = eps
+        ctx.group_size = group_size
+        ctx.norm_before_gate = norm_before_gate
+        ctx.is_rms_norm = is_rms_norm
+        return y.reshape(x_shape_og)
+
+
+def layernorm_fn(x,
+                 weight,
+                 bias,
+                 z=None,
+                 eps=1e-6,
+                 group_size=None,
+                 norm_before_gate=True,
+                 is_rms_norm=False):
+    return LayerNormFn.apply(x, weight, bias, z, eps, group_size,
+                             norm_before_gate, is_rms_norm)
+
+
+def rmsnorm_fn(x,
+               weight,
+               bias,
+               z=None,
+               eps=1e-6,
+               group_size=None,
+               norm_before_gate=True):
+    return LayerNormFn.apply(x, weight, bias, z, eps, group_size,
+                             norm_before_gate, True)
+
+
+class LayerNormGated(nn.Module):
+
+    def __init__(
+        self,
+        hidden_size,
+        eps: float = 1e-5,
+        group_size: Optional[int] = None,
+        norm_before_gate: bool = True,
+        device: Optional[torch.device] = None,
+        dtype: Optional[torch.dtype] = None,
+    ):
+        """If group_size is not None, we do GroupNorm with each group having group_size elements.
+        group_size=None is equivalent to group_size=hidden_size (i.e. there's only 1 group).
+        """
+
+        factory_kwargs = {"device": device, "dtype": dtype}
+        super().__init__()
+        self.eps = eps
+        self.weight = nn.Parameter(torch.empty(hidden_size, **factory_kwargs))
+        self.bias = nn.Parameter(torch.empty(hidden_size, **factory_kwargs))
+        self.group_size = group_size
+        self.norm_before_gate = norm_before_gate
+        self.reset_parameters()
+
+    def reset_parameters(self):
+        torch.nn.init.ones_(self.weight)
+        torch.nn.init.zeros_(self.bias)
+
+    def forward(self, x, z=None):
+        """If z is not None, we do norm(x) * silu(z) if norm_before_gate, else norm(x * silu(z))
+        """
+        return layernorm_fn(x,
+                            self.weight,
+                            self.bias,
+                            z=z,
+                            group_size=self.group_size,
+                            eps=self.eps,
+                            norm_before_gate=self.norm_before_gate)
+
+
+class RMSNormGated(nn.Module):
+
+    def __init__(
+        self,
+        hidden_size,
+        eps: float = 1e-5,
+        group_size: Optional[int] = None,
+        norm_before_gate: bool = False,
+        device: Optional[torch.device] = None,
+        dtype: Optional[torch.dtype] = None,
+    ):
+        """If group_size is not None, we do GroupNorm with each group having group_size elements.
+        group_size=None is equivalent to group_size=hidden_size (i.e. there's only 1 group).
+        """
+        factory_kwargs = {"device": device, "dtype": dtype}
+        super().__init__()
+        self.eps = eps
+        self.weight = nn.Parameter(torch.empty(hidden_size, **factory_kwargs))
+        self.register_parameter("bias", None)
+        self.group_size = group_size
+        self.norm_before_gate = norm_before_gate
+        self.reset_parameters()
+
+    def reset_parameters(self):
+        torch.nn.init.ones_(self.weight)
+
+    def forward(self, x, z=None):
+        """If z is not None, we do norm(x) * silu(z) if norm_before_gate, else norm(x * silu(z))
+        """
+        return rmsnorm_fn(x,
+                          self.weight,
+                          self.bias,
+                          z=z,
+                          eps=self.eps,
+                          group_size=self.group_size,
+                          norm_before_gate=self.norm_before_gate)
diff --git a/vllm/model_executor/layers/fla/ops/op.py b/vllm/model_executor/layers/fla/ops/op.py
new file mode 100644
index 0000000000000000000000000000000000000000..8c29434ca106ad7d8e66399eef533cb64c9bb50d
--- /dev/null
+++ b/vllm/model_executor/layers/fla/ops/op.py
@@ -0,0 +1,39 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+# SPDX-FileCopyrightText: Songlin Yang, Yu Zhang
+#
+# This file contains code copied from the flash-linear-attention project.
+# The original source code was licensed under the MIT license and included
+# the following copyright notice:
+# Copyright (c) 2023-2025, Songlin Yang, Yu Zhang
+
+import os
+
+from vllm.triton_utils import tl, tldevice, triton
+
+if os.environ.get('FLA_USE_FAST_OPS', '0') == '1':
+    div = tldevice.fast_dividef
+    exp = tldevice.fast_expf
+    log = tldevice.fast_logf
+    log2 = tldevice.fast_log2f
+else:
+
+    @triton.jit
+    def div_normal(x, y):
+        return x / y
+
+    div = div_normal
+    exp = tl.exp
+    log = tl.log
+    log2 = tl.log2
+
+
+if not hasattr(tl, 'gather'):
+
+    @triton.jit
+    def gather(src, index, axis, _builder=None):
+        # This is a fallback implementation when tl.gather is not supported
+        # In order to pass triton compiler, there is no actual gather operation
+        return src
+else:
+    gather = tl.gather
diff --git a/vllm/model_executor/layers/fla/ops/solve_tril.py b/vllm/model_executor/layers/fla/ops/solve_tril.py
new file mode 100644
index 0000000000000000000000000000000000000000..97cb0d800d4110dd44bc343ea5b32caff638317c
--- /dev/null
+++ b/vllm/model_executor/layers/fla/ops/solve_tril.py
@@ -0,0 +1,365 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+# SPDX-FileCopyrightText: Songlin Yang, Yu Zhang
+#
+# This file contains code copied from the flash-linear-attention project.
+# The original source code was licensed under the MIT license and included
+# the following copyright notice:
+# Copyright (c) 2023-2025, Songlin Yang, Yu Zhang
+# ruff: noqa: E501
+from typing import Optional
+
+import torch
+
+from vllm.triton_utils import tl, triton
+
+from .index import prepare_chunk_indices
+from .utils import input_guard
+
+
+@triton.heuristics({'IS_VARLEN': lambda args: args['cu_seqlens'] is not None})
+@triton.autotune(
+    configs=[
+        triton.Config({}, num_warps=num_warps, num_stages=num_stages)
+        for num_warps in [1, 2, 4, 8] for num_stages in [2, 3, 4, 5]
+    ],
+    key=['BT'],
+)
+@triton.jit(do_not_specialize=['T'])
+def solve_tril_16x16_kernel(
+    A,
+    Ad,
+    cu_seqlens,
+    chunk_indices,
+    T,
+    H: tl.constexpr,
+    BT: tl.constexpr,
+    IS_VARLEN: tl.constexpr,
+):
+    i_t, i_bh = tl.program_id(0), tl.program_id(1)
+    i_b, i_h = i_bh // H, i_bh % H
+    if IS_VARLEN:
+        i_n, i_t = tl.load(chunk_indices + i_t * 2).to(
+            tl.int32), tl.load(chunk_indices + i_t * 2 + 1).to(tl.int32)
+        bos, eos = tl.load(cu_seqlens + i_n).to(
+            tl.int32), tl.load(cu_seqlens + i_n + 1).to(tl.int32)
+        T = eos - bos
+    else:
+        bos, eos = i_b * T, i_b * T + T
+
+    A = A + (bos * H + i_h) * BT
+    Ad = Ad + (bos * H + i_h) * 16
+
+    offset = (i_t * 16) % BT
+    p_A = tl.make_block_ptr(A, (T, BT), (H * BT, 1), (i_t * 16, offset),
+                            (16, 16), (1, 0))
+    p_Ai = tl.make_block_ptr(Ad, (T, 16), (H * 16, 1), (i_t * 16, 0), (16, 16),
+                             (1, 0))
+    b_A = tl.load(p_A, boundary_check=(0, 1)).to(tl.float32)
+    b_A = -tl.where(
+        tl.arange(0, 16)[:, None] > tl.arange(0, 16)[None, :], b_A, 0)
+
+    o_i = tl.arange(0, 16)
+    for i in range(1, min(16, T - i_t * 16)):
+        b_a = -tl.load(A + (i_t * 16 + i) * H * BT + o_i + offset)
+        b_a = b_a + tl.sum(b_a[:, None] * b_A, 0)
+        mask = o_i == i
+        b_A = tl.where(mask[:, None], b_a, b_A)
+    b_A += o_i[:, None] == o_i[None, :]
+    tl.store(p_Ai,
+             b_A.to(p_Ai.dtype.element_ty, fp_downcast_rounding="rtne"),
+             boundary_check=(0, 1))
+
+
+@triton.heuristics({'IS_VARLEN': lambda args: args['cu_seqlens'] is not None})
+@triton.autotune(
+    configs=[
+        triton.Config({}, num_warps=num_warps, num_stages=num_stages)
+        for num_warps in [1, 2, 4, 8] for num_stages in [2, 3, 4, 5]
+    ],
+    key=['H', 'BT', 'IS_VARLEN'],
+)
+@triton.jit(do_not_specialize=['T'])
+def merge_16x16_to_32x32_inverse_kernel(A, Ad, Ai, cu_seqlens, chunk_indices,
+                                        T, H: tl.constexpr, BT: tl.constexpr,
+                                        IS_VARLEN: tl.constexpr):
+    i_t, i_bh = tl.program_id(0), tl.program_id(1)
+    i_b, i_h = i_bh // H, i_bh % H
+    if IS_VARLEN:
+        i_n, i_t = tl.load(chunk_indices + i_t * 2).to(
+            tl.int32), tl.load(chunk_indices + i_t * 2 + 1).to(tl.int32)
+        bos, eos = tl.load(cu_seqlens + i_n).to(
+            tl.int32), tl.load(cu_seqlens + i_n + 1).to(tl.int32)
+        T = eos - bos
+    else:
+        bos, eos = i_b * T, i_b * T + T
+
+    A += (bos * H + i_h) * 32
+    Ad += (bos * H + i_h) * 16
+    Ai += (bos * H + i_h) * 32
+
+    p_A_21 = tl.make_block_ptr(A, (T, 32), (H * 32, 1), (i_t * 32 + 16, 0),
+                               (16, 16), (1, 0))
+    p_Ad_11 = tl.make_block_ptr(Ad, (T, 16), (H * 16, 1), (i_t * 32, 0),
+                                (16, 16), (1, 0))
+    p_Ad_22 = tl.make_block_ptr(Ad, (T, 16), (H * 16, 1), (i_t * 32 + 16, 0),
+                                (16, 16), (1, 0))
+    p_Ai_11 = tl.make_block_ptr(Ai, (T, 32), (H * 32, 1), (i_t * 32, 0),
+                                (16, 16), (1, 0))
+    p_Ai_22 = tl.make_block_ptr(Ai, (T, 32), (H * 32, 1), (i_t * 32 + 16, 16),
+                                (16, 16), (1, 0))
+    p_Ai_21 = tl.make_block_ptr(Ai, (T, 32), (H * 32, 1), (i_t * 32 + 16, 0),
+                                (16, 16), (1, 0))
+
+    A_21 = tl.load(p_A_21, boundary_check=(0, 1)).to(tl.float32)
+    Ai_11 = tl.load(p_Ad_11, boundary_check=(0, 1)).to(tl.float32)
+    Ai_22 = tl.load(p_Ad_22, boundary_check=(0, 1)).to(tl.float32)
+    Ai_21 = -tl.dot(tl.dot(Ai_22, A_21, input_precision='ieee'),
+                    Ai_11,
+                    input_precision='ieee')
+    tl.store(p_Ai_11,
+             Ai_11.to(p_Ai_11.dtype.element_ty, fp_downcast_rounding="rtne"),
+             boundary_check=(0, 1))
+    tl.store(p_Ai_22,
+             Ai_22.to(p_Ai_22.dtype.element_ty, fp_downcast_rounding="rtne"),
+             boundary_check=(0, 1))
+    tl.store(p_Ai_21,
+             Ai_21.to(p_Ai_21.dtype.element_ty, fp_downcast_rounding="rtne"),
+             boundary_check=(0, 1))
+
+
+@triton.heuristics({'IS_VARLEN': lambda args: args['cu_seqlens'] is not None})
+@triton.autotune(
+    configs=[
+        triton.Config({}, num_warps=num_warps, num_stages=num_stages)
+        for num_warps in [2, 4, 8] for num_stages in [2, 3, 4, 5]
+    ],
+    key=['H', 'BT', 'IS_VARLEN'],
+)
+@triton.jit(do_not_specialize=['T'])
+def merge_16x16_to_64x64_inverse_kernel(A, Ad, Ai, cu_seqlens, chunk_indices,
+                                        T, H: tl.constexpr, BT: tl.constexpr,
+                                        IS_VARLEN: tl.constexpr):
+    i_t, i_bh = tl.program_id(0), tl.program_id(1)
+    i_b, i_h = i_bh // H, i_bh % H
+    if IS_VARLEN:
+        i_n, i_t = tl.load(chunk_indices + i_t * 2).to(
+            tl.int32), tl.load(chunk_indices + i_t * 2 + 1).to(tl.int32)
+        bos, eos = tl.load(cu_seqlens + i_n).to(
+            tl.int32), tl.load(cu_seqlens + i_n + 1).to(tl.int32)
+        T = eos - bos
+    else:
+        bos, eos = i_b * T, i_b * T + T
+
+    A += (bos * H + i_h) * 64
+    Ad += (bos * H + i_h) * 16
+    Ai += (bos * H + i_h) * 64
+
+    p_A_21 = tl.make_block_ptr(A, (T, 64), (H * 64, 1), (i_t * 64 + 16, 0),
+                               (16, 16), (1, 0))
+    p_A_32 = tl.make_block_ptr(A, (T, 64), (H * 64, 1), (i_t * 64 + 32, 16),
+                               (16, 16), (1, 0))
+    p_A_31 = tl.make_block_ptr(A, (T, 64), (H * 64, 1), (i_t * 64 + 32, 0),
+                               (16, 16), (1, 0))
+    p_A_43 = tl.make_block_ptr(A, (T, 64), (H * 64, 1), (i_t * 64 + 48, 32),
+                               (16, 16), (1, 0))
+    p_A_42 = tl.make_block_ptr(A, (T, 64), (H * 64, 1), (i_t * 64 + 48, 16),
+                               (16, 16), (1, 0))
+    p_A_41 = tl.make_block_ptr(A, (T, 64), (H * 64, 1), (i_t * 64 + 48, 0),
+                               (16, 16), (1, 0))
+    p_Ad_11 = tl.make_block_ptr(Ad, (T, 16), (H * 16, 1), (i_t * 64, 0),
+                                (16, 16), (1, 0))
+    p_Ad_22 = tl.make_block_ptr(Ad, (T, 16), (H * 16, 1), (i_t * 64 + 16, 0),
+                                (16, 16), (1, 0))
+    p_Ad_33 = tl.make_block_ptr(Ad, (T, 16), (H * 16, 1), (i_t * 64 + 32, 0),
+                                (16, 16), (1, 0))
+    p_Ad_44 = tl.make_block_ptr(Ad, (T, 16), (H * 16, 1), (i_t * 64 + 48, 0),
+                                (16, 16), (1, 0))
+
+    A_21 = tl.load(p_A_21, boundary_check=(0, 1)).to(tl.float32)
+    A_32 = tl.load(p_A_32, boundary_check=(0, 1)).to(tl.float32)
+    A_31 = tl.load(p_A_31, boundary_check=(0, 1)).to(tl.float32)
+    A_43 = tl.load(p_A_43, boundary_check=(0, 1)).to(tl.float32)
+    A_42 = tl.load(p_A_42, boundary_check=(0, 1)).to(tl.float32)
+    A_41 = tl.load(p_A_41, boundary_check=(0, 1)).to(tl.float32)
+
+    Ai_11 = tl.load(p_Ad_11, boundary_check=(0, 1)).to(tl.float32)
+    Ai_22 = tl.load(p_Ad_22, boundary_check=(0, 1)).to(tl.float32)
+    Ai_33 = tl.load(p_Ad_33, boundary_check=(0, 1)).to(tl.float32)
+    Ai_44 = tl.load(p_Ad_44, boundary_check=(0, 1)).to(tl.float32)
+
+    Ai_21 = -tl.dot(tl.dot(Ai_22, A_21, input_precision='ieee'),
+                    Ai_11,
+                    input_precision='ieee')
+    Ai_32 = -tl.dot(tl.dot(Ai_33, A_32, input_precision='ieee'),
+                    Ai_22,
+                    input_precision='ieee')
+    Ai_43 = -tl.dot(tl.dot(Ai_44, A_43, input_precision='ieee'),
+                    Ai_33,
+                    input_precision='ieee')
+
+    Ai_31 = -tl.dot(Ai_33,
+                    tl.dot(A_31, Ai_11, input_precision='ieee') +
+                    tl.dot(A_32, Ai_21, input_precision='ieee'),
+                    input_precision='ieee')
+    Ai_42 = -tl.dot(Ai_44,
+                    tl.dot(A_42, Ai_22, input_precision='ieee') +
+                    tl.dot(A_43, Ai_32, input_precision='ieee'),
+                    input_precision='ieee')
+    Ai_41 = -tl.dot(Ai_44,
+                    tl.dot(A_41, Ai_11, input_precision='ieee') +
+                    tl.dot(A_42, Ai_21, input_precision='ieee') +
+                    tl.dot(A_43, Ai_31, input_precision='ieee'),
+                    input_precision='ieee')
+
+    p_Ai_11 = tl.make_block_ptr(Ai, (T, 64), (H * 64, 1), (i_t * 64, 0),
+                                (16, 16), (1, 0))
+    p_Ai_22 = tl.make_block_ptr(Ai, (T, 64), (H * 64, 1), (i_t * 64 + 16, 16),
+                                (16, 16), (1, 0))
+    p_Ai_33 = tl.make_block_ptr(Ai, (T, 64), (H * 64, 1), (i_t * 64 + 32, 32),
+                                (16, 16), (1, 0))
+    p_Ai_44 = tl.make_block_ptr(Ai, (T, 64), (H * 64, 1), (i_t * 64 + 48, 48),
+                                (16, 16), (1, 0))
+    p_Ai_21 = tl.make_block_ptr(Ai, (T, 64), (H * 64, 1), (i_t * 64 + 16, 0),
+                                (16, 16), (1, 0))
+    p_Ai_31 = tl.make_block_ptr(Ai, (T, 64), (H * 64, 1), (i_t * 64 + 32, 0),
+                                (16, 16), (1, 0))
+    p_Ai_32 = tl.make_block_ptr(Ai, (T, 64), (H * 64, 1), (i_t * 64 + 32, 16),
+                                (16, 16), (1, 0))
+    p_Ai_41 = tl.make_block_ptr(Ai, (T, 64), (H * 64, 1), (i_t * 64 + 48, 0),
+                                (16, 16), (1, 0))
+    p_Ai_42 = tl.make_block_ptr(Ai, (T, 64), (H * 64, 1), (i_t * 64 + 48, 16),
+                                (16, 16), (1, 0))
+    p_Ai_43 = tl.make_block_ptr(Ai, (T, 64), (H * 64, 1), (i_t * 64 + 48, 32),
+                                (16, 16), (1, 0))
+    tl.store(p_Ai_11,
+             Ai_11.to(p_Ai_11.dtype.element_ty, fp_downcast_rounding="rtne"),
+             boundary_check=(0, 1))
+    tl.store(p_Ai_22,
+             Ai_22.to(p_Ai_22.dtype.element_ty, fp_downcast_rounding="rtne"),
+             boundary_check=(0, 1))
+    tl.store(p_Ai_33,
+             Ai_33.to(p_Ai_33.dtype.element_ty, fp_downcast_rounding="rtne"),
+             boundary_check=(0, 1))
+    tl.store(p_Ai_44,
+             Ai_44.to(p_Ai_44.dtype.element_ty, fp_downcast_rounding="rtne"),
+             boundary_check=(0, 1))
+    tl.store(p_Ai_21,
+             Ai_21.to(p_Ai_21.dtype.element_ty, fp_downcast_rounding="rtne"),
+             boundary_check=(0, 1))
+    tl.store(p_Ai_31,
+             Ai_31.to(p_Ai_31.dtype.element_ty, fp_downcast_rounding="rtne"),
+             boundary_check=(0, 1))
+    tl.store(p_Ai_32,
+             Ai_32.to(p_Ai_32.dtype.element_ty, fp_downcast_rounding="rtne"),
+             boundary_check=(0, 1))
+    tl.store(p_Ai_41,
+             Ai_41.to(p_Ai_41.dtype.element_ty, fp_downcast_rounding="rtne"),
+             boundary_check=(0, 1))
+    tl.store(p_Ai_42,
+             Ai_42.to(p_Ai_42.dtype.element_ty, fp_downcast_rounding="rtne"),
+             boundary_check=(0, 1))
+    tl.store(p_Ai_43,
+             Ai_43.to(p_Ai_43.dtype.element_ty, fp_downcast_rounding="rtne"),
+             boundary_check=(0, 1))
+
+    fill_zeros = tl.zeros((16, 16), dtype=tl.float32)
+    p_Ai_12 = tl.make_block_ptr(Ai, (T, 64), (H * 64, 1), (i_t * 64, 16),
+                                (16, 16), (1, 0))
+    p_Ai_13 = tl.make_block_ptr(Ai, (T, 64), (H * 64, 1), (i_t * 64, 32),
+                                (16, 16), (1, 0))
+    p_Ai_14 = tl.make_block_ptr(Ai, (T, 64), (H * 64, 1), (i_t * 64, 48),
+                                (16, 16), (1, 0))
+    p_Ai_23 = tl.make_block_ptr(Ai, (T, 64), (H * 64, 1), (i_t * 64 + 16, 32),
+                                (16, 16), (1, 0))
+    p_Ai_24 = tl.make_block_ptr(Ai, (T, 64), (H * 64, 1), (i_t * 64 + 16, 48),
+                                (16, 16), (1, 0))
+    p_Ai_34 = tl.make_block_ptr(Ai, (T, 64), (H * 64, 1), (i_t * 64 + 32, 48),
+                                (16, 16), (1, 0))
+    tl.store(p_Ai_12,
+             fill_zeros.to(p_Ai_12.dtype.element_ty,
+                           fp_downcast_rounding="rtne"),
+             boundary_check=(0, 1))
+    tl.store(p_Ai_13,
+             fill_zeros.to(p_Ai_13.dtype.element_ty,
+                           fp_downcast_rounding="rtne"),
+             boundary_check=(0, 1))
+    tl.store(p_Ai_14,
+             fill_zeros.to(p_Ai_14.dtype.element_ty,
+                           fp_downcast_rounding="rtne"),
+             boundary_check=(0, 1))
+    tl.store(p_Ai_23,
+             fill_zeros.to(p_Ai_23.dtype.element_ty,
+                           fp_downcast_rounding="rtne"),
+             boundary_check=(0, 1))
+    tl.store(p_Ai_24,
+             fill_zeros.to(p_Ai_24.dtype.element_ty,
+                           fp_downcast_rounding="rtne"),
+             boundary_check=(0, 1))
+    tl.store(p_Ai_34,
+             fill_zeros.to(p_Ai_34.dtype.element_ty,
+                           fp_downcast_rounding="rtne"),
+             boundary_check=(0, 1))
+
+
+@input_guard
+def solve_tril(A: torch.Tensor,
+               cu_seqlens: Optional[torch.Tensor] = None,
+               output_dtype: torch.dtype = torch.float) -> torch.Tensor:
+    """
+    Compute the inverse of the lower triangular matrix
+    A should be strictly lower triangular, i.e., A.triu() == 0.
+
+    Args:
+        A (torch.Tensor):
+            [B, T, H, K]
+        cu_seqlens (torch.Tensor):
+            The cumulative sequence lengths of the input tensor.
+            Default: None.
+        output_dtype (torch.dtype):
+            The dtype of the output tensor. Default: `torch.float`
+
+    Returns:
+        (I + A)^-1 with the same shape as A
+    """
+    assert A.shape[-1] in [16, 32, 64]
+
+    B, T, H, BT = A.shape
+    Ad = torch.empty(B,
+                     T,
+                     H,
+                     16,
+                     device=A.device,
+                     dtype=torch.float if BT != 16 else output_dtype)
+
+    chunk_indices = prepare_chunk_indices(
+        cu_seqlens, 16) if cu_seqlens is not None else None
+    NT = len(chunk_indices) if cu_seqlens is not None else triton.cdiv(T, 16)
+    solve_tril_16x16_kernel[NT, B * H](
+        A=A,
+        Ad=Ad,
+        cu_seqlens=cu_seqlens,
+        chunk_indices=chunk_indices,
+        T=T,
+        H=H,
+        BT=BT,
+    )
+    if BT == 16:
+        return Ad
+
+    Ai = torch.empty(B, T, H, BT, device=A.device, dtype=output_dtype)
+    merge_fn = merge_16x16_to_32x32_inverse_kernel if BT == 32 else merge_16x16_to_64x64_inverse_kernel
+    chunk_indices = prepare_chunk_indices(
+        cu_seqlens, BT) if cu_seqlens is not None else None
+    NT = len(chunk_indices) if cu_seqlens is not None else triton.cdiv(T, BT)
+    merge_fn[NT, B * H](
+        A=A,
+        Ad=Ad,
+        Ai=Ai,
+        cu_seqlens=cu_seqlens,
+        chunk_indices=chunk_indices,
+        T=T,
+        H=H,
+        BT=BT,
+    )
+    return Ai
diff --git a/vllm/model_executor/layers/fla/ops/utils.py b/vllm/model_executor/layers/fla/ops/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..7fd90cee45d0e71b3c5a67456da6823efcbdb728
--- /dev/null
+++ b/vllm/model_executor/layers/fla/ops/utils.py
@@ -0,0 +1,180 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+# SPDX-FileCopyrightText: Songlin Yang, Yu Zhang
+#
+# This file contains code copied from the flash-linear-attention project.
+# The original source code was licensed under the MIT license and included
+# the following copyright notice:
+# Copyright (c) 2023-2025, Songlin Yang, Yu Zhang
+# ruff: noqa: E501
+import contextlib
+import functools
+import logging
+import os
+from enum import Enum
+from typing import Any, Callable, Literal, Optional
+
+import torch
+
+from vllm.triton_utils import triton
+
+logger = logging.getLogger(__name__)
+
+COMPILER_MODE = os.getenv("FLA_COMPILER_MODE") == "1"
+FLA_CI_ENV = os.getenv("FLA_CI_ENV") == "1"
+FLA_GDN_FIX_BT = os.getenv("FLA_GDN_FIX_BT", "0") == "1"
+
+SUPPRESS_LEVEL = int(os.getenv("GDN_RECOMPUTE_SUPPRESS_LEVEL", "0"))
+
+
+def tensor_cache(
+        fn: Callable[..., torch.Tensor]) -> Callable[..., torch.Tensor]:
+    """
+    A decorator that caches the most recent results of a function with tensor inputs.
+
+    This decorator will store the output of the decorated function for the most recent set of input tensors.
+    The cache is limited to a fixed size (default is 4). When the cache is full, the oldest entry will be removed.
+
+    Args:
+        fn (Callable[..., torch.Tensor]):
+            The function to be decorated. It should take tensor inputs and return tensor outputs.
+
+    Returns:
+        Callable[..., torch.Tensor]:
+            A wrapped version of the input function with single-entry caching.
+    """
+
+    cache_entries: tuple[Optional[tuple], Optional[dict], Any] = []
+    cache_size = 4
+
+    @functools.wraps(fn)
+    def wrapper(*args: Any, **kwargs: Any) -> Any:
+        nonlocal cache_entries, cache_size
+        for i, entry in enumerate(cache_entries):
+            last_args, last_kwargs, last_result = entry
+            if len(args) == len(last_args) and len(kwargs) == len(last_kwargs) \
+                and all(a is b for a, b in zip(args, last_args)) \
+                and all(k in last_kwargs and v is last_kwargs[k] for k, v in kwargs.items()):
+                cache_entries = cache_entries[:i] + cache_entries[i + 1:] + [
+                    (args, kwargs, last_result)
+                ]
+                return last_result
+
+        result = fn(*args, **kwargs)
+
+        if len(cache_entries) >= cache_size:
+            cache_entries = cache_entries[1:]
+        cache_entries.append((args, kwargs, result))
+        return result
+
+    return wrapper
+
+
+def input_guard(
+        fn: Callable[..., torch.Tensor]) -> Callable[..., torch.Tensor]:
+    """
+    A decorator to make sure all input tensors are contiguous and set the device based on input tensors.
+    """
+
+    @functools.wraps(fn)
+    def wrapper(*args, **kwargs):
+        contiguous_args = (i if not isinstance(i, torch.Tensor) else
+                           i.contiguous() for i in args)
+        contiguous_kwargs = {
+            k: (v if not isinstance(v, torch.Tensor) else v.contiguous())
+            for k, v in kwargs.items()
+        }
+
+        tensor = None
+        for arg in args:
+            if isinstance(arg, torch.Tensor):
+                tensor = arg
+                break
+        if tensor is None:
+            for value in kwargs.values():
+                if isinstance(value, torch.Tensor):
+                    tensor = value
+                    break
+
+        if tensor is not None:
+            ctx = torch.cuda.device(tensor.device.index)
+        else:
+            ctx = contextlib.nullcontext()
+
+        with ctx:
+            return fn(*contiguous_args, **contiguous_kwargs)
+
+    return wrapper
+
+
+@functools.cache
+def get_available_device() -> str:
+    try:
+        return triton.runtime.driver.active.get_current_target().backend
+    except BaseException:
+        return 'cpu'
+
+
+@functools.cache
+def _check_platform() -> Literal['nvidia', 'amd', 'intel', 'musa']:
+    device = get_available_device()
+    mapping = {
+        "cuda": "nvidia",
+        "hip": "amd",
+        "xpu": "intel",
+    }
+    # return the mapped value, or the original if not found
+    return mapping.get(device, device)
+
+
+# For AMD GPUs, the triton backend is 'hip', while for Nvidia GPUs, the triton backend is 'cuda'.
+# However, the torch backend is 'cuda' for both Nvidia and AMD GPUs.
+# Therefore, we need to check the triton backend to determine the actual GPU vendor.
+device = get_available_device() if get_available_device() != 'hip' else 'cuda'
+device_torch_lib = getattr(torch, device)
+device_platform = _check_platform()
+
+is_amd = (device_platform == 'amd')
+is_intel = (device_platform == 'intel')
+is_nvidia = (device_platform == 'nvidia')
+is_intel_alchemist = (is_intel
+                      and 'Intel(R) Arc(TM) A' in torch.xpu.get_device_name(0))
+is_nvidia_hopper = (is_nvidia
+                    and ('NVIDIA H' in torch.cuda.get_device_name(0)
+                         or torch.cuda.get_device_capability()[0] >= 9))
+use_cuda_graph = (is_nvidia
+                  and os.environ.get('FLA_USE_CUDA_GRAPH', '0') == '1')
+
+
+def get_all_max_shared_mem():
+    try:
+        return [
+            triton.runtime.driver.active.utils.get_device_properties(i)
+            ['max_shared_mem'] for i in range(device_torch_lib.device_count())
+        ]
+    except BaseException:
+        return [-1]
+
+
+class Backend(Enum):
+    ADA = 101376  # RTX 4090
+    AMPERE = 166912  # A100
+    HOPPER = 232448  # H100
+    DEFAULT = 102400  # Default
+
+    @classmethod
+    def get_shared_memory(cls, arch: str) -> int:
+        try:
+            return cls[arch.upper()].value
+        except KeyError:
+            return cls.DEFAULT.value
+
+
+@functools.cache
+def check_shared_mem(arch: str = "none", tensor_idx: int = 0) -> bool:
+    try:
+        device_shared_mem_list = get_all_max_shared_mem()
+        max_shared_memory = device_shared_mem_list[tensor_idx]
+        return max_shared_memory >= Backend.get_shared_memory(arch)
+    except Exception:
+        return False
diff --git a/vllm/model_executor/layers/fla/ops/wy_fast.py b/vllm/model_executor/layers/fla/ops/wy_fast.py
new file mode 100644
index 0000000000000000000000000000000000000000..70374eb6506425199e44a9ab0d96f20f2eb295e1
--- /dev/null
+++ b/vllm/model_executor/layers/fla/ops/wy_fast.py
@@ -0,0 +1,114 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+# SPDX-FileCopyrightText: Songlin Yang, Yu Zhang
+#
+# This file contains code copied from the flash-linear-attention project.
+# The original source code was licensed under the MIT license and included
+# the following copyright notice:
+# Copyright (c) 2023-2025, Songlin Yang, Yu Zhang
+
+# ruff: noqa: E501
+from typing import Optional
+
+import torch
+
+from vllm.triton_utils import tl, triton
+
+from .index import prepare_chunk_indices
+
+
+@triton.heuristics({'IS_VARLEN': lambda args: args['cu_seqlens'] is not None})
+@triton.autotune(
+    configs=[
+        triton.Config({}, num_warps=num_warps, num_stages=num_stages)
+        for num_warps in [2, 4, 8] for num_stages in [2, 3, 4]
+    ],
+    key=['H', 'K', 'V', 'BT', 'BK', 'BV', 'IS_VARLEN'],
+)
+@triton.jit(do_not_specialize=['T'])
+def recompute_w_u_fwd_kernel(k, v, beta, w, u, A, g, cu_seqlens, chunk_indices,
+                             T, H: tl.constexpr, Hg: tl.constexpr,
+                             K: tl.constexpr, V: tl.constexpr,
+                             BT: tl.constexpr, BK: tl.constexpr,
+                             BV: tl.constexpr, IS_VARLEN: tl.constexpr):
+    i_t, i_bh = tl.program_id(0), tl.program_id(1)
+    i_b, i_h = i_bh // H, i_bh % H
+    if IS_VARLEN:
+        i_n, i_t = tl.load(chunk_indices + i_t * 2).to(
+            tl.int32), tl.load(chunk_indices + i_t * 2 + 1).to(tl.int32)
+        bos, eos = tl.load(cu_seqlens + i_n).to(
+            tl.int32), tl.load(cu_seqlens + i_n + 1).to(tl.int32)
+        T = eos - bos
+    else:
+        bos, eos = i_b * T, i_b * T + T
+    p_beta = tl.make_block_ptr(beta + bos * H + i_h, (T, ), (H, ),
+                               (i_t * BT, ), (BT, ), (0, ))
+    p_g = tl.make_block_ptr(g + (bos * H + i_h), (T, ), (H, ), (i_t * BT, ),
+                            (BT, ), (0, ))
+    p_A = tl.make_block_ptr(A + (bos * H + i_h) * BT, (T, BT), (H * BT, 1),
+                            (i_t * BT, 0), (BT, BT), (1, 0))
+    b_beta = tl.load(p_beta, boundary_check=(0, ))
+    b_A = tl.load(p_A, boundary_check=(0, 1))
+    b_g = tl.exp(tl.load(p_g, boundary_check=(0, )))
+
+    for i_v in range(tl.cdiv(V, BV)):
+        p_v = tl.make_block_ptr(v + (bos * H + i_h) * V, (T, V), (H * V, 1),
+                                (i_t * BT, i_v * BV), (BT, BV), (1, 0))
+        p_u = tl.make_block_ptr(u + (bos * H + i_h) * V, (T, V), (H * V, 1),
+                                (i_t * BT, i_v * BV), (BT, BV), (1, 0))
+        b_v = tl.load(p_v, boundary_check=(0, 1))
+        b_vb = (b_v * b_beta[:, None]).to(b_v.dtype)
+        b_u = tl.dot(b_A, b_vb, allow_tf32=False)
+        tl.store(p_u, b_u.to(p_u.dtype.element_ty), boundary_check=(0, 1))
+
+    for i_k in range(tl.cdiv(K, BK)):
+        p_k = tl.make_block_ptr(k + (bos * Hg + i_h // (H // Hg)) * K, (T, K),
+                                (Hg * K, 1), (i_t * BT, i_k * BK), (BT, BK),
+                                (1, 0))
+        p_w = tl.make_block_ptr(w + (bos * H + i_h) * K, (T, K), (H * K, 1),
+                                (i_t * BT, i_k * BK), (BT, BK), (1, 0))
+        b_k = tl.load(p_k, boundary_check=(0, 1))
+        b_kb = (b_k * b_beta[:, None] * b_g[:, None]).to(b_k.dtype)
+        b_w = tl.dot(b_A, b_kb)
+        tl.store(p_w, b_w.to(p_w.dtype.element_ty), boundary_check=(0, 1))
+
+
+def recompute_w_u_fwd(
+    k: torch.Tensor,
+    v: torch.Tensor,
+    beta: torch.Tensor,
+    g_cumsum: torch.Tensor,
+    A: torch.Tensor,
+    cu_seqlens: Optional[torch.LongTensor],
+) -> tuple[torch.Tensor, torch.Tensor]:
+    B, T, Hg, K, V = *k.shape, v.shape[-1]
+    H = v.shape[-2]
+    BT = A.shape[-1]
+
+    chunk_indices = prepare_chunk_indices(
+        cu_seqlens, BT) if cu_seqlens is not None else None
+    NT = triton.cdiv(T, BT) if cu_seqlens is None else len(chunk_indices)
+    BK = 64
+    BV = 64
+    u = torch.empty_like(v)
+    w = k.new_empty(B, T, H, K)
+    recompute_w_u_fwd_kernel[(NT, B * H)](
+        k=k,
+        v=v,
+        beta=beta,
+        w=w,
+        u=u,
+        A=A,
+        g=g_cumsum,
+        cu_seqlens=cu_seqlens,
+        chunk_indices=chunk_indices,
+        T=T,
+        H=H,
+        Hg=Hg,
+        K=K,
+        V=V,
+        BT=BT,
+        BK=BK,
+        BV=BV,
+    )
+    return w, u
diff --git a/vllm/model_executor/layers/fused_moe/configs/E=128,N=384,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/fused_moe/configs/E=128,N=384,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json
new file mode 100644
index 0000000000000000000000000000000000000000..e5059358c91e3fe0ae3c4baf6e361fa2580f2f48
--- /dev/null
+++ b/vllm/model_executor/layers/fused_moe/configs/E=128,N=384,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "64": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "96": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "128": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "256": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    }
+}
diff --git a/vllm/model_executor/layers/fused_moe/configs/E=128,N=384,device_name=NVIDIA_GB200,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/fused_moe/configs/E=128,N=384,device_name=NVIDIA_GB200,dtype=fp8_w8a8,block_shape=[128,128].json
new file mode 100644
index 0000000000000000000000000000000000000000..db1b6e98df469480fbbd92f3acc47b2c506b3816
--- /dev/null
+++ b/vllm/model_executor/layers/fused_moe/configs/E=128,N=384,device_name=NVIDIA_GB200,dtype=fp8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "64": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "96": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "128": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "256": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    }
+}
diff --git a/vllm/model_executor/layers/fused_moe/configs/E=128,N=768,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/fused_moe/configs/E=128,N=768,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json
index b9dc2d71f6dcf80d741c0035a63361d6442c0d07..1bbb8aa613996df12ab369d7045354dbecc343f6 100644
--- a/vllm/model_executor/layers/fused_moe/configs/E=128,N=768,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json
+++ b/vllm/model_executor/layers/fused_moe/configs/E=128,N=768,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json
@@ -9,16 +9,16 @@
     },
     "2": {
         "BLOCK_SIZE_M": 16,
-        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_N": 128,
         "BLOCK_SIZE_K": 128,
         "GROUP_SIZE_M": 1,
         "num_warps": 4,
-        "num_stages": 5
+        "num_stages": 4
     },
     "4": {
         "BLOCK_SIZE_M": 16,
         "BLOCK_SIZE_N": 128,
-        "BLOCK_SIZE_K": 256,
+        "BLOCK_SIZE_K": 128,
         "GROUP_SIZE_M": 1,
         "num_warps": 4,
         "num_stages": 4
@@ -26,15 +26,15 @@
     "8": {
         "BLOCK_SIZE_M": 16,
         "BLOCK_SIZE_N": 128,
-        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_K": 256,
         "GROUP_SIZE_M": 1,
         "num_warps": 4,
-        "num_stages": 4
+        "num_stages": 3
     },
     "16": {
         "BLOCK_SIZE_M": 16,
         "BLOCK_SIZE_N": 128,
-        "BLOCK_SIZE_K": 256,
+        "BLOCK_SIZE_K": 128,
         "GROUP_SIZE_M": 1,
         "num_warps": 4,
         "num_stages": 4
@@ -42,7 +42,7 @@
     "24": {
         "BLOCK_SIZE_M": 16,
         "BLOCK_SIZE_N": 128,
-        "BLOCK_SIZE_K": 256,
+        "BLOCK_SIZE_K": 128,
         "GROUP_SIZE_M": 1,
         "num_warps": 4,
         "num_stages": 4
@@ -53,12 +53,12 @@
         "BLOCK_SIZE_K": 256,
         "GROUP_SIZE_M": 1,
         "num_warps": 4,
-        "num_stages": 4
+        "num_stages": 5
     },
     "48": {
         "BLOCK_SIZE_M": 16,
         "BLOCK_SIZE_N": 128,
-        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_K": 256,
         "GROUP_SIZE_M": 1,
         "num_warps": 4,
         "num_stages": 4
@@ -82,10 +82,10 @@
     "128": {
         "BLOCK_SIZE_M": 16,
         "BLOCK_SIZE_N": 128,
-        "BLOCK_SIZE_K": 256,
+        "BLOCK_SIZE_K": 128,
         "GROUP_SIZE_M": 1,
         "num_warps": 4,
-        "num_stages": 4
+        "num_stages": 3
     },
     "256": {
         "BLOCK_SIZE_M": 16,
@@ -98,8 +98,8 @@
     "512": {
         "BLOCK_SIZE_M": 64,
         "BLOCK_SIZE_N": 128,
-        "BLOCK_SIZE_K": 256,
-        "GROUP_SIZE_M": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
         "num_warps": 4,
         "num_stages": 4
     },
@@ -107,7 +107,7 @@
         "BLOCK_SIZE_M": 64,
         "BLOCK_SIZE_N": 128,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 32,
+        "GROUP_SIZE_M": 16,
         "num_warps": 4,
         "num_stages": 4
     },
@@ -115,7 +115,7 @@
         "BLOCK_SIZE_M": 64,
         "BLOCK_SIZE_N": 128,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 1,
+        "GROUP_SIZE_M": 16,
         "num_warps": 4,
         "num_stages": 4
     },
@@ -123,15 +123,15 @@
         "BLOCK_SIZE_M": 64,
         "BLOCK_SIZE_N": 128,
         "BLOCK_SIZE_K": 256,
-        "GROUP_SIZE_M": 32,
+        "GROUP_SIZE_M": 16,
         "num_warps": 4,
         "num_stages": 4
     },
     "3072": {
         "BLOCK_SIZE_M": 64,
         "BLOCK_SIZE_N": 128,
-        "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 32,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
         "num_warps": 4,
         "num_stages": 4
     },
diff --git a/vllm/model_executor/layers/fused_moe/configs/E=128,N=768,device_name=NVIDIA_GB200,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/fused_moe/configs/E=128,N=768,device_name=NVIDIA_GB200,dtype=fp8_w8a8,block_shape=[128,128].json
new file mode 100644
index 0000000000000000000000000000000000000000..8fb4947d62ab2fdd70b2982842c8374e135ff99a
--- /dev/null
+++ b/vllm/model_executor/layers/fused_moe/configs/E=128,N=768,device_name=NVIDIA_GB200,dtype=fp8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "64": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "96": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "128": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "256": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 3
+    }
+}
diff --git a/vllm/model_executor/layers/fused_moe/configs/E=160,N=640,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/fused_moe/configs/E=160,N=640,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json
new file mode 100644
index 0000000000000000000000000000000000000000..bdbaf3811c939f43a26d4069ae85c560f027b2c3
--- /dev/null
+++ b/vllm/model_executor/layers/fused_moe/configs/E=160,N=640,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "64": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "96": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "128": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "256": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    }
+}
diff --git a/vllm/model_executor/layers/fused_moe/configs/E=160,N=640,device_name=NVIDIA_GB200,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/fused_moe/configs/E=160,N=640,device_name=NVIDIA_GB200,dtype=fp8_w8a8,block_shape=[128,128].json
new file mode 100644
index 0000000000000000000000000000000000000000..6e17bcd214748e9688134a2ec9e18afd2f0da509
--- /dev/null
+++ b/vllm/model_executor/layers/fused_moe/configs/E=160,N=640,device_name=NVIDIA_GB200,dtype=fp8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "64": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "96": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "128": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "256": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    }
+}
diff --git a/vllm/model_executor/layers/fused_moe/configs/E=160,N=640,device_name=NVIDIA_H100,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/fused_moe/configs/E=160,N=640,device_name=NVIDIA_H100,dtype=fp8_w8a8,block_shape=[128,128].json
new file mode 100644
index 0000000000000000000000000000000000000000..aa7610cd75e77b43e029c447c409bdd1d1df6c17
--- /dev/null
+++ b/vllm/model_executor/layers/fused_moe/configs/E=160,N=640,device_name=NVIDIA_H100,dtype=fp8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "64": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "96": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "128": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "256": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    }
+}
diff --git a/vllm/model_executor/layers/fused_moe/configs/E=20,N=2560,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/fused_moe/configs/E=20,N=2560,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json
new file mode 100644
index 0000000000000000000000000000000000000000..df920e8b39ba80228438c4e81203fa3c942f5b67
--- /dev/null
+++ b/vllm/model_executor/layers/fused_moe/configs/E=20,N=2560,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 5
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "48": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "64": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "96": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    }
+}
diff --git a/vllm/model_executor/layers/fused_moe/configs/E=20,N=2560,device_name=NVIDIA_GB200,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/fused_moe/configs/E=20,N=2560,device_name=NVIDIA_GB200,dtype=fp8_w8a8,block_shape=[128,128].json
new file mode 100644
index 0000000000000000000000000000000000000000..e8fe8ea67f2464da40a93d167f23b57f9803e3f2
--- /dev/null
+++ b/vllm/model_executor/layers/fused_moe/configs/E=20,N=2560,device_name=NVIDIA_GB200,dtype=fp8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 5
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "48": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "64": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "96": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    }
+}
diff --git a/vllm/model_executor/layers/fused_moe/configs/E=20,N=2560,device_name=NVIDIA_H100,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/fused_moe/configs/E=20,N=2560,device_name=NVIDIA_H100,dtype=fp8_w8a8,block_shape=[128,128].json
new file mode 100644
index 0000000000000000000000000000000000000000..0baf13cb6a5c5c515664278dee242fd19f851e82
--- /dev/null
+++ b/vllm/model_executor/layers/fused_moe/configs/E=20,N=2560,device_name=NVIDIA_H100,dtype=fp8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "64": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "96": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 4
+    }
+}
diff --git a/vllm/model_executor/layers/fused_moe/configs/E=20,N=2560,device_name=NVIDIA_H20-3e,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/fused_moe/configs/E=20,N=2560,device_name=NVIDIA_H20-3e,dtype=fp8_w8a8,block_shape=[128,128].json
index 307c9240938c5840b606e953df650976bb4eea44..c7998718dab4c1fcec92155d58b536943ded8d15 100644
--- a/vllm/model_executor/layers/fused_moe/configs/E=20,N=2560,device_name=NVIDIA_H20-3e,dtype=fp8_w8a8,block_shape=[128,128].json
+++ b/vllm/model_executor/layers/fused_moe/configs/E=20,N=2560,device_name=NVIDIA_H20-3e,dtype=fp8_w8a8,block_shape=[128,128].json
@@ -18,18 +18,18 @@
     "4": {
         "BLOCK_SIZE_M": 16,
         "BLOCK_SIZE_N": 128,
-        "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 1,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 32,
         "num_warps": 4,
         "num_stages": 3
     },
     "8": {
         "BLOCK_SIZE_M": 16,
         "BLOCK_SIZE_N": 128,
-        "BLOCK_SIZE_K": 256,
+        "BLOCK_SIZE_K": 128,
         "GROUP_SIZE_M": 64,
         "num_warps": 4,
-        "num_stages": 4
+        "num_stages": 3
     },
     "16": {
         "BLOCK_SIZE_M": 16,
@@ -58,7 +58,7 @@
     "48": {
         "BLOCK_SIZE_M": 64,
         "BLOCK_SIZE_N": 128,
-        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_K": 256,
         "GROUP_SIZE_M": 64,
         "num_warps": 4,
         "num_stages": 4
@@ -74,73 +74,73 @@
     "96": {
         "BLOCK_SIZE_M": 64,
         "BLOCK_SIZE_N": 128,
-        "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 16,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 32,
         "num_warps": 4,
-        "num_stages": 3
+        "num_stages": 4
     },
     "128": {
-        "BLOCK_SIZE_M": 128,
-        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 1,
+        "GROUP_SIZE_M": 64,
         "num_warps": 4,
-        "num_stages": 2
+        "num_stages": 4
     },
     "256": {
-        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_M": 64,
         "BLOCK_SIZE_N": 128,
         "BLOCK_SIZE_K": 256,
-        "GROUP_SIZE_M": 64,
+        "GROUP_SIZE_M": 1,
         "num_warps": 4,
-        "num_stages": 3
+        "num_stages": 4
     },
     "512": {
-        "BLOCK_SIZE_M": 256,
-        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
         "BLOCK_SIZE_K": 256,
         "GROUP_SIZE_M": 64,
-        "num_warps": 8,
+        "num_warps": 4,
         "num_stages": 4
     },
     "1024": {
-        "BLOCK_SIZE_M": 256,
-        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
         "BLOCK_SIZE_K": 256,
-        "GROUP_SIZE_M": 16,
+        "GROUP_SIZE_M": 64,
         "num_warps": 4,
         "num_stages": 4
     },
     "1536": {
         "BLOCK_SIZE_M": 64,
-        "BLOCK_SIZE_N": 256,
-        "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 1,
-        "num_warps": 8,
-        "num_stages": 4
-    },
-    "2048": {
-        "BLOCK_SIZE_M": 32,
         "BLOCK_SIZE_N": 128,
         "BLOCK_SIZE_K": 256,
         "GROUP_SIZE_M": 16,
-        "num_warps": 8,
-        "num_stages": 5
+        "num_warps": 4,
+        "num_stages": 3
     },
-    "3072": {
-        "BLOCK_SIZE_M": 128,
+    "2048": {
+        "BLOCK_SIZE_M": 64,
         "BLOCK_SIZE_N": 128,
         "BLOCK_SIZE_K": 128,
         "GROUP_SIZE_M": 64,
         "num_warps": 4,
-        "num_stages": 4
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
     },
     "4096": {
-        "BLOCK_SIZE_M": 128,
-        "BLOCK_SIZE_N": 256,
-        "BLOCK_SIZE_K": 256,
-        "GROUP_SIZE_M": 64,
-        "num_warps": 8,
-        "num_stages": 5
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
     }
 }
diff --git a/vllm/model_executor/layers/fused_moe/configs/E=384,N=128,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/fused_moe/configs/E=384,N=128,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json
new file mode 100644
index 0000000000000000000000000000000000000000..b4e736bec9b6596da0702121947463fdaac1d88f
--- /dev/null
+++ b/vllm/model_executor/layers/fused_moe/configs/E=384,N=128,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "64": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "96": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "128": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "256": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "512": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
diff --git a/vllm/model_executor/layers/fused_moe/configs/E=384,N=128,device_name=NVIDIA_GB200,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/fused_moe/configs/E=384,N=128,device_name=NVIDIA_GB200,dtype=fp8_w8a8,block_shape=[128,128].json
new file mode 100644
index 0000000000000000000000000000000000000000..bb71005a72bc54ebe66b1e43dc1ee622dd4e974f
--- /dev/null
+++ b/vllm/model_executor/layers/fused_moe/configs/E=384,N=128,device_name=NVIDIA_GB200,dtype=fp8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "64": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "96": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "128": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "256": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "512": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
diff --git a/vllm/model_executor/layers/fused_moe/configs/E=384,N=128,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/fused_moe/configs/E=384,N=128,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json
new file mode 100644
index 0000000000000000000000000000000000000000..ac53df14ce8464340f7e7ebe6d067a8009c35c7c
--- /dev/null
+++ b/vllm/model_executor/layers/fused_moe/configs/E=384,N=128,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "64": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "96": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "128": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "256": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
diff --git a/vllm/model_executor/layers/fused_moe/configs/E=384,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/fused_moe/configs/E=384,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json
new file mode 100644
index 0000000000000000000000000000000000000000..f1ed617d6308fbe89377a1a8d85d28f7f5663576
--- /dev/null
+++ b/vllm/model_executor/layers/fused_moe/configs/E=384,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "64": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "96": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "128": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "256": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
diff --git a/vllm/model_executor/layers/fused_moe/configs/E=384,N=256,device_name=NVIDIA_GB200,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/fused_moe/configs/E=384,N=256,device_name=NVIDIA_GB200,dtype=fp8_w8a8,block_shape=[128,128].json
new file mode 100644
index 0000000000000000000000000000000000000000..e72282dc5bcd99e4976ea57f9ca48c697df67f81
--- /dev/null
+++ b/vllm/model_executor/layers/fused_moe/configs/E=384,N=256,device_name=NVIDIA_GB200,dtype=fp8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "64": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "96": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "128": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "256": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    }
+}
diff --git a/vllm/model_executor/layers/fused_moe/configs/E=40,N=2560,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/fused_moe/configs/E=40,N=2560,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json
new file mode 100644
index 0000000000000000000000000000000000000000..4fc4868eaa85ab14cdea972ea1ff3a7d24da1670
--- /dev/null
+++ b/vllm/model_executor/layers/fused_moe/configs/E=40,N=2560,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "64": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "96": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "128": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
diff --git a/vllm/model_executor/layers/fused_moe/configs/E=40,N=2560,device_name=NVIDIA_GB200,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/fused_moe/configs/E=40,N=2560,device_name=NVIDIA_GB200,dtype=fp8_w8a8,block_shape=[128,128].json
new file mode 100644
index 0000000000000000000000000000000000000000..d70adca05e7799b962a0b098fa72d034a0c83a43
--- /dev/null
+++ b/vllm/model_executor/layers/fused_moe/configs/E=40,N=2560,device_name=NVIDIA_GB200,dtype=fp8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "64": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "96": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "128": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    }
+}
diff --git a/vllm/model_executor/layers/fused_moe/configs/E=40,N=2560,device_name=NVIDIA_H100,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/fused_moe/configs/E=40,N=2560,device_name=NVIDIA_H100,dtype=fp8_w8a8,block_shape=[128,128].json
new file mode 100644
index 0000000000000000000000000000000000000000..0f5867fea5f89262b99db682b7516862ccca795f
--- /dev/null
+++ b/vllm/model_executor/layers/fused_moe/configs/E=40,N=2560,device_name=NVIDIA_H100,dtype=fp8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "64": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "96": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 5
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 4
+    }
+}
diff --git a/vllm/model_executor/layers/fused_moe/configs/E=512,N=128,device_name=NVIDIA_B200.json b/vllm/model_executor/layers/fused_moe/configs/E=512,N=128,device_name=NVIDIA_B200.json
new file mode 100644
index 0000000000000000000000000000000000000000..d104aa5167b225d38ece8297def713998f8eea4d
--- /dev/null
+++ b/vllm/model_executor/layers/fused_moe/configs/E=512,N=128,device_name=NVIDIA_B200.json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "2": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "64": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "96": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "128": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "256": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 2
+    }
+}
diff --git a/vllm/model_executor/layers/fused_moe/configs/E=512,N=128,device_name=NVIDIA_H100_80GB_HBM3.json b/vllm/model_executor/layers/fused_moe/configs/E=512,N=128,device_name=NVIDIA_H100_80GB_HBM3.json
new file mode 100644
index 0000000000000000000000000000000000000000..94408e279b6569b54ddb3ee36c64a425c33c31e1
--- /dev/null
+++ b/vllm/model_executor/layers/fused_moe/configs/E=512,N=128,device_name=NVIDIA_H100_80GB_HBM3.json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "2": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "24": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "64": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "96": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "128": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "256": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_stages": 3
+    }
+}
diff --git a/vllm/model_executor/layers/fused_moe/configs/E=512,N=128,device_name=NVIDIA_H200.json b/vllm/model_executor/layers/fused_moe/configs/E=512,N=128,device_name=NVIDIA_H200.json
new file mode 100644
index 0000000000000000000000000000000000000000..20146f53a6eba8303bcd95e224c59ec1ff227c39
--- /dev/null
+++ b/vllm/model_executor/layers/fused_moe/configs/E=512,N=128,device_name=NVIDIA_H200.json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "2": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "64": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "96": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "128": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "256": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_stages": 3
+    }
+}
diff --git a/vllm/model_executor/layers/fused_moe/configs/E=512,N=256,device_name=NVIDIA_B200.json b/vllm/model_executor/layers/fused_moe/configs/E=512,N=256,device_name=NVIDIA_B200.json
new file mode 100644
index 0000000000000000000000000000000000000000..d0140252594f5b50d0c5c4ff94fd0d9eec0d0a52
--- /dev/null
+++ b/vllm/model_executor/layers/fused_moe/configs/E=512,N=256,device_name=NVIDIA_B200.json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "2": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "64": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "96": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "128": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "256": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 2
+    }
+}
diff --git a/vllm/model_executor/layers/fused_moe/configs/E=512,N=256,device_name=NVIDIA_H200.json b/vllm/model_executor/layers/fused_moe/configs/E=512,N=256,device_name=NVIDIA_H200.json
new file mode 100644
index 0000000000000000000000000000000000000000..2f0b45014e863acb318b4959351258cde42dd2d7
--- /dev/null
+++ b/vllm/model_executor/layers/fused_moe/configs/E=512,N=256,device_name=NVIDIA_H200.json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "2": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "64": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "96": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "128": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2
+    },
+    "256": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 3
+    }
+}
diff --git a/vllm/model_executor/layers/fused_moe/configs/E=512,N=512,device_name=NVIDIA_B200.json b/vllm/model_executor/layers/fused_moe/configs/E=512,N=512,device_name=NVIDIA_B200.json
new file mode 100644
index 0000000000000000000000000000000000000000..5d69efe9ed5f96c89f9bc381ddb52eecdec46a23
--- /dev/null
+++ b/vllm/model_executor/layers/fused_moe/configs/E=512,N=512,device_name=NVIDIA_B200.json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "64": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "96": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "128": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "256": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 4
+    }
+}
diff --git a/vllm/model_executor/layers/fused_moe/configs/E=512,N=512,device_name=NVIDIA_H200.json b/vllm/model_executor/layers/fused_moe/configs/E=512,N=512,device_name=NVIDIA_H200.json
new file mode 100644
index 0000000000000000000000000000000000000000..e55df46b402692b35fcc8922d83ef05337dcd2a3
--- /dev/null
+++ b/vllm/model_executor/layers/fused_moe/configs/E=512,N=512,device_name=NVIDIA_H200.json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "64": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "96": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "128": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "256": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 4
+    }
+}
diff --git a/vllm/model_executor/layers/fused_moe/configs/E=512,N=64,device_name=NVIDIA_B200.json b/vllm/model_executor/layers/fused_moe/configs/E=512,N=64,device_name=NVIDIA_B200.json
new file mode 100644
index 0000000000000000000000000000000000000000..a0855a921f3f62fa44102335637f1760e2bed577
--- /dev/null
+++ b/vllm/model_executor/layers/fused_moe/configs/E=512,N=64,device_name=NVIDIA_B200.json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "2": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "64": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "96": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "128": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "256": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 2
+    }
+}
diff --git a/vllm/model_executor/layers/fused_moe/configs/E=512,N=64,device_name=NVIDIA_H200.json b/vllm/model_executor/layers/fused_moe/configs/E=512,N=64,device_name=NVIDIA_H200.json
new file mode 100644
index 0000000000000000000000000000000000000000..d5b6d02123d71092651500d2d0c514a1bcec7a01
--- /dev/null
+++ b/vllm/model_executor/layers/fused_moe/configs/E=512,N=64,device_name=NVIDIA_H200.json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "2": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "64": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "96": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "128": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "256": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 3
+    }
+}
diff --git a/vllm/model_executor/layers/fused_moe/deep_gemm_moe.py b/vllm/model_executor/layers/fused_moe/deep_gemm_moe.py
index 7b8467a5a0cf0b03c99342b070d7d166d35090b4..c0bfda73eee0d38d3a972933f7f9c94a887df687 100644
--- a/vllm/model_executor/layers/fused_moe/deep_gemm_moe.py
+++ b/vllm/model_executor/layers/fused_moe/deep_gemm_moe.py
@@ -57,13 +57,14 @@ def _valid_deep_gemm(hidden_states: torch.Tensor, w1: torch.Tensor,
     if not _valid_deep_gemm_shape(M, N, K):
         logger.debug_once(
             "DeepGemm disabled due to unaligned problem size. "
-            "M: %s, N: %s, K: %s. M should >= align size "
-            "and N and K must be multiples of %s."
+            "M: %s, N: %s, K: %s. M should >= %s "
+            "and N and K must be multiples of %s. "
             "This is not an error and we will fall back to triton.",
             M,
             N,
             K,
             align,
+            align,
         )
         return False
     elif N <= 512:
diff --git a/vllm/model_executor/layers/fused_moe/deepep_ht_prepare_finalize.py b/vllm/model_executor/layers/fused_moe/deepep_ht_prepare_finalize.py
index 437e569d3130d46e842464ebcd4e25168c7024a2..2a3ae478f3eab98c9a3ba55ddea4938d45c454b7 100644
--- a/vllm/model_executor/layers/fused_moe/deepep_ht_prepare_finalize.py
+++ b/vllm/model_executor/layers/fused_moe/deepep_ht_prepare_finalize.py
@@ -1,6 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-from typing import Optional
+from typing import Callable, Optional, Union
 
 import deep_ep
 import torch
@@ -25,6 +25,8 @@ class DeepEPHTPrepareAndFinalize(mk.FusedMoEPrepareAndFinalize):
         self.num_dispatchers_ = num_dispatchers
         self.dp_size = dp_size
         self.rank_expert_offset = rank_expert_offset
+        self.async_prepare = True
+
         # The dispatch function returns a handle that the combine function
         # requires. We store the handle here so it is available to the
         # combine function.
@@ -47,19 +49,25 @@ class DeepEPHTPrepareAndFinalize(mk.FusedMoEPrepareAndFinalize):
         return torch.int64
 
     def _get_dispatch_config(self) -> Optional[deep_ep.Config]:
-        if self.dp_size not in self.available_rank_configs:
+        if self.num_dispatchers_ not in self.available_rank_configs:
             return None
-        return deep_ep.Buffer.get_dispatch_config(self.dp_size)
+        return deep_ep.Buffer.get_dispatch_config(self.num_dispatchers_)
 
     def _get_combine_config(self) -> Optional[deep_ep.Config]:
-        if self.dp_size not in self.available_rank_configs:
+        if self.num_dispatchers_ not in self.available_rank_configs:
             return None
-        return deep_ep.Buffer.get_combine_config(self.dp_size)
+        return deep_ep.Buffer.get_combine_config(self.num_dispatchers_)
 
-    def _do_dispatch(self, tokens: torch.Tensor,
-                     token_scales: Optional[torch.Tensor],
-                     rank_topk_ids: torch.Tensor,
-                     rank_topk_weights: torch.Tensor, num_experts: int):
+    def _do_dispatch(
+        self,
+        tokens: torch.Tensor,
+        token_scales: Optional[torch.Tensor],
+        rank_topk_ids: torch.Tensor,
+        rank_topk_weights: torch.Tensor,
+        num_experts: int,
+        a1_scale: Optional[torch.Tensor],
+        quant_config: FusedMoEQuantConfig,
+    ) -> Callable:
 
         has_scales = token_scales is not None
 
@@ -93,9 +101,36 @@ class DeepEPHTPrepareAndFinalize(mk.FusedMoEPrepareAndFinalize):
             expert_alignment=1,
             config=self._get_dispatch_config(),
             previous_event=None,
-            async_finish=False,
+            async_finish=self.async_prepare,
             allocate_on_comm_stream=False)
 
+        return lambda: self._receiver(
+            event,
+            has_scales,
+            token_data,
+            expert_topk_ids,
+            num_experts,
+            expert_num_tokens_per_expert_list,
+            expert_topk_weights,
+            a1_scale,
+            quant_config,
+        )
+
+    def _receiver(
+        self,
+        event: deep_ep.EventOverlap,
+        has_scales: bool,
+        token_data: Union[tuple[torch.Tensor, torch.Tensor], torch.Tensor],
+        expert_topk_ids: Optional[torch.Tensor],
+        num_experts: int,
+        expert_num_tokens_per_expert_list: list[int],
+        expert_topk_weights: Optional[torch.Tensor],
+        a1_scale: Optional[torch.Tensor],
+        quant_config: FusedMoEQuantConfig,
+    ) -> mk.PrepareResultType:
+        if self.async_prepare:
+            event.current_stream_wait()
+
         if has_scales:
             expert_x, expert_x_scale = token_data
         else:
@@ -112,6 +147,7 @@ class DeepEPHTPrepareAndFinalize(mk.FusedMoEPrepareAndFinalize):
         # DeepEP's topk_ids output refers to the local experts directly. Offset
         # the topk_ids to move it back to the global experts space so it aligns
         # with existing vLLM interfaces.
+        assert expert_topk_ids is not None
         expert_topk_ids = torch.where(
             expert_topk_ids == -1,
             num_experts - 1 if self.rank_expert_offset == 0 else 0,
@@ -123,10 +159,28 @@ class DeepEPHTPrepareAndFinalize(mk.FusedMoEPrepareAndFinalize):
         expert_tokens_meta = mk.ExpertTokensMetadata.make_from_list(
             expert_num_tokens_per_expert_list, device=expert_x.device)
 
+        # Dispatch and Quant
+        # DeepEP kernels only support dispatching block-quantized
+        # activation scales.
+        # Dispatch in bfloat16 and quantize afterwards
+        if not quant_config.is_block_quantized:
+            # Quantize after dispatch.
+            expert_x_scale = None
+            if expert_x.numel() != 0:
+                expert_x, expert_x_scale = moe_kernel_quantize_input(
+                    expert_x,
+                    a1_scale,
+                    quant_dtype=quant_config.quant_dtype,
+                    per_act_token_quant=False,
+                    block_shape=quant_config.block_shape)
+
         return (expert_x, expert_x_scale, expert_tokens_meta, expert_topk_ids,
                 expert_topk_weights)
 
-    def prepare(
+    def supports_async(self) -> bool:
+        return True
+
+    def prepare_async(
         self,
         a1: torch.Tensor,
         a1_scale: Optional[torch.Tensor],
@@ -137,9 +191,7 @@ class DeepEPHTPrepareAndFinalize(mk.FusedMoEPrepareAndFinalize):
         expert_map: Optional[torch.Tensor],
         apply_router_weight_on_input: bool,
         quant_config: FusedMoEQuantConfig,
-    ) -> tuple[torch.Tensor, Optional[torch.Tensor],
-               Optional[mk.ExpertTokensMetadata], Optional[torch.Tensor],
-               Optional[torch.Tensor]]:
+    ) -> Callable:
 
         if apply_router_weight_on_input:
             topk = topk_ids.size(1)
@@ -159,37 +211,37 @@ class DeepEPHTPrepareAndFinalize(mk.FusedMoEPrepareAndFinalize):
             )
             if a1q_scale is not None and a1q_scale.numel() == 1:
                 a1q_scale = a1q_scale.view(1, 1)
-            (expert_x, expert_x_scale, expert_tokens_meta, expert_topk_ids,
-             expert_topk_weights) = self._do_dispatch(
-                 tokens=a1q,
-                 token_scales=a1q_scale,
-                 rank_topk_ids=topk_ids,
-                 rank_topk_weights=topk_weights,
-                 num_experts=num_experts)
+            a1_post_scale = None
         else:
-            # Dispatch and Quant
-            # DeepEP kernels only support dispatching block-quantized
-            # activation scales.
-            # Dispatch in bfloat16
-            (expert_x, _, expert_tokens_meta, expert_topk_ids,
-             expert_topk_weights) = self._do_dispatch(
-                 tokens=a1,
-                 token_scales=None,
-                 rank_topk_ids=topk_ids,
-                 rank_topk_weights=topk_weights,
-                 num_experts=num_experts)
-            # Quantize after dispatch.
-            expert_x_scale = None
-            if expert_x.numel() != 0:
-                expert_x, expert_x_scale = moe_kernel_quantize_input(
-                    expert_x,
-                    a1_scale,
-                    quant_dtype=quant_config.quant_dtype,
-                    per_act_token_quant=False,
-                    block_shape=quant_config.block_shape)
+            a1q = a1
+            a1q_scale = None
+            a1_post_scale = a1_scale
 
-        return (expert_x, expert_x_scale, expert_tokens_meta, expert_topk_ids,
-                expert_topk_weights)
+        return self._do_dispatch(tokens=a1q,
+                                 token_scales=a1q_scale,
+                                 rank_topk_ids=topk_ids,
+                                 rank_topk_weights=topk_weights,
+                                 num_experts=num_experts,
+                                 a1_scale=a1_post_scale,
+                                 quant_config=quant_config)
+
+    def prepare(
+        self,
+        a1: torch.Tensor,
+        a1_scale: Optional[torch.Tensor],
+        a2_scale: Optional[torch.Tensor],
+        topk_weights: torch.Tensor,
+        topk_ids: torch.Tensor,
+        num_experts: int,
+        expert_map: Optional[torch.Tensor],
+        apply_router_weight_on_input: bool,
+        quant_config: FusedMoEQuantConfig,
+    ) -> mk.PrepareResultType:
+        receiver = self.prepare_async(a1, a1_scale, a2_scale, topk_weights,
+                                      topk_ids, num_experts, expert_map,
+                                      apply_router_weight_on_input,
+                                      quant_config)
+        return receiver()
 
     def finalize(
         self,
diff --git a/vllm/model_executor/layers/fused_moe/deepep_ll_prepare_finalize.py b/vllm/model_executor/layers/fused_moe/deepep_ll_prepare_finalize.py
index 93ac11fb4bfbf36636a5066bb50abd0fd88c7d08..1849e49e0ab51d6483ad8a72aeeafd804b257644 100644
--- a/vllm/model_executor/layers/fused_moe/deepep_ll_prepare_finalize.py
+++ b/vllm/model_executor/layers/fused_moe/deepep_ll_prepare_finalize.py
@@ -1,6 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-from typing import Optional, Union
+from typing import Callable, Optional, Union
 
 import deep_ep
 import torch
@@ -75,7 +75,6 @@ class DeepEPLLPrepareAndFinalize(mk.FusedMoEPrepareAndFinalize):
         self,
         x: Union[torch.Tensor, tuple[torch.Tensor, torch.Tensor]],
         a1_scale: Optional[torch.Tensor],
-        a2_scale: Optional[torch.Tensor],
         a1_dtype: torch.dtype,
         quant_dtype: Union[torch.dtype, str, None],
         per_act_token_quant: bool,
@@ -110,7 +109,10 @@ class DeepEPLLPrepareAndFinalize(mk.FusedMoEPrepareAndFinalize):
 
         return x, x_scales
 
-    def prepare(
+    def supports_async(self) -> bool:
+        return True
+
+    def prepare_async(
         self,
         a1: torch.Tensor,
         a1_scale: Optional[torch.Tensor],
@@ -121,9 +123,7 @@ class DeepEPLLPrepareAndFinalize(mk.FusedMoEPrepareAndFinalize):
         expert_map: Optional[torch.Tensor],
         apply_router_weight_on_input: bool,
         quant_config: FusedMoEQuantConfig,
-    ) -> tuple[torch.Tensor, Optional[torch.Tensor],
-               Optional[mk.ExpertTokensMetadata], Optional[torch.Tensor],
-               Optional[torch.Tensor]]:
+    ) -> mk.ReceiverType:
 
         hidden_size = a1.size(1)
         assert hidden_size in self.SUPPORTED_HIDDEN_SIZES, \
@@ -155,16 +155,48 @@ class DeepEPLLPrepareAndFinalize(mk.FusedMoEPrepareAndFinalize):
                                                 num_experts,
                                                 use_fp8=self.use_fp8_dispatch,
                                                 async_finish=False,
-                                                return_recv_hook=False)
+                                                return_recv_hook=True)
+
+        return lambda: self._receiver(hook, expert_x, expert_num_tokens,
+                                      a1_scale, a1.dtype, quant_config)
+
+    def _receiver(
+        self,
+        hook: Callable,
+        expert_x: Union[torch.Tensor, tuple[torch.Tensor, torch.Tensor]],
+        expert_num_tokens: torch.Tensor,
+        a1_scale,
+        a1_dtype,
+        quant_config: FusedMoEQuantConfig,
+    ) -> mk.PrepareResultType:
+        hook()
 
         expert_x, expert_x_scale = self._do_quant(
-            expert_x, a1_scale, a2_scale, a1.dtype, quant_config.quant_dtype,
+            expert_x, a1_scale, a1_dtype, quant_config.quant_dtype,
             quant_config.per_act_token_quant, quant_config.block_shape)
 
         expert_tokens_meta = mk.ExpertTokensMetadata(
             expert_num_tokens=expert_num_tokens, expert_num_tokens_cpu=None)
 
-        return (expert_x, expert_x_scale, expert_tokens_meta, None, None)
+        return expert_x, expert_x_scale, expert_tokens_meta, None, None
+
+    def prepare(
+        self,
+        a1: torch.Tensor,
+        a1_scale: Optional[torch.Tensor],
+        a2_scale: Optional[torch.Tensor],
+        topk_weights: torch.Tensor,
+        topk_ids: torch.Tensor,
+        num_experts: int,
+        expert_map: Optional[torch.Tensor],
+        apply_router_weight_on_input: bool,
+        quant_config: FusedMoEQuantConfig,
+    ) -> mk.PrepareResultType:
+        receiver = self.prepare_async(a1, a1_scale, a2_scale, topk_weights,
+                                      topk_ids, num_experts, expert_map,
+                                      apply_router_weight_on_input,
+                                      quant_config)
+        return receiver()
 
     def finalize(
         self,
diff --git a/vllm/model_executor/layers/fused_moe/flashinfer_cutlass_prepare_finalize.py b/vllm/model_executor/layers/fused_moe/flashinfer_cutlass_prepare_finalize.py
index 061b02172c4465155902e29a666abe9d3912f5f3..157cb36d4ffd32caa94beeeff42ba2ea693133d9 100644
--- a/vllm/model_executor/layers/fused_moe/flashinfer_cutlass_prepare_finalize.py
+++ b/vllm/model_executor/layers/fused_moe/flashinfer_cutlass_prepare_finalize.py
@@ -56,9 +56,7 @@ class FlashInferCutlassMoEPrepareAndFinalize(mk.FusedMoEPrepareAndFinalize):
         apply_router_weight_on_input: bool,
         # TODO(bnell): use quant_config + scales instead of ctor args
         quant_config: FusedMoEQuantConfig,
-    ) -> tuple[torch.Tensor, Optional[torch.Tensor],
-               Optional[mk.ExpertTokensMetadata], Optional[torch.Tensor],
-               Optional[torch.Tensor]]:
+    ) -> mk.PrepareResultType:
 
         if apply_router_weight_on_input:
             topk = topk_ids.size(1)
diff --git a/vllm/model_executor/layers/fused_moe/fused_batched_moe.py b/vllm/model_executor/layers/fused_moe/fused_batched_moe.py
index b46f4be4b912e013c5144ac01adb1c72996b7702..88063668e9188df6c9888a98812af0dad85f8043 100644
--- a/vllm/model_executor/layers/fused_moe/fused_batched_moe.py
+++ b/vllm/model_executor/layers/fused_moe/fused_batched_moe.py
@@ -506,9 +506,7 @@ class BatchedPrepareAndFinalize(mk.FusedMoEPrepareAndFinalize):
         expert_map: Optional[torch.Tensor],
         apply_router_weight_on_input: bool,
         quant_config: FusedMoEQuantConfig,
-    ) -> tuple[torch.Tensor, Optional[torch.Tensor],
-               Optional[mk.ExpertTokensMetadata], Optional[torch.Tensor],
-               Optional[torch.Tensor]]:
+    ) -> mk.PrepareResultType:
         assert a1.dim() == 2
         assert topk_ids.dim() == 2
         assert topk_ids.size(0) == a1.size(0)
diff --git a/vllm/model_executor/layers/fused_moe/fused_moe.py b/vllm/model_executor/layers/fused_moe/fused_moe.py
index 28e8fa8962305890148239b2954c7710d1e2a058..a8fd9bc3f550bae2b2464e85d714b3a86a009a43 100644
--- a/vllm/model_executor/layers/fused_moe/fused_moe.py
+++ b/vllm/model_executor/layers/fused_moe/fused_moe.py
@@ -549,7 +549,7 @@ def invoke_fused_moe_kernel(A: torch.Tensor,
     EM = sorted_token_ids.size(0)
     if A.size(0) < config["BLOCK_SIZE_M"]:
         # optimize for small batch_size.
-        # We assume that top_ids of each token is unique, so
+        # We assume that top_ids of each token is unique,
         # so num_valid_experts <= batch_size <= BLOCK_SIZE_M,
         # and we can skip some invalid blocks.
         EM = min(sorted_token_ids.size(0),
diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py
index ec352368af2397b1b01b784a96280ed85ace8a66..09a4d29ef0e8a764143e2b4b7ba81d7cb1a40112 100644
--- a/vllm/model_executor/layers/fused_moe/layer.py
+++ b/vllm/model_executor/layers/fused_moe/layer.py
@@ -6,7 +6,7 @@ import os
 from abc import abstractmethod
 from collections.abc import Iterable
 from enum import Enum
-from typing import Callable, Literal, Optional, overload
+from typing import Callable, Literal, Optional, Union, overload
 
 import torch
 import torch.nn.functional as F
@@ -37,7 +37,7 @@ from vllm.model_executor.layers.quantization.base_config import (
 from vllm.model_executor.utils import set_weight_attrs
 from vllm.platforms import current_platform
 from vllm.platforms.interface import CpuArchEnum
-from vllm.utils import (direct_register_custom_op, has_deep_ep, has_pplx,
+from vllm.utils import (cdiv, direct_register_custom_op, has_deep_ep, has_pplx,
                         round_up)
 
 if current_platform.is_cuda_alike():
@@ -217,6 +217,7 @@ class FusedMoEMethodBase(QuantizeMethodBase):
             self.fused_experts = FusedMoEModularKernel(
                 prepare_finalize,
                 experts,
+                layer.shared_experts,
             )
 
     def select_gemm_impl(
@@ -254,7 +255,7 @@ class FusedMoEMethodBase(QuantizeMethodBase):
         expert_load_view: Optional[torch.Tensor] = None,
         logical_to_physical_map: Optional[torch.Tensor] = None,
         logical_replica_count: Optional[torch.Tensor] = None,
-    ) -> torch.Tensor:
+    ) -> Union[torch.Tensor, tuple[torch.Tensor, torch.Tensor]]:
         raise NotImplementedError
 
 
@@ -428,7 +429,7 @@ class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp):
         logical_to_physical_map: Optional[torch.Tensor] = None,
         logical_replica_count: Optional[torch.Tensor] = None,
         use_nn_moe: Optional[bool] = False,
-    ) -> torch.Tensor:
+    ) -> Union[torch.Tensor, tuple[torch.Tensor, torch.Tensor]]:
         if enable_eplb:
             assert expert_load_view is not None
             assert logical_to_physical_map is not None
@@ -482,7 +483,7 @@ class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp):
         logical_to_physical_map: Optional[torch.Tensor] = None,
         logical_replica_count: Optional[torch.Tensor] = None,
         use_nn_moe: Optional[bool] = False,
-    ) -> torch.Tensor:
+    ) -> Union[torch.Tensor, tuple[torch.Tensor, torch.Tensor]]:
 
         topk_weights, topk_ids = FusedMoE.select_experts(
             hidden_states=x,
@@ -570,7 +571,7 @@ class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp):
         logical_to_physical_map: Optional[torch.Tensor] = None,
         logical_replica_count: Optional[torch.Tensor] = None,
         use_nn_moe: Optional[bool] = False,
-    ):
+    ) -> Union[torch.Tensor, tuple[torch.Tensor, torch.Tensor]]:
         if enable_eplb is not False or expert_load_view is not None or \
                 logical_to_physical_map is not None or \
                 logical_replica_count is not None:
@@ -617,7 +618,7 @@ class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp):
         expert_load_view: Optional[torch.Tensor] = None,
         logical_to_physical_map: Optional[torch.Tensor] = None,
         logical_replica_count: Optional[torch.Tensor] = None,
-    ):
+    ) -> Union[torch.Tensor, tuple[torch.Tensor, torch.Tensor]]:
         if enable_eplb is not False or expert_load_view is not None or \
                 logical_to_physical_map is not None or \
                 logical_replica_count is not None:
@@ -657,7 +658,7 @@ class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp):
         expert_load_view: Optional[torch.Tensor] = None,
         logical_to_physical_map: Optional[torch.Tensor] = None,
         logical_replica_count: Optional[torch.Tensor] = None,
-    ) -> torch.Tensor:
+    ) -> Union[torch.Tensor, tuple[torch.Tensor, torch.Tensor]]:
         assert not use_grouped_topk
         assert num_expert_group is None
         assert topk_group is None
@@ -733,7 +734,7 @@ def determine_expert_map(
 
     # Create a tensor of size num_experts filled with -1
     expert_map = torch.full((global_num_experts, ), -1, dtype=torch.int32)
-    # Create a expert map for the local experts
+    # Create an expert map for the local experts
     start_idx = ep_rank * base_experts + min(ep_rank, remainder)
     expert_map[start_idx:start_idx + local_num_experts] = torch.arange(
         0, local_num_experts, dtype=torch.int32)
@@ -778,7 +779,7 @@ class FusedMoE(CustomOp):
         intermediate_size: Intermediate size of the experts
         params_dtype: Data type for the parameters.
         reduce_results: Whether to all all_reduce on the output of the layer
-        renomalize: Whether to renormalize the logits in the fused_moe kernel
+        renormalize: Whether to renormalize the logits in the fused_moe kernel
         quant_config: Quantization configure.
         enable_eplb: Whether to enable expert parallelism load balancer.
     """
@@ -809,6 +810,7 @@ class FusedMoE(CustomOp):
         enable_eplb: bool = False,
         num_redundant_experts: int = 0,
         has_bias: bool = False,
+        is_sequence_parallel=False,
     ):
         super().__init__()
         if params_dtype is None:
@@ -820,6 +822,10 @@ class FusedMoE(CustomOp):
         dp_size_ = (dp_size
                     if dp_size is not None else get_dp_group().world_size)
 
+        self.is_sequence_parallel = is_sequence_parallel
+        if self.is_sequence_parallel:
+            self.sp_size = tp_size_
+
         vllm_config = get_current_vllm_config()
         self.moe_parallel_config: FusedMoEParallelConfig = (
             FusedMoEParallelConfig.make(
@@ -829,11 +835,18 @@ class FusedMoE(CustomOp):
 
         self.global_num_experts = num_experts + num_redundant_experts
 
-        # we padding globally so EP buffer allocation works
+        # we are padding globally so EP buffer allocation works
         if quant_config and quant_config.get_name() == "mxfp4":
-            from vllm.model_executor.layers.quantization.mxfp4 import (  # noqa: E501
-                should_use_flashinfer_mxfp4)
-            if current_platform.is_rocm() or should_use_flashinfer_mxfp4():
+            from vllm.model_executor.layers.quantization.mxfp4 import (
+                Mxfp4Backend, get_mxfp4_backend)
+            current_mxfp4_backend = get_mxfp4_backend()
+            if (current_mxfp4_backend == Mxfp4Backend.SM90_FI_MXFP4_BF16
+                    or current_mxfp4_backend
+                    == Mxfp4Backend.SM100_FI_MXFP4_MXFP8_CUTLASS):
+                hidden_size = round_up(hidden_size, 128)
+            elif (current_platform.is_rocm() or current_mxfp4_backend
+                  == Mxfp4Backend.SM100_FI_MXFP4_MXFP8_TRTLLM or
+                  current_mxfp4_backend == Mxfp4Backend.SM100_FI_MXFP4_BF16):
                 hidden_size = round_up(hidden_size, 256)
 
         # For smuggling this layer into the fused moe custom op
@@ -980,6 +993,10 @@ class FusedMoE(CustomOp):
                 dtype=moe.in_dtype,
                 device=torch.cuda.current_device())
 
+    @property
+    def shared_experts(self) -> Optional[torch.nn.Module]:
+        return None
+
     @property
     def tp_size(self):
         return self.moe_parallel_config.tp_size
@@ -1444,6 +1461,7 @@ class FusedMoE(CustomOp):
         return [
             weight.view(self.local_num_experts, -1) for name, weight in weights
             if name not in NON_EXPERT_WEIGHTS
+            and not name.startswith("_shared_experts.")
         ]
 
     def set_eplb_state(
@@ -1626,25 +1644,52 @@ class FusedMoE(CustomOp):
         else:
             return tensor_model_parallel_all_reduce(final_hidden_states)
 
-    def forward(self, hidden_states: torch.Tensor,
-                router_logits: torch.Tensor):
+    def forward_native(
+        self,
+        hidden_states: torch.Tensor,
+        router_logits: torch.Tensor,
+    ) -> Union[torch.Tensor, tuple[torch.Tensor, torch.Tensor]]:
         og_hidden_states = hidden_states.shape[-1]
         if self.hidden_size != og_hidden_states:
             hidden_states = F.pad(hidden_states,
                                   (0, self.hidden_size - og_hidden_states),
                                   mode='constant',
                                   value=0.0)
-        # TODO: Once the OOM issue for the TPU backend is resolved, we will
-        # switch to using the moe_forward custom op.
-        if current_platform.is_tpu():
-            return self.forward_impl(hidden_states, router_logits)
+
+        if self.shared_experts is None:
+            if current_platform.is_tpu():
+                # TODO: Once the OOM issue for the TPU backend is resolved, we
+                # will switch to using the moe_forward custom op.
+                fused_output = self.forward_impl(hidden_states, router_logits)
+                assert not isinstance(fused_output, tuple)
+            else:
+                fused_output = torch.ops.vllm.moe_forward(
+                    hidden_states, router_logits, self.layer_name)
+            return fused_output[..., :og_hidden_states]
         else:
-            return torch.ops.vllm.moe_forward(
-                hidden_states, router_logits,
-                self.layer_name)[..., :og_hidden_states]
+            if current_platform.is_tpu():
+                # TODO: Once the OOM issue for the TPU backend is resolved, we
+                # will switch to using the moe_forward custom op.
+                shared_output, fused_output = self.forward_impl(
+                    hidden_states, router_logits)
+            else:
+                shared_output, fused_output = torch.ops.vllm.moe_forward_shared(
+                    hidden_states, router_logits, self.layer_name)
+            return (shared_output[..., :og_hidden_states],
+                    fused_output[..., :og_hidden_states])
 
-    def forward_impl_chunked(self, full_hidden_states: torch.Tensor,
-                             full_router_logits: torch.Tensor):
+    def forward_cuda(
+        self,
+        hidden_states: torch.Tensor,
+        router_logits: torch.Tensor,
+    ) -> Union[torch.Tensor, tuple[torch.Tensor, torch.Tensor]]:
+        return self.forward_native(hidden_states, router_logits)
+
+    def forward_impl_chunked(
+        self,
+        full_hidden_states: torch.Tensor,
+        full_router_logits: torch.Tensor,
+    ) -> Union[torch.Tensor, tuple[torch.Tensor, torch.Tensor]]:
         assert self.batched_hidden_states is not None
         assert self.batched_router_logits is not None
         assert self.batched_hidden_states.dtype == full_hidden_states.dtype
@@ -1655,7 +1700,10 @@ class FusedMoE(CustomOp):
         assert (
             self.batched_router_logits.size(-1) == full_router_logits.size(-1))
 
-        full_final_hidden_states = torch.empty_like(full_hidden_states)
+        full_fused_final_hidden_states = torch.empty_like(full_hidden_states)
+        if self.shared_experts is not None:
+            full_shared_final_hidden_states = torch.empty_like(
+                full_hidden_states)
 
         def process_chunk(chunk_start, chunk_end, skip_result_store=False):
             chunk_size = chunk_end - chunk_start
@@ -1696,20 +1744,40 @@ class FusedMoE(CustomOp):
                 logical_replica_count=self.logical_replica_count,
             )
 
+            assert self.shared_experts is None or isinstance(
+                final_hidden_states, tuple)
+
             if not skip_result_store:
-                full_final_hidden_states[chunk_start:chunk_end, :].copy_(
-                    final_hidden_states, non_blocking=True)
+                if self.shared_experts is None:
+                    full_fused_final_hidden_states[
+                        chunk_start:chunk_end, :].copy_(final_hidden_states,
+                                                        non_blocking=True)
+                else:
+                    full_shared_final_hidden_states[
+                        chunk_start:chunk_end, :].copy_(final_hidden_states[0],
+                                                        non_blocking=True)
+                    full_fused_final_hidden_states[
+                        chunk_start:chunk_end, :].copy_(final_hidden_states[1],
+                                                        non_blocking=True)
 
         ctx = get_forward_context()
         # flashinfer_cutlass_kernels can handle: optional DP + TP/EP
-        max_tokens_across_dp = ctx.dp_metadata.max_tokens_across_dp_cpu
+        max_tokens_across_dispatchers = ctx.dp_metadata.max_tokens_across_dp_cpu
         moe_dp_chunk_size_per_rank = self.moe_config.max_num_tokens
+
+        # If the input to the MoE is sequence parallel then divide by sp_size
+        # to find the maximum number of tokens for any individual dispatcher.
+        if self.is_sequence_parallel:
+            max_tokens_across_dispatchers = cdiv(max_tokens_across_dispatchers,
+                                                 self.sp_size)
+
         num_tokens = full_hidden_states.size(0)
         for chunk_idx, chunk_start_ in enumerate(
-                range(0, max_tokens_across_dp, moe_dp_chunk_size_per_rank)):
+                range(0, max_tokens_across_dispatchers,
+                      moe_dp_chunk_size_per_rank)):
             chunk_start = chunk_start_
             chunk_end = min(chunk_start + moe_dp_chunk_size_per_rank,
-                            max_tokens_across_dp)
+                            max_tokens_across_dispatchers)
             # clamp start and end
             chunk_start = min(chunk_start, num_tokens - 1)
             chunk_end = min(chunk_end, num_tokens)
@@ -1719,10 +1787,17 @@ class FusedMoE(CustomOp):
                               chunk_end,
                               skip_result_store=chunk_start_ >= num_tokens)
 
-        return full_final_hidden_states
+        if self.shared_experts is None:
+            return full_fused_final_hidden_states
+        else:
+            return (full_shared_final_hidden_states,
+                    full_fused_final_hidden_states)
 
-    def forward_impl(self, hidden_states: torch.Tensor,
-                     router_logits: torch.Tensor):
+    def forward_impl(
+        self,
+        hidden_states: torch.Tensor,
+        router_logits: torch.Tensor,
+    ) -> Union[torch.Tensor, tuple[torch.Tensor, torch.Tensor]]:
         assert self.quant_method is not None
         # Route to the chunked forward path using the FlashInfer Cutlass kernel
         # only when data parallelism (DP) is enabled.
@@ -1738,6 +1813,16 @@ class FusedMoE(CustomOp):
             self.dp_size > 1
             and not self.moe_parallel_config.use_deepep_ht_kernels
             and not self.moe_config.use_flashinfer_cutlass_kernels)
+
+        # If there are shared experts but we are not using a modular kernel, the
+        # shared experts must be called here
+        if (not isinstance(self.quant_method.fused_experts,
+                           FusedMoEModularKernel)
+                and self.shared_experts is not None):
+            shared_output = self.shared_experts(hidden_states)
+        else:
+            shared_output = None
+
         if do_naive_dispatch_combine:
             hidden_states, router_logits = get_ep_group().dispatch(
                 hidden_states, router_logits)
@@ -1767,14 +1852,32 @@ class FusedMoE(CustomOp):
             use_nn_moe=self.use_nn_moe,
         )
 
-        if do_naive_dispatch_combine:
-            final_hidden_states = get_ep_group().combine(final_hidden_states)
-        if self.reduce_results and (self.tp_size > 1 or self.ep_size > 1):
-            # Default set to False. (May have to add shared expert outputs.
-            final_hidden_states = self.maybe_all_reduce_tensor_model_parallel(
-                final_hidden_states)
+        if shared_output is not None:
+            assert not isinstance(final_hidden_states, tuple)
+            assert self.shared_experts is not None
+            final_hidden_states = (
+                shared_output,
+                final_hidden_states,
+            )
+
+        def reduce_output(states: torch.Tensor,
+                          do_combine: bool = True) -> torch.Tensor:
+            if do_naive_dispatch_combine and do_combine:
+                states = get_ep_group().combine(states)
+
+            if self.reduce_results and (self.tp_size > 1 or self.ep_size > 1):
+                states = self.maybe_all_reduce_tensor_model_parallel(states)
 
-        return final_hidden_states
+            return states
+
+        if self.shared_experts is None:
+            assert not isinstance(final_hidden_states, tuple)
+            return reduce_output(final_hidden_states)
+        else:
+            return (
+                reduce_output(final_hidden_states[0], do_combine=False),
+                reduce_output(final_hidden_states[1]),
+            )
 
     @classmethod
     def make_expert_params_mapping(
@@ -1829,17 +1932,22 @@ class FusedMoE(CustomOp):
         return s
 
 
-def moe_forward(hidden_states: torch.Tensor, router_logits: torch.Tensor,
-                layer_name: str) -> torch.Tensor:
+def moe_forward(
+    hidden_states: torch.Tensor,
+    router_logits: torch.Tensor,
+    layer_name: str,
+) -> torch.Tensor:
     forward_context: ForwardContext = get_forward_context()
     self = forward_context.no_compile_layers[layer_name]
-    assert self.quant_method is not None
-
+    assert self.shared_experts is None
     return self.forward_impl(hidden_states, router_logits)
 
 
-def moe_forward_fake(hidden_states: torch.Tensor, router_logits: torch.Tensor,
-                     layer_name: str) -> torch.Tensor:
+def moe_forward_fake(
+    hidden_states: torch.Tensor,
+    router_logits: torch.Tensor,
+    layer_name: str,
+) -> torch.Tensor:
     return torch.empty_like(hidden_states)
 
 
@@ -1852,6 +1960,37 @@ direct_register_custom_op(
     tags=(torch.Tag.needs_fixed_stride_order, ),
 )
 
+
+def moe_forward_shared(
+    hidden_states: torch.Tensor,
+    router_logits: torch.Tensor,
+    layer_name: str,
+) -> tuple[torch.Tensor, torch.Tensor]:
+    forward_context: ForwardContext = get_forward_context()
+    self = forward_context.no_compile_layers[layer_name]
+    assert self.shared_experts is not None
+    return self.forward_impl(hidden_states, router_logits)
+
+
+def moe_forward_shared_fake(
+    hidden_states: torch.Tensor,
+    router_logits: torch.Tensor,
+    layer_name: str,
+) -> tuple[torch.Tensor, torch.Tensor]:
+    shared_out = torch.empty_like(hidden_states)
+    fused_out = torch.empty_like(hidden_states)
+    return shared_out, fused_out
+
+
+direct_register_custom_op(
+    op_name="moe_forward_shared",
+    op_func=moe_forward_shared,
+    mutates_args=["hidden_states"],
+    fake_impl=moe_forward_shared_fake,
+    dispatch_key=current_platform.dispatch_key,
+    tags=(torch.Tag.needs_fixed_stride_order, ),
+)
+
 # Mark the FusedMoE weight_loader as supporting MoE-specific parameters
 # to avoid expensive runtime reflection in model loading code
 FusedMoE.weight_loader.supports_moe_loading = True  # type: ignore[attr-defined]
diff --git a/vllm/model_executor/layers/fused_moe/modular_kernel.py b/vllm/model_executor/layers/fused_moe/modular_kernel.py
index 2ea6383d5ae90ed4111f3bf4362c3e5011bbaaf4..281563c3bfca2541a8688e78b6693d082d48c906 100644
--- a/vllm/model_executor/layers/fused_moe/modular_kernel.py
+++ b/vllm/model_executor/layers/fused_moe/modular_kernel.py
@@ -4,7 +4,7 @@ from abc import ABC, abstractmethod
 from dataclasses import dataclass
 from enum import Enum
 from math import prod
-from typing import Optional, final
+from typing import Callable, Optional, Union, final
 
 import torch
 
@@ -141,6 +141,29 @@ class TopKWeightAndReduce(ABC):
         raise NotImplementedError
 
 
+#
+# PrepareResultType is a tuple of:
+# - quantized + dispatched a.
+# - quantized + dispatched a1_scales.
+# - Optional ExpertTokensMetadata containing gpu/cpu tensors
+#   as big as the number of local experts with the information about the
+#   number of tokens assigned to each local expert.
+# - Optional dispatched expert topk IDs
+# - Optional dispatched expert topk weight
+#
+# See `prepare` method below.
+#
+PrepareResultType = tuple[
+    torch.Tensor,
+    Optional[torch.Tensor],
+    Optional[ExpertTokensMetadata],
+    Optional[torch.Tensor],
+    Optional[torch.Tensor],
+]
+
+ReceiverType = Callable[[], PrepareResultType]
+
+
 # TODO: pass FusedMoEParallelConfig in as ctor parameter?
 class FusedMoEPrepareAndFinalize(ABC):
     """
@@ -160,16 +183,9 @@ class FusedMoEPrepareAndFinalize(ABC):
         expert_map: Optional[torch.Tensor],
         apply_router_weight_on_input: bool,
         quant_config: FusedMoEQuantConfig,
-    ) -> tuple[
-            torch.Tensor,
-            Optional[torch.Tensor],
-            Optional[ExpertTokensMetadata],
-            Optional[torch.Tensor],
-            Optional[torch.Tensor],
-    ]:
+    ) -> PrepareResultType:
         """
-        Perform any quantization (and/or) dispatching needed
-        for this kernel.
+        Perform any quantization (and/or) dispatching needed for this kernel.
         - a1: The (unquantized) input to the MoE layer.
         - a1_scale: Optional scales for a1
         - a2_scale: Optional scales for the second MoE gemm.  Required to make
@@ -193,6 +209,51 @@ class FusedMoEPrepareAndFinalize(ABC):
         """
         raise NotImplementedError
 
+    def supports_async(self) -> bool:
+        """
+        Indicates whether or not this class implements prepare_async.
+        """
+        return False
+
+    def prepare_async(
+        self,
+        a1: torch.Tensor,
+        a1_scale: Optional[torch.Tensor],
+        a2_scale: Optional[torch.Tensor],
+        topk_weights: torch.Tensor,
+        topk_ids: torch.Tensor,
+        num_experts: int,
+        expert_map: Optional[torch.Tensor],
+        apply_router_weight_on_input: bool,
+        quant_config: FusedMoEQuantConfig,
+    ) -> ReceiverType:
+        """
+        Perform any quantization (and/or) dispatching needed for this kernel
+        but do not wait for results from other workers.
+        - a1: The (unquantized) input to the MoE layer.
+        - a1_scale: Optional scales for a1
+        - a2_scale: Optional scales for the second MoE gemm.  Required to make
+          sure the quantization is consistent for both gemms.
+        - topk_ids: The topk ids.
+        - topk_weights: The topk weights.
+        - num_experts: The total number of experts in the global expert space.
+        - expert_map: A tensor mapping expert indices from the global expert
+          space to the local expert space of the expert parallel shard.
+        - apply_router_weight_on_input: When True, apply the weights to the
+          activations, before quantization + dispatching.
+
+        Returns a callback that when invoked waits for results from other
+        workers and has the same return signature as `prepare`, e.g.
+
+        receiver = obj.prepare_async(...)
+        a, a_scales, expert_meta, topk_ids, topk_weights = receiver()
+
+        is equivalent to:
+
+        a, a_scales, expert_meta, topk_ids, topk_weights = obj.prepare(...)
+        """
+        raise NotImplementedError
+
     @abstractmethod
     def finalize(
         self,
@@ -241,7 +302,7 @@ class FusedMoEPrepareAndFinalize(ABC):
     def max_num_tokens_per_rank(self) -> Optional[int]:
         """
         Some PrepareFinalize All2All implementations are batched. Meaning,
-        they can processes only as set of tokens at a time. This
+        they can process only as set of tokens at a time. This
         function returns the batch size i.e the maximum number of tokens
         the implementation can process at a time.
         Return None if there are no such restrictions.
@@ -453,10 +514,12 @@ class FusedMoEModularKernel(torch.nn.Module):
         self,
         prepare_finalize: FusedMoEPrepareAndFinalize,
         fused_experts: FusedMoEPermuteExpertsUnpermute,
+        shared_experts: Optional[torch.nn.Module] = None,
     ):
         super().__init__()
         self.prepare_finalize = prepare_finalize
         self.fused_experts = fused_experts
+        self.shared_experts = shared_experts
         assert prepare_finalize.activation_format == \
             fused_experts.activation_formats[0], (
                 f"{prepare_finalize.__class__.__name__}."
@@ -692,7 +755,7 @@ class FusedMoEModularKernel(torch.nn.Module):
         a1_scale: Optional[torch.Tensor] = None,
         a2_scale: Optional[torch.Tensor] = None,
         apply_router_weight_on_input: bool = False,
-    ) -> torch.Tensor:
+    ) -> Union[torch.Tensor, tuple[torch.Tensor, torch.Tensor]]:
         """
         This function computes a Mixture of Experts (MoE) layer using two sets
         of weights, w1 and w2, and top-k gating mechanism.
@@ -736,18 +799,46 @@ class FusedMoEModularKernel(torch.nn.Module):
         if global_num_experts == -1:
             global_num_experts = local_num_experts
 
-        (a1q, a1q_scale, expert_tokens_meta, _expert_topk_ids,
-         _expert_topk_weights) = self.prepare_finalize.prepare(
-             a1,
-             a1_scale,
-             a2_scale,
-             topk_weights,
-             topk_ids,
-             global_num_experts,
-             expert_map,
-             apply_router_weight_on_input,
-             self.fused_experts.quant_config,
-         )
+        shared_output: torch.Tensor
+
+        if (not self.prepare_finalize.supports_async()
+                or self.shared_experts is None):
+
+            # Run shared experts serially with dispatch.
+            if self.shared_experts is not None:
+                shared_output = self.shared_experts(a1)
+
+            (a1q, a1q_scale, expert_tokens_meta, _expert_topk_ids,
+             _expert_topk_weights) = self.prepare_finalize.prepare(
+                 a1,
+                 a1_scale,
+                 a2_scale,
+                 topk_weights,
+                 topk_ids,
+                 global_num_experts,
+                 expert_map,
+                 apply_router_weight_on_input,
+                 self.fused_experts.quant_config,
+             )
+        else:
+            # Overlap shared expert compute with all2all dispatch.
+            receiver = self.prepare_finalize.prepare_async(
+                a1,
+                a1_scale,
+                a2_scale,
+                topk_weights,
+                topk_ids,
+                global_num_experts,
+                expert_map,
+                apply_router_weight_on_input,
+                self.fused_experts.quant_config,
+            )
+
+            assert self.shared_experts is not None
+            shared_output = self.shared_experts(a1)
+
+            (a1q, a1q_scale, expert_tokens_meta, _expert_topk_ids,
+             _expert_topk_weights) = receiver()
 
         # Maybe prepare gathered topk_ids and topk_weights from other EP ranks.
         topk_ids = topk_ids if _expert_topk_ids is None else _expert_topk_ids
@@ -795,4 +886,7 @@ class FusedMoEModularKernel(torch.nn.Module):
             self.fused_experts.finalize_weight_and_reduce_impl(),
         )
 
-        return output
+        if self.shared_experts is None:
+            return output
+        else:
+            return shared_output, output
diff --git a/vllm/model_executor/layers/fused_moe/moe_pallas.py b/vllm/model_executor/layers/fused_moe/moe_pallas.py
index 582ae3e12c2899d50560ee87ed8e9604e84709b9..23f618b1a5fd76fe401ab31eb3fed6a5b2d86445 100644
--- a/vllm/model_executor/layers/fused_moe/moe_pallas.py
+++ b/vllm/model_executor/layers/fused_moe/moe_pallas.py
@@ -7,7 +7,7 @@ import torch.nn.functional as F
 
 def _histogram(input: torch.Tensor, min: int, max: int) -> torch.Tensor:
     """
-  Compute the histogram of a int32 tensor. The bin edges are defined by the
+  Compute the histogram of an int32 tensor. The bin edges are defined by the
   min and max values, with step = 1.
   """
     assert input.dtype == torch.int32, "input must be of torch.int32 dtype."
diff --git a/vllm/model_executor/layers/fused_moe/pplx_prepare_finalize.py b/vllm/model_executor/layers/fused_moe/pplx_prepare_finalize.py
index 401f37922b7bb5fde4b08fa65e597ebe45e22d40..2ae79e69f55546ff21ca7449de60dc9025827cda 100644
--- a/vllm/model_executor/layers/fused_moe/pplx_prepare_finalize.py
+++ b/vllm/model_executor/layers/fused_moe/pplx_prepare_finalize.py
@@ -84,12 +84,15 @@ class PplxPrepareAndFinalize(mk.FusedMoEPrepareAndFinalize):
         return self.max_num_tokens
 
     def topk_indices_dtype(self) -> Optional[torch.dtype]:
-        return torch.int32
+        return torch.uint32
 
     def num_dispatchers(self) -> int:
         return self.num_dispatchers_
 
-    def prepare(
+    def supports_async(self) -> bool:
+        return True
+
+    def prepare_async(
         self,
         a1: torch.Tensor,
         a1_scale: Optional[torch.Tensor],
@@ -100,9 +103,7 @@ class PplxPrepareAndFinalize(mk.FusedMoEPrepareAndFinalize):
         expert_map: Optional[torch.Tensor],
         apply_router_weight_on_input: bool,
         quant_config: FusedMoEQuantConfig,
-    ) -> tuple[torch.Tensor, Optional[torch.Tensor],
-               Optional[mk.ExpertTokensMetadata], Optional[torch.Tensor],
-               Optional[torch.Tensor]]:
+    ) -> mk.ReceiverType:
         num_tokens = a1.size(0)  # M
         hidden_dim = a1.size(-1)  # K
 
@@ -138,6 +139,8 @@ class PplxPrepareAndFinalize(mk.FusedMoEPrepareAndFinalize):
         _validate_scale_shape(a1q, a1q_scale, quant_config.per_act_token_quant,
                               quant_config.block_shape)
 
+        orig_a_scale_block_shape: Optional[int] = None
+
         if a1q_scale is not None:
             scalar_scales = a1q_scale.numel() == 1
 
@@ -205,8 +208,45 @@ class PplxPrepareAndFinalize(mk.FusedMoEPrepareAndFinalize):
             out_expert_x_scale=expert_x_scale,
             dp_x=a1q,
             dp_x_scale=a1q_scale,
-            indices=topk_ids.view(dtype=torch.uint32),
+            indices=topk_ids,
+            bound_m=bound_m,
+            do_send=True,
+            do_recv=False,
+        )
+
+        return lambda: self._receiver(
+            expert_num_tokens,
+            expert_x,
+            expert_x_scale,
+            a1q,
+            a1q_scale,
+            topk_ids,
+            bound_m,
+            orig_a_scale_block_shape,
+        )
+
+    def _receiver(
+        self,
+        expert_num_tokens: torch.Tensor,
+        expert_x: torch.Tensor,
+        expert_x_scale: Optional[torch.Tensor],
+        a1q: torch.Tensor,
+        a1q_scale: Optional[torch.Tensor],
+        topk_ids: torch.Tensor,
+        bound_m: Optional[torch.Tensor],
+        orig_a_scale_block_shape: Optional[int],
+    ) -> mk.PrepareResultType:
+
+        self.a2a.dispatch(
+            out_expert_num_tokens=expert_num_tokens,
+            out_expert_x=expert_x,
+            out_expert_x_scale=expert_x_scale,
+            dp_x=a1q,
+            dp_x_scale=a1q_scale,
+            indices=topk_ids,
             bound_m=bound_m,
+            do_send=False,
+            do_recv=True,
         )
 
         if expert_x_scale is not None:
@@ -218,6 +258,31 @@ class PplxPrepareAndFinalize(mk.FusedMoEPrepareAndFinalize):
 
         return expert_x, expert_x_scale, expert_tokens_meta, None, None
 
+    def prepare(
+        self,
+        a1: torch.Tensor,
+        a1_scale: Optional[torch.Tensor],
+        a2_scale: Optional[torch.Tensor],
+        topk_weights: torch.Tensor,
+        topk_ids: torch.Tensor,
+        num_experts: int,
+        expert_map: Optional[torch.Tensor],
+        apply_router_weight_on_input: bool,
+        quant_config: FusedMoEQuantConfig,
+    ) -> mk.PrepareResultType:
+        receiver = self.prepare_async(
+            a1,
+            a1_scale,
+            a2_scale,
+            topk_weights,
+            topk_ids,
+            num_experts,
+            expert_map,
+            apply_router_weight_on_input,
+            quant_config,
+        )
+        return receiver()
+
     def finalize(
         self,
         output: torch.Tensor,
diff --git a/vllm/model_executor/layers/fused_moe/prepare_finalize.py b/vllm/model_executor/layers/fused_moe/prepare_finalize.py
index 567a0a88fec0aac5666dab6f1474cfbc8655022c..bd9f7d4a06b174c78c5e568d83a9e1ad23ef6310 100644
--- a/vllm/model_executor/layers/fused_moe/prepare_finalize.py
+++ b/vllm/model_executor/layers/fused_moe/prepare_finalize.py
@@ -38,9 +38,7 @@ class MoEPrepareAndFinalizeNoEP(mk.FusedMoEPrepareAndFinalize):
         expert_map: Optional[torch.Tensor],
         apply_router_weight_on_input: bool,
         quant_config: FusedMoEQuantConfig,
-    ) -> tuple[torch.Tensor, Optional[torch.Tensor],
-               Optional[mk.ExpertTokensMetadata], Optional[torch.Tensor],
-               Optional[torch.Tensor]]:
+    ) -> mk.PrepareResultType:
 
         if apply_router_weight_on_input:
             topk = topk_ids.size(1)
diff --git a/vllm/model_executor/layers/fused_moe/rocm_aiter_fused_moe.py b/vllm/model_executor/layers/fused_moe/rocm_aiter_fused_moe.py
index f14f13e2ade9d7c4f21f1cd8eeb65552d7334db0..13c3ab4f06dd14df1415d498c4ac8b97981f37fa 100644
--- a/vllm/model_executor/layers/fused_moe/rocm_aiter_fused_moe.py
+++ b/vllm/model_executor/layers/fused_moe/rocm_aiter_fused_moe.py
@@ -420,9 +420,8 @@ def shuffle_weights(
 
     Args:
         *tensors: Variable number of torch.Tensor objects.
-        layout: A pair of integers specifying the
-        block sizes used to divide the tensors during shuffling.
-        Default is (16, 16).
+        layout: A pair of integers specifying the block sizes used to divide 
+            the tensors during shuffling. Default is (16, 16).
 
     Returns:
     A Tuple of shuffled tensors.
diff --git a/vllm/model_executor/layers/fused_moe/routing_simulator.py b/vllm/model_executor/layers/fused_moe/routing_simulator.py
index c8b107f13cd0d9b5c69055f293d947ff6d5c6878..8758a570b3c63d9ec572f4397ff737b5a9506351 100644
--- a/vllm/model_executor/layers/fused_moe/routing_simulator.py
+++ b/vllm/model_executor/layers/fused_moe/routing_simulator.py
@@ -10,7 +10,7 @@ like uniform random routing.
 """
 
 from abc import ABC, abstractmethod
-from typing import Optional
+from typing import Any, Optional
 
 import torch
 
@@ -50,7 +50,9 @@ class DistributionBasedRouting(RoutingStrategy):
     distributions for testing different routing patterns.
     """
 
-    def __init__(self, distribution: str = "uniform", **distribution_params):
+    def __init__(self,
+                 distribution: str = "uniform",
+                 **distribution_params: Any):
         """
         Initialize distribution-based routing.
 
@@ -244,7 +246,7 @@ class RoutingSimulator:
         cls._routing_strategies[name] = strategy
 
     @classmethod
-    def get_available_strategies(cls):
+    def get_available_strategies(cls) -> list[str]:
         """
         Get list of available routing strategy names.
 
diff --git a/vllm/model_executor/layers/layernorm.py b/vllm/model_executor/layers/layernorm.py
index a5fc1db2dc10faad8a0cf488f070af46db575c3a..f875f712ba9c9cd7d58304be2ce80aa8a620595f 100644
--- a/vllm/model_executor/layers/layernorm.py
+++ b/vllm/model_executor/layers/layernorm.py
@@ -9,11 +9,11 @@ import torch.nn as nn
 import vllm.envs as envs
 from vllm.model_executor.custom_op import CustomOp
 from vllm.platforms import current_platform
+from vllm.utils import direct_register_custom_op
 
 
 def is_rocm_aiter_rmsnorm_enabled() -> bool:
-    return current_platform.is_rocm() \
-        and envs.VLLM_ROCM_USE_AITER_RMSNORM \
+    return envs.VLLM_ROCM_USE_AITER_RMSNORM \
         and envs.VLLM_ROCM_USE_AITER
 
 
@@ -43,8 +43,22 @@ def fused_add_rms_norm(
     return x, residual
 
 
-def rocm_aiter_rms_norm(x: torch.Tensor, weight: torch.Tensor,
-                        variance_epsilon: float) -> torch.Tensor:
+def poly_norm(x: torch.Tensor, weight: torch.Tensor, bias: torch.Tensor,
+              variance_epsilon: float) -> torch.Tensor:
+    from vllm import _custom_ops as ops
+    out = torch.empty_like(x)
+    ops.poly_norm(
+        out,
+        x,
+        weight,
+        bias,
+        variance_epsilon,
+    )
+    return out
+
+
+def rocm_aiter_rms_norm_impl(x: torch.Tensor, weight: torch.Tensor,
+                             variance_epsilon: float) -> torch.Tensor:
     import aiter as rocm_aiter
     if x.dim() > 2:
         x_original_shape = x.shape
@@ -55,7 +69,7 @@ def rocm_aiter_rms_norm(x: torch.Tensor, weight: torch.Tensor,
     return rocm_aiter.rms_norm(x, weight, variance_epsilon)
 
 
-def rocm_aiter_fused_add_rms_norm(
+def rocm_aiter_rmsnorm2d_fwd_with_add_impl(
         x: torch.Tensor, residual: torch.Tensor, weight: torch.Tensor,
         variance_epsilon: float) -> tuple[torch.Tensor, torch.Tensor]:
 
@@ -74,14 +88,48 @@ def rocm_aiter_fused_add_rms_norm(
     return output, residual_out
 
 
-def dispatch_cuda_rmsnorm_func(add_residual: bool):
-    if add_residual:
-        if is_rocm_aiter_rmsnorm_enabled():
-            return rocm_aiter_fused_add_rms_norm
-        return fused_add_rms_norm
+def rocm_aiter_rms_norm_fake(x: torch.Tensor, weight: torch.Tensor,
+                             variance_epsilon: float) -> torch.Tensor:
+    return torch.empty_like(x)
+
+
+def rocm_aiter_rmsnorm2d_fwd_with_add_fake(
+        x: torch.Tensor, residual: torch.Tensor, weight: torch.Tensor,
+        variance_epsilon: float) -> tuple[torch.Tensor, torch.Tensor]:
+    return torch.empty_like(x), torch.empty_like(residual)
 
-    if is_rocm_aiter_rmsnorm_enabled():
-        return rocm_aiter_rms_norm
+
+if current_platform.is_rocm():
+    direct_register_custom_op(
+        op_name="rocm_aiter_rms_norm",
+        op_func=rocm_aiter_rms_norm_impl,
+        mutates_args=[],
+        fake_impl=rocm_aiter_rms_norm_fake,
+        dispatch_key=current_platform.dispatch_key,
+    )
+
+    direct_register_custom_op(
+        op_name="rocm_aiter_rmsnorm2d_fwd_with_add",
+        op_func=rocm_aiter_rmsnorm2d_fwd_with_add_impl,
+        mutates_args=[],
+        fake_impl=rocm_aiter_rmsnorm2d_fwd_with_add_fake,
+        dispatch_key=current_platform.dispatch_key,
+    )
+
+
+def dispatch_rocm_rmsnorm_func(with_fused_add: bool, dtype: torch.dtype):
+    use_aiter = is_rocm_aiter_rmsnorm_enabled() and dtype in [
+        torch.float16, torch.bfloat16
+    ]
+
+    if use_aiter and with_fused_add:
+        return torch.ops.vllm.rocm_aiter_rmsnorm2d_fwd_with_add
+    if use_aiter:
+        return torch.ops.vllm.rocm_aiter_rms_norm
+
+    # fall back to CUDA implementation
+    if with_fused_add:
+        return fused_add_rms_norm
     return rms_norm
 
 
@@ -114,6 +162,13 @@ class RMSNorm(CustomOp):
             self.weight = torch.ones(hidden_size)
         if self.has_weight:
             self.weight = nn.Parameter(self.weight)
+        weight_dtype = self.weight.data.dtype
+
+        if current_platform.is_rocm():
+            self.rocm_norm_func = dispatch_rocm_rmsnorm_func(
+                with_fused_add=False, dtype=weight_dtype)
+            self.rocm_norm_func_with_add = dispatch_rocm_rmsnorm_func(
+                with_fused_add=True, dtype=weight_dtype)
 
     def forward_native(
         self,
@@ -162,13 +217,27 @@ class RMSNorm(CustomOp):
             return self.forward_native(x, residual)
 
         add_residual = residual is not None
-        norm_func = dispatch_cuda_rmsnorm_func(add_residual)
+        if add_residual:
+            return fused_add_rms_norm(x, residual, self.weight.data,
+                                      self.variance_epsilon)
+        else:
+            return rms_norm(x, self.weight.data, self.variance_epsilon)
+
+    def forward_hip(
+        self,
+        x: torch.Tensor,
+        residual: Optional[torch.Tensor] = None,
+    ) -> Union[torch.Tensor, tuple[torch.Tensor, torch.Tensor]]:
+        if self.variance_size_override is not None:
+            return self.forward_native(x, residual)
 
+        add_residual = residual is not None
         if add_residual:
-            return norm_func(x, residual, self.weight.data,
-                             self.variance_epsilon)
+            return self.rocm_norm_func_with_add(x, residual, self.weight.data,
+                                                self.variance_epsilon)
         else:
-            return norm_func(x, self.weight.data, self.variance_epsilon)
+            return self.rocm_norm_func(x, self.weight.data,
+                                       self.variance_epsilon)
 
     def forward_xpu(
         self,
@@ -265,3 +334,48 @@ class GemmaRMSNorm(CustomOp):
                 self.forward_static)
             self._is_compiled = True
         return self.forward_native(x, residual)
+
+
+@CustomOp.register("poly_norm")
+class PolyNorm(CustomOp):
+    """Polynomial normalization.
+
+    Computes x -> w_0 * RMSNorm(x^3) + w_1 * RMSNorm(x^2) + w_2 * RMSNorm(x) + b
+    where w_n is the learned weight and b is the bias.
+    Refer to https://arxiv.org/html/2411.03884v1
+    """
+
+    def __init__(
+        self,
+        eps: float = 1e-6,
+    ) -> None:
+        super().__init__()
+        self.weight = torch.nn.Parameter(torch.ones(3) / 3)
+        self.bias = torch.nn.Parameter(torch.zeros(1))
+        self.variance_epsilon = eps
+
+    def _norm(self, x):
+        return x / torch.sqrt(
+            x.pow(2).mean(-1, keepdim=True) + self.variance_epsilon)
+
+    def forward_native(
+        self,
+        x: torch.Tensor,
+    ) -> torch.Tensor:
+        """PyTorch-native implementation equivalent to forward().
+
+        Refer to https://github.com/BryceZhuo/PolyCom?tab=readme-ov-file/README.md
+        """
+
+        orig_dtype = x.dtype
+        x_float = x.to(torch.float32)
+        output = (self.weight[0] * self._norm(x_float**3) +
+                  self.weight[1] * self._norm(x_float**2) +
+                  self.weight[2] * self._norm(x_float) + self.bias)
+        return output.to(orig_dtype)
+
+    def forward_cuda(
+        self,
+        x: torch.Tensor,
+    ) -> torch.Tensor:
+        return poly_norm(x, self.weight, self.bias, self.variance_epsilon)
diff --git a/vllm/model_executor/layers/linear.py b/vllm/model_executor/layers/linear.py
index 19ff63145024fcb653c24d269102c49760a26a0f..fd88eac55cb5121b0401ea95744d8656e4465577 100644
--- a/vllm/model_executor/layers/linear.py
+++ b/vllm/model_executor/layers/linear.py
@@ -9,7 +9,6 @@ import torch
 import torch.nn as nn
 from torch.nn.parameter import Parameter, UninitializedParameter
 
-from vllm import envs
 from vllm.distributed import (divide, get_tensor_model_parallel_rank,
                               get_tensor_model_parallel_world_size,
                               split_tensor_along_last_dim,
@@ -200,26 +199,10 @@ class UnquantizedLinearMethod(LinearMethodBase):
         set_weight_attrs(weight, extra_weight_attrs)
 
     def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
-        # special postprocessing for CPU SGL
-        if current_platform.is_cpu() and envs.VLLM_CPU_SGL_KERNEL:
-            from vllm.model_executor.layers.utils import check_cpu_sgl_kernel
-            N, K = layer.weight.size()
-            dtype = layer.weight.dtype
-            if check_cpu_sgl_kernel(N, K, dtype):
-                packed_weight = torch.ops._C.convert_weight_packed(
-                    layer.weight)
-                assert packed_weight.size() == layer.weight.size()
-                layer.weight.copy_(packed_weight)
-                if layer.bias is not None:
-                    layer.bias = Parameter(layer.bias.to(torch.float32),
-                                           requires_grad=False)
-                layer.use_cpu_sgl = True
-            else:
-                logger.warning(
-                    "CPU SGL kernels require Intel AMX support,"
-                    " bf16/fp16/int8 weight, IC and OC are divisible by "
-                    "32 and 16.")
-                layer.use_cpu_sgl = False
+        if current_platform.is_cpu():
+            from vllm.model_executor.layers.utils import (
+                dispatch_cpu_unquantized_gemm)
+            dispatch_cpu_unquantized_gemm(layer, remove_weight=True)
 
     def apply(self,
               layer: torch.nn.Module,
@@ -240,6 +223,7 @@ class LinearBase(CustomOp):
         quant_config: Quantization configure.
         prefix: Prefix for parameter names.
         return_bias: If true, return bias together with outputs in forward pass.
+        disable_tp: If true, tensor parallelism will be disabled for this layer.
     """
 
     def __init__(
@@ -252,6 +236,7 @@ class LinearBase(CustomOp):
         prefix: str = "",
         *,
         return_bias: bool = True,
+        disable_tp: bool = False,
     ):
         super().__init__()
 
@@ -271,6 +256,17 @@ class LinearBase(CustomOp):
             self.quant_method = quant_config.get_quant_method(self,
                                                               prefix=prefix)
         self.return_bias = return_bias
+        self.disable_tp = disable_tp
+        self.tp_rank = (get_tensor_model_parallel_rank()
+                        if not disable_tp else 0)
+        self.tp_size = (get_tensor_model_parallel_world_size()
+                        if not disable_tp else 1)
+
+    def update_param_tp_status(self):
+        for param in self.parameters():
+            if isinstance(param, BasevLLMParameter):
+                param.tp_rank = self.tp_rank
+                param.tp_size = self.tp_size
 
 
 @CustomOp.register("replicated_linear")
@@ -287,6 +283,7 @@ class ReplicatedLinear(LinearBase):
         prefix: The name of the layer in the state dict, including all parents
                         (e.g. model.layers.0.qkv_proj)
         return_bias: If true, return bias together with outputs in forward pass.
+        disable_tp: Take no effect for replicated linear layers.
     """
 
     def __init__(
@@ -300,26 +297,21 @@ class ReplicatedLinear(LinearBase):
         prefix: str = "",
         *,
         return_bias: bool = True,
+        disable_tp: bool = False,
     ):
-        # If MergedReplicatedLinear, use output size of each partition.
-        if hasattr(self, "output_sizes"):
-            self.output_partition_sizes = self.output_sizes
-        else:
-            self.output_partition_sizes = [output_size]
-
         super().__init__(input_size,
                          output_size,
                          skip_bias_add,
                          params_dtype,
                          quant_config,
                          prefix=prefix,
-                         return_bias=return_bias)
+                         return_bias=return_bias,
+                         disable_tp=disable_tp)
 
         # All the linear layer supports quant method.
         assert self.quant_method is not None
         self.quant_method.create_weights(self,
-                                         self.input_size,
-                                         self.output_partition_sizes,
+                                         self.input_size, [self.output_size],
                                          self.input_size,
                                          self.output_size,
                                          self.params_dtype,
@@ -375,74 +367,6 @@ class ReplicatedLinear(LinearBase):
         return s
 
 
-class MergedReplicatedLinear(ReplicatedLinear):
-    """Replicated linear layer.
-
-    Args:
-        input_size: input dimension of the linear layer.
-        output_sizes: list of output dimensions of the linear layer.
-        bias: If true, add bias.
-        skip_bias_add: If true, skip adding bias but instead return it.
-        params_dtype: Data type for the parameters.
-        quant_config: Quantization configure.
-        prefix: The name of the layer in the state dict, including all parents
-                        (e.g. model.layers.0.qkv_proj)
-        return_bias: If true, return bias together with outputs in forward pass.
-    """
-
-    def __init__(
-        self,
-        input_size: int,
-        output_sizes: list[int],
-        bias: bool = True,
-        skip_bias_add: bool = False,
-        params_dtype: Optional[torch.dtype] = None,
-        quant_config: Optional[QuantizationConfig] = None,
-        prefix: str = "",
-        *,
-        return_bias: bool = True,
-    ):
-        self.output_sizes = output_sizes
-        super().__init__(input_size,
-                         sum(output_sizes),
-                         bias,
-                         skip_bias_add,
-                         params_dtype,
-                         quant_config,
-                         prefix=prefix,
-                         return_bias=return_bias)
-
-    def weight_loader(self,
-                      param: Union[Parameter, BasevLLMParameter],
-                      loaded_weight: torch.Tensor,
-                      loaded_shard_id: Optional[int] = None):
-        assert loaded_shard_id is not None
-        assert loaded_shard_id < len(self.output_sizes)
-
-        if isinstance(param, BlockQuantScaleParameter):
-            from vllm.model_executor.layers.quantization.fp8 import (
-                Fp8LinearMethod, Fp8MoEMethod)
-            assert self.quant_method is not None
-            assert isinstance(self.quant_method,
-                              (Fp8LinearMethod, Fp8MoEMethod))
-            weight_block_size = self.quant_method.quant_config.weight_block_size
-            assert weight_block_size is not None
-            block_n, _ = weight_block_size[0], weight_block_size[1]
-            shard_offset = (
-                (sum(self.output_sizes[:loaded_shard_id]) + block_n - 1) //
-                block_n)
-            shard_size = ((self.output_sizes[loaded_shard_id] + block_n - 1) //
-                          block_n)
-        elif isinstance(param, PerTensorScaleParameter):
-            shard_offset = loaded_shard_id
-            shard_size = 1
-        else:
-            shard_offset = sum(self.output_sizes[:loaded_shard_id])
-            shard_size = self.output_sizes[loaded_shard_id]
-
-        param.data[shard_offset:shard_offset + shard_size] = loaded_weight
-
-
 @CustomOp.register("column_parallel_linear")
 class ColumnParallelLinear(LinearBase):
     """Linear layer with column parallelism.
@@ -465,7 +389,9 @@ class ColumnParallelLinear(LinearBase):
         output_sizes: list of output sizes packed into one output, like for QKV
                        the list would be size 3.
         prefix: The name of the layer in the state dict, including all parents
-                        (e.g. model.layers.0.qkv_proj) 
+                        (e.g. model.layers.0.qkv_proj)
+        return_bias: If true, return bias together with outputs in forward pass.
+        disable_tp: If true, weights matrix won't be sharded through tp rank.
     """
 
     def __init__(
@@ -481,9 +407,13 @@ class ColumnParallelLinear(LinearBase):
         prefix: str = "",
         *,
         return_bias: bool = True,
+        disable_tp: bool = False,
     ):
         # Divide the weight matrix along the last dimension.
-        self.tp_size = get_tensor_model_parallel_world_size()
+        self.tp_rank = (get_tensor_model_parallel_rank()
+                        if not disable_tp else 0)
+        self.tp_size = (get_tensor_model_parallel_world_size()
+                        if not disable_tp else 1)
         self.input_size_per_partition = input_size
         self.output_size_per_partition = divide(output_size, self.tp_size)
         self.output_partition_sizes = [self.output_size_per_partition]
@@ -500,7 +430,8 @@ class ColumnParallelLinear(LinearBase):
                          params_dtype,
                          quant_config,
                          prefix,
-                         return_bias=return_bias)
+                         return_bias=return_bias,
+                         disable_tp=disable_tp)
 
         self.gather_output = gather_output
 
@@ -528,8 +459,7 @@ class ColumnParallelLinear(LinearBase):
             })
         else:
             self.register_parameter("bias", None)
-
-        self.tp_rank = get_tensor_model_parallel_rank()
+        self.update_param_tp_status()
 
     def weight_loader(self, param: Parameter, loaded_weight: torch.Tensor):
 
@@ -571,7 +501,8 @@ class ColumnParallelLinear(LinearBase):
         assert param_data.shape == loaded_weight.shape
         param_data.copy_(loaded_weight)
 
-    def weight_loader_v2(self, param: Parameter, loaded_weight: torch.Tensor):
+    def weight_loader_v2(self, param: BasevLLMParameter,
+                         loaded_weight: torch.Tensor):
         # Special case for loading scales off disk, which often do not
         # have a shape (such as in the case of AutoFP8).
         if len(loaded_weight.shape) == 0:
@@ -587,7 +518,7 @@ class ColumnParallelLinear(LinearBase):
         # Matrix multiply.
         assert self.quant_method is not None
         output_parallel = self.quant_method.apply(self, input_, bias)
-        if self.gather_output:
+        if self.gather_output and self.tp_size > 1:
             # All-gather across the partitions.
             output = tensor_model_parallel_all_gather(output_parallel)
         else:
@@ -601,7 +532,7 @@ class ColumnParallelLinear(LinearBase):
         s = f"in_features={self.input_size}"
         s += f", output_features={self.output_size_per_partition}"
         s += f", bias={self.bias is not None}"
-        s += f", tp_size={get_tensor_model_parallel_world_size()}"
+        s += f", tp_size={self.tp_size}"
         s += f", gather_output={self.gather_output}"
         return s
 
@@ -628,6 +559,8 @@ class MergedColumnParallelLinear(ColumnParallelLinear):
         prefix: The name of the layer in the state dict, including all parents
                         (e.g. model.layers.0.qkv_proj)
         return_bias: If true, return bias together with outputs in forward pass.
+        disable_tp: If true, all weights matrix won't be sharded, this layer
+                    will be treated as a "Replicated" MergedLinear.
     """
 
     def __init__(
@@ -642,10 +575,13 @@ class MergedColumnParallelLinear(ColumnParallelLinear):
         prefix: str = "",
         *,
         return_bias: bool = True,
+        disable_tp: bool = False,
     ):
         self.output_sizes = output_sizes
-        self.tp_size = get_tensor_model_parallel_world_size()
-        self.tp_rank = get_tensor_model_parallel_rank()
+        self.tp_size = (get_tensor_model_parallel_world_size()
+                        if not disable_tp else 1)
+        self.tp_rank = (get_tensor_model_parallel_rank()
+                        if not disable_tp else 0)
 
         assert all(output_size % self.tp_size == 0
                    for output_size in output_sizes)
@@ -657,7 +593,8 @@ class MergedColumnParallelLinear(ColumnParallelLinear):
                          params_dtype=params_dtype,
                          quant_config=quant_config,
                          prefix=prefix,
-                         return_bias=return_bias)
+                         return_bias=return_bias,
+                         disable_tp=disable_tp)
 
     def weight_loader(self,
                       param: Parameter,
@@ -722,8 +659,8 @@ class MergedColumnParallelLinear(ColumnParallelLinear):
                 # If quantized, we need to adjust the offset and size to account
                 # for the packing.
                 if packed_dim == output_dim:
-                    shard_size = shard_size // param.pack_factor
-                    shard_offset = shard_offset // param.pack_factor
+                    shard_size = shard_size // param.packed_factor
+                    shard_offset = shard_offset // param.packed_factor
                     # Special case for Marlin.
                     shard_size, shard_offset = adjust_marlin_shard(
                         param, shard_size, shard_offset)
@@ -756,8 +693,8 @@ class MergedColumnParallelLinear(ColumnParallelLinear):
             # for the packing.
             packed_dim = getattr(param, "packed_dim", None)
             if packed_dim == output_dim:
-                shard_size = shard_size // param.pack_factor
-                shard_offset = shard_offset // param.pack_factor
+                shard_size = shard_size // param.packed_factor
+                shard_offset = shard_offset // param.packed_factor
                 # Special case for Marlin.
                 shard_size, shard_offset = adjust_marlin_shard(
                     param, shard_size, shard_offset)
@@ -849,8 +786,6 @@ class MergedColumnParallelLinear(ColumnParallelLinear):
 
         assert loaded_shard_id < len(self.output_sizes)
 
-        tp_size = get_tensor_model_parallel_world_size()
-
         if isinstance(param, BlockQuantScaleParameter):
             from vllm.model_executor.layers.quantization.fp8 import (
                 Fp8LinearMethod, Fp8MoEMethod)
@@ -862,17 +797,19 @@ class MergedColumnParallelLinear(ColumnParallelLinear):
             block_n, _ = weight_block_size[0], weight_block_size[1]
             shard_offset = (
                 (sum(self.output_sizes[:loaded_shard_id]) + block_n - 1) //
-                block_n) // tp_size
+                block_n) // self.tp_size
             shard_size = ((self.output_sizes[loaded_shard_id] + block_n - 1) //
-                          block_n // tp_size)
+                          block_n // self.tp_size)
         else:
-            shard_offset = sum(self.output_sizes[:loaded_shard_id]) // tp_size
-            shard_size = self.output_sizes[loaded_shard_id] // tp_size
+            shard_offset = sum(
+                self.output_sizes[:loaded_shard_id]) // self.tp_size
+            shard_size = self.output_sizes[loaded_shard_id] // self.tp_size
 
         param.load_merged_column_weight(loaded_weight=loaded_weight,
                                         shard_id=loaded_shard_id,
                                         shard_offset=shard_offset,
-                                        shard_size=shard_size)
+                                        shard_size=shard_size,
+                                        tp_rank=self.tp_rank)
 
 
 class QKVParallelLinear(ColumnParallelLinear):
@@ -900,6 +837,7 @@ class QKVParallelLinear(ColumnParallelLinear):
         prefix: The name of the layer in the state dict, including all parents
                         (e.g. model.layers.0.qkv_proj)
         return_bias: If true, return bias together with outputs in forward pass.
+        disable_tp: If true, weights matrix won't be sharded through tp rank.
     """
 
     def __init__(
@@ -915,6 +853,7 @@ class QKVParallelLinear(ColumnParallelLinear):
         prefix: str = "",
         *,
         return_bias: bool = True,
+        disable_tp: bool = False,
     ):
         self.hidden_size = hidden_size
         self.head_size = head_size
@@ -923,7 +862,8 @@ class QKVParallelLinear(ColumnParallelLinear):
             total_num_kv_heads = total_num_heads
         self.total_num_kv_heads = total_num_kv_heads
         # Divide the weight matrix along the last dimension.
-        tp_size = get_tensor_model_parallel_world_size()
+        tp_size = (get_tensor_model_parallel_world_size()
+                   if not disable_tp else 1)
         self.num_heads = divide(self.total_num_heads, tp_size)
         if tp_size >= self.total_num_kv_heads:
             self.num_kv_heads = 1
@@ -949,7 +889,8 @@ class QKVParallelLinear(ColumnParallelLinear):
                          params_dtype=params_dtype,
                          quant_config=quant_config,
                          prefix=prefix,
-                         return_bias=return_bias)
+                         return_bias=return_bias,
+                         disable_tp=disable_tp)
 
     def _get_shard_offset_mapping(self, loaded_shard_id: str):
         shard_offset_mapping = {
@@ -1010,10 +951,13 @@ class QKVParallelLinear(ColumnParallelLinear):
                          loaded_shard_id: Optional[str] = None):
         if loaded_shard_id is None:  # special case for certain models
             if isinstance(param, PerTensorScaleParameter):
-                param.load_qkv_weight(loaded_weight=loaded_weight, shard_id=0)
+                param.load_qkv_weight(loaded_weight=loaded_weight,
+                                      shard_id=0,
+                                      tp_rank=self.tp_rank)
                 return
             elif type(param) in (RowvLLMParameter, BasevLLMParameter):
-                param.load_qkv_weight(loaded_weight=loaded_weight)
+                param.load_qkv_weight(loaded_weight=loaded_weight,
+                                      tp_rank=self.tp_rank)
                 return
             # TODO: @dsikka - move to parameter.py
             self._load_fused_module_from_checkpoint(param, loaded_weight)
@@ -1037,7 +981,8 @@ class QKVParallelLinear(ColumnParallelLinear):
                               num_heads=self.num_kv_head_replicas,
                               shard_id=loaded_shard_id,
                               shard_offset=shard_offset,
-                              shard_size=shard_size)
+                              shard_size=shard_size,
+                              tp_rank=self.tp_rank)
 
     def weight_loader(self,
                       param: Parameter,
@@ -1107,8 +1052,8 @@ class QKVParallelLinear(ColumnParallelLinear):
                 # If quantized, we need to adjust the offset and size to account
                 # for the packing.
                 if packed_dim == output_dim:
-                    shard_size = shard_size // param.pack_factor
-                    shard_offset = shard_offset // param.pack_factor
+                    shard_size = shard_size // param.packed_factor
+                    shard_offset = shard_offset // param.packed_factor
 
                     # Special case for Marlin.
                     shard_size, shard_offset = adjust_marlin_shard(
@@ -1155,8 +1100,8 @@ class QKVParallelLinear(ColumnParallelLinear):
             # for the packing.
             packed_dim = getattr(param, "packed_dim", None)
             if packed_dim == output_dim:
-                shard_size = shard_size // param.pack_factor
-                shard_offset = shard_offset // param.pack_factor
+                shard_size = shard_size // param.packed_factor
+                shard_offset = shard_offset // param.packed_factor
 
                 # Special case for Marlin.
                 shard_size, shard_offset = adjust_marlin_shard(
@@ -1243,6 +1188,7 @@ class RowParallelLinear(LinearBase):
         prefix: The name of the layer in the state dict, including all parents
                         (e.g. model.layers.0.down_proj)
         return_bias: If true, return bias together with outputs in forward pass.
+        disable_tp: If true, weights matrix won't be sharded through tp rank.
     """
 
     def __init__(
@@ -1258,10 +1204,13 @@ class RowParallelLinear(LinearBase):
         prefix: str = "",
         *,
         return_bias: bool = True,
+        disable_tp: bool = False,
     ):
         # Divide the weight matrix along the first dimension.
-        self.tp_rank = get_tensor_model_parallel_rank()
-        self.tp_size = get_tensor_model_parallel_world_size()
+        self.tp_rank = (get_tensor_model_parallel_rank()
+                        if not disable_tp else 0)
+        self.tp_size = (get_tensor_model_parallel_world_size()
+                        if not disable_tp else 1)
         self.input_size_per_partition = divide(input_size, self.tp_size)
         self.output_size_per_partition = output_size
         self.output_partition_sizes = [output_size]
@@ -1272,7 +1221,8 @@ class RowParallelLinear(LinearBase):
                          params_dtype,
                          quant_config,
                          prefix,
-                         return_bias=return_bias)
+                         return_bias=return_bias,
+                         disable_tp=disable_tp)
 
         self.input_is_parallel = input_is_parallel
         self.reduce_results = reduce_results
@@ -1301,6 +1251,7 @@ class RowParallelLinear(LinearBase):
             })
         else:
             self.register_parameter("bias", None)
+        self.update_param_tp_status()
 
     def weight_loader(self, param: Parameter, loaded_weight: torch.Tensor):
         input_dim = getattr(param, "input_dim", None)
@@ -1356,10 +1307,9 @@ class RowParallelLinear(LinearBase):
         if self.input_is_parallel:
             input_parallel = input_
         else:
-            tp_rank = get_tensor_model_parallel_rank()
             splitted_input = split_tensor_along_last_dim(
                 input_, num_partitions=self.tp_size)
-            input_parallel = splitted_input[tp_rank].contiguous()
+            input_parallel = splitted_input[self.tp_rank].contiguous()
 
         # Matrix multiply.
         assert self.quant_method is not None
diff --git a/vllm/model_executor/layers/logits_processor.py b/vllm/model_executor/layers/logits_processor.py
index e93be9bfb16579970d84c4f454c805b64b7c12a0..8a4ac214443ebcb980d2b245b06f41c5ce98ca04 100644
--- a/vllm/model_executor/layers/logits_processor.py
+++ b/vllm/model_executor/layers/logits_processor.py
@@ -6,11 +6,11 @@ from concurrent.futures import ThreadPoolExecutor
 from typing import Optional
 
 import torch
-import torch.nn as nn
 
 import vllm.envs as envs
 from vllm.distributed import (tensor_model_parallel_all_gather,
                               tensor_model_parallel_gather)
+from vllm.model_executor.custom_op import CustomOp
 from vllm.model_executor.layers.vocab_parallel_embedding import (
     VocabParallelEmbedding)
 from vllm.model_executor.sampling_metadata import SamplingMetadata
@@ -22,7 +22,8 @@ if envs.VLLM_LOGITS_PROCESSOR_THREADS is not None:
         envs.VLLM_LOGITS_PROCESSOR_THREADS)
 
 
-class LogitsProcessor(nn.Module):
+@CustomOp.register("logits_processor")
+class LogitsProcessor(CustomOp):
     """Process logits and apply logits processors from sampling metadata.
 
     This layer does the following:
diff --git a/vllm/model_executor/layers/mamba/linear_attn.py b/vllm/model_executor/layers/mamba/linear_attn.py
index d93cef1a27ad4a1f3265e2a7d7e1f8af9547912a..5fe37a6289e01e1275dd5135b0ba373e5b6936bb 100644
--- a/vllm/model_executor/layers/mamba/linear_attn.py
+++ b/vllm/model_executor/layers/mamba/linear_attn.py
@@ -83,17 +83,7 @@ class MiniMaxText01RMSNormTP(CustomOp):
             variance = tensor_model_parallel_all_reduce(
                 variance) / self.tp_world
         x = x * torch.rsqrt(variance + self.variance_epsilon)
-
-        weight = self.weight
-        if x.size(-1) != self.weight.size(0):
-            if self.weight.size(0) < x.size(-1):
-                repeat_count = (x.size(-1) + self.weight.size(0)) // x.size(-1)
-                full_weight = self.weight.repeat(repeat_count)
-                weight = full_weight[:x.size(-1)]
-            else:
-                weight = self.weight[:x.size(-1)]
-
-        x = x.to(orig_dtype) * weight
+        x = x.to(orig_dtype) * self.weight
         return x
 
     def forward(
diff --git a/vllm/model_executor/layers/mamba/mamba_mixer2.py b/vllm/model_executor/layers/mamba/mamba_mixer2.py
index bb3fdd38dbef3be9cc578dea93239fabb37a1bbe..04ebdbca85e5d6782e92638904b3c7887fdaa46a 100644
--- a/vllm/model_executor/layers/mamba/mamba_mixer2.py
+++ b/vllm/model_executor/layers/mamba/mamba_mixer2.py
@@ -291,6 +291,7 @@ class MambaMixer2(MambaBase, CustomOp):
             output_size=self.conv_dim,
             bias=use_conv_bias,
             quant_config=None,
+            prefix=f"{prefix}.conv1d",
         )
         # unsqueeze to fit conv1d weights shape into the linear weights shape.
         # Can't do this in `weight_loader` since it already exists in
@@ -303,6 +304,7 @@ class MambaMixer2(MambaBase, CustomOp):
             output_size=intermediate_size + self.conv_dim + self.num_heads,
             bias=use_bias,
             quant_config=quant_config,
+            prefix=f"{prefix}.in_proj",
         )
 
         # - because in_proj is a concatenation of 3 weights, we
@@ -402,6 +404,7 @@ class MambaMixer2(MambaBase, CustomOp):
             bias=use_bias,
             input_is_parallel=True,
             quant_config=quant_config,
+            prefix=f"{prefix}.out_proj",
         )
 
         self.norm = Mixer2RMSNormGated(intermediate_size,
diff --git a/vllm/model_executor/layers/mamba/mamba_utils.py b/vllm/model_executor/layers/mamba/mamba_utils.py
index 280a9e45e662e26d00fc5fb1f8c6397785b1b7f6..a6c1af91de4213ee0bf8ad36ab0c8ef54d42bc06 100644
--- a/vllm/model_executor/layers/mamba/mamba_utils.py
+++ b/vllm/model_executor/layers/mamba/mamba_utils.py
@@ -30,12 +30,8 @@ class MambaStateDtypeCalculator:
         mamba_cache_dtype: MambaDType,
         mamba_ssm_cache_dtype: MambaDType,
     ) -> tuple[torch.dtype, ...]:
-        # TODO (tdoublep) requires kernel changes
-        if mamba_cache_dtype == "float32" or mamba_ssm_cache_dtype == "float32":
-            raise ValueError("fp32 state for mamba1 is not yet supported")
-        else:
-            return MambaStateDtypeCalculator.mamba2_state_dtype(
-                model_dtype, mamba_cache_dtype, mamba_ssm_cache_dtype)
+        return cls._mamba_state_dtype(model_dtype, mamba_cache_dtype,
+                                      mamba_ssm_cache_dtype)
 
     @classmethod
     def mamba2_state_dtype(
@@ -43,6 +39,16 @@ class MambaStateDtypeCalculator:
         model_dtype: Union[ModelDType, torch.dtype],
         mamba_cache_dtype: MambaDType,
         mamba_ssm_cache_dtype: MambaDType,
+    ) -> tuple[torch.dtype, ...]:
+        return cls._mamba_state_dtype(model_dtype, mamba_cache_dtype,
+                                      mamba_ssm_cache_dtype)
+
+    @classmethod
+    def _mamba_state_dtype(
+        cls,
+        model_dtype: Union[ModelDType, torch.dtype],
+        mamba_cache_dtype: MambaDType,
+        mamba_ssm_cache_dtype: MambaDType,
     ) -> tuple[torch.dtype, ...]:
         conv_state_dtype = get_kv_cache_torch_dtype(mamba_cache_dtype,
                                                     model_dtype)
@@ -64,6 +70,15 @@ class MambaStateDtypeCalculator:
                                                     model_dtype)
         return (conv_state_dtype, )
 
+    @classmethod
+    def gated_delta_net_state_dtype(
+        cls,
+        model_dtype: Union[ModelDType, torch.dtype],
+        mamba_cache_dtype: MambaDType,
+    ) -> tuple[torch.dtype, torch.dtype]:
+        state_dtype = get_kv_cache_torch_dtype(mamba_cache_dtype, model_dtype)
+        return (state_dtype, state_dtype)
+
 
 class MambaStateShapeCalculator:
 
@@ -157,3 +172,31 @@ class MambaStateShapeCalculator:
 
         # for n_groups == 1, this is exactly tp_size - n_groups
         return tp_size - ngroups
+
+    @classmethod
+    def gated_delta_net_state_shape(
+        cls,
+        tp_world_size: int,
+        num_k_heads: int,
+        num_v_heads: int,
+        head_k_dim: int,
+        head_v_dim: int,
+        conv_kernel_size: int,
+        num_spec: int = 0,
+        use_v1: bool = True,
+    ):
+        conv_dim = (head_k_dim * num_k_heads * 2 + head_v_dim * num_v_heads)
+        conv_state_shape = (
+            divide(conv_dim, tp_world_size),
+            conv_kernel_size - 1 + num_spec,
+        )
+
+        # In V0, the conv_state shape was swapped during allocation in
+        # MambaCacheManager, but in V1 it needs to be determined here at the
+        # calculation level
+        if use_v1:
+            conv_state_shape = conv_state_shape[1], conv_state_shape[0]
+
+        temporal_state_shape = (divide(num_v_heads,
+                                       tp_world_size), head_k_dim, head_v_dim)
+        return conv_state_shape, temporal_state_shape
diff --git a/vllm/model_executor/layers/mamba/ops/causal_conv1d.py b/vllm/model_executor/layers/mamba/ops/causal_conv1d.py
index b8d4bbc37105d78abe48b0faab1e1f15ae35e56c..a0478a359f91b04ac054e7247186d99f39ac6dbc 100644
--- a/vllm/model_executor/layers/mamba/ops/causal_conv1d.py
+++ b/vllm/model_executor/layers/mamba/ops/causal_conv1d.py
@@ -464,7 +464,9 @@ def causal_conv1d_fn(
         # 3. mapping from sequence x[idx] to a cache line at index as specified via cache_indices[idx]
         # 4. computation can be skipped if cache_indices[idx] == pad_slot_id
         num_cache_lines = conv_states.size(0)
-        assert (num_cache_lines, dim, width - 1) == conv_states.shape
+        assert (num_cache_lines == conv_states.shape[0]
+                and dim == conv_states.shape[1]
+                and width - 1 <= conv_states.shape[2])
         stride_istate_seq = conv_states.stride(0)
         stride_istate_dim = conv_states.stride(1)
         stride_istate_token = conv_states.stride(2)
@@ -623,6 +625,7 @@ def _causal_conv1d_update_kernel(
     conv_state_ptr,
     cache_seqlens_ptr,  # circular buffer
     conv_state_indices_ptr,
+    num_accepted_tokens_ptr,
     o_ptr,  # (batch, dim, seqlen)
     # Matrix dimensions
     batch: int,
@@ -639,6 +642,7 @@ def _causal_conv1d_update_kernel(
     stride_conv_state_seq: tl.constexpr,
     stride_conv_state_dim: tl.constexpr,
     stride_conv_state_tok: tl.constexpr,
+    stride_state_indices: tl.constexpr,
     stride_o_seq: tl.constexpr,
     stride_o_dim: tl.constexpr,
     stride_o_token: tl.constexpr,
@@ -649,6 +653,7 @@ def _causal_conv1d_update_kernel(
     KERNEL_WIDTH: tl.constexpr,
     SILU_ACTIVATION: tl.constexpr,
     IS_CONTINUOUS_BATCHING: tl.constexpr,
+    IS_SPEC_DECODING: tl.constexpr,
     NP2_STATELEN: tl.constexpr,
     USE_PAD_SLOT: tl.constexpr,
     BLOCK_N: tl.constexpr,
@@ -663,8 +668,9 @@ def _causal_conv1d_update_kernel(
 
     if IS_CONTINUOUS_BATCHING:
         # mask = idx_seq < batch
-        conv_state_batch_coord = tl.load(conv_state_indices_ptr + idx_seq).to(
-            tl.int64)
+        conv_state_batch_coord = tl.load(conv_state_indices_ptr +
+                                         idx_seq * stride_state_indices).to(
+                                             tl.int64)
     else:
         conv_state_batch_coord = idx_seq
     if USE_PAD_SLOT:  # noqa
@@ -672,13 +678,32 @@ def _causal_conv1d_update_kernel(
             # not processing as this is not the actual sequence
             return
 
+    if IS_SPEC_DECODING:
+        # The rolling of conv state:
+        #
+        # Before forward, the conv_state is:
+        # [history1, history2, ..., historyM].
+        #
+        # After forward, the conv_state becomes:
+        # [history2, ..., historyM, draft1, draft2, ..., draftN].
+        #
+        # After acceptance, it becomes:
+        #
+        # - accept 1 tokens: [history2, ..., historyM, draft1]
+        # - accept 2 tokens: [history3, ..., historyM, draft1, draft2]
+        # - and so on.
+        conv_state_token_offset = (tl.load(num_accepted_tokens_ptr + idx_seq) -
+                                   1)
+    else:
+        conv_state_token_offset = 0
+
     # STEP 1: READ init_state data
     conv_states_base = (conv_state_ptr +
                         (conv_state_batch_coord * stride_conv_state_seq) +
                         (idx_feats * stride_conv_state_dim))
     mask_w = idx_feats < dim
 
-    prior_tokens = conv_states_base
+    prior_tokens = conv_states_base + conv_state_token_offset * stride_conv_state_tok
     if KERNEL_WIDTH >= 2:
         conv_states_ptrs = prior_tokens  # [BLOCK_N]
         col0 = tl.load(conv_states_ptrs, mask_w, 0.0)
@@ -695,11 +720,15 @@ def _causal_conv1d_update_kernel(
     # STEP 2: assume state_len > seqlen
     idx_tokens = tl.arange(0, NP2_STATELEN)  # [BLOCK_M]
 
+    # With speculative decoding, the conv_state updates works in a sliding
+    # window manner, at each forward pass, the tokens are shift by 1, so we
+    # load since idx_tokens + 1.
     conv_state_ptrs_source = (
         conv_state_ptr + (conv_state_batch_coord * stride_conv_state_seq) +
+        conv_state_token_offset * stride_conv_state_tok +
         (idx_feats * stride_conv_state_dim)[None, :] +
-        ((idx_tokens + seqlen) * stride_conv_state_tok)[:, None]
-    )  # [BLOCK_M, BLOCK_N]
+        ((idx_tokens + (1 if IS_SPEC_DECODING else seqlen)) *
+         stride_conv_state_tok)[:, None])  # [BLOCK_M, BLOCK_N]
     mask = ((conv_state_batch_coord < num_cache_lines)
             & ((idx_tokens + seqlen) < state_len)[:, None]
             & (idx_feats < dim)[None, :])
@@ -820,6 +849,7 @@ def causal_conv1d_update(
     activation: Union[bool, str, None] = None,
     cache_seqlens: Optional[torch.Tensor] = None,
     conv_state_indices: Optional[torch.Tensor] = None,
+    num_accepted_tokens: Optional[torch.Tensor] = None,
     pad_slot_id: int = PAD_SLOT_ID,
     metadata=None,
     validate_data=False,
@@ -890,10 +920,14 @@ def causal_conv1d_update(
     )  # X (batch, dim, seqlen)
 
     stride_o_seq, stride_o_dim, stride_o_token = out.stride()
-
     stride_istate_seq, stride_istate_dim, stride_istate_token = conv_state.stride(
     )
-    state_len = width - 1
+    stride_state_indices = conv_state_indices.stride(
+        0) if conv_state_indices is not None else 0
+    if num_accepted_tokens is not None:
+        state_len = width - 1 + (seqlen - 1)  # effective state_len needed
+    else:
+        state_len = width - 1
     np2_statelen = triton.next_power_of_2(state_len)
 
     def grid(META):
@@ -910,6 +944,7 @@ def causal_conv1d_update(
         conv_state,
         cache_seqlens,
         conv_state_indices,
+        num_accepted_tokens,
         out,
         # Matrix dimensions
         batch,
@@ -926,6 +961,7 @@ def causal_conv1d_update(
         stride_istate_seq,
         stride_istate_dim,
         stride_istate_token,
+        stride_state_indices,
         stride_o_seq,
         stride_o_dim,
         stride_o_token,
@@ -936,6 +972,7 @@ def causal_conv1d_update(
         KERNEL_WIDTH=width,
         SILU_ACTIVATION=activation in ["silu", "swish"],
         IS_CONTINUOUS_BATCHING=conv_state_indices is not None,
+        IS_SPEC_DECODING=num_accepted_tokens is not None,
         NP2_STATELEN=np2_statelen,
         USE_PAD_SLOT=pad_slot_id is not None,
         BLOCK_N=256,
diff --git a/vllm/model_executor/layers/mamba/ops/ssd_chunk_scan.py b/vllm/model_executor/layers/mamba/ops/ssd_chunk_scan.py
index 365139e237c66c532d8339511e415e0fc6014626..fb8350e191c9485dba4b1d77c24518987701503c 100644
--- a/vllm/model_executor/layers/mamba/ops/ssd_chunk_scan.py
+++ b/vllm/model_executor/layers/mamba/ops/ssd_chunk_scan.py
@@ -289,6 +289,9 @@ def _chunk_scan_fwd_kernel(
 
             # get the cs at the offset boundary
             # - c_off == 0 is a passthrough
+            # - We need dA_cs at the boundary, defined by c_off - no need
+            #   to increase pointer by pid_m (it is a constant offset,
+            #   i.e. the same for all blocks)
             dA_cs_m_boundary = tl.load(
                 dA_cumsum_ptr + (c_off - 1) * stride_dA_cs_csize,
                 mask=(((c_off - 1) > -1) and ((c_off) < chunk_size)),
diff --git a/vllm/model_executor/layers/mamba/ops/ssd_chunk_state.py b/vllm/model_executor/layers/mamba/ops/ssd_chunk_state.py
index ad58a9918f03c52776cca8bcd08e7fb2872c7d70..a7b3c814859ce18e6027a3b12d3d4973349f39a0 100644
--- a/vllm/model_executor/layers/mamba/ops/ssd_chunk_state.py
+++ b/vllm/model_executor/layers/mamba/ops/ssd_chunk_state.py
@@ -502,7 +502,7 @@ def _chunk_state_varlen_kernel(
         dA_cumsum_ptrs += BLOCK_SIZE_K * stride_dA_cs_csize
 
     # If the sequence starts after the last chunk idx, we don't need to add the contribution from the last chunk
-    # If HAS_INITSTATES==True need to consider two possiblties
+    # If HAS_INITSTATES==True need to consider two possibilities
     # - if start_idx < pid_c * chunk_size, then we need to take the past_states_ptrs
     # - if state_idx >= pid * chunk_size, then we need to insert initstates
     if ((start_idx < pid_c * chunk_size)  # first chunk
diff --git a/vllm/model_executor/layers/mamba/ops/ssd_combined.py b/vllm/model_executor/layers/mamba/ops/ssd_combined.py
index d0b3e9e5235bf7a5526dbc90b911f159403448ef..fcc5c905bf77f369ff5d8f24d59f9e673282886f 100644
--- a/vllm/model_executor/layers/mamba/ops/ssd_combined.py
+++ b/vllm/model_executor/layers/mamba/ops/ssd_combined.py
@@ -106,21 +106,24 @@ def _mamba_chunk_scan_combined_fwd(x,
     # 3. Compute the inter-chunk SSM recurrence; produces correct SSM states at chunk boundaries
     # (middle term of factorization of off-diag blocks; A terms)
     # - for handling chunked prefill, this requires i) initial_states
-    #   ii) seq_idx and iii) is_cont_batched to be all specified.
+    #   ii) seq_idx iii) is_cont_batched and (iv) chunk_offsets to be all specified.
     # - When a new seq_idx is detected, we will stop passing the prev_state
     #   and switch accordingly to the init_state corresponding to the new seq_idx.
+    # - We will also make sure that the dA_cumsum is taken only from the start of the
+    #   sequence (hence we need the full dA_cumsum tensor and not just the values at chunk boundaries)
     # - this will ensure that states will be updated with the rightmost flushed seq_idx
     #   of the previous chunk. This implies that the first chunk of states is either 0
     #   or equal to init_states of the first example.
     states, final_states = _state_passing_fwd(
         rearrange(states, "... p n -> ... (p n)"),
-        dA_cumsum[:, :, :, -1],
+        dA_cumsum,
         initial_states=rearrange(initial_states, "... p n -> ... (p n)")
         if initial_states is not None else None,
         seq_idx=seq_idx,
         chunk_size=chunk_size,
         out_dtype=state_dtype if state_dtype is not None else C.dtype,
-        is_cont_batched=cu_seqlens is not None)
+        is_cont_batched=cu_seqlens is not None,
+        chunk_offsets=chunk_offsets)
     states, final_states = (rearrange(t, "... (p n) -> ... p n", n=dstate)
                             for t in [states, final_states])
 
diff --git a/vllm/model_executor/layers/mamba/ops/ssd_state_passing.py b/vllm/model_executor/layers/mamba/ops/ssd_state_passing.py
index a28fc9ffad71b5a4d42cc7bac70ff1c82860bb03..d61c3a8cdbe9c5d838ed066ff855accec357d009 100644
--- a/vllm/model_executor/layers/mamba/ops/ssd_state_passing.py
+++ b/vllm/model_executor/layers/mamba/ops/ssd_state_passing.py
@@ -31,6 +31,8 @@ def _state_passing_fwd_kernel(
     dA_cs_ptr,
     initstates_ptr,
     seq_idx_ptr,
+    chunk_offsets_ptr,
+    chunk_meta_num,
     # Matrix dimensions
     dim,
     nchunks,
@@ -51,6 +53,7 @@ def _state_passing_fwd_kernel(
     stride_dA_cs_batch,
     stride_dA_cs_chunk,
     stride_dA_cs_head,
+    stride_dA_cs_csize,
     stride_initstates_batch,
     stride_initstates_head,
     stride_initstates_dim,
@@ -66,7 +69,8 @@ def _state_passing_fwd_kernel(
     pid_h = tl.program_id(axis=2)
     pid_m = tl.program_id(axis=0)
     states_ptr += pid_b * stride_states_batch + pid_h * stride_states_head
-    dA_cs_ptr += pid_b * stride_dA_cs_batch + pid_h * stride_dA_cs_head
+    dA_cs_ptr += pid_b * stride_dA_cs_batch + pid_h * stride_dA_cs_head + (
+        chunk_size - 1) * stride_dA_cs_csize
     out_ptr += pid_b * stride_out_batch + pid_h * stride_out_head
     final_states_ptr += pid_b * stride_final_states_batch + pid_h * stride_final_states_head
     if HAS_INITSTATES:
@@ -95,35 +99,62 @@ def _state_passing_fwd_kernel(
 
     tl.store(out_ptrs, states, mask=offs_m < dim)
     out_ptrs += stride_out_chunk
-    seq_idx = 0
+    prev_seq_idx_chunk_end = 0
+    logical_chunk_idx = 0
     for c in range(nchunks):
         new_states = tl.load(states_ptrs, mask=offs_m < dim,
                              other=0.0).to(tl.float32)
         dA_cs = tl.load(dA_cs_ptr).to(tl.float32)
-        scale = tl.exp(dA_cs)
+        scale_mask = True
         if HAS_SEQ_IDX:
             # - the seq to pass forward is the one that is flushed to the right
             #   boundary.
-            # - that is given by seq_idx_new below.
-            seq_idx_new = tl.load(seq_idx_ptr +
-                                  (min((c + 1) * chunk_size, seqlen) - 1) *
-                                  stride_seq_idx_seqlen)
+            # - that is given by seq_idx_chunk_end below: the sequence index at the end of the chunk.
+            seq_idx_chunk_end = tl.load(seq_idx_ptr + (min(
+                (c + 1) * chunk_size, seqlen) - 1) * stride_seq_idx_seqlen)
             if HAS_INITSTATES:
-                if IS_CONT_BATCHED and seq_idx != seq_idx_new:
+                if IS_CONT_BATCHED and prev_seq_idx_chunk_end != seq_idx_chunk_end:
                     # this means in the current chunk the rightmost flushed seq
                     # has changed.
                     # - so we do not propagate the state from previous chunk
                     # - but rather we load that sequence's init state
-                    initstates_ptrs = initstates_ptr + seq_idx_new * stride_initstates_batch
+                    initstates_ptrs = initstates_ptr + seq_idx_chunk_end * stride_initstates_batch
 
                     # - update state with seq_idx_new's init state
                     states = tl.load(initstates_ptrs,
                                      mask=offs_m < dim,
                                      other=0.0).to(tl.float32)
+
+                    # - we need to consider the cumsum only of the last sequence in the chunk
+                    # - find its starting position (given by c_off of the logical chunk index)
+                    # - and subtract the cumsum just before that position from the total cumsum
+                    # - first, update the logical chunk index (add the number of sequences in the current physical chunk):
+                    # sequence index at the start of the current chunk
+                    seq_idx_chunk_start = tl.load(seq_idx_ptr +
+                                                  min(c * chunk_size, seqlen) *
+                                                  stride_seq_idx_seqlen)
+                    logical_chunk_idx += seq_idx_chunk_end - seq_idx_chunk_start
+                    # - load the chunk offset:
+                    c_off = tl.load(chunk_offsets_ptr + logical_chunk_idx,
+                                    mask=logical_chunk_idx < chunk_meta_num,
+                                    other=0)
+                    # - if offset is 0, then the sequence starts at the beginning of the chunk, and we don't need to subtract anything
+                    if c_off > 0:
+                        # - dA_cs_ptr currently points to the cumsum at the end of the chunk - subtract the chunk size and add the offset
+                        dA_cs_boundary = tl.load(
+                            dA_cs_ptr - (chunk_size - 1) * stride_dA_cs_csize +
+                            (c_off - 1) * stride_dA_cs_csize,
+                            mask=(c_off - 1) > -1 and c_off < chunk_size,
+                            other=0.0)
+                        dA_cs -= dA_cs_boundary
+
+                # - increment logical chunk index for every physical chunk
+                logical_chunk_idx += 1
             else:
-                scale = tl.where(seq_idx_new == seq_idx, scale, 0.0)
+                scale_mask = seq_idx_chunk_end == prev_seq_idx_chunk_end
+            prev_seq_idx_chunk_end = seq_idx_chunk_end
 
-            seq_idx = seq_idx_new
+        scale = tl.where(scale_mask, tl.exp(dA_cs), 0.0)
         states = scale * states + new_states
         if c < nchunks - 1:
             tl.store(out_ptrs, states, mask=offs_m < dim)
@@ -136,28 +167,36 @@ def _state_passing_fwd_kernel(
 
 def _state_passing_fwd(
     states,
-    dA_chunk_cumsum,
+    dA_cumsum,
     initial_states=None,
     seq_idx=None,
     chunk_size=None,
     out_dtype=None,
     is_cont_batched=False,
+    chunk_offsets=None,
 ):
     batch, nchunks, nheads, dim = states.shape
-    assert dA_chunk_cumsum.shape == (batch, nheads, nchunks)
+    if chunk_size is None:
+        chunk_size = dA_cumsum.shape[-1]
+    else:
+        assert chunk_size == dA_cumsum.shape[-1]
+    assert dA_cumsum.shape == (batch, nheads, nchunks, chunk_size)
     if initial_states is not None:
         if is_cont_batched:
             # - if cu_seqlens is provided, then the initial states
             #   are used for continuous batching. In which case we
             #   require seq_idx to be provided
-            assert seq_idx is not None, ""
+            assert seq_idx is not None, "seq_idx must be provided for continuous batching"
+            # - we also need chunk_offsets to be provided, to account
+            #   for computation of dA_cumsum from the start of the
+            #   sequence
+            assert chunk_offsets is not None, "chunk_offsets must be provided for continuous batching"
         else:
             # - this is the regular batching case, where initial
             #   states are used are for each example of the batch.
             assert initial_states.shape == (batch, nheads, dim)
 
     if seq_idx is not None:
-        assert chunk_size is not None
         seqlen = seq_idx.shape[-1]
         assert seq_idx.shape == (batch, seqlen)
     out_dtype = states.dtype if out_dtype is None else out_dtype
@@ -173,13 +212,15 @@ def _state_passing_fwd(
             states,
             out,
             final_states,
-            dA_chunk_cumsum,
+            dA_cumsum,
             initial_states,
             seq_idx,
+            chunk_offsets,
+            len(chunk_offsets) if chunk_offsets is not None else 0,
             dim,
             nchunks,
             seqlen if seq_idx is not None else 0,
-            chunk_size if seq_idx is not None else 0,
+            chunk_size,
             states.stride(0),
             states.stride(1),
             states.stride(2),
@@ -191,9 +232,10 @@ def _state_passing_fwd(
             final_states.stride(0),
             final_states.stride(1),
             final_states.stride(2),
-            dA_chunk_cumsum.stride(0),
-            dA_chunk_cumsum.stride(2),
-            dA_chunk_cumsum.stride(1),
+            dA_cumsum.stride(0),
+            dA_cumsum.stride(2),
+            dA_cumsum.stride(1),
+            dA_cumsum.stride(3),
             *((initial_states.stride(0), initial_states.stride(1),
                initial_states.stride(2)) if initial_states is not None else
               (0, 0, 0)),
diff --git a/vllm/model_executor/layers/mla.py b/vllm/model_executor/layers/mla.py
new file mode 100644
index 0000000000000000000000000000000000000000..a05716190365f190037dc566dc661c9615918ee3
--- /dev/null
+++ b/vllm/model_executor/layers/mla.py
@@ -0,0 +1,158 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from dataclasses import dataclass
+from typing import Optional
+
+import torch
+
+from vllm.attention import Attention
+from vllm.config import CacheConfig
+from vllm.model_executor.custom_op import CustomOp
+from vllm.model_executor.layers.quantization import QuantizationConfig
+
+
+@dataclass
+class MLAModules:
+    """Modules used in MLA.
+    """
+    kv_a_layernorm: torch.nn.Module
+    kv_b_proj: torch.nn.Module
+    rotary_emb: torch.nn.Module
+    o_proj: torch.nn.Module
+    fused_qkv_a_proj: Optional[torch.nn.Module]
+    kv_a_proj_with_mqa: Optional[torch.nn.Module]
+    q_a_layernorm: Optional[torch.nn.Module]
+    q_b_proj: Optional[torch.nn.Module]
+    q_proj: Optional[torch.nn.Module]
+
+
+@CustomOp.register("multi_head_latent_attention")
+class MultiHeadLatentAttention(CustomOp):
+    """MLA layer registered as CustomOp.
+    Note that currently MLA ignores the enable/disable mechanism of CustomOp
+    because there is only one in-tree implementation in forward_native.
+    TODO: implement this with a new PluggableLayer mechanism.
+
+    This class takes positions and hidden_states as input. 
+    The input tensors can either contain prefill tokens or decode tokens.
+    The class does the following:
+
+    1. MLA Preprocess.
+    2. Perform multi-head attention to prefill tokens and
+       multi-query attention to decode tokens separately.
+    3. Return the output tensor.
+    """
+
+    def __init__(
+        self,
+        hidden_size: int,
+        num_heads: int,
+        scale: float,
+        qk_nope_head_dim: int,
+        qk_rope_head_dim: int,
+        v_head_dim: int,
+        q_lora_rank: Optional[int],
+        kv_lora_rank: int,
+        mla_modules: MLAModules,
+        cache_config: Optional[CacheConfig] = None,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.hidden_size = hidden_size
+        self.qk_nope_head_dim = qk_nope_head_dim
+        self.qk_rope_head_dim = qk_rope_head_dim
+        self.qk_head_dim = qk_nope_head_dim + qk_rope_head_dim
+        self.v_head_dim = v_head_dim
+        self.q_lora_rank = q_lora_rank
+        self.kv_lora_rank = kv_lora_rank
+        self.num_heads = num_heads
+        self.fused_qkv_a_proj = mla_modules.fused_qkv_a_proj
+        self.kv_a_proj_with_mqa = mla_modules.kv_a_proj_with_mqa
+        self.q_a_layernorm = mla_modules.q_a_layernorm
+        self.q_b_proj = mla_modules.q_b_proj
+        self.q_proj = mla_modules.q_proj
+        self.kv_a_layernorm = mla_modules.kv_a_layernorm
+        self.kv_b_proj = mla_modules.kv_b_proj
+        self.rotary_emb = mla_modules.rotary_emb
+        self.o_proj = mla_modules.o_proj
+
+        # In the MLA backend, kv_cache includes both k_c and
+        # pe (i.e. decoupled position embeddings). In particular,
+        # the concat_and_cache_mla op requires
+        #     k_c.size(1) + k_pe.size(1) == kv_cache.size(2)
+        # i.e.
+        #     kv_lora_rank + qk_rope_head_dim == head_size
+        self.mla_attn = Attention(
+            num_heads=self.num_heads,
+            head_size=self.kv_lora_rank + self.qk_rope_head_dim,
+            scale=scale,
+            num_kv_heads=1,
+            cache_config=cache_config,
+            quant_config=quant_config,
+            prefix=f"{prefix}.attn",
+            use_mla=True,
+            # MLA Args
+            q_lora_rank=self.q_lora_rank,
+            kv_lora_rank=self.kv_lora_rank,
+            qk_nope_head_dim=self.qk_nope_head_dim,
+            qk_rope_head_dim=self.qk_rope_head_dim,
+            qk_head_dim=self.qk_head_dim,
+            v_head_dim=self.v_head_dim,
+            kv_b_proj=self.kv_b_proj,
+        )
+
+        self.prefix = prefix
+        self.debug_layer_idx = int(self.prefix.split(".")[-2])
+
+    def forward_native(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+    ) -> torch.Tensor:
+        q_c = None
+        kv_lora = None
+
+        if self.q_lora_rank is not None:
+            assert self.fused_qkv_a_proj is not None, \
+                "fused_qkv_a_proj is required when q_lora_rank is not None"
+            assert self.q_a_layernorm is not None, \
+                "q_a_layernorm is required when q_lora_rank is not None"
+            assert self.q_b_proj is not None, \
+                "q_b_proj is required when q_lora_rank is not None"
+            qkv_lora = self.fused_qkv_a_proj(hidden_states)[0]
+            q_c, kv_lora = qkv_lora.split(
+                [self.q_lora_rank, self.kv_lora_rank + self.qk_rope_head_dim],
+                dim=-1,
+            )
+            q_c = self.q_a_layernorm(q_c)
+            q = self.q_b_proj(q_c)[0]
+        else:
+            assert self.kv_a_proj_with_mqa is not None, \
+                "kv_a_proj_with_mqa is required when q_lora_rank is None"
+            assert self.q_proj is not None, \
+                "q_proj is required when q_lora_rank is None"
+            kv_lora = self.kv_a_proj_with_mqa(hidden_states)[0]
+            q = self.q_proj(hidden_states)[0]
+
+        kv_c, k_pe = kv_lora.split([self.kv_lora_rank, self.qk_rope_head_dim],
+                                   dim=-1)
+        kv_c_normed = self.kv_a_layernorm(kv_c)
+
+        q = q.view(-1, self.num_heads, self.qk_head_dim)
+        # Add head dim of 1 to k_pe
+        k_pe = k_pe.unsqueeze(1)
+
+        q[..., self.qk_nope_head_dim:], k_pe = self.rotary_emb(
+            positions, q[..., self.qk_nope_head_dim:], k_pe)
+
+        attn_out = self.mla_attn(
+            q,
+            kv_c_normed,
+            k_pe,
+            output_shape=(hidden_states.shape[0],
+                          self.num_heads * self.v_head_dim))
+        return self.o_proj(attn_out)[0]
+
+    def forward_cuda(self, *args, **kwargs):
+        return self.forward_native(*args, **kwargs)
diff --git a/vllm/model_executor/layers/pooler.py b/vllm/model_executor/layers/pooler.py
index 62b3ee1abaca841b8814909d7322cc26141c7be7..b571a8f866990577e0ad68bf06e30135a72d4d48 100644
--- a/vllm/model_executor/layers/pooler.py
+++ b/vllm/model_executor/layers/pooler.py
@@ -5,7 +5,7 @@ from collections.abc import Mapping, Set
 from dataclasses import dataclass
 from enum import IntEnum
 from itertools import groupby
-from typing import Callable, Optional, TypeVar, Union, cast
+from typing import Callable, Optional, TypeVar, Union
 
 import torch
 import torch.nn as nn
@@ -362,14 +362,13 @@ class PoolerIdentity(PoolerActivation):
 class PoolerNormalize(PoolerActivation):
 
     def forward_chunk(self, pooled_data: torch.Tensor) -> torch.Tensor:
-        x = F.normalize(pooled_data.float(), p=2, dim=-1)
-        return x.to(pooled_data.dtype)
+        return F.normalize(pooled_data, p=2, dim=-1)
 
 
 class PoolerMultiLabelClassify(PoolerActivation):
 
     def forward_chunk(self, pooled_data: torch.Tensor) -> torch.Tensor:
-        return F.sigmoid(pooled_data.float()).to(pooled_data.dtype)
+        return F.sigmoid(pooled_data)
 
 
 class PoolerClassify(PoolerActivation):
@@ -394,9 +393,9 @@ class PoolerClassify(PoolerActivation):
                       pooled_data.shape[-1])
 
         if num_labels < 2:
-            return F.sigmoid(pooled_data.float()).to(pooled_data.dtype)
+            return F.sigmoid(pooled_data)
 
-        return F.softmax(pooled_data.float(), dim=-1).to(pooled_data.dtype)
+        return F.softmax(pooled_data, dim=-1)
 
 
 class LambdaPoolerActivation(PoolerActivation):
@@ -432,8 +431,9 @@ class EmbeddingPoolerHead(PoolerHead):
         from vllm.model_executor.models.adapters import _load_st_projector
 
         vllm_config = get_current_vllm_config()
-        self.projector = _load_st_projector(
+        self.projector: Optional[nn.Module] = _load_st_projector(
             vllm_config.model_config) if vllm_config else None
+        self.head_dtype = vllm_config.model_config.head_dtype
 
     def forward(self, pooled_data: Union[list[torch.Tensor], torch.Tensor],
                 pooling_metadata: PoolingMetadata):
@@ -442,16 +442,11 @@ class EmbeddingPoolerHead(PoolerHead):
             pooled_data = torch.stack(pooled_data)
         # pooled_data shape: [batchsize, hidden_dimension]
 
+        pooled_data = pooled_data.to(self.head_dtype)
+
         # Apply ST projector
         if self.projector is not None:
-            projector = cast(nn.Module, self.projector)
-
-            def _proj(x: torch.Tensor) -> torch.Tensor:
-                orig_dtype = x.dtype
-                y = projector(x.to(torch.float32))
-                return y.to(orig_dtype)
-
-            pooled_data = _proj(pooled_data)
+            pooled_data = self.projector(pooled_data)
         # pooled_data shape: [batchsize, embedding_dimension]
 
         pooling_params = get_pooling_params(pooling_metadata)
@@ -494,8 +489,18 @@ class RewardPoolerHead(PoolerHead):
     def __init__(self) -> None:
         super().__init__(activation=PoolerClassify(static_num_labels=False))
 
+        from vllm.config import get_current_vllm_config
+        vllm_config = get_current_vllm_config()
+        self.head_dtype = vllm_config.model_config.head_dtype
+
     def forward(self, pooled_data: Union[list[torch.Tensor], torch.Tensor],
                 pooling_metadata: PoolingMetadata):
+
+        if isinstance(pooled_data, list):
+            pooled_data = [p.to(self.head_dtype) for p in pooled_data]
+        else:
+            pooled_data = pooled_data.to(self.head_dtype)
+
         pooling_params = get_pooling_params(pooling_metadata)
 
         # for softmax
@@ -633,9 +638,15 @@ class ClassifierPooler(Pooler):
     ) -> None:
         super().__init__()
 
+        from vllm.config import get_current_vllm_config
+        vllm_config = get_current_vllm_config()
+
         self.pooling = pooling
         self.classifier = classifier
         self.act_fn = act_fn or PoolerClassify()
+        self.logit_bias: Optional[
+            float] = vllm_config.model_config.pooler_config.logit_bias
+        self.head_dtype = vllm_config.model_config.head_dtype
 
     def get_supported_tasks(self) -> Set[PoolingTask]:
         return {"classify", "score"}
@@ -650,10 +661,15 @@ class ClassifierPooler(Pooler):
             pooled_data = torch.stack(pooled_data)
         # pooled_data shape: [batchsize, hidden_size]
 
+        pooled_data = pooled_data.to(self.head_dtype)
+
         if self.classifier is not None:
             pooled_data = self.classifier(pooled_data)
         # pooled_data shape: [batchsize, num_labels]
 
+        if self.logit_bias is not None:
+            pooled_data -= self.logit_bias
+
         pooling_params = get_pooling_params(pooling_metadata)
         flags = [p.activation for p in pooling_params]
 
diff --git a/vllm/model_executor/layers/quantization/__init__.py b/vllm/model_executor/layers/quantization/__init__.py
index d73fcf368f261879036b13bd586c405a724833b8..8cac47b5a39a347d0a9c8c4cd415f6044484acf6 100644
--- a/vllm/model_executor/layers/quantization/__init__.py
+++ b/vllm/model_executor/layers/quantization/__init__.py
@@ -26,7 +26,6 @@ QuantizationMethods = Literal[
     "bitsandbytes",
     "hqq",
     "experts_int8",
-    "neuron_quant",
     "ipex",
     "quark",
     "moe_wna16",
@@ -108,7 +107,6 @@ def get_quantization_config(quantization: str) -> type[QuantizationConfig]:
     from .modelopt import ModelOptFp8Config, ModelOptNvFp4Config
     from .moe_wna16 import MoeWNA16Config
     from .mxfp4 import Mxfp4Config
-    from .neuron_quant import NeuronQuantConfig
     from .petit import PetitNvFp4Config
     from .ptpc_fp8 import PTPCFp8Config
     from .rtn import RTNConfig
@@ -135,7 +133,6 @@ def get_quantization_config(quantization: str) -> type[QuantizationConfig]:
         "ptpc_fp8": PTPCFp8Config,
         "hqq": HQQMarlinConfig,
         "experts_int8": ExpertsInt8Config,
-        "neuron_quant": NeuronQuantConfig,
         "ipex": IPEXConfig,
         "quark": QuarkConfig,
         "moe_wna16": MoeWNA16Config,
diff --git a/vllm/model_executor/layers/quantization/auto_round.py b/vllm/model_executor/layers/quantization/auto_round.py
index fb285413ba9efd71301d3c05bd4105789f9aba66..1ca92273430dde72afacefdcda44aaaaae0b212a 100644
--- a/vllm/model_executor/layers/quantization/auto_round.py
+++ b/vllm/model_executor/layers/quantization/auto_round.py
@@ -327,6 +327,8 @@ class AutoRoundConfig(QuantizationConfig):
 
         if isinstance(layer, FusedMoE):
             if use_marlin:
+                return GPTQMarlinMoEMethod(quant_args_marlin, layer.moe)
+            else:
                 from vllm.model_executor.layers.quantization.moe_wna16 import (
                     MoeWNA16Config)
 
@@ -339,7 +341,6 @@ class AutoRoundConfig(QuantizationConfig):
                 }
                 return MoeWNA16Config.from_config(config).get_quant_method(
                     layer, prefix)
-            return GPTQMarlinMoEMethod(quant_args_marlin, layer.moe)
 
         if isinstance(layer, (LinearBase, ParallelLMHead)):
             if use_marlin:
diff --git a/vllm/model_executor/layers/quantization/awq_marlin.py b/vllm/model_executor/layers/quantization/awq_marlin.py
index 8293d42ef4556a39cfff1dc42286554330e05fbb..bf99f0823b74556a467bd70688949aca4c23a29e 100644
--- a/vllm/model_executor/layers/quantization/awq_marlin.py
+++ b/vllm/model_executor/layers/quantization/awq_marlin.py
@@ -1,7 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-from typing import Any, Callable, Optional
+from typing import Any, Callable, Optional, Union
 
 import torch
 from torch.nn import Parameter
@@ -505,7 +505,7 @@ class AWQMoEMethod(FusedMoEMethodBase):
         expert_load_view: Optional[torch.Tensor] = None,
         logical_to_physical_map: Optional[torch.Tensor] = None,
         logical_replica_count: Optional[torch.Tensor] = None,
-    ) -> torch.Tensor:
+    ) -> Union[torch.Tensor, tuple[torch.Tensor, torch.Tensor]]:
         assert self.fused_experts is None
 
         if enable_eplb:
diff --git a/vllm/model_executor/layers/quantization/awq_triton.py b/vllm/model_executor/layers/quantization/awq_triton.py
index ebc526d6db2f9349938570ed91d6f874d07730dd..2e8894436a98522c396a1698c037c6a95fbcb1ac 100644
--- a/vllm/model_executor/layers/quantization/awq_triton.py
+++ b/vllm/model_executor/layers/quantization/awq_triton.py
@@ -19,7 +19,7 @@ def awq_dequantize_kernel(
         num_rows,  # input num rows in qweight
         BLOCK_SIZE_X: tl.constexpr,
         BLOCK_SIZE_Y: tl.constexpr):
-    # Setup the pids.
+    # Set up the pids.
     pid_x = tl.program_id(axis=0)
     pid_y = tl.program_id(axis=1)
 
diff --git a/vllm/model_executor/layers/quantization/base_config.py b/vllm/model_executor/layers/quantization/base_config.py
index 4a43351260e9f5d717798ca642935a07b50d8c39..6fd94afbe55662dead857aa8d312fb90752438cf 100644
--- a/vllm/model_executor/layers/quantization/base_config.py
+++ b/vllm/model_executor/layers/quantization/base_config.py
@@ -128,7 +128,7 @@ class QuantizationConfig(ABC):
     @staticmethod
     def get_from_keys_or(config: dict[str, Any], keys: list[str],
                          default: Any) -> Any:
-        """Get a optional value from the model's quantization config."""
+        """Get an optional value from the model's quantization config."""
         try:
             return QuantizationConfig.get_from_keys(config, keys)
         except ValueError:
diff --git a/vllm/model_executor/layers/quantization/bitblas.py b/vllm/model_executor/layers/quantization/bitblas.py
index 39bd34d351f61c962726e90fa1ff1007df878ea5..d05c0c0d5473cb2d92724ca8e48144e48848a79a 100644
--- a/vllm/model_executor/layers/quantization/bitblas.py
+++ b/vllm/model_executor/layers/quantization/bitblas.py
@@ -202,7 +202,7 @@ class BitBLASLinearMethod(LinearMethodBase):
         output_size: int,
         params_dtype: torch.dtype,
         **extra_weight_attrs,
-    ):
+    ) -> None:
         """Creates quantized weights for use in linear operations.
 
         The function initializes and returns a dictionary containing quantized 
@@ -211,7 +211,7 @@ class BitBLASLinearMethod(LinearMethodBase):
 
         Args:
             input_size_per_partition: The size of the input partition.
-            output_size_per_partition: The size of the output partition.
+            output_partition_sizes: List of output partition sizes.
             input_size: The total size of the input (unused).
             output_size: The total size of the output (unused).
             params_dtype: 
@@ -222,9 +222,9 @@ class BitBLASLinearMethod(LinearMethodBase):
             scales ('scales'), and zeros ('zeros').
 
         Raises:
-            ValueError: If `params_dtype` is not `torch.float16` or if the 
-            input size per partition is not divisible by the group size in 
-            `quant_config`.
+            ValueError: If `params_dtype` is not `torch.float16` or if the input
+                size per partition is not divisible by the group size
+                in `quant_config`.
         """
         del input_size, output_size  # Unused arguments.
         weight_loader = extra_weight_attrs["weight_loader"]
diff --git a/vllm/model_executor/layers/quantization/bitsandbytes.py b/vllm/model_executor/layers/quantization/bitsandbytes.py
index 9713757df9b077a9db3c776d13aa3ba7cfaaded5..2245c59af6feafd4784c12506a048955710e7ffd 100644
--- a/vllm/model_executor/layers/quantization/bitsandbytes.py
+++ b/vllm/model_executor/layers/quantization/bitsandbytes.py
@@ -474,7 +474,7 @@ class BitsAndBytesMoEMethod(FusedMoEMethodBase):
         expert_load_view: Optional[torch.Tensor] = None,
         logical_to_physical_map: Optional[torch.Tensor] = None,
         logical_replica_count: Optional[torch.Tensor] = None,
-    ) -> torch.Tensor:
+    ) -> Union[torch.Tensor, tuple[torch.Tensor, torch.Tensor]]:
         from vllm.model_executor.layers.fused_moe import fused_experts
         assert self.fused_experts is None
 
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py
index b07bf675ca47d18bf50a6384768e42fb6ce8ec46..97041a5a050f158fbc71260b0d974216ce710f4a 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py
@@ -63,7 +63,7 @@ class CompressedTensorsConfig(QuantizationConfig):
         sparsity_ignore_list: list[str],
         kv_cache_scheme: Optional[dict[str, Any]] = None,
         config: Optional[dict[str, Any]] = None,
-        transform_config: Optional[TransformConfig] = None,
+        transform_config: Optional[dict[str, Any]] = None,
     ):
         super().__init__()
         self.ignore = ignore
@@ -75,7 +75,7 @@ class CompressedTensorsConfig(QuantizationConfig):
         self.sparsity_ignore_list = sparsity_ignore_list
         self.config = config
 
-        if transform_config is not None:
+        if transform_config:
             self.transform_config = TransformConfig.model_validate(
                 transform_config)
         else:
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py
index e4585419226cd577263fcdf23e1088e284935eaa..c2b884c058d3af618f6861e1fbc95b2c53ba82ce 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py
@@ -3,7 +3,7 @@
 
 import enum
 from enum import Enum
-from typing import Callable, Optional
+from typing import Callable, Optional, Union
 
 import torch
 from compressed_tensors import CompressionFormat
@@ -358,7 +358,7 @@ class CompressedTensorsW4A4MoeMethod(CompressedTensorsMoEMethod):
         expert_load_view: Optional[torch.Tensor] = None,
         logical_to_physical_map: Optional[torch.Tensor] = None,
         logical_replica_count: Optional[torch.Tensor] = None,
-    ) -> torch.Tensor:
+    ) -> Union[torch.Tensor, tuple[torch.Tensor, torch.Tensor]]:
         assert self.fused_experts is None
 
         if enable_eplb:
@@ -819,7 +819,7 @@ class CompressedTensorsW8A8Fp8MoEMethod(CompressedTensorsMoEMethod):
         expert_load_view: Optional[torch.Tensor] = None,
         logical_to_physical_map: Optional[torch.Tensor] = None,
         logical_replica_count: Optional[torch.Tensor] = None,
-    ) -> torch.Tensor:
+    ) -> Union[torch.Tensor, tuple[torch.Tensor, torch.Tensor]]:
         if enable_eplb:
             raise NotImplementedError(
                 "EPLB not supported for "
@@ -1069,7 +1069,7 @@ class CompressedTensorsW8A8Int8MoEMethod(CompressedTensorsMoEMethod):
         expert_load_view: Optional[torch.Tensor] = None,
         logical_to_physical_map: Optional[torch.Tensor] = None,
         logical_replica_count: Optional[torch.Tensor] = None,
-    ) -> torch.Tensor:
+    ) -> Union[torch.Tensor, tuple[torch.Tensor, torch.Tensor]]:
         assert self.fused_experts is None
 
         if enable_eplb:
@@ -1375,7 +1375,7 @@ class CompressedTensorsWNA16MarlinMoEMethod(CompressedTensorsMoEMethod):
         expert_load_view: Optional[torch.Tensor] = None,
         logical_to_physical_map: Optional[torch.Tensor] = None,
         logical_replica_count: Optional[torch.Tensor] = None,
-    ) -> torch.Tensor:
+    ) -> Union[torch.Tensor, tuple[torch.Tensor, torch.Tensor]]:
         assert self.fused_experts is None
 
         if enable_eplb:
@@ -1608,7 +1608,7 @@ class CompressedTensorsWNA16MoEMethod(CompressedTensorsMoEMethod):
         expert_load_view: Optional[torch.Tensor] = None,
         logical_to_physical_map: Optional[torch.Tensor] = None,
         logical_replica_count: Optional[torch.Tensor] = None,
-    ) -> torch.Tensor:
+    ) -> Union[torch.Tensor, tuple[torch.Tensor, torch.Tensor]]:
         assert self.fused_experts is None
 
         if enable_eplb:
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/transform/module.py b/vllm/model_executor/layers/quantization/compressed_tensors/transform/module.py
index b3be25471773418bb1c643c1ac9f45acceefd152..48ab2582a3b26766147f805b2ef57d453bb446b7 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/transform/module.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/transform/module.py
@@ -83,7 +83,7 @@ class HadamardTransform(torch.nn.Module):
             # do not fold into weight in order to utilize FWHT
             self.scales[part_id] = 1 / math.sqrt(data.size(0))
 
-            # FUTURE: avoid runtime tranpose by processing weights
+            # FUTURE: avoid runtime transpose by processing weights
             # prior to apply
 
     def forward(self, value: Tensor, part_id: int = 0) -> Tensor:
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/utils.py b/vllm/model_executor/layers/quantization/compressed_tensors/utils.py
index 099d8613fc1a73db206d3e8fc7056ac2d5d1bf1f..b2dd2501095f8031621070497b72a3ba681e2ec7 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/utils.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/utils.py
@@ -94,7 +94,7 @@ def find_matched_target(
     config that a layer corresponds to.
 
     Recall that a compressed-tensors configs has a concept of
-    config_groups, where each layer can be quantized with with a different
+    config_groups, where each layer can be quantized with a different
     scheme.
 
     targets in each config_group will be a list of either layer names
diff --git a/vllm/model_executor/layers/quantization/experts_int8.py b/vllm/model_executor/layers/quantization/experts_int8.py
index 2d8a684bc7d9006eb1fbda374960ceba306784e4..b361fe9bea0880f3755143073ee8f9d77be1886d 100644
--- a/vllm/model_executor/layers/quantization/experts_int8.py
+++ b/vllm/model_executor/layers/quantization/experts_int8.py
@@ -1,7 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-from typing import Any, Callable, Optional
+from typing import Any, Callable, Optional, Union
 
 import torch
 
@@ -128,7 +128,7 @@ class ExpertsInt8MoEMethod(FusedMoEMethodBase):
         expert_load_view: Optional[torch.Tensor] = None,
         logical_to_physical_map: Optional[torch.Tensor] = None,
         logical_replica_count: Optional[torch.Tensor] = None,
-    ) -> torch.Tensor:
+    ) -> Union[torch.Tensor, tuple[torch.Tensor, torch.Tensor]]:
         assert self.fused_experts is None
 
         if enable_eplb:
diff --git a/vllm/model_executor/layers/quantization/fp8.py b/vllm/model_executor/layers/quantization/fp8.py
index 48bac8697e4662249421831d4c78420bb478f409..3d94626e5d8c617596393033079a3fc886a226e6 100644
--- a/vllm/model_executor/layers/quantization/fp8.py
+++ b/vllm/model_executor/layers/quantization/fp8.py
@@ -1,7 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-from typing import TYPE_CHECKING, Any, Callable, Optional
+from typing import TYPE_CHECKING, Any, Callable, Optional, Union
 
 import torch
 import torch.nn.functional as F
@@ -30,7 +30,8 @@ from vllm.model_executor.layers.quantization.utils.flashinfer_utils import (
     register_moe_scaling_factors, rotate_flashinfer_fp8_moe_weights,
     select_cutlass_fp8_gemm_impl, swap_w13_to_w31)
 from vllm.model_executor.layers.quantization.utils.fp8_utils import (
-    get_col_major_tma_aligned_tensor, requant_weight_ue8m0_inplace)
+    get_col_major_tma_aligned_tensor, requant_weight_ue8m0_inplace,
+    should_use_deepgemm_for_fp8_linear)
 from vllm.model_executor.layers.quantization.utils.marlin_utils_fp8 import (
     apply_fp8_marlin_linear, prepare_fp8_layer_for_marlin,
     prepare_moe_fp8_layer_for_marlin)
@@ -137,10 +138,35 @@ class Fp8Config(QuantizationConfig):
                    ignored_layers=ignored_layers,
                    weight_block_size=weight_block_size)
 
+    def get_xpu_quant_method(self, layer: torch.nn.Module,
+                             prefix: str) -> Optional["QuantizeMethodBase"]:
+        from vllm.attention.layer import Attention
+        from vllm.model_executor.layers.quantization.ipex_quant import (
+            XPUFp8LinearMethod, XPUFp8MoEMethod)
+        fp8_config = Fp8Config(
+            is_checkpoint_fp8_serialized=self.is_checkpoint_fp8_serialized,
+            activation_scheme=self.activation_scheme,
+            ignored_layers=self.ignored_layers,
+            weight_block_size=self.weight_block_size)
+
+        if isinstance(layer, LinearBase):
+            if is_layer_skipped(prefix=prefix,
+                                ignored_layers=self.ignored_layers,
+                                fused_mapping=self.packed_modules_mapping):
+                return UnquantizedLinearMethod()
+            return XPUFp8LinearMethod(fp8_config)
+        elif isinstance(layer, FusedMoE):
+            return XPUFp8MoEMethod(fp8_config, layer)
+        elif isinstance(layer, Attention):
+            return Fp8KVCacheMethod(self)
+        return None
+
     def get_quant_method(self, layer: torch.nn.Module,
                          prefix: str) -> Optional["QuantizeMethodBase"]:
         from vllm.attention.layer import Attention  # Avoid circular import
 
+        if current_platform.is_xpu():
+            return self.get_xpu_quant_method(layer, prefix)
         if isinstance(layer, LinearBase):
             if is_layer_skipped(prefix=prefix,
                                 ignored_layers=self.ignored_layers,
@@ -245,7 +271,8 @@ class Fp8LinearMethod(LinearMethodBase):
         layer.weight_block_size = None
 
         if self.block_quant:
-            tp_size = get_tensor_model_parallel_world_size()
+            tp_size = getattr(layer, "tp_size",
+                              get_tensor_model_parallel_world_size())
             assert self.quant_config.weight_block_size is not None
             layer.weight_block_size = self.quant_config.weight_block_size
             block_n, block_k = (
@@ -423,10 +450,10 @@ class Fp8LinearMethod(LinearMethodBase):
             # Activations not quantized for marlin.
             del layer.input_scale
 
-        # On B200, if E8M0 for DeepGemm is used, we need to
+        # On Blackwell or Hopper, if E8M0 for DeepGemm is used, we need to
         # requantize the weight and input to the specific scale
         # at the same time.
-        if is_deep_gemm_e8m0_used():
+        if is_deep_gemm_e8m0_used() and self.block_quant:
             assert layer.weight_block_size is not None
             block_sz = tuple(layer.weight_block_size)
             requant_weight_ue8m0_inplace(
@@ -436,6 +463,15 @@ class Fp8LinearMethod(LinearMethodBase):
                 block_sz,
             )
 
+        # SM90 Block FP8 CUTLASS requires row-major weight scales
+        if (self.block_quant and current_platform.is_device_capability(90)
+                and self.cutlass_block_fp8_supported
+                and not should_use_deepgemm_for_fp8_linear(
+                    torch.bfloat16, layer.weight)):
+            layer.weight_scale_inv = Parameter(
+                layer.weight_scale_inv.data.T.contiguous(),
+                requires_grad=False)
+
     def apply(self,
               layer: torch.nn.Module,
               x: torch.Tensor,
@@ -731,10 +767,9 @@ class Fp8MoEMethod(FusedMoEMethodBase):
                 layer.w2_weight = torch.nn.Parameter(shuffled_w2,
                                                      requires_grad=False)
 
-            # DeepGemm scales need to be transposed and aligned.  We try to do
+            # DeepGemm scales need to be transposed and aligned. We try to do
             # it ahead of time for performance reasons.
             if self.allow_deep_gemm and not is_deep_gemm_e8m0_used():
-                # Lazy import to avoid CUDA initialization problems.
                 if _is_col_major(layer.w13_weight_scale_inv):
                     layer.w13_weight_scale_inv = \
                         get_col_major_tma_aligned_tensor(layer.w13_weight_scale_inv).contiguous()
@@ -870,7 +905,7 @@ class Fp8MoEMethod(FusedMoEMethodBase):
             del layer.w13_input_scale
             del layer.w2_input_scale
 
-        if is_deep_gemm_e8m0_used():
+        if is_deep_gemm_e8m0_used() and self.block_quant:
             assert layer.weight_block_size is not None
             # Re-quantise the expert weights so their scales are UE8M0.
             block_sz = tuple(layer.weight_block_size)
@@ -963,7 +998,7 @@ class Fp8MoEMethod(FusedMoEMethodBase):
         expert_load_view: Optional[torch.Tensor] = None,
         logical_to_physical_map: Optional[torch.Tensor] = None,
         logical_replica_count: Optional[torch.Tensor] = None,
-    ) -> torch.Tensor:
+    ) -> Union[torch.Tensor, tuple[torch.Tensor, torch.Tensor]]:
         if enable_eplb:
             assert expert_load_view is not None
             assert logical_to_physical_map is not None
diff --git a/vllm/model_executor/layers/quantization/gguf.py b/vllm/model_executor/layers/quantization/gguf.py
index ad648df23819444537a7ca4f224798951b4d83bf..01af1ccd9ae06c2f859b12d1ee23718a21f52acc 100644
--- a/vllm/model_executor/layers/quantization/gguf.py
+++ b/vllm/model_executor/layers/quantization/gguf.py
@@ -1,7 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-from typing import Any, Callable, Optional
+from typing import Any, Callable, Optional, Union
 
 import gguf
 import torch
@@ -540,7 +540,7 @@ class GGUFMoEMethod(FusedMoEMethodBase):
         expert_load_view: Optional[torch.Tensor] = None,
         logical_to_physical_map: Optional[torch.Tensor] = None,
         logical_replica_count: Optional[torch.Tensor] = None,
-    ):
+    ) -> Union[torch.Tensor, tuple[torch.Tensor, torch.Tensor]]:
         assert self.fused_experts is None
 
         if enable_eplb:
diff --git a/vllm/model_executor/layers/quantization/gptq_bitblas.py b/vllm/model_executor/layers/quantization/gptq_bitblas.py
index d03074f861848107d7f65c7cc43624a6e36e2dbf..64622925864820bbd8c0d3313a74d19437994f1a 100644
--- a/vllm/model_executor/layers/quantization/gptq_bitblas.py
+++ b/vllm/model_executor/layers/quantization/gptq_bitblas.py
@@ -265,9 +265,9 @@ class GPTQBitBLASLinearMethod(LinearMethodBase):
             scales ('scales'), and zeros ('zeros').
 
         Raises:
-            ValueError: If `params_dtype` is not `torch.float16` or 
-            if the input size per partition is not divisible by the 
-            group size in `quant_config`.
+            ValueError: If `params_dtype` is not `torch.float16` or if the input
+                size per partition is not divisible by the group size
+                in `quant_config`.
         """
         if params_dtype != torch.float16:
             raise ValueError("Parameter data type must be torch.float16, "
diff --git a/vllm/model_executor/layers/quantization/gptq_marlin.py b/vllm/model_executor/layers/quantization/gptq_marlin.py
index 3644d91f64e3c26ba9221da81387480901bcc87f..76de3a59c8ca157b87fc86da50fe38d96d979626 100644
--- a/vllm/model_executor/layers/quantization/gptq_marlin.py
+++ b/vllm/model_executor/layers/quantization/gptq_marlin.py
@@ -469,7 +469,7 @@ class GPTQMarlinMoEMethod(FusedMoEMethodBase):
         )
         layer.register_parameter("w2_scales", w2_scales)
         set_weight_attrs(w2_scales, extra_weight_attrs)
-        # dont shard the w2 scales when running act order
+        # don't shard the w2 scales when running act order
         set_weight_attrs(w2_scales,
                          {"load_full_w2": self.quant_config.desc_act})
         # up_proj scales
@@ -493,7 +493,7 @@ class GPTQMarlinMoEMethod(FusedMoEMethodBase):
         )
         layer.register_parameter("w2_qzeros", w2_qzeros)
         set_weight_attrs(w2_qzeros, extra_weight_attrs)
-        # dont shard the w2 scales when running act order
+        # don't shard the w2 scales when running act order
         set_weight_attrs(w2_qzeros,
                          {"load_full_w2": self.quant_config.desc_act})
         w13_g_idx = torch.nn.Parameter(
@@ -654,7 +654,7 @@ class GPTQMarlinMoEMethod(FusedMoEMethodBase):
         expert_load_view: Optional[torch.Tensor] = None,
         logical_to_physical_map: Optional[torch.Tensor] = None,
         logical_replica_count: Optional[torch.Tensor] = None,
-    ) -> torch.Tensor:
+    ) -> Union[torch.Tensor, tuple[torch.Tensor, torch.Tensor]]:
         assert self.fused_experts is None
 
         if enable_eplb:
diff --git a/vllm/model_executor/layers/quantization/ipex_quant.py b/vllm/model_executor/layers/quantization/ipex_quant.py
index 9c458954f960f9a3ec5457c896489c7339db6667..5f9d4814274c81d72119a9591043db9c76b65f9e 100644
--- a/vllm/model_executor/layers/quantization/ipex_quant.py
+++ b/vllm/model_executor/layers/quantization/ipex_quant.py
@@ -1,11 +1,16 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-from typing import Any, Optional
+from typing import Any, Callable, Optional
 
 import torch
 from packaging import version
+from torch.nn import Module
+from torch.nn.parameter import Parameter
 
+from vllm._ipex_ops import ipex_ops as ops
+from vllm.model_executor.layers.fused_moe import (FusedMoEMethodBase,
+                                                  FusedMoeWeightScaleSupported)
 from vllm.model_executor.layers.linear import (LinearBase, LinearMethodBase,
                                                UnquantizedLinearMethod)
 from vllm.model_executor.layers.quantization import QuantizationMethods
@@ -13,7 +18,10 @@ from vllm.model_executor.layers.quantization.awq import (AWQLinearMethod,
                                                          is_layer_skipped_awq)
 from vllm.model_executor.layers.quantization.base_config import (
     QuantizationConfig)
+from vllm.model_executor.layers.quantization.fp8 import (Fp8Config,
+                                                         Fp8LinearMethod)
 from vllm.model_executor.layers.quantization.gptq import GPTQLinearMethod
+from vllm.model_executor.utils import set_weight_attrs
 from vllm.platforms import current_platform
 
 MIN_IPEX_VERSION = "2.6.0"
@@ -251,3 +259,152 @@ class IPEXAWQLinearMethod(AWQLinearMethod):
         reshaped_x = x.reshape(-1, x.shape[-1])
         out = layer.ipex_qlinear(reshaped_x)
         return out.reshape(x.shape[:-1] + (layer.ipex_output_size, ))
+
+
+class XPUFp8LinearMethod(Fp8LinearMethod):
+
+    def __init__(self, quant_config: Fp8Config):
+        super().__init__(quant_config)
+
+    def process_weights_after_loading(self, layer: Module) -> None:
+        # If checkpoint not serialized fp8, quantize the weights.
+        if not self.quant_config.is_checkpoint_fp8_serialized:
+            qweight, weight_scale = ops.scaled_fp8_quant(layer.weight,
+                                                         scale=None)
+            # Update the layer with the new values.
+            layer.weight = Parameter(qweight, requires_grad=False)
+            layer.weight_scale = Parameter(weight_scale, requires_grad=False)
+            layer.input_scale = None
+
+    def apply(self,
+              layer: torch.nn.Module,
+              x: torch.Tensor,
+              bias: Optional[torch.Tensor] = None) -> torch.Tensor:
+        weight = layer.weight.data
+        weight_scale = layer.weight_scale.data
+        output = torch.ops.torch_ipex.fp8_gemm_w8a16(x, weight, True,
+                                                     weight_scale, bias)
+        return output
+
+
+class XPUFp8MoEMethod(FusedMoEMethodBase):
+
+    def __init__(self, quant_config: Fp8Config, layer: torch.nn.Module):
+        super().__init__(layer.moe_config)
+        self.quant_config = quant_config
+
+    def create_weights(self, layer: Module, num_experts: int, hidden_size: int,
+                       intermediate_size_per_partition: int,
+                       params_dtype: torch.dtype, **extra_weight_attrs):
+
+        layer.intermediate_size_per_partition = intermediate_size_per_partition
+        layer.hidden_size = hidden_size
+        layer.num_experts = num_experts
+        layer.orig_dtype = params_dtype
+        layer.weight_block_size = None
+        # WEIGHTS
+        w13_weight = torch.nn.Parameter(torch.empty(
+            num_experts,
+            2 * intermediate_size_per_partition,
+            hidden_size,
+            dtype=params_dtype),
+                                        requires_grad=False)
+        layer.register_parameter("w13_weight", w13_weight)
+        set_weight_attrs(w13_weight, extra_weight_attrs)
+
+        w2_weight = torch.nn.Parameter(torch.empty(
+            num_experts,
+            hidden_size,
+            intermediate_size_per_partition,
+            dtype=params_dtype),
+                                       requires_grad=False)
+        layer.register_parameter("w2_weight", w2_weight)
+        set_weight_attrs(w2_weight, extra_weight_attrs)
+
+        # Allocate 2 scales for w1 and w3 respectively.
+        # They will be combined to a single scale after weight loading.
+        w13_weight_scale = torch.nn.Parameter(torch.ones(num_experts,
+                                                         2,
+                                                         dtype=torch.float32),
+                                              requires_grad=False)
+        w2_weight_scale = torch.nn.Parameter(torch.ones(num_experts,
+                                                        dtype=torch.float32),
+                                             requires_grad=False)
+        layer.register_parameter("w13_weight_scale", w13_weight_scale)
+        layer.register_parameter("w2_weight_scale", w2_weight_scale)
+
+        extra_weight_attrs.update(
+            {"quant_method": FusedMoeWeightScaleSupported.TENSOR.value})
+        # INPUT_SCALES
+        layer.w13_input_scale = None
+        layer.w2_input_scale = None
+
+    def process_weights_after_loading(self, layer: Module) -> None:
+        if not self.quant_config.is_checkpoint_fp8_serialized:
+            fp8_dtype = current_platform.fp8_dtype()
+            w13_weight = torch.empty_like(layer.w13_weight.data,
+                                          dtype=fp8_dtype)
+            w2_weight = torch.empty_like(layer.w2_weight.data, dtype=fp8_dtype)
+
+            # Re-initialize w13_scale because we directly quantize
+            # merged w13 weights and generate a single scaling factor.
+            layer.w13_weight_scale = torch.nn.Parameter(torch.ones(
+                layer.local_num_experts,
+                dtype=torch.float32,
+                device=w13_weight.device),
+                                                        requires_grad=False)
+            for expert in range(layer.local_num_experts):
+                w13_weight[expert, :, :], layer.w13_weight_scale[
+                    expert] = ops.scaled_fp8_quant(
+                        layer.w13_weight.data[expert, :, :])
+                w2_weight[expert, :, :], layer.w2_weight_scale[
+                    expert] = ops.scaled_fp8_quant(
+                        layer.w2_weight.data[expert, :, :])
+            layer.w13_weight = torch.nn.Parameter(w13_weight,
+                                                  requires_grad=False)
+            layer.w2_weight = torch.nn.Parameter(w2_weight,
+                                                 requires_grad=False)
+        import intel_extension_for_pytorch as ipex
+        layer.ipex_fusion = ipex.llm.modules.GatedMLPMOE(
+            layer.w13_weight,
+            layer.w2_weight,
+            w1_scale_inv=layer.w13_weight_scale,
+            w2_scale_inv=layer.w2_weight_scale,
+            a1_scale_inv=layer.w13_input_scale,
+            a2_scale_inv=layer.w2_input_scale,
+            use_prepack=True,
+        )
+
+    def apply(
+        self,
+        layer: torch.nn.Module,
+        x: torch.Tensor,
+        router_logits: torch.Tensor,
+        top_k: int,
+        renormalize: bool,
+        use_grouped_topk: bool = False,
+        topk_group: Optional[int] = None,
+        num_expert_group: Optional[int] = None,
+        global_num_experts: int = -1,
+        expert_map: Optional[torch.Tensor] = None,
+        custom_routing_function: Optional[Callable] = None,
+        scoring_func: str = "softmax",
+        routed_scaling_factor: float = 1.0,
+        e_score_correction_bias: Optional[torch.Tensor] = None,
+        apply_router_weight_on_input: bool = False,
+        activation: str = "silu",
+        enable_eplb: bool = False,
+        expert_load_view: Optional[torch.Tensor] = None,
+        logical_to_physical_map: Optional[torch.Tensor] = None,
+        logical_replica_count: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        return layer.ipex_fusion(
+            x,
+            use_grouped_topk,
+            top_k,
+            router_logits,
+            renormalize,
+            topk_group,
+            num_expert_group,
+            custom_routing_function=custom_routing_function,
+        )
diff --git a/vllm/model_executor/layers/quantization/kernels/mixed_precision/__init__.py b/vllm/model_executor/layers/quantization/kernels/mixed_precision/__init__.py
index 4bcfcd04b3d8b0021c85579bbbfa1765fb9f6ed8..f10d20999bee3571c3404d7fcbc51b35ecaeb7bb 100644
--- a/vllm/model_executor/layers/quantization/kernels/mixed_precision/__init__.py
+++ b/vllm/model_executor/layers/quantization/kernels/mixed_precision/__init__.py
@@ -46,11 +46,11 @@ def choose_mp_linear_kernel(
      performance.
 
     Args:
-        config (MPLinearLayerConfig): Description of the linear layer to be 
-          implemented.
+        config (MPLinearLayerConfig): Description of the linear layer to be
+            implemented.
         compute_capability (Optional[int], optional): The compute capability of
-          the target device, if None uses `current_platform` to get the compute 
-          capability. Defaults to None.
+            the target device, if None uses `current_platform` to get
+            the compute capability. Defaults to None.
 
     Raises:
         ValueError: If no kernel can implement the given config.
diff --git a/vllm/model_executor/layers/quantization/modelopt.py b/vllm/model_executor/layers/quantization/modelopt.py
index 4bb8438d90844fe56db1b7f46077e7440498934e..9b99931e7b43fcb28c08b933648971a2b6fd25d1 100644
--- a/vllm/model_executor/layers/quantization/modelopt.py
+++ b/vllm/model_executor/layers/quantization/modelopt.py
@@ -1,7 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-from typing import Any, Callable, Optional, Union
+from typing import TYPE_CHECKING, Any, Callable, Optional, Union
 
 import torch
 from torch.nn import Module
@@ -45,6 +45,9 @@ from vllm.utils import next_power_of_2
 from vllm.utils.flashinfer import (flashinfer_scaled_fp4_mm, has_flashinfer,
                                    has_flashinfer_moe)
 
+if TYPE_CHECKING:
+    from vllm.model_executor.models.utils import WeightsMapper
+
 logger = init_logger(__name__)
 
 QUANT_ALGOS = ["FP8", "NVFP4"]
@@ -63,7 +66,7 @@ class ModelOptFp8Config(QuantizationConfig):
         super().__init__()
         self.is_checkpoint_fp8_serialized = is_checkpoint_fp8_serialized
         self.kv_cache_quant_method = kv_cache_quant_method
-        self.exclude_modules = exclude_modules
+        self.exclude_modules = exclude_modules or []
         if is_checkpoint_fp8_serialized:
             logger.warning("Detected ModelOpt fp8 checkpoint. Please note that"
                            " the format is experimental and could change.")
@@ -84,6 +87,11 @@ class ModelOptFp8Config(QuantizationConfig):
     def get_config_filenames(cls) -> list[str]:
         return ["hf_quant_config.json"]
 
+    def apply_vllm_mapper(self, hf_to_vllm_mapper: "WeightsMapper"):
+        if self.exclude_modules is not None:
+            self.exclude_modules = hf_to_vllm_mapper.apply_list(
+                self.exclude_modules)
+
     @classmethod
     def override_quantization_method(
             cls, hf_quant_cfg, user_quant) -> Optional[QuantizationMethods]:
@@ -170,7 +178,9 @@ class ModelOptFp8Config(QuantizationConfig):
                          prefix: str) -> Optional["QuantizeMethodBase"]:
         from vllm.attention.layer import Attention  # Avoid circular import
         if isinstance(layer, LinearBase):
-            if self.is_layer_excluded(prefix):
+            if (is_layer_skipped(prefix, self.exclude_modules,
+                                 self.packed_modules_mapping)
+                    or self.is_layer_excluded(prefix)):
                 return UnquantizedLinearMethod()
             return ModelOptFp8LinearMethod(self)
         elif isinstance(layer, Attention):
@@ -491,7 +501,7 @@ class ModelOptFp8MoEMethod(FusedMoEMethodBase):
         expert_load_view: Optional[torch.Tensor] = None,
         logical_to_physical_map: Optional[torch.Tensor] = None,
         logical_replica_count: Optional[torch.Tensor] = None,
-    ) -> torch.Tensor:
+    ) -> Union[torch.Tensor, tuple[torch.Tensor, torch.Tensor]]:
         if enable_eplb:
             raise NotImplementedError(
                 "EPLB not supported for `ModelOptFp8MoEMethod` yet.")
@@ -615,6 +625,11 @@ class ModelOptNvFp4Config(QuantizationConfig):
     def get_config_filenames(cls) -> list[str]:
         return ["hf_quant_config.json"]
 
+    def apply_vllm_mapper(self, hf_to_vllm_mapper: "WeightsMapper"):
+        if self.exclude_modules is not None:
+            self.exclude_modules = hf_to_vllm_mapper.apply_list(
+                self.exclude_modules)
+
     @classmethod
     def override_quantization_method(
             cls, hf_quant_cfg, user_quant) -> Optional[QuantizationMethods]:
@@ -763,7 +778,8 @@ class ModelOptNvFp4Config(QuantizationConfig):
                          prefix: str) -> Optional["QuantizeMethodBase"]:
         from vllm.attention.layer import Attention  # Avoid circular import
         if isinstance(layer, LinearBase):
-            if (is_layer_skipped(prefix, self.exclude_modules)
+            if (is_layer_skipped(prefix, self.exclude_modules,
+                                 self.packed_modules_mapping)
                     or self.is_layer_excluded(prefix, self.exclude_modules)):
                 return UnquantizedLinearMethod()
             return ModelOptNvFp4LinearMethod(self)
@@ -1366,7 +1382,7 @@ class ModelOptNvFp4FusedMoE(FusedMoEMethodBase):
         expert_load_view: Optional[torch.Tensor] = None,
         logical_to_physical_map: Optional[torch.Tensor] = None,
         logical_replica_count: Optional[torch.Tensor] = None,
-    ):
+    ) -> Union[torch.Tensor, tuple[torch.Tensor, torch.Tensor]]:
         if enable_eplb:
             raise NotImplementedError(
                 "EPLB not supported for `ModelOptNvFp4FusedMoE` yet.")
diff --git a/vllm/model_executor/layers/quantization/moe_wna16.py b/vllm/model_executor/layers/quantization/moe_wna16.py
index fb3e4b518bf6cf5a3b8c0cb9699cc63278d2a3c2..d6d7ec9b15805f7ba67864dbbe503b87fad03d1f 100644
--- a/vllm/model_executor/layers/quantization/moe_wna16.py
+++ b/vllm/model_executor/layers/quantization/moe_wna16.py
@@ -1,7 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-from typing import Any, Callable, Optional
+from typing import Any, Callable, Optional, Union
 
 import torch
 
@@ -305,7 +305,7 @@ class MoeWNA16Method(FusedMoEMethodBase):
         expert_load_view: Optional[torch.Tensor] = None,
         logical_to_physical_map: Optional[torch.Tensor] = None,
         logical_replica_count: Optional[torch.Tensor] = None,
-    ) -> torch.Tensor:
+    ) -> Union[torch.Tensor, tuple[torch.Tensor, torch.Tensor]]:
         assert self.fused_experts is None
         if enable_eplb:
             raise NotImplementedError(
diff --git a/vllm/model_executor/layers/quantization/mxfp4.py b/vllm/model_executor/layers/quantization/mxfp4.py
index 5f53d313f3f5c150c1b3724275f76a09d7affe25..f935bdd84124a975b00b2c9bab34182f368426ad 100644
--- a/vllm/model_executor/layers/quantization/mxfp4.py
+++ b/vllm/model_executor/layers/quantization/mxfp4.py
@@ -1,6 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-from typing import Callable, Optional
+from enum import Enum
+from typing import Callable, Optional, Union
 
 import torch
 from torch.nn.parameter import Parameter
@@ -33,33 +34,72 @@ from vllm.utils.flashinfer import has_flashinfer
 logger = init_logger(__name__)
 
 
-def _should_use_flashinfer_mxfp4_bf16():
-    """Determine if FlashInfer MXFP4 BF16 should be used."""
-    # If explicitly set, respect the setting
-    if envs.is_set("VLLM_USE_FLASHINFER_MOE_MXFP4_BF16"):
-        return envs.VLLM_USE_FLASHINFER_MOE_MXFP4_BF16
-
-    # Enable by default on SM100 if MXFP8 is not explicitly enabled
-    if (current_platform.is_device_capability(100) and has_flashinfer()
-            and not envs.is_set("VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8")):
-        logger.info_once(
-            "Enabling FlashInfer MXFP4 BF16 backend by default for Blackwell. "
-            "For faster performance, consider setting "
-            "VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8=1, "
-            "though this may impact accuracy.")
-        return True
-
-    return False
-
-
-def _should_use_flashinfer_mxfp4_mxfp8():
-    """Determine if FlashInfer MXFP4 MXFP8 should be used."""
-    return envs.VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8
+# enum for mxfp4 backend
+class Mxfp4Backend(Enum):
+    NONE = 0
+
+    # FlashInfer Backend
+    SM100_FI_MXFP4_MXFP8_TRTLLM = 1
+    SM100_FI_MXFP4_MXFP8_CUTLASS = 2
+    SM100_FI_MXFP4_BF16 = 3
+    SM90_FI_MXFP4_BF16 = 4
+
+    # Marlin Backend
+    MARLIN = 5
+
+    # Triton Backend
+    TRITON = 6
+
+
+def get_mxfp4_backend():
+    # Backend Selection
+    if current_platform.is_cuda():
+        if (current_platform.is_device_capability(90) and has_flashinfer()
+                and envs.VLLM_USE_FLASHINFER_MOE_MXFP4_BF16):
+            logger.info_once("Using FlashInfer MXFP4 BF16 backend for SM90")
+            return Mxfp4Backend.SM90_FI_MXFP4_BF16
+        elif (current_platform.is_device_capability(100) and has_flashinfer()
+              and envs.VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8_CUTLASS):
+            logger.info_once(
+                "Using FlashInfer MXFP4 MXFP8 CUTLASS backend for SM100")
+            return Mxfp4Backend.SM100_FI_MXFP4_MXFP8_CUTLASS
+        elif (current_platform.is_device_capability(100) and has_flashinfer()
+              and envs.VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8):
+            logger.info_once(
+                "Using FlashInfer MXFP4 MXFP8 TRTLLM backend for SM100, "
+                "for high concurrency throughput workloads consider setting "
+                "VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8_CUTLASS=1 for better "
+                "performance")
+            return Mxfp4Backend.SM100_FI_MXFP4_MXFP8_TRTLLM
+        elif current_platform.is_device_capability(100) and has_flashinfer():
+            logger.info_once(
+                "Using FlashInfer MXFP4 BF16 backend for SM100, "
+                "For faster performance on SM100, consider setting "
+                "VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8=1, though this may impact "
+                "accuracy.")
+            return Mxfp4Backend.SM100_FI_MXFP4_BF16
+        elif ((current_platform.is_device_capability(100)
+               or current_platform.is_device_capability(90))
+              and not has_flashinfer()):
+            logger.warning_once(
+                "MXFP4 MoE is enabled on Hopper/Blackwell but FlashInfer "
+                "is not available. This may result in degraded performance. "
+                "Please `pip install vllm[flashinfer]` for best results.")
 
+        # If FlashInfer is not available, try either Marlin or Triton
+        if current_platform.get_device_capability(
+        )[0] < 9 or not has_triton_kernels() or not is_torch_equal_or_newer(
+                "2.8.0"):
+            logger.info_once("Using Marlin backend")
+            return Mxfp4Backend.MARLIN
+        else:
+            logger.info_once("Using Triton backend")
+            return Mxfp4Backend.TRITON
+    elif current_platform.is_rocm() and has_triton_kernels():
+        logger.info_once("Using Triton backend")
+        return Mxfp4Backend.TRITON
 
-def should_use_flashinfer_mxfp4():
-    return (_should_use_flashinfer_mxfp4_mxfp8()
-            or _should_use_flashinfer_mxfp4_bf16())
+    return Mxfp4Backend.NONE
 
 
 class Mxfp4Config(QuantizationConfig):
@@ -113,35 +153,14 @@ class Mxfp4MoEMethod(FusedMoEMethodBase):
         super().__init__(moe)
         self.topk_indices_dtype = None
         self.moe = moe
-        self.use_marlin = self._should_use_marlin()
+        self.mxfp4_backend = get_mxfp4_backend()
         self.max_capture_size = get_current_vllm_config(
         ).compilation_config.max_capture_size
 
-        if current_platform.is_device_capability(100) and not has_flashinfer():
-            logger.warning_once(
-                "MXFP4 MoE is enabled on Blackwell but FlashInfer "
-                "is not available. This may result in degraded performance. "
-                "Please `pip install vllm[flashinfer]` for best results.")
-
-        if current_platform.is_device_capability(100) and not has_flashinfer():
-            logger.warning_once(
-                "MXFP4 MoE is enabled on Blackwell but FlashInfer "
-                "is not available. This may result in degraded performance. "
-                "Please `pip install vllm[flashinfer]` for best results.")
-
-    def _should_use_marlin(self):
-        if envs.VLLM_MXFP4_USE_MARLIN is not None:
-            return envs.VLLM_MXFP4_USE_MARLIN
-        if current_platform.is_cuda() and \
-                not current_platform.is_device_capability(100):
-            if not current_platform.has_device_capability(90):
-                # marlin kernel has better performance on ampere
-                return True
-            if not has_triton_kernels():
-                return True
-            if not is_torch_equal_or_newer("2.8.0"):
-                return True
-        return False
+        assert self.mxfp4_backend != Mxfp4Backend.NONE, (
+            "No MXFP4 MoE backend (FlashInfer/Marlin/Triton) available."
+            "Please check your environment and try again.")
+        self._cache_permute_indices: dict[torch.Size, torch.Tensor] = {}
 
     def create_weights(self, layer: torch.nn.Module, num_experts: int,
                        hidden_size: int, intermediate_size_per_partition: int,
@@ -162,7 +181,7 @@ class Mxfp4MoEMethod(FusedMoEMethodBase):
 
         intermediate_size_per_partition_after_pad = \
             intermediate_size_per_partition
-        if self.use_marlin:
+        if self.mxfp4_backend == Mxfp4Backend.MARLIN:
             # The moe marlin kernel requires that for each linear
             # n % 256 == 0 and k % 128 == 0.
             # In gate_up_proj:
@@ -180,16 +199,20 @@ class Mxfp4MoEMethod(FusedMoEMethodBase):
             layer.hidden_size = hidden_size
             layer.intermediate_size_per_partition = \
                 intermediate_size_per_partition_after_pad
-        elif should_use_flashinfer_mxfp4():
+        elif (self.mxfp4_backend == Mxfp4Backend.SM100_FI_MXFP4_MXFP8_TRTLLM
+              or self.mxfp4_backend == Mxfp4Backend.SM100_FI_MXFP4_BF16):
             # pad the intermediate size to be a multiple of 2 * mxfp4_block
             # for to hold non-uniform sharded tensor as well as swizzling
             # other padding to increase performance
             intermediate_size_per_partition_after_pad = round_up(
                 intermediate_size_per_partition, 256)
             hidden_size = round_up(hidden_size, 256)
-        elif current_platform.is_rocm():
+        elif current_platform.is_rocm() or (
+                self.mxfp4_backend == Mxfp4Backend.SM100_FI_MXFP4_MXFP8_CUTLASS
+                or self.mxfp4_backend == Mxfp4Backend.SM90_FI_MXFP4_BF16):
             intermediate_size_per_partition_after_pad = round_up(
                 intermediate_size_per_partition, 128)
+            hidden_size = round_up(hidden_size, 128)
         else:
             intermediate_size_per_partition_after_pad = round_up(
                 intermediate_size_per_partition, 64)
@@ -269,10 +292,14 @@ class Mxfp4MoEMethod(FusedMoEMethodBase):
         set_weight_attrs(w2_bias, extra_weight_attrs)
 
     def process_weights_after_loading(self, layer):
-        if self.use_marlin:
+        if self.mxfp4_backend == Mxfp4Backend.MARLIN:
             prepare_moe_fp4_layer_for_marlin(layer)
-        elif should_use_flashinfer_mxfp4():
-            from flashinfer import shuffle_matrix_a, shuffle_matrix_sf_a
+        elif (self.mxfp4_backend == Mxfp4Backend.SM100_FI_MXFP4_MXFP8_TRTLLM
+              or self.mxfp4_backend == Mxfp4Backend.SM100_FI_MXFP4_BF16):
+            from flashinfer.fp4_quantization import (
+                nvfp4_block_scale_interleave)
+            from flashinfer.fused_moe.core import (
+                _maybe_get_cached_w2_permute_indices)
             layer.gemm1_alpha = Parameter(torch.tensor(
                 [1.702] * self.num_experts, dtype=torch.float32).cuda(),
                                           requires_grad=False)
@@ -316,7 +343,7 @@ class Mxfp4MoEMethod(FusedMoEMethodBase):
             w13_bias = layer.w13_bias.data.to(torch.float32)
             w2_bias = layer.w2_bias.data.to(torch.float32)
 
-            # Swap w1 and w3 as the defenition of
+            # Swap w1 and w3 as the definition of
             # swiglu is different in the trtllm-gen
             def swap_every_two_rows(x, axis=-1):
                 shape = x.shape
@@ -349,25 +376,63 @@ class Mxfp4MoEMethod(FusedMoEMethodBase):
             gemm2_bias_shuffled = []
             epilogue_tile_m = 128  # FIXME: this depends on the kernel internals
             for i in range(self.num_experts):
-                gemm1_weights_mxfp4_shuffled.append(
-                    shuffle_matrix_a(w13_weight[i].view(torch.uint8),
-                                     epilogue_tile_m))
+                # w13 weight shuffling
+                permute_indices = _maybe_get_cached_w2_permute_indices(
+                    self._cache_permute_indices,
+                    w13_weight[i].view(torch.uint8),
+                    epilogue_tile_m,
+                )
+                gemm1_weights_mxfp4_shuffled.append(w13_weight[i].view(
+                    torch.uint8)[permute_indices.to(
+                        w13_weight.device)].contiguous())
+                # w13 scale shuffling
+                permute_sf_indices = _maybe_get_cached_w2_permute_indices(
+                    self._cache_permute_indices,
+                    w13_weight_scale[i].view(torch.uint8),
+                    epilogue_tile_m,
+                    num_elts_per_sf=16,
+                )
                 gemm1_scales_mxfp4_shuffled.append(
-                    shuffle_matrix_sf_a(w13_weight_scale[i].view(torch.uint8),
-                                        epilogue_tile_m))
-                gemm1_bias_shuffled.append(
-                    shuffle_matrix_a(w13_bias[i].clone().reshape(-1, 1),
-                                     epilogue_tile_m))
-
-                gemm2_weights_mxfp4_shuffled.append(
-                    shuffle_matrix_a(w2_weight[i].view(torch.uint8),
-                                     epilogue_tile_m))
+                    nvfp4_block_scale_interleave(w13_weight_scale[i].view(
+                        torch.uint8)[permute_sf_indices.to(
+                            w13_weight_scale.device)].contiguous()))
+                # w13 bias shuffling
+                permute_bias_indices = _maybe_get_cached_w2_permute_indices(
+                    self._cache_permute_indices,
+                    w13_bias[i].clone().reshape(-1, 1),
+                    epilogue_tile_m,
+                )
+                gemm1_bias_shuffled.append(w13_bias[i].clone().reshape(
+                    -1,
+                    1)[permute_bias_indices.to(w13_bias.device)].contiguous())
+                # w2 weight shuffling
+                permute_indices = _maybe_get_cached_w2_permute_indices(
+                    self._cache_permute_indices,
+                    w2_weight[i].view(torch.uint8),
+                    epilogue_tile_m,
+                )
+                gemm2_weights_mxfp4_shuffled.append(w2_weight[i].view(
+                    torch.uint8)[permute_indices.to(
+                        w2_weight.device)].contiguous())
+                # w2 scale shuffling
+                permute_sf_indices = _maybe_get_cached_w2_permute_indices(
+                    self._cache_permute_indices,
+                    w2_weight_scale[i].view(torch.uint8),
+                    epilogue_tile_m,
+                    num_elts_per_sf=16,
+                )
                 gemm2_scales_mxfp4_shuffled.append(
-                    shuffle_matrix_sf_a(w2_weight_scale[i].view(torch.uint8),
-                                        epilogue_tile_m))
-                gemm2_bias_shuffled.append(
-                    shuffle_matrix_a(w2_bias[i].clone().reshape(-1, 1),
-                                     epilogue_tile_m))
+                    nvfp4_block_scale_interleave(w2_weight_scale[i].view(
+                        torch.uint8)[permute_sf_indices.to(
+                            w2_weight_scale.device)].contiguous()))
+                # w2 bias shuffling
+                permute_indices = _maybe_get_cached_w2_permute_indices(
+                    self._cache_permute_indices,
+                    w2_bias[i].clone().reshape(-1, 1),
+                    epilogue_tile_m,
+                )
+                gemm2_bias_shuffled.append(w2_bias[i].clone().reshape(
+                    -1, 1)[permute_indices.to(w2_bias.device)].contiguous())
 
             w13_weight = torch.stack(gemm1_weights_mxfp4_shuffled)
             w13_weight_scale = torch.stack(
@@ -393,7 +458,116 @@ class Mxfp4MoEMethod(FusedMoEMethodBase):
             layer.w2_bias = Parameter(torch.stack(gemm2_bias_shuffled).reshape(
                 self.num_experts, -1),
                                       requires_grad=False)
-        else:
+        elif (self.mxfp4_backend == Mxfp4Backend.SM100_FI_MXFP4_MXFP8_CUTLASS
+              or self.mxfp4_backend == Mxfp4Backend.SM90_FI_MXFP4_BF16):
+            layer.gemm1_alpha = Parameter(torch.tensor(
+                [1.702] * self.num_experts, dtype=torch.float32).cuda(),
+                                          requires_grad=False)
+            layer.gemm1_beta = Parameter(torch.tensor(
+                [1.0] * self.num_experts, dtype=torch.float32).cuda(),
+                                         requires_grad=False)
+            layer.gemm1_clamp_limit = Parameter(torch.tensor(
+                [7.0] * self.num_experts, dtype=torch.float32).cuda(),
+                                                requires_grad=False)
+
+            sf_block_size = 32  # mxfp4 block size
+
+            # Common shape assertions
+            assert (layer.w13_weight.dim() == 3
+                    and layer.w13_weight.shape[0] == self.num_experts
+                    and layer.w13_weight.shape[1] == self.intermediate_size * 2
+                    and layer.w13_weight.shape[2] == self.hidden_size // 2)
+            assert (layer.w13_weight_scale.dim() == 3
+                    and layer.w13_weight_scale.shape[0] == self.num_experts
+                    and layer.w13_weight_scale.shape[1]
+                    == self.intermediate_size * 2
+                    and layer.w13_weight_scale.shape[2]
+                    == self.hidden_size // sf_block_size)
+            assert (layer.w2_weight.dim() == 3
+                    and layer.w2_weight.shape[0] == self.num_experts
+                    and layer.w2_weight.shape[1] == self.hidden_size and
+                    layer.w2_weight.shape[2] == self.intermediate_size // 2)
+            assert (layer.w2_weight_scale.dim() == 3
+                    and layer.w2_weight_scale.shape[1] == self.hidden_size
+                    and layer.w2_weight_scale.shape[2]
+                    == self.intermediate_size // sf_block_size)
+            assert (layer.w13_bias.dim() == 2
+                    and layer.w13_bias.shape[0] == self.num_experts
+                    and layer.w13_bias.shape[1] == self.intermediate_size * 2)
+            assert (layer.w2_bias.dim() == 2
+                    and layer.w2_bias.shape[0] == self.num_experts
+                    and layer.w2_bias.shape[1] == self.hidden_size)
+
+            # De-interleave and swap for w13 weight, bias, and scales
+            w13_w = layer.w13_weight.data
+            gate_w, up_w = w13_w[:, ::2, :], w13_w[:, 1::2, :]
+            deinterleaved_w13_w = torch.cat([gate_w, up_w], dim=1)
+            w1_w, w3_w = torch.chunk(deinterleaved_w13_w, 2, dim=1)
+            w13_weight_swapped = torch.cat([w3_w, w1_w], dim=1)
+
+            w13_b = layer.w13_bias.data.to(torch.float32)
+            gate_b, up_b = w13_b[:, ::2], w13_b[:, 1::2]
+            deinterleaved_w13_b = torch.cat([gate_b, up_b], dim=1)
+            b1, b3 = torch.chunk(deinterleaved_w13_b, 2, dim=-1)
+            w13_bias_swapped = torch.cat([b3, b1], dim=-1).to(torch.bfloat16)
+
+            w13_s = layer.w13_weight_scale.data
+            gate_s, up_s = w13_s[:, ::2, :], w13_s[:, 1::2, :]
+            deinterleaved_w13_s = torch.cat([gate_s, up_s], dim=1)
+            s1, s3 = torch.chunk(deinterleaved_w13_s, 2, dim=1)
+            w13_scale_swapped = torch.cat([s3, s1], dim=1)
+
+            if self.mxfp4_backend == Mxfp4Backend.SM100_FI_MXFP4_MXFP8_CUTLASS:
+                from flashinfer import block_scale_interleave
+
+                orig_shape = w13_scale_swapped.shape
+                w13_scale_interleaved = block_scale_interleave(
+                    w13_scale_swapped.view(torch.uint8)).reshape(orig_shape)
+
+                w2_s = layer.w2_weight_scale.data
+                orig_shape = w2_s.shape
+                w2_scale_interleaved = block_scale_interleave(
+                    w2_s.view(torch.uint8)).reshape(orig_shape)
+
+                layer.w13_weight = Parameter(w13_weight_swapped,
+                                             requires_grad=False)
+                layer.w13_weight_scale = Parameter(w13_scale_interleaved,
+                                                   requires_grad=False)
+                layer.w13_bias = Parameter(w13_bias_swapped,
+                                           requires_grad=False)
+                layer.w2_weight_scale = Parameter(w2_scale_interleaved,
+                                                  requires_grad=False)
+            elif self.mxfp4_backend == Mxfp4Backend.SM90_FI_MXFP4_BF16:
+
+                def _interleave_mxfp4_cutlass_sm90(w):
+                    w_shape = w.shape
+                    w_interleaved = w.reshape(w_shape[0], w_shape[1],
+                                              (w_shape[2] // 4), 4)
+                    w_interleaved = w_interleaved.permute(0, 2, 1, 3)
+                    w_interleaved = w_interleaved.reshape(
+                        w_shape[0], w_shape[2] // 4, w_shape[1] * 4)
+                    return w_interleaved
+
+                w31_scales = w13_scale_swapped.to(torch.uint8).view(
+                    torch.uint8)
+                w31_scales_interleaved = _interleave_mxfp4_cutlass_sm90(
+                    w31_scales)
+
+                w2_weight_scale = layer.w2_weight_scale.data
+                w2_scales = w2_weight_scale.to(torch.uint8).view(torch.uint8)
+                w2_scales_interleaved = _interleave_mxfp4_cutlass_sm90(
+                    w2_scales)
+
+                layer.w13_weight = torch.nn.Parameter(torch.cat([w3_w, w1_w],
+                                                                dim=1),
+                                                      requires_grad=False)
+                layer.w13_bias = torch.nn.Parameter(w13_bias_swapped,
+                                                    requires_grad=False)
+                layer.w13_weight_scale = torch.nn.Parameter(
+                    w31_scales_interleaved, requires_grad=False)
+                layer.w2_weight_scale = torch.nn.Parameter(
+                    w2_scales_interleaved, requires_grad=False)
+        elif self.mxfp4_backend == Mxfp4Backend.TRITON:
             from triton_kernels.matmul_ogs import FlexCtx, PrecisionConfig
 
             w13_bias = layer.w13_bias.to(torch.float32)
@@ -428,6 +602,8 @@ class Mxfp4MoEMethod(FusedMoEMethodBase):
             layer.w13_weight = None
             layer.w2_weight = None
             torch.cuda.empty_cache()
+        else:
+            raise ValueError(f"Unsupported backend: {self.mxfp4_backend}")
 
     def _get_tile_tokens_dim(self, x: torch.Tensor, top_k: int):
         # Number of tokens in the input tensor.
@@ -464,7 +640,8 @@ class Mxfp4MoEMethod(FusedMoEMethodBase):
             raise NotImplementedError(
                 "Mxfp4 does not support batched experts format for EP")
         else:
-            if should_use_flashinfer_mxfp4():
+            if (self.mxfp4_backend == Mxfp4Backend.SM100_FI_MXFP4_MXFP8_TRTLLM
+                    or self.mxfp4_backend == Mxfp4Backend.SM100_FI_MXFP4_BF16):
                 # B200 code-path
                 kwargs = {
                     "gemm1_alpha": layer.gemm1_alpha,
@@ -560,12 +737,12 @@ class Mxfp4MoEMethod(FusedMoEMethodBase):
         expert_load_view: Optional[torch.Tensor] = None,
         logical_to_physical_map: Optional[torch.Tensor] = None,
         logical_replica_count: Optional[torch.Tensor] = None,
-    ) -> torch.Tensor:
+    ) -> Union[torch.Tensor, tuple[torch.Tensor, torch.Tensor]]:
 
         if enable_eplb:
             raise NotImplementedError("EPLB is not supported for mxfp4")
 
-        if self.use_marlin:
+        if self.mxfp4_backend == Mxfp4Backend.MARLIN:
             topk_weights, topk_ids = FusedMoE.select_experts(
                 hidden_states=x,
                 router_logits=router_logits,
@@ -629,16 +806,19 @@ class Mxfp4MoEMethod(FusedMoEMethodBase):
             logical_replica_count), (
                 "MXFP4 are not supported with this configuration.")
 
-        if should_use_flashinfer_mxfp4():
-            from flashinfer import mxfp8_quantize, trtllm_fp4_block_scale_moe
-            if _should_use_flashinfer_mxfp4_bf16():
+        if (self.mxfp4_backend == Mxfp4Backend.SM100_FI_MXFP4_MXFP8_TRTLLM
+                or self.mxfp4_backend == Mxfp4Backend.SM100_FI_MXFP4_BF16):
+            from flashinfer import trtllm_fp4_block_scale_moe
+            if self.mxfp4_backend == Mxfp4Backend.SM100_FI_MXFP4_BF16:
                 assert x.dtype == torch.bfloat16
                 x_quant = x
                 x_scale = None
-            else:
+            elif self.mxfp4_backend == Mxfp4Backend.SM100_FI_MXFP4_MXFP8_TRTLLM:
+                from flashinfer import mxfp8_quantize
                 x_quant, x_scale = mxfp8_quantize(x, False)  # to mxfp8
                 x_scale = x_scale.view(torch.float8_e4m3fn).reshape(
                     *x.shape[:-1], -1)
+
             trtllm_gen_output = trtllm_fp4_block_scale_moe(
                 router_logits.to(torch.bfloat16),
                 None,  # routing_bias
@@ -670,7 +850,86 @@ class Mxfp4MoEMethod(FusedMoEMethodBase):
                 tune_max_num_tokens=self.max_capture_size,
             )[0]
             return trtllm_gen_output
-        else:
+        elif (self.mxfp4_backend == Mxfp4Backend.SM100_FI_MXFP4_MXFP8_CUTLASS
+              or self.mxfp4_backend == Mxfp4Backend.SM90_FI_MXFP4_BF16):
+            from vllm.utils.flashinfer import flashinfer_cutlass_fused_moe
+
+            topk_weights, topk_ids = FusedMoE.select_experts(
+                hidden_states=x,
+                router_logits=router_logits,
+                use_grouped_topk=use_grouped_topk,
+                top_k=top_k,
+                renormalize=renormalize,
+                topk_group=topk_group,
+                num_expert_group=num_expert_group,
+                custom_routing_function=custom_routing_function,
+                scoring_func=scoring_func,
+                e_score_correction_bias=e_score_correction_bias,
+            )
+
+            # Backend-specific preparation
+            if self.mxfp4_backend == Mxfp4Backend.SM100_FI_MXFP4_MXFP8_CUTLASS:
+
+                from flashinfer import mxfp8_quantize
+
+                x_quant, x_scale = mxfp8_quantize(x, True, 32)
+
+                fake_input_scale = torch.ones(self.num_experts,
+                                              device=x.device)
+                quant_scales = [
+                    layer.w13_weight_scale.contiguous().view(torch.int32),
+                    fake_input_scale,
+                    layer.w2_weight_scale.contiguous().view(torch.int32),
+                    fake_input_scale,
+                ]
+
+                fi_input = x_quant
+                extra_kwargs = dict(
+                    use_mxfp8_act_scaling=True,
+                    input_sf=x_scale,
+                    fc1_expert_weights=layer.w13_weight.contiguous().view(
+                        torch.long),
+                    fc2_expert_weights=layer.w2_weight.contiguous().view(
+                        torch.long),
+                )
+            elif self.mxfp4_backend == Mxfp4Backend.SM90_FI_MXFP4_BF16:
+                assert x.dtype == torch.bfloat16
+
+                quant_scales = [
+                    layer.w13_weight_scale,
+                    layer.w2_weight_scale,
+                ]
+
+                fi_input = x
+                extra_kwargs = dict(
+                    use_w4_group_scaling=True,
+                    fc1_expert_weights=layer.w13_weight,
+                    fc2_expert_weights=layer.w2_weight,
+                )
+
+            output = torch.empty_like(x, dtype=torch.bfloat16)
+            _ = flashinfer_cutlass_fused_moe(
+                input=fi_input,
+                token_selected_experts=topk_ids.to(torch.int).contiguous(),
+                token_final_scales=topk_weights,
+                output_dtype=torch.bfloat16,
+                output=output,
+                quant_scales=quant_scales,
+                fc1_expert_biases=layer.w13_bias,
+                fc2_expert_biases=layer.w2_bias,
+                swiglu_alpha=layer.gemm1_alpha,
+                swiglu_beta=layer.gemm1_beta,
+                swiglu_limit=layer.gemm1_clamp_limit,
+                tp_size=self.moe.tp_size,
+                tp_rank=self.moe.tp_rank,
+                ep_size=self.moe.ep_size,
+                ep_rank=self.moe.ep_rank,
+                tune_max_num_tokens=self.max_capture_size,
+                **extra_kwargs,
+            )
+
+            return output
+        elif self.mxfp4_backend == Mxfp4Backend.TRITON:
             from vllm.model_executor.layers.fused_moe.gpt_oss_triton_kernels_moe import (  # noqa: E501
                 triton_kernel_moe_forward)
             return triton_kernel_moe_forward(
@@ -688,3 +947,5 @@ class Mxfp4MoEMethod(FusedMoEMethodBase):
                 w2_precision=self.w2_precision_config,
                 apply_router_weight_on_input=apply_router_weight_on_input,
             )
+        else:
+            raise ValueError(f"Unsupported backend: {self.mxfp4_backend}")
diff --git a/vllm/model_executor/layers/quantization/neuron_quant.py b/vllm/model_executor/layers/quantization/neuron_quant.py
deleted file mode 100644
index 8040236663dd1eadf924de928c643e4b405ef222..0000000000000000000000000000000000000000
--- a/vllm/model_executor/layers/quantization/neuron_quant.py
+++ /dev/null
@@ -1,76 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-import os
-from importlib.util import find_spec
-from typing import Any, Optional
-
-from torch.nn import Module
-
-from vllm.model_executor.layers.quantization import QuantizationMethods
-from vllm.model_executor.layers.quantization.base_config import (
-    QuantizationConfig)
-
-SUPPORTED_QUANT_DTYPE_LIST = ['s8', 'f8e4m3fn']
-
-
-class AlwaysSupportedDtypes(list):
-
-    def __contains__(self, item):
-        return True
-
-
-class NeuronQuantConfig(QuantizationConfig):
-    """Int8 Quantization Config class for Neuron Backend."""
-
-    def __init__(
-        self,
-        dequant_dtype: str = "f16",
-        quantize_method: str = "vector_dynamic",
-    ) -> None:
-        super().__init__()
-        self.quant_dtype = os.getenv("NEURON_QUANT_DTYPE", "s8")
-        if self.quant_dtype not in SUPPORTED_QUANT_DTYPE_LIST:
-            raise ValueError(
-                f"Neuron quantization datatype {self.quant_dtype} is not valid,"
-                f" the quantization datatype should match one of the below "
-                f"types {SUPPORTED_QUANT_DTYPE_LIST}")
-        self.dequant_dtype = dequant_dtype
-        self.quantize_method = quantize_method
-
-    def get_name(self) -> QuantizationMethods:
-        return "neuron_quant"
-
-    def get_supported_act_dtypes(self) -> list[str]:
-        # Neuron implements custom handling logic for quantization support
-        return AlwaysSupportedDtypes()
-
-    @classmethod
-    def get_min_capability(cls) -> int:
-        raise NotImplementedError(
-            "This function should not be called with Neuron Backend")
-
-    @staticmethod
-    def get_config_filenames() -> list[str]:
-        return []
-
-    @classmethod
-    def from_config(cls, config: dict[str, Any]) -> "NeuronQuantConfig":
-        quantize_method = cls.get_from_keys(config, ["quantize_method"])
-        dequant_dtype = cls.get_from_keys(config, ["dequant_dtype"])
-        return cls(dequant_dtype=dequant_dtype,
-                   quantize_method=quantize_method)
-
-    def get_quant_method(self, layer: Module, prefix: str) -> Optional[Any]:
-        if find_spec("transformers_neuronx") is not None:
-            return self.get_quantization_config()
-        else:
-            raise NotImplementedError(
-                "Neuron Quantization is only supported through"
-                " transformers_neuronx.")
-
-    def get_quantization_config(self):
-        from transformers_neuronx.config import QuantizationConfig
-        return QuantizationConfig(quant_dtype=self.quant_dtype,
-                                  dequant_dtype=self.dequant_dtype,
-                                  quantize_method=self.quantize_method)
diff --git a/vllm/model_executor/layers/quantization/ptpc_fp8.py b/vllm/model_executor/layers/quantization/ptpc_fp8.py
index 466fd5fba768542806c161f278a5522f7158e9a4..45ea8e3520f1d85df1c93a15ab3b2851f6dcf30c 100644
--- a/vllm/model_executor/layers/quantization/ptpc_fp8.py
+++ b/vllm/model_executor/layers/quantization/ptpc_fp8.py
@@ -92,13 +92,13 @@ class PTPCFp8LinearMethod(Fp8LinearMethod):
     """
 
     def __init__(self, quant_config: PTPCFp8Config):
+        assert current_platform.is_rocm(), \
+            "PTPCFp8LinearMethod is only supported on ROCm."
         super().__init__(quant_config=quant_config)
         # Force weight quantization
         self.quant_config.is_checkpoint_fp8_serialized = False
         self.fp8_linear = Fp8LinearOp(
-            act_quant_static=False,
-            act_quant_group_shape=GroupShape.PER_TOKEN,
-            force_fp8_e4m3fnuz=True)
+            act_quant_static=False, act_quant_group_shape=GroupShape.PER_TOKEN)
 
     def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
         layer.weight = torch.nn.Parameter(layer.weight.data,
diff --git a/vllm/model_executor/layers/quantization/quark/quark_moe.py b/vllm/model_executor/layers/quantization/quark/quark_moe.py
index fdf03ded044807c22d5f1417fe875a3f5d40de8b..6cff9f3019d34360d06a8ab174e5eb7e967faec3 100644
--- a/vllm/model_executor/layers/quantization/quark/quark_moe.py
+++ b/vllm/model_executor/layers/quantization/quark/quark_moe.py
@@ -1,7 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-from typing import Any, Callable, Optional
+from typing import Any, Callable, Optional, Union
 
 import torch
 
@@ -226,7 +226,7 @@ class QuarkW8A8Fp8MoEMethod(QuarkMoEMethod):
         expert_load_view: Optional[torch.Tensor] = None,
         logical_to_physical_map: Optional[torch.Tensor] = None,
         logical_replica_count: Optional[torch.Tensor] = None,
-    ) -> torch.Tensor:
+    ) -> Union[torch.Tensor, tuple[torch.Tensor, torch.Tensor]]:
         assert self.fused_experts is None
 
         if enable_eplb:
@@ -390,7 +390,7 @@ class QuarkW4A4MXFp4MoEMethod(QuarkMoEMethod):
         expert_load_view: Optional[torch.Tensor] = None,
         logical_to_physical_map: Optional[torch.Tensor] = None,
         logical_replica_count: Optional[torch.Tensor] = None,
-    ) -> torch.Tensor:
+    ) -> Union[torch.Tensor, tuple[torch.Tensor, torch.Tensor]]:
         assert self.fused_experts is None
 
         if enable_eplb:
diff --git a/vllm/model_executor/layers/quantization/rtn.py b/vllm/model_executor/layers/quantization/rtn.py
index 8f72b8cbea7a75c133fcb7fe36d2f3a5fcdc6a1e..0d5fa05652b8062c0598a9cb578ac8e4dbdd0dd3 100644
--- a/vllm/model_executor/layers/quantization/rtn.py
+++ b/vllm/model_executor/layers/quantization/rtn.py
@@ -3,7 +3,7 @@
 # Copyright © 2025, Oracle and/or its affiliates.
 
 import os
-from typing import Any, Callable, Optional
+from typing import Any, Callable, Optional, Union
 
 import torch
 import torch.nn.functional as F
@@ -291,7 +291,7 @@ class RTNMoEMethod(FusedMoEMethodBase):
         expert_load_view: Optional[torch.Tensor] = None,
         logical_to_physical_map: Optional[torch.Tensor] = None,
         logical_replica_count: Optional[torch.Tensor] = None,
-    ) -> torch.Tensor:
+    ) -> Union[torch.Tensor, tuple[torch.Tensor, torch.Tensor]]:
         assert self.fused_experts is None
 
         if enable_eplb:
diff --git a/vllm/model_executor/layers/quantization/torchao.py b/vllm/model_executor/layers/quantization/torchao.py
index 63b2ab6bab0638f7dfe9c3f2837458ff82986778..3498d2994c2aba619446394fa6d027a1efb0e073 100644
--- a/vllm/model_executor/layers/quantization/torchao.py
+++ b/vllm/model_executor/layers/quantization/torchao.py
@@ -152,18 +152,20 @@ def torchao_quantize_param_data(param: torch.Tensor,
     from torchao.quantization import quantize_
 
     assert isinstance(torchao_config, AOBaseConfig), f"{torchao_config}"
-    """ 
-    Avoid real weight allocation for faster load, since we will 
+    """
+    Avoid real weight allocation for faster load, since we will
     end up setting it to param.
     """
     with torch.device("meta"):
-        dummy_linear = torch.nn.Linear(param.shape[1],
-                                       param.shape[0],
-                                       bias=False)
+        # linear can't be top level module since quantize_ is inplace
+        # while some of our configs need to do module swap, and only non-top
+        # level modules support module swap
+        dummy_linear = torch.nn.Sequential(
+            torch.nn.Linear(param.shape[1], param.shape[0], bias=False))
 
-    dummy_linear.weight = param
+    dummy_linear[0].weight = param
     quantize_(dummy_linear, torchao_config)
-    return dummy_linear.weight
+    return dummy_linear[0].weight
 
 
 class TorchAOLinearMethod(LinearMethodBase):
diff --git a/vllm/model_executor/layers/quantization/utils/fp8_utils.py b/vllm/model_executor/layers/quantization/utils/fp8_utils.py
index d08f421b68f0406df30bcd8652dd5b6f06ad18ee..c36e1dd40346690fae208a2544ac6a822c195aac 100644
--- a/vllm/model_executor/layers/quantization/utils/fp8_utils.py
+++ b/vllm/model_executor/layers/quantization/utils/fp8_utils.py
@@ -41,11 +41,14 @@ def cutlass_scaled_mm(
     block_size: List[int],
     output_dtype: torch.dtype = torch.float16,
 ) -> torch.Tensor:
-    return ops.cutlass_scaled_mm(A,
-                                 B.T,
-                                 out_dtype=output_dtype,
-                                 scale_a=As,
-                                 scale_b=Bs.T)
+    return ops.cutlass_scaled_mm(
+        A,
+        B.T,
+        out_dtype=output_dtype,
+        scale_a=As,
+        # SM90 block FP8 requires row-major scale_b, which we do ahead of time
+        scale_b=Bs if block_size is not None
+        and current_platform.is_device_capability(90) else Bs.T)
 
 
 def rocm_aiter_gemm_w8a8_blockscale_impl(
@@ -153,35 +156,32 @@ def apply_w8a8_block_fp8_linear(
             output += bias
         return output.to(dtype=output_dtype).view(*output_shape)
 
-    if current_platform.is_cuda():
-        if current_platform.has_device_capability(100):
-
-            use_cutlass = cutlass_block_fp8_supported and (
-                cdiv(weight.shape[0], 128) == weight_scale.shape[0]
-                and cdiv(weight.shape[1], 128) == weight_scale.shape[1])
-        else:
-            # TODO: update this after switching to public sm90 block scale gemm
-            # as it also supports weight.shape % 128 != 0
-            use_cutlass = cutlass_block_fp8_supported and (
-                weight.shape[0] % 128 == 0 and weight.shape[1] % 128 == 0)
-    else:
-        use_cutlass = False
-
     w8a8_blockscale_func = dispatch_w8a8_blockscale_func(
-        use_cutlass, use_aiter_and_is_supported)
-    if use_cutlass:
-        q_input, x_scale = per_token_group_quant_fp8(
-            input_2d, block_size[1], column_major_scales=use_cutlass)
+        cutlass_block_fp8_supported, use_aiter_and_is_supported)
+    if cutlass_block_fp8_supported:
+        num_pad = 0
+        if current_platform.is_device_capability(90):
+            # pad first dimension to be divisible by 4 due to
+            # cutlass blockwise gemm limitation for hopper
+            num_pad = 4 - (input_2d.shape[0] % 4)
+            if num_pad > 0:
+                input_2d = torch.nn.functional.pad(input_2d,
+                                                   (0, 0, 0, num_pad),
+                                                   "constant", 0)
+        q_input, x_scale = per_token_group_quant_fp8(input_2d,
+                                                     block_size[1],
+                                                     column_major_scales=True)
         output = w8a8_blockscale_func(q_input, weight, x_scale, weight_scale,
                                       block_size, input.dtype)
-
+        if num_pad > 0:
+            output = output[:-num_pad]
     else:
         if use_aiter_and_is_supported:
             q_input, x_scale = aiter_per1x128_quant(
                 input_2d.contiguous(), quant_dtype=rocm_aiter.dtypes.fp8)
         else:
             q_input, x_scale = per_token_group_quant_fp8(
-                input_2d, block_size[1], column_major_scales=use_cutlass)
+                input_2d, block_size[1], column_major_scales=False)
 
         output = w8a8_blockscale_func(q_input, weight, x_scale, weight_scale,
                                       block_size, input.dtype)
diff --git a/vllm/model_executor/layers/quantization/utils/marlin_utils.py b/vllm/model_executor/layers/quantization/utils/marlin_utils.py
index 02057b476c6e212536e2e27096bd29ec58418b28..317ad079b392df967c02b697c39bc5d8c4b9f404 100644
--- a/vllm/model_executor/layers/quantization/utils/marlin_utils.py
+++ b/vllm/model_executor/layers/quantization/utils/marlin_utils.py
@@ -201,7 +201,7 @@ def marlin_make_workspace(output_size_per_partition: int,
 def marlin_make_workspace_new(device: torch.device,
                               max_blocks_per_sm: int = 1) -> torch.Tensor:
     # In the new marlin kernel, we use the num of threadblocks as workspace
-    # size. The num of threadblocks is is sms_count * max_blocks_per_sm.
+    # size. The num of threadblocks is sms_count * max_blocks_per_sm.
     sms = torch.cuda.get_device_properties(device).multi_processor_count
     return torch.zeros(sms * max_blocks_per_sm,
                        dtype=torch.int,
diff --git a/vllm/model_executor/layers/quantization/utils/quant_utils.py b/vllm/model_executor/layers/quantization/utils/quant_utils.py
index 6154fca2e416df7e9811efd95c267ee01c2b3a9a..f4ff875adb21c5b4476d0f5de4c830e7dd313976 100644
--- a/vllm/model_executor/layers/quantization/utils/quant_utils.py
+++ b/vllm/model_executor/layers/quantization/utils/quant_utils.py
@@ -116,7 +116,7 @@ def _normalize_quant_group_shape(x: torch.Tensor, group_shape: GroupShape):
 # then we would expand a to:
 #       a = [[1, 1, 2, 2],
 #            [3, 3, 4, 4]]
-# NOTE this function this function does not explicitly broadcast dimensions
+# NOTE this function does not explicitly broadcast dimensions
 # with an extent of 1, since this can be done implicitly by pytorch
 def group_broadcast(t, shape):
     for i, s in enumerate(shape):
diff --git a/vllm/model_executor/layers/quantization/utils/w8a8_utils.py b/vllm/model_executor/layers/quantization/utils/w8a8_utils.py
index 5333bbd310ff97c18305a59166dc83ef8fedaf36..e89a5e643b0e58e914b3b1c9ebb55ba9394ec828 100644
--- a/vllm/model_executor/layers/quantization/utils/w8a8_utils.py
+++ b/vllm/model_executor/layers/quantization/utils/w8a8_utils.py
@@ -23,7 +23,7 @@ TORCH_DEVICE_IDENTITY = None
 # The condition to determine if it is on a platform that supports
 # torch._scaled_mm rowwise feature.
 # The condition is determined once as the operations
-# are time consuming.
+# are time-consuming.
 USE_ROWWISE_TORCH_SCALED_MM = (current_platform.is_rocm() and version.parse(
     torch.__version__) >= version.parse("2.7")
                                and current_platform.has_device_capability(94))
@@ -171,10 +171,12 @@ def flashinfer_w8a8_scaled_mm(*, qinput: torch.Tensor, weight: torch.Tensor,
                                     bias=bias)
 
 
-def rocm_per_tensor_w8a8_scaled_mm_impl(
-        qinput: torch.Tensor, weight: torch.Tensor, out_dtype: torch.dtype,
-        scale_a: torch.Tensor, scale_b: torch.Tensor, bias: torch.Tensor,
-        input_2d: torch.Tensor) -> torch.Tensor:
+def rocm_per_tensor_w8a8_scaled_mm_impl(qinput: torch.Tensor,
+                                        weight: torch.Tensor,
+                                        out_dtype: torch.dtype,
+                                        scale_a: torch.Tensor,
+                                        scale_b: torch.Tensor,
+                                        bias: torch.Tensor) -> torch.Tensor:
     from vllm.platforms.rocm import on_mi3xx
     if envs.VLLM_ROCM_USE_SKINNY_GEMM and on_mi3xx(
     ) and qinput.shape[0] == 1 and qinput.shape[1] % 16 == 0:
@@ -190,10 +192,12 @@ def rocm_per_tensor_w8a8_scaled_mm_impl(
     return output
 
 
-def rocm_per_tensor_w8a8_scaled_mm_fake(
-        qinput: torch.Tensor, weight: torch.Tensor, out_dtype: torch.dtype,
-        scale_a: torch.Tensor, scale_b: torch.Tensor, bias: torch.Tensor,
-        input_2d: torch.Tensor) -> torch.Tensor:
+def rocm_per_tensor_w8a8_scaled_mm_fake(qinput: torch.Tensor,
+                                        weight: torch.Tensor,
+                                        out_dtype: torch.dtype,
+                                        scale_a: torch.Tensor,
+                                        scale_b: torch.Tensor,
+                                        bias: torch.Tensor) -> torch.Tensor:
     return qinput.new_empty((*qinput.shape[:-1], weight.shape[1]),
                             dtype=out_dtype)
 
@@ -203,11 +207,10 @@ def rocm_per_tensor_w8a8_scaled_mm(*, qinput: torch.Tensor,
                                    out_dtype: torch.dtype,
                                    scale_a: torch.Tensor,
                                    scale_b: torch.Tensor, bias: torch.Tensor,
-                                   input_2d: torch.Tensor,
                                    output_shape: list) -> torch.Tensor:
     output = torch.ops.vllm.rocm_per_tensor_w8a8_scaled_mm_impl(
-        qinput, weight, out_dtype, scale_a, scale_b, bias, input_2d)
-    return torch.narrow(output, 0, 0, input_2d.shape[0]).view(*output_shape)
+        qinput, weight, out_dtype, scale_a, scale_b, bias)
+    return torch.narrow(output, 0, 0, qinput.shape[0]).view(*output_shape)
 
 
 direct_register_custom_op(
@@ -224,7 +227,6 @@ def torch_per_tensor_w8a8_scaled_mm(*, qinput: torch.Tensor,
                                     out_dtype: torch.dtype,
                                     scale_a: torch.Tensor,
                                     scale_b: torch.Tensor, bias: torch.Tensor,
-                                    input_2d: torch.Tensor,
                                     output_shape: list) -> torch.Tensor:
     output = torch._scaled_mm(qinput,
                               weight,
@@ -237,7 +239,7 @@ def torch_per_tensor_w8a8_scaled_mm(*, qinput: torch.Tensor,
     if type(output) is tuple and len(output) == 2:
         output = output[0]
 
-    return torch.narrow(output, 0, 0, input_2d.shape[0]).view(*output_shape)
+    return torch.narrow(output, 0, 0, qinput.shape[0]).view(*output_shape)
 
 
 def torch_per_token_w8a8_scaled_mm(*, qinput: torch.Tensor,
@@ -245,7 +247,7 @@ def torch_per_token_w8a8_scaled_mm(*, qinput: torch.Tensor,
                                    out_dtype: torch.dtype,
                                    scale_a: torch.Tensor,
                                    scale_b: torch.Tensor, bias: torch.Tensor,
-                                   input_2d: torch.Tensor, output_shape: list,
+                                   output_shape: list,
                                    **kwargs) -> torch.Tensor:
     # Note: Callers of this function should check USE_ROWWISE_TORCH_SCALED_MM
     #  when using it.
@@ -265,7 +267,7 @@ def torch_per_token_w8a8_scaled_mm(*, qinput: torch.Tensor,
                               scale_b=scale_b.t(),
                               bias=bias)
 
-    output = torch.narrow(output, 0, 0, input_2d.shape[0])
+    output = torch.narrow(output, 0, 0, qinput.shape[0])
     output = output.view(*output_shape)
     return output
 
@@ -275,7 +277,6 @@ def torch_channelwise_w8a8_scaled_mm(*, qinput: torch.Tensor,
                                      out_dtype: torch.dtype,
                                      scale_a: torch.Tensor,
                                      scale_b: torch.Tensor, bias: torch.Tensor,
-                                     input_2d: torch.Tensor,
                                      output_shape: list,
                                      **kwargs) -> torch.Tensor:
     # Use unfused DQ due to limitations with scaled_mm
@@ -305,8 +306,8 @@ def torch_channelwise_w8a8_scaled_mm(*, qinput: torch.Tensor,
     if type(output) is tuple and len(output) == 2:
         output = output[0]
     # Unpad (undo num_token_padding)
-    output = torch.narrow(output, 0, 0, input_2d.shape[0])
-    x_scale = torch.narrow(scale_a, 0, 0, input_2d.shape[0])
+    output = torch.narrow(output, 0, 0, qinput.shape[0])
+    x_scale = torch.narrow(scale_a, 0, 0, qinput.shape[0])
 
     # DQ
     # C = sw * sx * (X * W) + bias
@@ -355,12 +356,10 @@ class Fp8LinearOp:
     def __init__(self,
                  act_quant_static: bool,
                  act_quant_group_shape: GroupShape = GroupShape.PER_TENSOR,
-                 pad_output: Optional[bool] = None,
-                 force_fp8_e4m3fnuz: bool = False):
+                 pad_output: Optional[bool] = None):
         if current_platform.is_rocm():
             self.preferred_backend = "rocm"
-        elif current_platform.is_cuda(
-        ) and not force_fp8_e4m3fnuz and cutlass_fp8_supported():
+        elif current_platform.is_cuda() and cutlass_fp8_supported():
             if has_flashinfer() and current_platform.has_device_capability(
                     100):
                 self.preferred_backend = "flashinfer"
@@ -432,7 +431,6 @@ class Fp8LinearOp:
                                    scale_a=x_scale,
                                    scale_b=weight_scale,
                                    bias=bias,
-                                   input_2d=input_2d,
                                    output_shape=output_shape)
 
 
diff --git a/vllm/model_executor/layers/rotary_embedding/base.py b/vllm/model_executor/layers/rotary_embedding/base.py
index 10fce857a8ae271621d573c42bb516ae15302710..be25e90abf821186c823c86194c4c3b8c1e21c9a 100644
--- a/vllm/model_executor/layers/rotary_embedding/base.py
+++ b/vllm/model_executor/layers/rotary_embedding/base.py
@@ -7,7 +7,7 @@ import torch
 
 from vllm.model_executor.custom_op import CustomOp
 
-from .common import apply_rotary_emb_dispatch, apply_rotary_emb_torch
+from .common import apply_rotary_emb_torch
 
 
 @CustomOp.register("rotary_embedding")
@@ -149,87 +149,6 @@ class RotaryEmbedding(CustomOp):
                                      self.cos_sin_cache, self.is_neox_style)
         return query, key
 
-    def forward_neuron(
-        self,
-        positions: torch.Tensor,
-        query: torch.Tensor,
-        key: Optional[torch.Tensor] = None,
-        offsets: Optional[torch.Tensor] = None,
-    ) -> tuple[torch.Tensor, Optional[torch.Tensor]]:
-
-        def _apply_rotary_emb_neuron(
-            x: torch.Tensor,
-            cos: torch.Tensor,
-            sin: torch.Tensor,
-            is_neox_style: bool,
-        ) -> torch.Tensor:
-            cos = cos.unsqueeze(-2).to(x.dtype)
-            sin = sin.unsqueeze(-2).to(x.dtype)
-            if is_neox_style:
-                x1, x2 = torch.chunk(x, 2, dim=-1)
-            else:
-                # x1 = x[..., ::2]
-
-                # x2 = x[..., 1::2]
-                d = x.shape[-1] // 2
-                x_reshaped = x.view(-1, x.shape[-1])
-                x1 = x_reshaped[:, ::2].view(*x.shape[:-1], d)
-                x2 = x_reshaped[:, 1::2].view(*x.shape[:-1], d)
-            o1 = x1 * cos - x2 * sin
-            o2 = x2 * cos + x1 * sin
-            if is_neox_style:
-                return torch.cat((o1, o2), dim=-1)
-            else:
-                return torch.stack((o1, o2), dim=-1).flatten(-2)
-
-        if offsets is not None:
-            positions = positions + offsets
-
-        self.cos_sin_cache = self.cos_sin_cache.to(query.device,
-                                                   dtype=query.dtype)
-
-        positions = positions.flatten()
-        num_tokens = positions.shape[0]
-        cos_sin = self.cos_sin_cache.index_select(0, positions)
-        cos, sin = cos_sin.chunk(2, dim=-1)
-
-        query_shape = query.shape
-        query = query.view(num_tokens, -1, self.head_size)
-        if key is not None:
-            key_shape = key.shape
-            key = key.view(num_tokens, -1, self.head_size)
-
-        if self.rotary_dim == self.head_size:
-            query = apply_rotary_emb_dispatch(query, cos, sin,
-                                              self.is_neox_style)
-            query = query.reshape(query_shape)
-            if key is not None:
-                key = apply_rotary_emb_dispatch(key, cos, sin,
-                                                self.is_neox_style)
-                key = key.reshape(key_shape)
-        else:
-            head_size = query.shape[-1]
-            query_reshaped = query.view(-1, head_size)
-            query_pass = query_reshaped[:, self.rotary_dim:].view(
-                *query.shape[:-1], head_size - self.rotary_dim)
-            query_rot = query_reshaped[:, :self.rotary_dim].view(
-                *query.shape[:-1], self.rotary_dim)
-            query_rot = _apply_rotary_emb_neuron(query_rot, cos, sin,
-                                                 self.is_neox_style)
-            query = torch.cat((query_rot, query_pass),
-                              dim=-1).reshape(query_shape)
-
-            if key is not None:
-                key_reshaped = key.view(-1, head_size)
-                key_pass = key_reshaped[:, self.rotary_dim:].view(
-                    *key.shape[:-1], head_size - self.rotary_dim)
-                key_rot = key_reshaped[:, :self.rotary_dim].view(
-                    *key.shape[:-1], self.rotary_dim)
-                key_rot = _apply_rotary_emb_neuron(key_rot, cos, sin,
-                                                   self.is_neox_style)
-                key = torch.cat((key_rot, key_pass), dim=-1).reshape(key_shape)
-        return query, key
-
     def extra_repr(self) -> str:
         s = f"head_size={self.head_size}, rotary_dim={self.rotary_dim}"
         s += f", max_position_embeddings={self.max_position_embeddings}"
diff --git a/vllm/model_executor/layers/rotary_embedding/deepseek_scaling_rope.py b/vllm/model_executor/layers/rotary_embedding/deepseek_scaling_rope.py
index cd888b733426b382d4ed00f691217e58dc744e4e..7ac2e4bb6c34ff09b541efb362a223805ba84223 100644
--- a/vllm/model_executor/layers/rotary_embedding/deepseek_scaling_rope.py
+++ b/vllm/model_executor/layers/rotary_embedding/deepseek_scaling_rope.py
@@ -88,7 +88,7 @@ class DeepseekScalingRotaryEmbedding(RotaryEmbedding):
         cache = torch.cat((cos, sin), dim=-1)
         return cache
 
-    def forward(
+    def forward_native(
         self,
         positions: torch.Tensor,
         query: torch.Tensor,
@@ -129,3 +129,12 @@ class DeepseekScalingRotaryEmbedding(RotaryEmbedding):
             query = query_rot
             key = key_rot
         return query, key
+
+    def forward_cuda(
+        self,
+        positions: torch.Tensor,
+        query: torch.Tensor,
+        key: Optional[torch.Tensor] = None,
+        offsets: Optional[torch.Tensor] = None,
+    ) -> tuple[torch.Tensor, Optional[torch.Tensor]]:
+        return self.forward_native(positions, query, key, offsets)
diff --git a/vllm/model_executor/layers/rotary_embedding/dual_chunk_rope.py b/vllm/model_executor/layers/rotary_embedding/dual_chunk_rope.py
index 3d8da0fa9d8f5cc3299f6ed3726ee95a5edf4da7..27e41dd0fa97e8cada833cb0cebfcd95275aba83 100644
--- a/vllm/model_executor/layers/rotary_embedding/dual_chunk_rope.py
+++ b/vllm/model_executor/layers/rotary_embedding/dual_chunk_rope.py
@@ -111,7 +111,7 @@ class DualChunkRotaryEmbedding(CustomOp):
                                              device=self.device)
         return q_cache, qc_cache, k_cache, qc_no_clamp_cache, q_inter_cache
 
-    def forward(
+    def forward_native(
         self,
         positions: torch.Tensor,
         query: torch.Tensor,
@@ -161,6 +161,15 @@ class DualChunkRotaryEmbedding(CustomOp):
                           dim=-1)
         return query, key
 
+    def forward_cuda(
+        self,
+        positions: torch.Tensor,
+        query: torch.Tensor,
+        key: torch.Tensor,
+        offsets: Optional[torch.Tensor] = None,
+    ) -> tuple[torch.Tensor, torch.Tensor]:
+        return self.forward_native(positions, query, key, offsets)
+
     def _apply_rotary_embedding(self, cos_sin, hidden_rot, hidden_pass):
         cos, sin = cos_sin.chunk(2, dim=-1)
         if self.is_neox_style:
diff --git a/vllm/model_executor/layers/rotary_embedding/ernie45_vl_rope.py b/vllm/model_executor/layers/rotary_embedding/ernie45_vl_rope.py
index 05322e56f2620c4b1a8723af92a1d2c739200bca..4960c20f4060a0cc1badf031f5838c8e8e1e3f37 100644
--- a/vllm/model_executor/layers/rotary_embedding/ernie45_vl_rope.py
+++ b/vllm/model_executor/layers/rotary_embedding/ernie45_vl_rope.py
@@ -12,7 +12,7 @@ from .mrope import MRotaryEmbedding
 class Ernie4_5_VLRotaryEmbedding(MRotaryEmbedding):
     """3D rotary positional embedding. 3D is t:time h:height w:width"""
 
-    def forward(
+    def forward_native(  # type: ignore[override]
         self,
         positions: torch.Tensor,
         query: torch.Tensor,
@@ -70,3 +70,11 @@ class Ernie4_5_VLRotaryEmbedding(MRotaryEmbedding):
                                             self.is_neox_style)
         key = torch.cat((key_rot, key_pass), dim=-1).reshape(key_shape)
         return query, key
+
+    def forward_cuda(  # type: ignore[override]
+        self,
+        positions: torch.Tensor,
+        query: torch.Tensor,
+        key: Optional[torch.Tensor] = None,
+    ) -> tuple[torch.Tensor, Optional[torch.Tensor]]:
+        return self.forward_native(positions, query, key)
\ No newline at end of file
diff --git a/vllm/model_executor/layers/rotary_embedding/llama4_vision_rope.py b/vllm/model_executor/layers/rotary_embedding/llama4_vision_rope.py
index 415a85ab698bcee3441679ec2b7af8d00526c95b..37ead43e22bc457b3877db25d9a71557badd8910 100644
--- a/vllm/model_executor/layers/rotary_embedding/llama4_vision_rope.py
+++ b/vllm/model_executor/layers/rotary_embedding/llama4_vision_rope.py
@@ -53,7 +53,7 @@ class Llama4VisionRotaryEmbedding(RotaryEmbedding):
             torch.stack([torch.cos(freqs), torch.sin(freqs)], dim=-1))
         return cache
 
-    def forward(
+    def forward_native(  # type: ignore[override]
         self,
         query: torch.Tensor,
         key: Optional[torch.Tensor] = None,
@@ -72,3 +72,10 @@ class Llama4VisionRotaryEmbedding(RotaryEmbedding):
         query_out = torch.view_as_real(query_ * freqs_ci).flatten(3)
         key_out = torch.view_as_real(key_ * freqs_ci).flatten(3)
         return query_out.type_as(query), key_out.type_as(key)
+
+    def forward_cuda(  # type: ignore[override]
+        self,
+        query: torch.Tensor,
+        key: Optional[torch.Tensor] = None,
+    ) -> tuple[torch.Tensor, Optional[torch.Tensor]]:
+        return self.forward_native(query, key)
diff --git a/vllm/model_executor/layers/rotary_embedding/mrope.py b/vllm/model_executor/layers/rotary_embedding/mrope.py
index 5686ec7b35de88aa80722acd7296ae1bf899ae00..0acb5ea7424554ffc3d0ad729f558ae06f5ed2a8 100644
--- a/vllm/model_executor/layers/rotary_embedding/mrope.py
+++ b/vllm/model_executor/layers/rotary_embedding/mrope.py
@@ -8,7 +8,6 @@ import numpy as np
 import torch
 from transformers import PretrainedConfig
 
-from vllm.platforms import current_platform
 from vllm.triton_utils import tl, triton
 
 from .base import RotaryEmbedding
@@ -202,28 +201,6 @@ class MRotaryEmbedding(RotaryEmbedding):
         if self.mrope_section:
             assert sum(self.mrope_section) == rotary_dim // 2
 
-        self.use_triton = current_platform.is_cuda_alike()
-
-    def forward(
-        self,
-        positions: torch.Tensor,
-        query: torch.Tensor,
-        key: Optional[torch.Tensor] = None,
-    ) -> tuple[torch.Tensor, Optional[torch.Tensor]]:
-        """MRope forward.
-
-        Args:
-            positions:
-                [num_tokens,] (text only) or
-                [3, num_tokens] (T/H/W positions with multimodal inputs)
-            query: [num_tokens, num_heads * head_size]
-            key: [num_tokens, num_kv_heads * head_size]
-        """
-        if self.use_triton:
-            return self.forward_cuda(positions, query, key)
-        else:
-            return self.forward_native(positions, query, key)
-
     def forward_native(
         self,
         positions: torch.Tensor,
@@ -402,6 +379,15 @@ class MRotaryEmbedding(RotaryEmbedding):
                 context_len=context_len,
                 seq_len=seq_len,
             )
+        elif "KeyeVL1_5" in hf_config.model_type:
+            return cls._keye_get_input_positions_tensor(
+                input_tokens=input_tokens,
+                hf_config=hf_config,
+                image_grid_thw=image_grid_thw,
+                video_grid_thw=video_grid_thw,
+                context_len=context_len,
+                seq_len=seq_len,
+            )
         else:
             return cls._vl_get_input_positions_tensor(
                 input_tokens=input_tokens,
@@ -636,6 +622,126 @@ class MRotaryEmbedding(RotaryEmbedding):
                                 len(input_tokens)).item()
         return llm_positions, mrope_position_delta
 
+    @classmethod
+    def _keye_get_input_positions_tensor(
+        cls,
+        input_tokens: list[int],
+        hf_config: PretrainedConfig,
+        image_grid_thw: Union[list[list[int]], torch.Tensor],
+        video_grid_thw: Union[list[list[int]], torch.Tensor],
+        context_len: int = 0,
+        seq_len: Optional[int] = None,
+    ) -> tuple[torch.Tensor, int]:
+        if isinstance(video_grid_thw, list) and len(video_grid_thw) > 0:
+            video_grid_thw = video_grid_thw[0]
+        """Get mrope input positions and delta value (Keye series)."""
+
+        def split_thw(
+                grid_thw: Union[torch.Tensor, list[int]]) -> list[list[int]]:
+            """
+            Split grid_thw along the t dimension.
+
+            Args:
+                grid_thw: shape [N, 3] tensor or nested list of [t, h, w].
+
+            Returns:
+                List of [1, h, w] rows, repeated t times for each original row.
+            """
+
+            if isinstance(grid_thw, list):
+                grid_thw = torch.tensor(grid_thw, dtype=torch.long)
+
+            if grid_thw.numel() == 0:
+                return []
+
+            t, hw = grid_thw[:, 0], grid_thw[:, 1:]
+            ones = torch.ones_like(hw[:, :1])  # [N,1]
+            out = torch.cat([ones, hw], dim=1).repeat_interleave(t, dim=0)
+            return out.tolist()
+
+        video_grid_thw = split_thw(video_grid_thw)
+
+        image_token_id = hf_config.image_token_id
+        video_token_id = hf_config.video_token_id
+        spatial_merge_size = hf_config.vision_config.spatial_merge_size
+
+        image_nums = len(image_grid_thw)
+        frame_nums = len(video_grid_thw)
+        llm_pos_ids_list: list = []
+
+        st = 0
+        remain_images, remain_frames = image_nums, frame_nums
+
+        image_index, video_index = 0, 0
+        for _ in range(image_nums + frame_nums):
+            if remain_images > 0:
+                try:
+                    ed_image = input_tokens.index(image_token_id, st)
+                except ValueError:
+                    ed_image = len(input_tokens) + 1
+            else:
+                ed_image = len(input_tokens) + 1
+            if remain_frames > 0:
+                try:
+                    ed_video = input_tokens.index(video_token_id, st)
+                except ValueError:
+                    ed_video = len(input_tokens) + 1
+            else:
+                ed_video = len(input_tokens) + 1
+
+            if ed_image < ed_video:
+                t, h, w = (
+                    image_grid_thw[image_index][0],
+                    image_grid_thw[image_index][1],
+                    image_grid_thw[image_index][2],
+                )
+                image_index += 1
+                remain_images -= 1
+                ed = ed_image
+            else:
+                t, h, w = (
+                    video_grid_thw[video_index][0],
+                    video_grid_thw[video_index][1],
+                    video_grid_thw[video_index][2],
+                )
+                video_index += 1
+                remain_frames -= 1
+                ed = ed_video
+
+            llm_grid_t, llm_grid_h, llm_grid_w = \
+                t, h // spatial_merge_size, w // spatial_merge_size
+            text_len = ed - st
+
+            st_idx = llm_pos_ids_list[-1].max() + 1 if len(
+                llm_pos_ids_list) > 0 else 0
+            llm_pos_ids_list.append(
+                torch.arange(text_len).view(1, -1).expand(3, -1) + st_idx)
+
+            t_index = (torch.arange(llm_grid_t).view(-1, 1).expand(
+                -1, llm_grid_h * llm_grid_w)).long().flatten()
+
+            h_index = torch.arange(llm_grid_h).view(1, -1, 1).expand(
+                llm_grid_t, -1, llm_grid_w).flatten()
+            w_index = torch.arange(llm_grid_w).view(1, 1, -1).expand(
+                llm_grid_t, llm_grid_h, -1).flatten()
+            llm_pos_ids_list.append(
+                torch.stack([t_index, h_index, w_index]) + text_len + st_idx)
+            st = ed + llm_grid_t * llm_grid_h * llm_grid_w
+
+        if st < len(input_tokens):
+            st_idx = llm_pos_ids_list[-1].max() + 1 if len(
+                llm_pos_ids_list) > 0 else 0
+            text_len = len(input_tokens) - st
+            llm_pos_ids_list.append(
+                torch.arange(text_len).view(1, -1).expand(3, -1) + st_idx)
+
+        llm_positions = torch.cat(llm_pos_ids_list, dim=1).reshape(3, -1)
+        mrope_position_delta = (llm_positions.max() + 1 -
+                                len(input_tokens)).item()
+        llm_positions = llm_positions[:, context_len:seq_len]
+
+        return llm_positions, mrope_position_delta
+
     @classmethod
     def _vl_get_input_positions_tensor(
         cls,
diff --git a/vllm/model_executor/layers/sampler.py b/vllm/model_executor/layers/sampler.py
index e77eb637c8942129f085f1f83fd67bd4fb0a5538..829dd82b0bd4d9f88f188e56b159e98e41d9a4dc 100644
--- a/vllm/model_executor/layers/sampler.py
+++ b/vllm/model_executor/layers/sampler.py
@@ -13,14 +13,14 @@ import torch
 import torch.nn as nn
 
 import vllm.envs as envs
+from vllm.logprobs import Logprob, PromptLogprobs, SampleLogprobs
 from vllm.model_executor.layers.utils import apply_penalties
 from vllm.model_executor.sampling_metadata import (SamplingMetadata,
                                                    SamplingTensors,
                                                    SequenceGroupToSample)
 from vllm.sampling_params import SamplingType
 from vllm.sequence import (VLLM_INVALID_TOKEN_ID,
-                           CompletionSequenceGroupOutput, Logprob,
-                           PromptLogprobs, SampleLogprobs, SequenceOutput)
+                           CompletionSequenceGroupOutput, SequenceOutput)
 
 if envs.VLLM_USE_FLASHINFER_SAMPLER and find_spec("flashinfer"):
     # yapf: disable
diff --git a/vllm/model_executor/layers/shared_fused_moe/__init__.py b/vllm/model_executor/layers/shared_fused_moe/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..b87c69d3edd04d50cfedb73da028cf9677a313cd
--- /dev/null
+++ b/vllm/model_executor/layers/shared_fused_moe/__init__.py
@@ -0,0 +1,6 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from vllm.model_executor.layers.shared_fused_moe.shared_fused_moe import (
+    SharedFusedMoE)
+
+__all__ = ["SharedFusedMoE"]
diff --git a/vllm/model_executor/layers/shared_fused_moe/shared_fused_moe.py b/vllm/model_executor/layers/shared_fused_moe/shared_fused_moe.py
new file mode 100644
index 0000000000000000000000000000000000000000..e1e3d188d985233500e579ab7ed0d183840904e6
--- /dev/null
+++ b/vllm/model_executor/layers/shared_fused_moe/shared_fused_moe.py
@@ -0,0 +1,56 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from typing import Optional
+
+import torch
+
+from vllm.distributed import tensor_model_parallel_all_reduce
+from vllm.model_executor.layers.fused_moe.layer import FusedMoE
+
+
+# TODO(bnell): Add shared + fused combo function? e.g. +
+class SharedFusedMoE(FusedMoE):
+    """
+    A FusedMoE operation that also computes the results of shared experts.
+    If an all2all communicator is being used the shared expert computation
+    can be interleaved with the fused all2all dispatch communication step.
+    """
+
+    def __init__(
+        self,
+        shared_experts: torch.nn.Module,
+        use_overlapped: bool = True,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        self._shared_experts = shared_experts
+        self.use_overlapped = use_overlapped
+
+    @property
+    def shared_experts(self) -> Optional[torch.nn.Module]:
+        return self._shared_experts if self.use_overlapped else None
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        router_logits: torch.Tensor,
+    ) -> tuple[torch.Tensor, torch.Tensor]:
+        if not self.use_overlapped:
+            shared_out = self._shared_experts(hidden_states)
+
+            # Reduce outputs if necessary, since the MLP should
+            # have been created with reduce_results=False.
+            if (self.reduce_results and self.tp_size > 1
+                    and self.must_reduce_shared_expert_outputs()):
+                shared_out = tensor_model_parallel_all_reduce(shared_out)
+
+            fused_out = super().forward(
+                hidden_states=hidden_states,
+                router_logits=router_logits,
+            )
+        else:
+            shared_out, fused_out = super().forward(
+                hidden_states=hidden_states,
+                router_logits=router_logits,
+            )
+        return shared_out, fused_out
diff --git a/vllm/model_executor/layers/utils.py b/vllm/model_executor/layers/utils.py
index 2897f75b3129e8804f686aedacf742216bd74d72..d2b135c1e4d4e0293f35f6e2f0fa57d44815fe78 100644
--- a/vllm/model_executor/layers/utils.py
+++ b/vllm/model_executor/layers/utils.py
@@ -142,20 +142,49 @@ direct_register_custom_op(
 )
 
 
-def check_cpu_sgl_kernel(n: int, k: int, dtype: torch.dtype):
+def check_cpu_sgl_kernel(n: int, k: int, dtype: torch.dtype) -> bool:
     return (torch._C._cpu._is_amx_tile_supported()
             and (dtype in (torch.bfloat16, torch.int8)) and k % 32 == 0
             and n % 16 == 0)
 
 
+def dispatch_cpu_unquantized_gemm(
+    layer: torch.nn.Module,
+    remove_weight: bool,
+) -> None:
+    N, K = layer.weight.size()
+    dtype = layer.weight.dtype
+    if envs.VLLM_CPU_SGL_KERNEL and check_cpu_sgl_kernel(N, K, dtype):
+        packed_weight = torch.ops._C.convert_weight_packed(layer.weight)
+        if getattr(layer, "bias", None) is not None:
+            bias_f32 = layer.bias.to(torch.float32)
+        else:
+            bias_f32 = None
+        layer.cpu_linear = (
+            lambda x, weight, bias: torch.ops._C.weight_packed_linear(
+                x, packed_weight, bias_f32
+                if bias is not None else None, True))
+        if remove_weight:
+            layer.weight = torch.nn.Parameter(torch.empty(0),
+                                              requires_grad=False)
+    elif ops._supports_onednn:
+        origin_weight = layer.weight
+        if remove_weight:
+            layer.weight = torch.nn.Parameter(torch.empty(0),
+                                              requires_grad=False)
+        handler = ops.create_onednn_mm(origin_weight.t(), 32)
+        layer.cpu_linear = lambda x, weight, bias: ops.onednn_mm(
+            handler, x, bias)
+    else:
+        layer.cpu_linear = lambda x, weight, bias: torch.nn.functional.linear(
+            x, weight, bias)
+
+
 def cpu_unquantized_gemm(layer: torch.nn.Module,
                          x: torch.Tensor,
                          weight: torch.Tensor,
                          bias: Optional[torch.Tensor] = None):
-    if getattr(layer, "use_cpu_sgl", False):
-        return torch.ops._C.weight_packed_linear(x, weight, bias, True)
-    else:
-        return torch.nn.functional.linear(x, weight, bias)
+    return layer.cpu_linear(x, weight, bias)
 
 
 def dispatch_unquantized_gemm() -> Callable[..., torch.Tensor]:
diff --git a/vllm/model_executor/layers/vocab_parallel_embedding.py b/vllm/model_executor/layers/vocab_parallel_embedding.py
index 9f223998e554ff43942e59e5bd39c52123dafcce..c915ebac91c59b3f05dc2d566d552ebf57781075 100644
--- a/vllm/model_executor/layers/vocab_parallel_embedding.py
+++ b/vllm/model_executor/layers/vocab_parallel_embedding.py
@@ -40,6 +40,12 @@ class UnquantizedEmbeddingMethod(QuantizeMethodBase):
         layer.register_parameter("weight", weight)
         set_weight_attrs(weight, extra_weight_attrs)
 
+    def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
+        if current_platform.is_cpu():
+            from vllm.model_executor.layers.utils import (
+                dispatch_cpu_unquantized_gemm)
+            dispatch_cpu_unquantized_gemm(layer, remove_weight=False)
+
     def apply(self,
               layer: torch.nn.Module,
               x: torch.Tensor,
@@ -393,7 +399,7 @@ class VocabParallelEmbedding(CustomOp):
         param[:loaded_weight.shape[0]].data.copy_(loaded_weight)
         param[loaded_weight.shape[0]:].data.fill_(0)
 
-    def forward(self, input_):
+    def forward_native(self, input_):
         if self.tp_size > 1:
             # Build the mask.
             masked_input, input_mask = get_masked_input_and_mask(
@@ -414,6 +420,9 @@ class VocabParallelEmbedding(CustomOp):
         output = tensor_model_parallel_all_reduce(output_parallel)
         return output
 
+    def forward_cuda(self, input_):
+        return self.forward_native(input_)
+
     def extra_repr(self) -> str:
         s = f"num_embeddings={self.num_embeddings_per_partition}"
         s += f", embedding_dim={self.embedding_dim}"
@@ -423,6 +432,7 @@ class VocabParallelEmbedding(CustomOp):
         return s
 
 
+@CustomOp.register("parallel_lm_head")
 class ParallelLMHead(VocabParallelEmbedding):
     """Parallelized LM head.
 
diff --git a/vllm/model_executor/model_loader/__init__.py b/vllm/model_executor/model_loader/__init__.py
index 2dada794a8f3ed45236795f7f37ec44cf13263ac..138a2ff30b62276cebeeefb060071d528bdae104 100644
--- a/vllm/model_executor/model_loader/__init__.py
+++ b/vllm/model_executor/model_loader/__init__.py
@@ -5,7 +5,8 @@ from typing import Literal, Optional
 
 from torch import nn
 
-from vllm.config import LoadConfig, ModelConfig, VllmConfig
+from vllm.config import ModelConfig, VllmConfig
+from vllm.config.load import LoadConfig
 from vllm.logger import init_logger
 from vllm.model_executor.model_loader.base_loader import BaseModelLoader
 from vllm.model_executor.model_loader.bitsandbytes_loader import (
@@ -67,7 +68,7 @@ def register_model_loader(load_format: str):
         load_format (str): The model loader format name.
 
     Examples:
-        >>> from vllm.config import LoadConfig
+        >>> from vllm.config.load import LoadConfig
         >>> from vllm.model_executor.model_loader import get_model_loader, register_model_loader
         >>> from vllm.model_executor.model_loader.base_loader import BaseModelLoader
         >>>
diff --git a/vllm/model_executor/model_loader/base_loader.py b/vllm/model_executor/model_loader/base_loader.py
index 4cf6c7988960dd2dd51f97081fc9f9aef14c9f74..ab538a3c95620a799434ded18a9779d0867f7225 100644
--- a/vllm/model_executor/model_loader/base_loader.py
+++ b/vllm/model_executor/model_loader/base_loader.py
@@ -5,7 +5,8 @@ from abc import ABC, abstractmethod
 import torch
 import torch.nn as nn
 
-from vllm.config import LoadConfig, ModelConfig, VllmConfig
+from vllm.config import ModelConfig, VllmConfig
+from vllm.config.load import LoadConfig
 from vllm.logger import init_logger
 from vllm.model_executor.model_loader.utils import (
     initialize_model, process_weights_after_loading, set_default_torch_dtype)
diff --git a/vllm/model_executor/model_loader/bitsandbytes_loader.py b/vllm/model_executor/model_loader/bitsandbytes_loader.py
index b8393956eed3fcb8462cbb8efa2d301aacfeb15d..9c34159f9a26991979279409498a5caa6bbeb42e 100644
--- a/vllm/model_executor/model_loader/bitsandbytes_loader.py
+++ b/vllm/model_executor/model_loader/bitsandbytes_loader.py
@@ -16,7 +16,8 @@ from packaging import version
 from torch import nn
 from transformers.utils import SAFE_WEIGHTS_INDEX_NAME
 
-from vllm.config import LoadConfig, ModelConfig
+from vllm.config import ModelConfig
+from vllm.config.load import LoadConfig
 from vllm.distributed import (get_tensor_model_parallel_rank,
                               get_tensor_model_parallel_world_size)
 # yapf: enable
@@ -69,6 +70,7 @@ class BitsAndBytesModelLoader(BaseModelLoader):
         # Store all module names (from transformers) that support
         # BNB quantization.
         self.target_modules: list[str] = []
+        self.tp_disabled_modules: list[str] = []
         # Store the mapping of expert parameters for MoE models.
         self.expert_params_mapping: list[tuple[str, str, int, str]] = []
         # mapping weight names from transformers to vllm.
@@ -322,14 +324,24 @@ class BitsAndBytesModelLoader(BaseModelLoader):
                                quant_state_dict) -> Generator:
         from bitsandbytes.functional import quantize_4bit
 
-        tp_size = get_tensor_model_parallel_world_size()
-        tp_rank = get_tensor_model_parallel_rank()
+        global_tp_size = get_tensor_model_parallel_world_size()
+        global_tp_rank = get_tensor_model_parallel_rank()
 
         for (
                 org_weight_name,
                 mapped_weight_name,
                 weight_tensor,
         ) in self._hf_weight_iter(hf_weights_files, use_safetensors):
+
+            # override tp_size and tp_rank if the module has disabled TP
+            if any(tp_disabled_module in mapped_weight_name
+                   for tp_disabled_module in self.tp_disabled_modules):
+                tp_size = 1
+                tp_rank = 0
+            else:
+                tp_size = global_tp_size
+                tp_rank = global_tp_rank
+
             if any(target_module in mapped_weight_name
                    for target_module in self.target_modules
                    ) and mapped_weight_name.endswith(".weight"):
@@ -418,12 +430,16 @@ class BitsAndBytesModelLoader(BaseModelLoader):
                     # Map vllm's names to transformers's names.
                     rep_name, sub_modules = modules_info
                     for sub_name in sub_modules:
-                        self.target_modules.append(
-                            name.replace(rep_name, sub_name))
+                        new_name = name.replace(rep_name, sub_name)
+                        self.target_modules.append(new_name)
+                        if module.disable_tp:
+                            self.tp_disabled_modules.append(new_name)
                 # Add original module name even if the module has stacked map,
                 # in case model has a mixture of disk-merged and disk-split
                 # weights with same last name.
                 self.target_modules.append(name)
+                if module.disable_tp:
+                    self.tp_disabled_modules.append(name)
             elif isinstance(module, FusedMoE) and hasattr(
                     module.quant_method, "quant_config"):
                 # TODO: support FusedMoE with prequant and 8bit.
diff --git a/vllm/model_executor/model_loader/default_loader.py b/vllm/model_executor/model_loader/default_loader.py
index 34b8d8e4ed6228df467827df3f62c084bc5b8a04..d1bdec21fd9745c67aa652ac374489644cdb908c 100644
--- a/vllm/model_executor/model_loader/default_loader.py
+++ b/vllm/model_executor/model_loader/default_loader.py
@@ -7,19 +7,20 @@ import time
 from collections.abc import Generator, Iterable
 from typing import Optional, cast
 
-import huggingface_hub
 import torch
 from torch import nn
 from transformers.utils import SAFE_WEIGHTS_INDEX_NAME
 
-from vllm import envs
-from vllm.config import LoadConfig, ModelConfig
+from vllm.config import ModelConfig
+from vllm.config.load import LoadConfig
 from vllm.logger import init_logger
 from vllm.model_executor.model_loader.base_loader import BaseModelLoader
 from vllm.model_executor.model_loader.weight_utils import (
     download_safetensors_index_file_from_hf, download_weights_from_hf,
     fastsafetensors_weights_iterator, filter_duplicate_safetensors_files,
-    filter_files_not_needed_for_inference, get_lock, np_cache_weights_iterator,
+    filter_files_not_needed_for_inference, maybe_download_from_modelscope,
+    multi_thread_pt_weights_iterator,
+    multi_thread_safetensors_weights_iterator, np_cache_weights_iterator,
     pt_weights_iterator, safetensors_weights_iterator)
 from vllm.platforms import current_platform
 
@@ -29,6 +30,9 @@ logger = init_logger(__name__)
 class DefaultModelLoader(BaseModelLoader):
     """Model loader that can load different file types from disk."""
 
+    # default number of thread when enable multithread weight loading
+    DEFAULT_NUM_THREADS = 8
+
     @dataclasses.dataclass
     class Source:
         """A source for weights."""
@@ -53,38 +57,15 @@ class DefaultModelLoader(BaseModelLoader):
 
     def __init__(self, load_config: LoadConfig):
         super().__init__(load_config)
-        if load_config.model_loader_extra_config:
-            raise ValueError(f"Model loader extra config is not supported for "
-                             f"load format {load_config.load_format}")
-
-    def _maybe_download_from_modelscope(
-            self, model: str, revision: Optional[str]) -> Optional[str]:
-        """Download model from ModelScope hub if VLLM_USE_MODELSCOPE is True.
-
-        Returns the path to the downloaded model, or None if the model is not
-        downloaded from ModelScope."""
-        if envs.VLLM_USE_MODELSCOPE:
-            # download model from ModelScope hub,
-            # lazy import so that modelscope is not required for normal use.
-            # pylint: disable=C.
-            from modelscope.hub.snapshot_download import snapshot_download
-
-            # Use file lock to prevent multiple processes from
-            # downloading the same model weights at the same time.
-            with get_lock(model, self.load_config.download_dir):
-                if not os.path.exists(model):
-                    model_path = snapshot_download(
-                        model_id=model,
-                        cache_dir=self.load_config.download_dir,
-                        local_files_only=huggingface_hub.constants.
-                        HF_HUB_OFFLINE,
-                        revision=revision,
-                        ignore_file_pattern=self.load_config.ignore_patterns,
-                    )
-                else:
-                    model_path = model
-            return model_path
-        return None
+
+        extra_config = load_config.model_loader_extra_config
+        allowed_keys = {"enable_multithread_load", "num_threads"}
+        unexpected_keys = set(extra_config.keys()) - allowed_keys
+
+        if unexpected_keys:
+            raise ValueError(f"Unexpected extra config keys for load format "
+                             f"{load_config.load_format}: "
+                             f"{unexpected_keys}")
 
     def _prepare_weights(
         self,
@@ -96,7 +77,7 @@ class DefaultModelLoader(BaseModelLoader):
         """Prepare weights for the model.
 
         If the model is not local, it will be downloaded."""
-        model_name_or_path = (self._maybe_download_from_modelscope(
+        model_name_or_path = (maybe_download_from_modelscope(
             model_name_or_path, revision) or model_name_or_path)
 
         is_local = os.path.isdir(model_name_or_path)
@@ -175,6 +156,7 @@ class DefaultModelLoader(BaseModelLoader):
             self, source: "Source"
     ) -> Generator[tuple[str, torch.Tensor], None, None]:
         """Get an iterator for the model weights based on the load format."""
+        extra_config = self.load_config.model_loader_extra_config
         hf_folder, hf_weights_files, use_safetensors = self._prepare_weights(
             source.model_or_path, source.revision, source.fall_back_to_pt,
             source.allow_patterns_overrides)
@@ -195,23 +177,42 @@ class DefaultModelLoader(BaseModelLoader):
                     self.load_config.use_tqdm_on_load,
                 )
             else:
-                weights_iterator = safetensors_weights_iterator(
+                if extra_config.get("enable_multithread_load"):
+                    weights_iterator = (
+                        multi_thread_safetensors_weights_iterator(
+                            hf_weights_files,
+                            self.load_config.use_tqdm_on_load,
+                            max_workers=extra_config.get(
+                                "num_threads", self.DEFAULT_NUM_THREADS),
+                        ))
+                else:
+                    weights_iterator = safetensors_weights_iterator(
+                        hf_weights_files,
+                        self.load_config.use_tqdm_on_load,
+                        self.load_config.safetensors_load_strategy,
+                    )
+        else:
+            if extra_config.get("enable_multithread_load"):
+                weights_iterator = multi_thread_pt_weights_iterator(
+                    hf_weights_files,
+                    self.load_config.use_tqdm_on_load,
+                    self.load_config.pt_load_map_location,
+                    max_workers=extra_config.get("num_threads",
+                                                 self.DEFAULT_NUM_THREADS),
+                )
+            else:
+                weights_iterator = pt_weights_iterator(
                     hf_weights_files,
                     self.load_config.use_tqdm_on_load,
+                    self.load_config.pt_load_map_location,
                 )
-        else:
-            weights_iterator = pt_weights_iterator(
-                hf_weights_files,
-                self.load_config.use_tqdm_on_load,
-                self.load_config.pt_load_map_location,
-            )
 
         if current_platform.is_tpu():
             from vllm.platforms.tpu import USE_TPU_COMMONS
 
             if not USE_TPU_COMMONS:
                 # In PyTorch XLA, we should call `xm.mark_step`
-                # requently so that not too many ops are accumulated
+                # frequently so that not too many ops are accumulated
                 # in the XLA program. import torch_xla.core.xla_model
                 # as xm
                 import torch_xla.core.xla_model as xm
diff --git a/vllm/model_executor/model_loader/dummy_loader.py b/vllm/model_executor/model_loader/dummy_loader.py
index f4a7da5744e04bfe1e67d215d739f347f27bbbd6..5b8c6268f64efaed4acaa564cc86b3f5b9cf286f 100644
--- a/vllm/model_executor/model_loader/dummy_loader.py
+++ b/vllm/model_executor/model_loader/dummy_loader.py
@@ -2,7 +2,8 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import torch.nn as nn
 
-from vllm.config import LoadConfig, ModelConfig
+from vllm.config import ModelConfig
+from vllm.config.load import LoadConfig
 from vllm.model_executor.model_loader.base_loader import BaseModelLoader
 from vllm.model_executor.model_loader.weight_utils import (
     initialize_dummy_weights)
diff --git a/vllm/model_executor/model_loader/gguf_loader.py b/vllm/model_executor/model_loader/gguf_loader.py
index 9877cb3b7c06e3caddaf5c7adb2eab49003f67a8..aaee8f3f7635387c83093999e48137ca1df4564c 100644
--- a/vllm/model_executor/model_loader/gguf_loader.py
+++ b/vllm/model_executor/model_loader/gguf_loader.py
@@ -9,7 +9,8 @@ import torch.nn as nn
 from huggingface_hub import hf_hub_download
 from transformers import AutoModelForCausalLM
 
-from vllm.config import LoadConfig, ModelConfig, VllmConfig
+from vllm.config import ModelConfig, VllmConfig
+from vllm.config.load import LoadConfig
 from vllm.model_executor.model_loader.base_loader import BaseModelLoader
 from vllm.model_executor.model_loader.utils import (
     initialize_model, process_weights_after_loading, set_default_torch_dtype)
diff --git a/vllm/model_executor/model_loader/neuron.py b/vllm/model_executor/model_loader/neuron.py
deleted file mode 100644
index fad97aba84b6aaf9f69e0c5c968935e2b5315a45..0000000000000000000000000000000000000000
--- a/vllm/model_executor/model_loader/neuron.py
+++ /dev/null
@@ -1,476 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-"""Utilities for selecting and loading Neuron models in transformers-neuronx
-framework."""
-import ast
-import copy
-import importlib
-import os
-from typing import Optional
-
-import torch
-import torch.nn as nn
-from transformers import PretrainedConfig
-
-from vllm.config import (ModelConfig, ParallelConfig, SchedulerConfig,
-                         SpeculativeConfig)
-from vllm.model_executor.layers.logits_processor import LogitsProcessor
-from vllm.model_executor.layers.quantization import get_quantization_config
-from vllm.model_executor.layers.sampler import Sampler, SamplerOutput
-from vllm.model_executor.sampling_metadata import SamplingMetadata
-from vllm.sequence import (CompletionSequenceGroupOutput, Logprob,
-                           SequenceOutput)
-
-TORCH_DTYPE_TO_NEURON_AMP = {
-    "auto": "f32",
-    "half": "f16",
-    "float16": "f16",
-    "bfloat16": "bf16",
-    "float": "f32",
-    "float32": "f32",
-    torch.float16: "f16",
-    torch.bfloat16: "bf16",
-    torch.float32: "f32",
-}
-
-# Models supported by Neuron.
-_NEURON_SUPPORTED_MODELS: dict[str, tuple[str, str, str]] = {
-    "LlamaForCausalLM": ("transformers_neuronx.llama.model",
-                         "LlamaForSampling", "LlamaForCausalLM"),
-    "MistralForCausalLM": ("transformers_neuronx.mistral.model",
-                           "MistralForSampling", "MistralForCausalLM")
-}
-
-
-class NeuronCausalLM(nn.Module):
-
-    def __init__(self,
-                 config: PretrainedConfig,
-                 on_device_sampling_disabled: bool = False) -> None:
-        super().__init__()
-        self.config = config
-        self.logits_processor = LogitsProcessor(config.vocab_size,
-                                                logits_as_input=True)
-
-        self.on_device_sampling_disabled = on_device_sampling_disabled
-        if self.on_device_sampling_disabled:
-            # Use default sampler
-            self.sampler = Sampler()
-
-        # Lazy initialized
-        self.model: nn.Module
-
-    def forward(
-        self,
-        input_ids: torch.Tensor,
-        positions: torch.Tensor,
-        input_block_ids: torch.Tensor,
-    ) -> torch.Tensor:
-        logits = self.model(input_ids,
-                            cache_ids=positions,
-                            start_ids=input_block_ids)
-        return logits
-
-    def compute_logits(self, hidden_states: torch.Tensor,
-                       sampling_metadata: SamplingMetadata) -> torch.Tensor:
-        logits = self.logits_processor(None, hidden_states, sampling_metadata)
-        return logits
-
-    def sample(
-        self,
-        logits: torch.Tensor,
-        sampling_metadata: SamplingMetadata,
-    ) -> Optional[SamplerOutput]:
-
-        if self.on_device_sampling_disabled:
-            next_tokens = self.sampler(logits, sampling_metadata)
-            return next_tokens
-
-        # On-device sampling outputs the token ids directly.
-        sampled_token_ids = logits.flatten()
-        next_tokens = []
-        sample_idx = 0
-        for seq_group in sampling_metadata.seq_groups:
-            samples = []
-            for seq_id in seq_group.seq_ids:
-                token_id = sampled_token_ids[sample_idx].item()
-                samples.append(
-                    SequenceOutput(parent_seq_id=seq_id,
-                                   output_token=token_id,
-                                   logprobs={token_id: Logprob(token_id)}))
-                sample_idx += 1
-            next_tokens.append(
-                CompletionSequenceGroupOutput(samples=samples,
-                                              prompt_logprobs=None))
-
-        return SamplerOutput(outputs=next_tokens)
-
-    def load_weights(self, model_name_or_path: str, **kwargs):
-        arch = _get_model_architecture(self.config)
-        neuronx_module_path, neuronx_model_cls_name, hf_model_cls_name = (
-            _NEURON_SUPPORTED_MODELS[arch])
-        neuronx_module = importlib.import_module(neuronx_module_path)
-        neuronx_model_cls = getattr(neuronx_module, neuronx_model_cls_name)
-
-        self.model = neuronx_model_cls.from_pretrained(model_name_or_path,
-                                                       **kwargs)
-        self.model.to_neuron()
-
-
-class NeuronSpeculationCausalLM(nn.Module):
-    """A Neuron-optimized causal language model with speculative decoding."""
-
-    SPECULATION_TERMINATION_ID = -1
-
-    def __init__(self, speculation_model) -> None:
-        super().__init__()
-        self.model = speculation_model
-
-    def forward(
-        self,
-        input_ids: torch.Tensor,
-        positions: torch.Tensor,
-        input_block_ids: torch.Tensor,
-    ) -> torch.Tensor:
-        tokens, counts = self.model.speculative_iteration(
-            input_ids, positions, input_block_ids)
-
-        # Mark the end of accepted speculative tokens for each sequence with the
-        # speculation termination id.
-        batch_size, steps = tokens.shape
-        mask = torch.arange(steps).expand(batch_size, -1) >= counts
-        tokens[mask] = self.SPECULATION_TERMINATION_ID
-
-        return tokens
-
-    def sample(
-        self,
-        logits: torch.Tensor,
-        sampling_metadata: SamplingMetadata,
-    ) -> Optional[list[SamplerOutput]]:
-        batch_size, num_steps = logits.shape
-        seq_ids = [
-            seq_id for sg in sampling_metadata.seq_groups
-            for seq_id in sg.seq_ids
-        ]
-        # Organize input tensors by step instead of by sequence.
-        accepted_token_ids_by_step = logits.transpose(0, 1)
-        accepted_token_ids_by_step = accepted_token_ids_by_step.tolist()
-
-        sampler_output_list = []
-        for step_index in range(num_steps):
-            if all(token_id == self.SPECULATION_TERMINATION_ID
-                   for token_id in accepted_token_ids_by_step[step_index]):
-                break
-            step_output_token_ids = []
-            for sequence_index in range(batch_size):
-                token_id = accepted_token_ids_by_step[step_index][
-                    sequence_index]
-                step_output_token_ids.append(
-                    CompletionSequenceGroupOutput(samples=[
-                        SequenceOutput(parent_seq_id=seq_ids[sequence_index],
-                                       output_token=token_id,
-                                       logprobs={token_id: Logprob(token_id)})
-                    ],
-                                                  prompt_logprobs=None))
-            sampler_output_list.append(
-                SamplerOutput(outputs=step_output_token_ids))
-        return sampler_output_list
-
-
-def _get_model_architecture(config: PretrainedConfig) -> str:
-    architectures = getattr(config, "architectures", [])
-    for arch in architectures:
-        if arch in _NEURON_SUPPORTED_MODELS:
-            return arch
-    raise ValueError(
-        f"Model architectures {architectures} are not supported on Neuron "
-        f"for now. Supported architectures: "
-        f"{list(_NEURON_SUPPORTED_MODELS.keys())}")
-
-
-def _get_buckets(env: str, default_value: list[int]) -> list[int]:
-    env_value = os.getenv(env)
-    if env_value is None:
-        return default_value
-    buckets_remove_empty = filter(
-        lambda x: x is not None and len(x.strip()) > 0, env_value.split(","))
-    buckets_int = map(int, buckets_remove_empty)
-    buckets_list = list(buckets_int)
-    return buckets_list
-
-
-def _get_default_neuron_config(model_config: ModelConfig,
-                               parallel_config: ParallelConfig,
-                               scheduler_config: SchedulerConfig):
-    """Generate a neuron config based on vllm config args."""
-    from transformers_neuronx.config import ContinuousBatchingConfig
-    from transformers_neuronx.constants import LAYOUT_BSH
-
-    continuous_batching_config = ContinuousBatchingConfig(
-        batch_size_for_shared_caches=scheduler_config.max_num_seqs)
-    quant_config = dict(
-        dequant_dtype=TORCH_DTYPE_TO_NEURON_AMP[model_config.dtype],
-        quantize_method="vector_dynamic")
-    neuron_quantization_config_builder = lambda quant: get_quantization_config(
-        quant).from_config(quant_config).get_quant_method(None, "")
-    # TODO: Add Paged attention config to the default neuron arguments.
-    default_neuron_args = dict(
-        collectives_layout=LAYOUT_BSH,
-        attention_layout=LAYOUT_BSH,
-        fuse_qkv=True,
-        quant=neuron_quantization_config_builder(model_config.quantization)
-        if model_config.quantization else None,
-        continuous_batching=continuous_batching_config,
-        weight_tiling=bool(model_config.quantization),
-        on_device_generation=_get_neuron_on_device_generation_config(
-            model_config))
-    return default_neuron_args
-
-
-def _get_default_neuron_config_for_speculation(
-        model_config: ModelConfig, parallel_config: ParallelConfig,
-        scheduler_config: SchedulerConfig):
-    """Generate a neuron config for speculative decoding based on
-    vllm config args."""
-    from transformers_neuronx.config import ContinuousBatchingConfig
-    from transformers_neuronx.constants import LAYOUT_BSH
-
-    continuous_batching_config = ContinuousBatchingConfig(
-        batch_size_for_shared_caches=scheduler_config.max_num_seqs)
-
-    default_neuron_args = dict(collectives_layout=LAYOUT_BSH,
-                               attention_layout=LAYOUT_BSH,
-                               fuse_qkv=True,
-                               on_device_embedding=True,
-                               continuous_batching=continuous_batching_config,
-                               on_device_generation=copy.deepcopy(
-                                   model_config.neuron_sampling_params))
-    return default_neuron_args
-
-
-def _get_neuron_on_device_generation_config(model_config: ModelConfig):
-    if not _is_neuron_on_device_sampling_disabled(model_config):
-        return copy.deepcopy(model_config.neuron_sampling_params)
-    return None
-
-
-def _is_neuron_on_device_sampling_disabled(model_config: ModelConfig) -> bool:
-    return not getattr(model_config, "neuron_sampling_params", None)
-
-
-def _get_neuron_config_after_override(default_neuron_config,
-                                      overridden_neuron_config):
-    from transformers_neuronx.config import (ContinuousBatchingConfig,
-                                             GenerationConfig,
-                                             KVCacheQuantizationConfig,
-                                             NeuronConfig, QuantizationConfig,
-                                             SparseAttnConfig)
-
-    sparse_attn = overridden_neuron_config.pop("sparse_attn", {})
-    if sparse_attn:
-        overridden_neuron_config["sparse_attn"] = SparseAttnConfig(
-            **sparse_attn)
-
-    kv_cache_quant = overridden_neuron_config.pop("kv_cache_quant", {})
-    if kv_cache_quant:
-        overridden_neuron_config["kv_cache_quant"] = KVCacheQuantizationConfig(
-            **kv_cache_quant)
-
-    continuous_batching = overridden_neuron_config.pop("continuous_batching",
-                                                       {})
-    if continuous_batching:
-        overridden_neuron_config[
-            "continuous_batching"] = ContinuousBatchingConfig(
-                **continuous_batching)
-
-    quant = overridden_neuron_config.pop("quant", {})
-    if quant:
-        overridden_neuron_config["quant"] = QuantizationConfig(**quant)
-
-    on_device_generation = overridden_neuron_config.pop(
-        "on_device_generation", {})
-    if on_device_generation:
-        overridden_neuron_config["on_device_generation"] = GenerationConfig(
-            **on_device_generation)
-    default_neuron_config.update(overridden_neuron_config)
-    return NeuronConfig(**default_neuron_config)
-
-
-def get_neuron_model(model_config: ModelConfig,
-                     parallel_config: ParallelConfig,
-                     scheduler_config: SchedulerConfig) -> nn.Module:
-    """Initializes a neuron-optimized model for inference."""
-    # Create a model instance.
-    model = NeuronCausalLM(
-        model_config.hf_config,
-        _is_neuron_on_device_sampling_disabled(model_config))
-
-    default_neuron_config_args = _get_default_neuron_config(
-        model_config, parallel_config, scheduler_config)
-
-    neuron_config = _get_neuron_config_after_override(
-        default_neuron_config_args, model_config.override_neuron_config)
-
-    context_length_estimates = _get_buckets("NEURON_CONTEXT_LENGTH_BUCKETS",
-                                            [scheduler_config.max_model_len])
-    n_positions = _get_buckets("NEURON_TOKEN_GEN_BUCKETS",
-                               [scheduler_config.max_model_len])
-
-    model.load_weights(model_config.model,
-                       tp_degree=parallel_config.tensor_parallel_size,
-                       amp=TORCH_DTYPE_TO_NEURON_AMP[model_config.dtype],
-                       neuron_config=neuron_config,
-                       context_length_estimate=context_length_estimates,
-                       n_positions=n_positions,
-                       batch_size=scheduler_config.max_num_seqs)
-
-    return model.eval()
-
-
-def get_neuron_speculation_model(model_config: ModelConfig,
-                                 parallel_config: ParallelConfig,
-                                 scheduler_config: SchedulerConfig,
-                                 speculation_config: SpeculativeConfig):
-    """Initializes a neuron-optimized speculation model for inference.
-
-    This method is only applicable for speculation with a standalone draft model
-    """
-    from transformers_neuronx.fused_speculation import FusedSpeculativeDecoder
-
-    # For Eagle SD, we need to pass in additional parameters in neuron config.
-    is_eagle = getattr(speculation_config.draft_model_config.hf_config,
-                       "is_eagle", False)
-
-    # Create target model instance.
-    target_model = NeuronCausalLM(model_config.hf_config)
-
-    default_neuron_config_args = _get_default_neuron_config_for_speculation(
-        model_config, parallel_config, scheduler_config)
-    if is_eagle:
-        default_neuron_config_args['is_eagle_target'] = True
-
-    neuron_config = _get_neuron_config_after_override(
-        default_neuron_config_args, model_config.override_neuron_config)
-
-    context_length_estimates = _get_buckets("NEURON_CONTEXT_LENGTH_BUCKETS",
-                                            [scheduler_config.max_model_len])
-    n_positions = _get_buckets("NEURON_TOKEN_GEN_BUCKETS",
-                               [scheduler_config.max_model_len])
-
-    target_model.load_weights(
-        model_config.model,
-        tp_degree=parallel_config.tensor_parallel_size,
-        amp=TORCH_DTYPE_TO_NEURON_AMP[model_config.dtype],
-        neuron_config=neuron_config,
-        context_length_estimate=context_length_estimates,
-        n_positions=n_positions,
-        batch_size=scheduler_config.max_num_seqs)
-
-    target_model.eval()
-
-    # Create draft model instance.
-    draft_model = NeuronCausalLM(
-        speculation_config.draft_model_config.hf_config)
-
-    default_draft_neuron_config_args = (
-        _get_default_neuron_config_for_speculation(
-            speculation_config.draft_model_config, parallel_config,
-            scheduler_config))
-    if is_eagle:
-        default_draft_neuron_config_args['is_eagle_draft'] = True
-        default_draft_neuron_config_args['has_pre_attention_norm'] = False
-
-    draft_neuron_config = _get_neuron_config_after_override(
-        default_draft_neuron_config_args,
-        speculation_config.draft_model_config.override_neuron_config)
-
-    draft_model.load_weights(speculation_config.draft_model_config.model,
-                             tp_degree=speculation_config.
-                             draft_parallel_config.tensor_parallel_size,
-                             amp=TORCH_DTYPE_TO_NEURON_AMP[
-                                 speculation_config.draft_model_config.dtype],
-                             neuron_config=draft_neuron_config,
-                             context_length_estimate=context_length_estimates,
-                             n_positions=n_positions,
-                             batch_size=scheduler_config.max_num_seqs)
-
-    draft_model.eval()
-
-    num_speculative_tokens = speculation_config.num_speculative_tokens
-    # Create speculation model instance.
-    speculation_model = FusedSpeculativeDecoder(draft_model.model,
-                                                target_model.model,
-                                                num_speculative_tokens)
-    speculation_model.to_neuron()
-
-    return NeuronSpeculationCausalLM(speculation_model)
-
-
-def get_neuron_eagle_speculation_model(model_config: ModelConfig,
-                                       parallel_config: ParallelConfig,
-                                       scheduler_config: SchedulerConfig,
-                                       speculation_config: SpeculativeConfig):
-    """Initializes a neuron-optimized EAGLE speculation model for inference."""
-    from transformers_neuronx.eagle_speculation import EagleSpeculativeDecoder
-
-    # Create target model instance.
-    target_model = NeuronCausalLM(model_config.hf_config)
-
-    default_neuron_config_args = _get_default_neuron_config_for_speculation(
-        model_config, parallel_config, scheduler_config)
-    default_neuron_config_args['is_eagle_target'] = True
-    neuron_config = _get_neuron_config_after_override(
-        default_neuron_config_args, model_config.override_neuron_config)
-
-    context_length_estimates = _get_buckets("NEURON_CONTEXT_LENGTH_BUCKETS",
-                                            [scheduler_config.max_model_len])
-    n_positions = _get_buckets("NEURON_TOKEN_GEN_BUCKETS",
-                               [scheduler_config.max_model_len])
-
-    target_model.load_weights(
-        model_config.model,
-        tp_degree=parallel_config.tensor_parallel_size,
-        amp=TORCH_DTYPE_TO_NEURON_AMP[model_config.dtype],
-        neuron_config=neuron_config,
-        context_length_estimate=context_length_estimates,
-        n_positions=n_positions,
-        batch_size=scheduler_config.max_num_seqs)
-
-    target_model.eval()
-
-    # Create draft model instance.
-    draft_model = NeuronCausalLM(
-        speculation_config.draft_model_config.hf_config)
-
-    default_draft_neuron_config_args = (
-        _get_default_neuron_config_for_speculation(
-            speculation_config.draft_model_config, parallel_config,
-            scheduler_config))
-    default_draft_neuron_config_args['is_eagle_draft'] = True
-    default_draft_neuron_config_args['has_pre_attention_norm'] = False
-    draft_neuron_config = _get_neuron_config_after_override(
-        default_draft_neuron_config_args,
-        speculation_config.draft_model_config.override_neuron_config)
-
-    draft_model.load_weights(speculation_config.draft_model_config.model,
-                             tp_degree=speculation_config.
-                             draft_parallel_config.tensor_parallel_size,
-                             amp=TORCH_DTYPE_TO_NEURON_AMP[
-                                 speculation_config.draft_model_config.dtype],
-                             neuron_config=draft_neuron_config,
-                             context_length_estimate=context_length_estimates,
-                             n_positions=n_positions,
-                             batch_size=scheduler_config.max_num_seqs)
-
-    draft_model.eval()
-
-    token_tree: dict[int, list[int]] = ast.literal_eval(
-        speculation_config.speculative_token_tree)
-
-    speculation_model = EagleSpeculativeDecoder(draft_model.model,
-                                                target_model.model,
-                                                token_tree=token_tree)
-    speculation_model.to_neuron()
-
-    return NeuronSpeculationCausalLM(speculation_model)
diff --git a/vllm/model_executor/model_loader/neuronx_distributed.py b/vllm/model_executor/model_loader/neuronx_distributed.py
deleted file mode 100644
index f450961c64ff4931b2ba193fbc713b821ed9815d..0000000000000000000000000000000000000000
--- a/vllm/model_executor/model_loader/neuronx_distributed.py
+++ /dev/null
@@ -1,685 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-"""Utilities for selecting and loading Neuron models in
-neuronx-distributed-inference framework."""
-# Disabling yapf because yapf and isort have conflicts for the below imports
-# yapf: disable
-import copy
-import hashlib
-import importlib
-import multiprocessing
-import os
-import shutil
-from typing import Optional
-
-import torch
-import torch.nn as nn
-from neuronx_distributed_inference.models.config import (
-    FusedSpecNeuronConfig, OnDeviceSamplingConfig)
-from neuronx_distributed_inference.models.mllama.utils import (
-    create_vision_mask)
-from neuronx_distributed_inference.modules.lora_serving import (
-    LoraServingConfig)
-from neuronx_distributed_inference.utils.hf_adapter import (
-    load_pretrained_config)
-from transformers import AutoModelForCausalLM, AutoTokenizer, PretrainedConfig
-
-from vllm.config import (ModelConfig, ParallelConfig, SchedulerConfig,
-                         SpeculativeConfig)
-from vllm.logger import init_logger
-from vllm.model_executor.layers.logits_processor import LogitsProcessor
-from vllm.model_executor.layers.sampler import Sampler, SamplerOutput
-from vllm.model_executor.sampling_metadata import SamplingMetadata
-from vllm.sequence import (CompletionSequenceGroupOutput, Logprob,
-                           SequenceOutput)
-
-# yapf: enable
-logger = init_logger(__name__)
-
-TORCH_DTYPE_TO_NEURON_AMP = {
-    "auto": "float32",
-    "half": "float16",
-    "float16": "float16",
-    "bfloat16": "bfloat16",
-    "float": "float32",
-    "float32": "float32",
-    torch.float16: "float16",
-    torch.bfloat16: "bfloat16",
-    torch.float32: "float32",
-}
-
-# Models supported by Neuronx distributed for inference.
-_NEURON_SUPPORTED_MODELS: dict[str, tuple[str, str]] = {
-    "LlamaForCausalLM":
-    ("neuronx_distributed_inference.models.llama.modeling_llama",
-     "NeuronLlamaForCausalLM"),
-    "MistralForCausalLM":
-    ("neuronx_distributed_inference.models.llama.modeling_llama",
-     "NeuronLlamaForCausalLM"),
-    "DbrxForCausalLM":
-    ("neuronx_distributed_inference.models.dbrx.modeling_dbrx",
-     "NeuronDbrxForCausalLM"),
-    "MixtralForCausalLM":
-    ("neuronx_distributed_inference.models.mixtral.modeling_mixtral",
-     "NeuronMixtralForCausalLM"),
-    "MllamaForConditionalGeneration":
-    ("neuronx_distributed_inference.models.mllama.modeling_mllama",
-     "NeuronMllamaForCausalLM"),
-}
-
-
-class NeuronCausalLM(nn.Module):
-
-    def __init__(
-        self,
-        config: PretrainedConfig,
-    ) -> None:
-        super().__init__()
-        self.config = config
-        self.logits_processor = LogitsProcessor(config.vocab_size,
-                                                logits_as_input=True)
-        self.sampler = Sampler()
-
-        # Lazy initialized
-        self.model: nn.Module
-
-    def forward(self,
-                input_ids: torch.Tensor,
-                positions: torch.Tensor,
-                input_block_ids: torch.Tensor,
-                sampling_params: torch.Tensor,
-                prev_hidden: Optional[torch.Tensor] = None,
-                adapter_ids: Optional[torch.Tensor] = None) -> torch.Tensor:
-        # sort block ids sequentially for perf/neuron support reasons
-        sorted_input_block_ids, sorted_indices = torch.sort(input_block_ids)
-        input_ids = torch.index_select(input_ids, 0, sorted_indices)
-        positions = torch.index_select(positions, 0, sorted_indices)
-        sampling_params = torch.index_select(sampling_params, 0,
-                                             sorted_indices)
-        output = self.model(input_ids,
-                            attention_mask=None,
-                            position_ids=positions,
-                            seq_ids=sorted_input_block_ids,
-                            sampling_params=sampling_params,
-                            prev_hidden=prev_hidden,
-                            adapter_ids=adapter_ids)
-        # on-device sampling
-        if self.config.neuron_config.on_device_sampling_config:
-            output = output.hidden_states
-        else:
-            output = output.logits[:, -1, :]
-
-        restored_indices = torch.argsort(sorted_indices)
-        if input_block_ids.shape[0] != 1:
-            output = torch.index_select(output, 0, restored_indices)
-
-        return output
-
-    def compute_logits(self, hidden_states: torch.Tensor,
-                       sampling_metadata: SamplingMetadata) -> torch.Tensor:
-        logits = self.logits_processor(None, hidden_states, sampling_metadata)
-        return logits
-
-    def sample(
-        self,
-        logits: torch.Tensor,
-        sampling_metadata: SamplingMetadata,
-    ) -> Optional[SamplerOutput]:
-        # on-device sampling
-        if self.config.neuron_config.on_device_sampling_config:
-            batch_size = logits.shape
-            seq_ids = [
-                seq_id for sg in sampling_metadata.seq_groups
-                for seq_id in sg.seq_ids
-            ]
-            assert len(seq_ids) == list(batch_size)[0], "batch size mismatch"
-            # Organize input tensors by step instead of by sequence.
-            accepted_token_ids_by_step = logits.flatten()
-            accepted_token_ids_by_step = accepted_token_ids_by_step.tolist()
-
-            step_output_token_ids = []
-            for i, seq_id in enumerate(seq_ids):
-                token_id = accepted_token_ids_by_step[i]
-                step_output_token_ids.append(
-                    CompletionSequenceGroupOutput(samples=[
-                        SequenceOutput(parent_seq_id=seq_id,
-                                       output_token=token_id,
-                                       logprobs={token_id: Logprob(token_id)})
-                    ],
-                                                  prompt_logprobs=None))
-            return SamplerOutput(outputs=step_output_token_ids)
-        else:
-            return self.sampler(logits, sampling_metadata)
-
-    def load_weights(self, model_name_or_path: str, **kwargs):
-        arch = _get_model_architecture(self.config)
-        neuronx_module_path, neuronx_model_cls_name = (
-            _NEURON_SUPPORTED_MODELS[arch])
-        neuronx_module = importlib.import_module(neuronx_module_path)
-        neuronx_model_cls = getattr(neuronx_module, neuronx_model_cls_name)
-        neuron_config = neuronx_model_cls.get_neuron_config_cls()(
-            **kwargs['neuron_config'])
-        self.config.neuron_config = neuron_config
-        config = neuronx_model_cls.get_config_cls()(
-            neuron_config,
-            load_config=load_pretrained_config(model_name_or_path))
-        hashed_config = hashlib.md5(config.to_json_string().encode('utf-8'),
-                                    usedforsecurity=False).hexdigest()
-        if os.getenv("NEURON_COMPILED_ARTIFACTS") is not None:
-            compiled_model_path = os.getenv("NEURON_COMPILED_ARTIFACTS")
-        elif os.path.exists(model_name_or_path):
-            compiled_model_path = os.path.join(model_name_or_path,
-                                               "neuron-compiled-artifacts",
-                                               hashed_config)
-            shutil.rmtree(compiled_model_path, ignore_errors=True)
-        else:
-            compiled_model_path = os.path.join("local-models",
-                                               model_name_or_path,
-                                               "neuron-compiled-artifacts",
-                                               hashed_config)
-            shutil.rmtree(compiled_model_path, ignore_errors=True)
-        try:
-            self.model = neuronx_model_cls(compiled_model_path)
-            override_neuron_config = kwargs["override_neuron_config"]
-            for k, v in override_neuron_config.items():
-                setattr(self.model.config.neuron_config, k, v)
-            self.model.load(compiled_model_path)
-            return
-        except (FileNotFoundError, ValueError) as e:
-            logger.warning("Exception: %s", e)
-            logger.warning("Failed to load the model from %s, Recompiling...",
-                           compiled_model_path)
-        if not os.path.exists(model_name_or_path):
-            hf_model = AutoModelForCausalLM.from_pretrained(model_name_or_path)
-            saved_path = os.path.join("local-models", model_name_or_path)
-            hf_model.save_pretrained(saved_path)
-            model_name_or_path = saved_path
-        self.model = neuronx_model_cls(model_name_or_path, config)
-        self.model.compile(compiled_model_path)
-        self.model.load(compiled_model_path)
-
-
-class NeuronMllamaForCausalLM(nn.Module):
-
-    def __init__(self,
-                 config: PretrainedConfig,
-                 on_device_sampling_disabled: bool = False) -> None:
-        super().__init__()
-        # has_image is the only multimodal input that is used in
-        # token-generation
-        # This is a cache (on CPU) that saves has_image data per sequence id
-        # The number of entries in this cache is <= Batch-Size
-        self.has_image_cache: dict[int, torch.Tensor] = {}
-        self.config = config
-        self.logits_processor = LogitsProcessor(
-            config.get_text_config().vocab_size, logits_as_input=True)
-
-        self.on_device_sampling_disabled = on_device_sampling_disabled
-        if self.on_device_sampling_disabled:
-            # Use default sampler
-            self.sampler = Sampler()
-
-        # Lazy initialized
-        self.model: nn.Module
-        self.is_reorder_needed: bool = True
-
-    def read_from_has_image_cache(self, seq_ids: torch.Tensor):
-        has_image_list = []
-        for index in range(len(seq_ids)):
-            seq_id = seq_ids[index].item()
-            if seq_id in self.has_image_cache:
-                has_image_list.append(self.has_image_cache[seq_id])
-            else:
-                has_image_list.append(torch.tensor([0]))
-        return torch.tensor(has_image_list)
-
-    def write_to_has_image_cache(self, seq_ids: torch.Tensor,
-                                 has_image: torch.Tensor):
-        for index in range(len(seq_ids)):
-            seq_id = seq_ids[index].item()
-            if index < len(has_image):
-                self.has_image_cache[seq_id] = has_image[index]
-            else:
-                self.has_image_cache[seq_id] = torch.zeros(1)
-
-    def forward(self, input_ids: torch.Tensor, positions: torch.Tensor,
-                seq_ids: torch.Tensor, pixel_values: torch.Tensor,
-                aspect_ratios: torch.Tensor, num_chunks: torch.Tensor,
-                has_image: torch.Tensor, sampling_params) -> torch.Tensor:
-
-        # We update the has_image cache during prefill
-        # and read the has_image cache during decode
-        if input_ids.shape[-1] > 1:  # prefill
-            self.write_to_has_image_cache(seq_ids, has_image)
-        else:
-            has_image = self.read_from_has_image_cache(seq_ids)
-            bs = input_ids.shape[0]
-            num_chunks = torch.zeros((bs, 1))
-            aspect_ratios = torch.zeros((bs, 1, 2))
-
-        input_block_ids = seq_ids
-        origin_input_block_ids = seq_ids
-        if self.is_reorder_needed:
-            # sort block ids sequentially for perf/neuron support reasons
-            input_block_ids, sorted_indices = torch.sort(input_block_ids)
-            input_ids = torch.index_select(input_ids, 0, sorted_indices)
-            positions = torch.index_select(positions, 0, sorted_indices)
-            sampling_params = torch.index_select(sampling_params, 0,
-                                                 sorted_indices)
-            pixel_values = torch.index_select(pixel_values, 0, sorted_indices)
-            aspect_ratios = torch.index_select(aspect_ratios, 0,
-                                               sorted_indices)
-            num_chunks = torch.index_select(num_chunks, 0, sorted_indices)
-            has_image = torch.index_select(has_image, 0, sorted_indices)
-
-        self.vision_mask = create_vision_mask(input_ids, self.vision_token_id)
-        output = self.model(
-            input_ids.to(torch.int32),
-            attention_mask=None,
-            position_ids=positions.to(torch.int32),
-            seq_ids=seq_ids.flatten().to(torch.int32),
-            pixel_values=pixel_values.to(
-                self.config.vision_config.torch_dtype),
-            aspect_ratios=aspect_ratios.to(torch.int32),
-            vision_mask=self.vision_mask.to(torch.int32),
-            sampling_params=sampling_params,
-            num_chunks=num_chunks.to(torch.int32),
-            has_image=has_image.to(torch.int32),
-        )
-        if self.config.neuron_config.on_device_sampling_config:
-            output = output.hidden_states
-        else:
-            output = output.logits[:, -1, :]
-
-        if self.is_reorder_needed and origin_input_block_ids.shape[0] != 1:
-            restored_indices = torch.argsort(sorted_indices)
-            output = torch.index_select(output, 0, restored_indices)
-        return output
-
-    def compute_logits(self, hidden_states: torch.Tensor,
-                       sampling_metadata: SamplingMetadata) -> torch.Tensor:
-        logits = self.logits_processor(None, hidden_states, sampling_metadata)
-        return logits
-
-    def sample(self, hidden_states, sampling_metadata):
-        if not self.on_device_sampling_disabled:
-            with torch.profiler.record_function("sample"):
-                hidden_states = hidden_states.flatten()
-                res = []
-                sample_idx = 0
-                for seq_group in sampling_metadata.seq_groups:
-                    seq_ids = seq_group.seq_ids
-                    samples = []
-                    for seq_id in seq_ids:
-                        token_id = hidden_states[sample_idx].item()
-                        samples.append(
-                            SequenceOutput(
-                                parent_seq_id=seq_id,
-                                output_token=token_id,
-                                logprobs={token_id: Logprob(token_id)}))
-                        sample_idx += 1
-                    res.append(
-                        CompletionSequenceGroupOutput(samples=samples,
-                                                      prompt_logprobs=None))
-                next_tokens = SamplerOutput(outputs=res)
-        else:
-            next_tokens = self.sampler(None, hidden_states, sampling_metadata)
-        return next_tokens
-
-    def load_weights(self, model_name_or_path: str, **kwargs):
-        arch = _get_model_architecture(self.config)
-        neuronx_module_path, neuronx_model_cls_name = (
-            _NEURON_SUPPORTED_MODELS[arch])
-        neuronx_module = importlib.import_module(neuronx_module_path)
-        neuronx_model_cls = getattr(neuronx_module, neuronx_model_cls_name)
-        neuron_config = neuronx_model_cls.get_neuron_config_cls()(
-            **kwargs['neuron_config'])
-        self.config.neuron_config = neuron_config
-        logger.info("neuron_config buckets: %s",
-                    self.config.neuron_config.buckets)
-        config = neuronx_model_cls.get_config_cls()(
-            neuron_config,
-            load_config=load_pretrained_config(model_name_or_path))
-        hashed_config = hashlib.md5(config.to_json_string().encode('utf-8'),
-                                    usedforsecurity=False).hexdigest()
-        if os.getenv("NEURON_COMPILED_ARTIFACTS") is not None:
-            compiled_model_path = os.getenv("NEURON_COMPILED_ARTIFACTS")
-        elif os.path.exists(model_name_or_path):
-            compiled_model_path = os.path.join(model_name_or_path,
-                                               "neuron-compiled-artifacts",
-                                               hashed_config)
-        else:
-            compiled_model_path = os.path.join("local-models",
-                                               model_name_or_path,
-                                               "neuron-compiled-artifacts",
-                                               hashed_config)
-        try:
-            self.model = neuronx_model_cls(compiled_model_path)
-            tokenizer = AutoTokenizer.from_pretrained(model_name_or_path)
-            self.vision_token_id = tokenizer(
-                "<|image|>", add_special_tokens=False).input_ids[0]
-            self.model.load(compiled_model_path)
-            return
-        except (FileNotFoundError, ValueError):
-            logger.warning("Failed to load the model from %s, Recompiling...",
-                           compiled_model_path)
-        if not os.path.exists(model_name_or_path):
-            hf_model = AutoModelForCausalLM.from_pretrained(model_name_or_path)
-            saved_path = os.path.join("local-models", model_name_or_path)
-            hf_model.save_pretrained(saved_path)
-            model_name_or_path = saved_path
-        self.model = neuronx_model_cls(model_name_or_path, config)
-
-        logger.info("\nCompiling and saving model to %s", model_name_or_path)
-
-        p = multiprocessing.Process(target=compile_model,
-                                    args=(self, compiled_model_path))
-        p.start()
-        p.join()
-
-        tokenizer = AutoTokenizer.from_pretrained(model_name_or_path)
-        tokenizer.save_pretrained(compiled_model_path)
-        logger.info("Successfully compiled and saved the model in %s",
-                    compiled_model_path)
-
-        # Read "<|image|>" token_id from the tokenizer
-        self.vision_token_id = tokenizer("<|image|>",
-                                         add_special_tokens=False).input_ids[0]
-        logger.info("\nLoading model from compiled checkpoint...")
-        self.model.load(compiled_model_path)
-
-
-def compile_model(neuron_model, traced_model_path):
-    neuron_model.model.compile(traced_model_path)
-
-
-class NeuronSpeculationCausalLM(nn.Module):
-    """A Neuron-optimized causal language model with speculative decoding."""
-
-    def __init__(
-        self,
-        config: PretrainedConfig,
-    ) -> None:
-        super().__init__()
-        self.config = config
-        self.logits_processor = LogitsProcessor(config.vocab_size,
-                                                logits_as_input=True)
-        # Lazy initialized
-        self.model: nn.Module
-
-    def forward(
-        self,
-        input_ids: torch.Tensor,
-        positions: torch.Tensor,
-        input_block_ids: torch.Tensor,
-        sampling_params: torch.Tensor,
-    ) -> torch.Tensor:
-        # sort block ids sequentially for perf/neuron support reasons
-        sorted_input_block_ids, sorted_indices = torch.sort(input_block_ids)
-        input_ids = torch.index_select(input_ids, 0, sorted_indices)
-        positions = torch.index_select(positions, 0, sorted_indices)
-        sampling_params = torch.index_select(sampling_params, 0,
-                                             sorted_indices)
-
-        output = self.model(input_ids,
-                            attention_mask=None,
-                            position_ids=positions,
-                            seq_ids=sorted_input_block_ids,
-                            sampling_params=sampling_params)
-        restored_indices = torch.argsort(sorted_indices)
-
-        # CTX encoding
-        if (positions[:, 0]).sum().item() == 0:
-            output = output.fused_outputs[0][:, 0:1]
-            if input_block_ids.shape[0] != 1:
-                output = torch.index_select(output, 0, restored_indices)
-            return output
-
-        # Fused Spec (Generation)
-        accepted_tokens_with_padding = output.fused_outputs[0]
-        next_pos_ids = output.fused_outputs[-1]
-        generated_token_counts = next_pos_ids - positions
-
-        assert torch.any(generated_token_counts == 0).item() is False, \
-            "NxDI model generated no output for one or more sequences."
-
-        batch_size, steps = accepted_tokens_with_padding.shape
-        mask = torch.arange(steps).expand(batch_size,
-                                          -1) >= generated_token_counts
-        accepted_tokens_with_padding[mask] = -1
-
-        if input_block_ids.shape[0] != 1:
-            accepted_tokens_with_padding = torch.index_select(
-                accepted_tokens_with_padding, 0, restored_indices)
-
-        return accepted_tokens_with_padding
-
-    def sample(
-        self,
-        logits: torch.Tensor,
-        sampling_metadata: SamplingMetadata,
-    ) -> Optional[list[SamplerOutput]]:
-        batch_size, num_steps = logits.shape
-        seq_ids = [
-            seq_id for sg in sampling_metadata.seq_groups
-            for seq_id in sg.seq_ids
-        ]
-        # Organize input tensors by step instead of by sequence.
-        accepted_token_ids_by_step = logits.transpose(0, 1)
-        accepted_token_ids_by_step = accepted_token_ids_by_step.tolist()
-
-        sampler_output_list = []
-        for step_index in range(num_steps):
-            if all(token_id == -1
-                   for token_id in accepted_token_ids_by_step[step_index]):
-                break
-            step_output_token_ids = []
-            for sequence_index in range(batch_size):
-                token_id = accepted_token_ids_by_step[step_index][
-                    sequence_index]
-                step_output_token_ids.append(
-                    CompletionSequenceGroupOutput(samples=[
-                        SequenceOutput(parent_seq_id=seq_ids[sequence_index],
-                                       output_token=token_id,
-                                       logprobs={token_id: Logprob(token_id)})
-                    ],
-                                                  prompt_logprobs=None))
-            sampler_output_list.append(
-                SamplerOutput(outputs=step_output_token_ids))
-        return sampler_output_list
-
-    def load_weights(self, model_name_or_path: str,
-                     draft_model_name_or_path: str, **kwargs):
-        arch = _get_model_architecture(self.config)
-        neuronx_module_path, neuronx_model_cls_name = (
-            _NEURON_SUPPORTED_MODELS[arch])
-        neuronx_module = importlib.import_module(neuronx_module_path)
-        neuronx_model_cls = getattr(neuronx_module, neuronx_model_cls_name)
-        neuron_config = neuronx_model_cls.get_neuron_config_cls()(
-            **kwargs['neuron_config'])
-        config = neuronx_model_cls.get_config_cls()(
-            neuron_config,
-            load_config=load_pretrained_config(model_name_or_path))
-
-        draft_neuron_config = copy.deepcopy(config.neuron_config)
-        if not config.neuron_config.enable_eagle_speculation:
-            draft_neuron_config.speculation_length = 0
-        draft_neuron_config.trace_tokengen_model = True
-        draft_neuron_config.enable_fused_speculation = False
-        if getattr(config.neuron_config, "draft_model_modules_to_not_convert",
-                   None):
-            draft_neuron_config.modules_to_not_convert = (
-                draft_neuron_config.draft_model_modules_to_not_convert)
-        if config.neuron_config.enable_eagle_speculation:
-            draft_neuron_config.is_eagle_draft = True
-            draft_neuron_config.sequence_parallel_enabled = False
-        draft_config = neuronx_model_cls.get_config_cls()(
-            draft_neuron_config,
-            load_config=load_pretrained_config(draft_model_name_or_path))
-        fused_spec_config = (FusedSpecNeuronConfig(
-            neuronx_model_cls._model_cls,
-            draft_config=draft_config,
-            draft_model_path=draft_model_name_or_path))
-        config.fused_spec_config = fused_spec_config
-        self.config.neuron_config = neuron_config
-
-        hashed_config = hashlib.md5(config.to_json_string().encode('utf-8'),
-                                    usedforsecurity=False).hexdigest()
-        if os.getenv("NEURON_COMPILED_ARTIFACTS") is not None:
-            compiled_model_path = os.getenv("NEURON_COMPILED_ARTIFACTS")
-        elif os.path.exists(model_name_or_path):
-            compiled_model_path = os.path.join(model_name_or_path,
-                                               "neuron-compiled-artifacts",
-                                               hashed_config)
-            shutil.rmtree(compiled_model_path, ignore_errors=True)
-        else:
-            compiled_model_path = os.path.join("local-models",
-                                               model_name_or_path,
-                                               "neuron-compiled-artifacts",
-                                               hashed_config)
-            shutil.rmtree(compiled_model_path, ignore_errors=True)
-        try:
-            self.model = neuronx_model_cls(compiled_model_path)
-            override_neuron_config = kwargs["override_neuron_config"]
-            for k, v in override_neuron_config.items():
-                setattr(self.model.config.neuron_config, k, v)
-            self.model.load(compiled_model_path)
-            return
-        except (FileNotFoundError, ValueError) as e:
-            logger.warning("Exception: %s", e)
-            logger.warning("Failed to load the model from %s Recompiling...",
-                           compiled_model_path)
-        if not os.path.exists(model_name_or_path):
-            hf_model = AutoModelForCausalLM.from_pretrained(model_name_or_path)
-            saved_path = os.path.join("local-models", model_name_or_path)
-            hf_model.save_pretrained(saved_path)
-            model_name_or_path = saved_path
-        if not os.path.exists(draft_model_name_or_path):
-            if draft_model_name_or_path != model_name_or_path:
-                hf_model = AutoModelForCausalLM.from_pretrained(
-                    draft_model_name_or_path)
-                saved_path = os.path.join("local-models",
-                                          draft_model_name_or_path)
-                hf_model.save_pretrained(saved_path)
-                draft_model_name_or_path = saved_path
-            else:
-                draft_model_name_or_path = model_name_or_path
-            config.fused_spec_config.draft_model_path = draft_model_name_or_path
-        self.model = neuronx_model_cls(model_name_or_path, config)
-        self.model.compile(compiled_model_path)
-        self.model.load(compiled_model_path)
-
-
-def _get_model_architecture(config: PretrainedConfig) -> str:
-    architectures = getattr(config, "architectures", [])
-    for arch in architectures:
-        if arch in _NEURON_SUPPORTED_MODELS:
-            return arch
-    raise ValueError(
-        f"Model architectures {architectures} are not supported on Neuron "
-        f"for now. Supported architectures: "
-        f"{list(_NEURON_SUPPORTED_MODELS.keys())}")
-
-
-def _get_default_neuron_config(model_config: ModelConfig,
-                               parallel_config: ParallelConfig,
-                               scheduler_config: SchedulerConfig,
-                               lora_serving_config: LoraServingConfig):
-    """Generate a neuron config based on vllm config args."""
-    on_device_sampling_config = OnDeviceSamplingConfig(dynamic=True,
-                                                       deterministic=False)
-    batch_size = scheduler_config.max_num_seqs
-
-    neuron_config = dict(
-        tp_degree=parallel_config.tensor_parallel_size,
-        ctx_batch_size=1,
-        batch_size=batch_size,
-        max_context_length=scheduler_config.max_model_len,
-        seq_len=scheduler_config.max_model_len,
-        enable_bucketing=True,
-        is_continuous_batching=True,
-        quantized=False,
-        torch_dtype=TORCH_DTYPE_TO_NEURON_AMP[model_config.dtype],
-        padding_side="right",
-        on_device_sampling_config=on_device_sampling_config,
-        sequence_parallel_enabled=True,
-        lora_serving_config=lora_serving_config)
-    return neuron_config
-
-
-def _get_default_speculation_config(model_config: ModelConfig,
-                                    parallel_config: ParallelConfig,
-                                    scheduler_config: SchedulerConfig,
-                                    speculation_config: SpeculativeConfig):
-    """Generate a neuron config for speculative decoding based on vllm config
-    args."""
-    neuron_config = dict(
-        tp_degree=parallel_config.tensor_parallel_size,
-        ctx_batch_size=1,
-        batch_size=scheduler_config.max_num_seqs,
-        max_context_length=scheduler_config.max_model_len,
-        seq_len=scheduler_config.max_model_len,
-        speculation_length=speculation_config.num_speculative_tokens,
-        trace_tokengen_model=False,
-        enable_fused_speculation=True,
-        enable_bucketing=True,
-        is_continuous_batching=True,
-        quantized=False,
-        torch_dtype=TORCH_DTYPE_TO_NEURON_AMP[model_config.dtype],
-        on_device_sampling_config=dict(
-            top_k=1,
-            do_sample=False,
-        ))
-    return neuron_config
-
-
-def _get_neuron_config_after_override(default_neuron_config,
-                                      overridden_neuron_config):
-    """Update default neuron config values with override args"""
-    overridden_neuron_config = overridden_neuron_config or {}
-    default_neuron_config.update(overridden_neuron_config)
-    return default_neuron_config
-
-
-def get_neuron_model(model_config: ModelConfig,
-                     parallel_config: ParallelConfig,
-                     scheduler_config: SchedulerConfig,
-                     lora_serving_config: LoraServingConfig) -> nn.Module:
-    """Initializes a neuron-optimized model for inference."""
-    model_arch = _get_model_architecture(model_config.hf_config)
-    if model_arch == "MllamaForConditionalGeneration":
-        model = NeuronMllamaForCausalLM(model_config.hf_config)
-    else:
-        model = NeuronCausalLM(model_config.hf_config)
-    default_neuron_config_args = _get_default_neuron_config(
-        model_config, parallel_config, scheduler_config, lora_serving_config)
-    neuron_config = _get_neuron_config_after_override(
-        default_neuron_config_args, model_config.override_neuron_config)
-
-    override_neuron_config = model_config.override_neuron_config
-    model.load_weights(model_config.model,
-                       neuron_config=neuron_config,
-                       override_neuron_config=override_neuron_config)
-    return model.eval()
-
-
-def get_neuron_speculation_model(model_config: ModelConfig,
-                                 parallel_config: ParallelConfig,
-                                 scheduler_config: SchedulerConfig,
-                                 speculation_config: SpeculativeConfig):
-    """Initializes a neuron-optimized speculation model for inference.
-    
-    This model handles speculation using both a draft model and an EAGLE draft. 
-    """
-    model = NeuronSpeculationCausalLM(model_config.hf_config)
-    default_neuron_config_args = _get_default_speculation_config(
-        model_config, parallel_config, scheduler_config, speculation_config)
-    neuron_config = _get_neuron_config_after_override(
-        default_neuron_config_args, model_config.override_neuron_config)
-
-    override_neuron_config = model_config.override_neuron_config
-    model.load_weights(model_config.model,
-                       speculation_config.draft_model_config.model,
-                       neuron_config=neuron_config,
-                       override_neuron_config=override_neuron_config)
-    return model.eval()
diff --git a/vllm/model_executor/model_loader/runai_streamer_loader.py b/vllm/model_executor/model_loader/runai_streamer_loader.py
index 83e0f386c10823d09e0878feed69557bb8004423..dc941401a04e0e4e1bab609fb7ed64de7c42d1c8 100644
--- a/vllm/model_executor/model_loader/runai_streamer_loader.py
+++ b/vllm/model_executor/model_loader/runai_streamer_loader.py
@@ -1,7 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 # ruff: noqa: SIM117
-import glob
 import os
 from collections.abc import Generator
 from typing import Optional
@@ -10,13 +9,14 @@ import torch
 from torch import nn
 from transformers.utils import SAFE_WEIGHTS_INDEX_NAME
 
-from vllm.config import LoadConfig, ModelConfig
+from vllm.config import ModelConfig
+from vllm.config.load import LoadConfig
 from vllm.model_executor.model_loader.base_loader import BaseModelLoader
 from vllm.model_executor.model_loader.weight_utils import (
     download_safetensors_index_file_from_hf, download_weights_from_hf,
     runai_safetensors_weights_iterator)
-from vllm.transformers_utils.s3_utils import glob as s3_glob
-from vllm.transformers_utils.utils import is_s3
+from vllm.transformers_utils.runai_utils import (is_runai_obj_uri,
+                                                 list_safetensors)
 
 
 class RunaiModelStreamerLoader(BaseModelLoader):
@@ -53,27 +53,22 @@ class RunaiModelStreamerLoader(BaseModelLoader):
 
         If the model is not local, it will be downloaded."""
 
-        is_s3_path = is_s3(model_name_or_path)
+        is_object_storage_path = is_runai_obj_uri(model_name_or_path)
         is_local = os.path.isdir(model_name_or_path)
         safetensors_pattern = "*.safetensors"
         index_file = SAFE_WEIGHTS_INDEX_NAME
 
-        hf_folder = (model_name_or_path if
-                     (is_local or is_s3_path) else download_weights_from_hf(
+        hf_folder = (model_name_or_path if (is_local or is_object_storage_path)
+                     else download_weights_from_hf(
                          model_name_or_path,
                          self.load_config.download_dir,
                          [safetensors_pattern],
                          revision,
                          ignore_patterns=self.load_config.ignore_patterns,
                      ))
-        if is_s3_path:
-            hf_weights_files = s3_glob(path=hf_folder,
-                                       allow_pattern=[safetensors_pattern])
-        else:
-            hf_weights_files = glob.glob(
-                os.path.join(hf_folder, safetensors_pattern))
-
-        if not is_local and not is_s3_path:
+        hf_weights_files = list_safetensors(path=hf_folder)
+
+        if not is_local and not is_object_storage_path:
             download_safetensors_index_file_from_hf(
                 model_name_or_path, index_file, self.load_config.download_dir,
                 revision)
diff --git a/vllm/model_executor/model_loader/sharded_state_loader.py b/vllm/model_executor/model_loader/sharded_state_loader.py
index 3edd4ec4007e85a55ea87949bc1963b9bf95bde1..a85ca065d1d274870ea56af7b3e24da23ef62c32 100644
--- a/vllm/model_executor/model_loader/sharded_state_loader.py
+++ b/vllm/model_executor/model_loader/sharded_state_loader.py
@@ -10,7 +10,8 @@ from typing import Any, Optional
 import torch
 from torch import nn
 
-from vllm.config import LoadConfig, ModelConfig
+from vllm.config import ModelConfig
+from vllm.config.load import LoadConfig
 from vllm.logger import init_logger
 from vllm.model_executor.model_loader.base_loader import BaseModelLoader
 from vllm.model_executor.model_loader.weight_utils import (
diff --git a/vllm/model_executor/model_loader/tensorizer_loader.py b/vllm/model_executor/model_loader/tensorizer_loader.py
index fa01758ab4cee2c5edce7fc1ff793020b310454f..65ea49c6429445869d8bb8e105aaf60bf8ee1e75 100644
--- a/vllm/model_executor/model_loader/tensorizer_loader.py
+++ b/vllm/model_executor/model_loader/tensorizer_loader.py
@@ -8,7 +8,8 @@ from typing import Union
 import torch
 from torch import nn
 
-from vllm.config import LoadConfig, ModelConfig, ParallelConfig, VllmConfig
+from vllm.config import ModelConfig, ParallelConfig, VllmConfig
+from vllm.config.load import LoadConfig
 from vllm.logger import init_logger
 from vllm.model_executor.model_loader.base_loader import BaseModelLoader
 from vllm.model_executor.model_loader.tensorizer import (
diff --git a/vllm/model_executor/model_loader/utils.py b/vllm/model_executor/model_loader/utils.py
index f57ebdb1abcbcd8c7f479281dbba7cbfc497da68..c82fa5a40aa53c6c2571ff7e52a4fc772c46e365 100644
--- a/vllm/model_executor/model_loader/utils.py
+++ b/vllm/model_executor/model_loader/utils.py
@@ -169,22 +169,6 @@ def get_model_architecture(
         model_config: ModelConfig) -> tuple[type[nn.Module], str]:
     architectures = getattr(model_config.hf_config, "architectures", [])
 
-    # Special handling for quantized Mixtral.
-    # FIXME(woosuk): This is a temporary hack.
-    mixtral_supported = [
-        "fp8",
-        "compressed-tensors",
-        "gptq_marlin",
-        "awq_marlin",
-        "quark",
-        "bitsandbytes",
-    ]
-
-    if (model_config.quantization is not None
-            and model_config.quantization not in mixtral_supported
-            and "MixtralForCausalLM" in architectures):
-        architectures = ["QuantMixtralForCausalLM"]
-
     model_cls, arch = model_config.registry.resolve_model_cls(
         architectures,
         model_config=model_config,
diff --git a/vllm/model_executor/model_loader/weight_utils.py b/vllm/model_executor/model_loader/weight_utils.py
index f87eeaa4563ff7f24acc9da83fe2eeed94dc5684..f2c66763d0816bf68e4c5eb1770f2f414a297f68 100644
--- a/vllm/model_executor/model_loader/weight_utils.py
+++ b/vllm/model_executor/model_loader/weight_utils.py
@@ -1,6 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Utilities for downloading and initializing model weights."""
+import concurrent.futures
 import fnmatch
 import glob
 import hashlib
@@ -18,10 +19,12 @@ import huggingface_hub.constants
 import numpy as np
 import torch
 from huggingface_hub import HfFileSystem, hf_hub_download, snapshot_download
-from safetensors.torch import load_file, safe_open, save_file
+from safetensors.torch import load, load_file, safe_open, save_file
 from tqdm.auto import tqdm
 
-from vllm.config import LoadConfig, ModelConfig
+from vllm import envs
+from vllm.config import ModelConfig
+from vllm.config.load import LoadConfig
 from vllm.distributed import get_tensor_model_parallel_rank
 from vllm.logger import init_logger
 from vllm.model_executor.layers.quantization import (QuantizationConfig,
@@ -95,6 +98,41 @@ def get_lock(model_name_or_path: Union[str, Path],
     return lock
 
 
+def maybe_download_from_modelscope(
+        model: str,
+        revision: Optional[str] = None,
+        download_dir: Optional[str] = None,
+        ignore_patterns: Optional[Union[str, list[str]]] = None,
+        allow_patterns: Optional[Union[list[str],
+                                       str]] = None) -> Optional[str]:
+    """Download model from ModelScope hub if VLLM_USE_MODELSCOPE is True.
+
+        Returns the path to the downloaded model, or None if the model is not
+        downloaded from ModelScope."""
+    if envs.VLLM_USE_MODELSCOPE:
+        # download model from ModelScope hub,
+        # lazy import so that modelscope is not required for normal use.
+        # pylint: disable=C.
+        from modelscope.hub.snapshot_download import snapshot_download
+
+        # Use file lock to prevent multiple processes from
+        # downloading the same model weights at the same time.
+        with get_lock(model, download_dir):
+            if not os.path.exists(model):
+                model_path = snapshot_download(
+                    model_id=model,
+                    cache_dir=download_dir,
+                    local_files_only=huggingface_hub.constants.HF_HUB_OFFLINE,
+                    revision=revision,
+                    ignore_file_pattern=ignore_patterns,
+                    allow_patterns=allow_patterns,
+                )
+            else:
+                model_path = model
+        return model_path
+    return None
+
+
 def _shared_pointers(tensors):
     ptrs = defaultdict(list)
     for k, v in tensors.items():
@@ -169,7 +207,13 @@ def get_quant_config(model_config: ModelConfig,
     # Inflight BNB quantization
     if model_config.quantization == "bitsandbytes":
         return quant_cls.from_config({})
-    is_local = os.path.isdir(model_config.model)
+    model_name_or_path = maybe_download_from_modelscope(
+        model_config.model,
+        revision=model_config.revision,
+        download_dir=load_config.download_dir,
+        allow_patterns=["*.json"],
+    ) or model_config.model
+    is_local = os.path.isdir(model_name_or_path)
     if not is_local:
         # Download the config files.
         with get_lock(model_config.model, load_config.download_dir):
@@ -182,7 +226,7 @@ def get_quant_config(model_config: ModelConfig,
                 tqdm_class=DisabledTqdm,
             )
     else:
-        hf_folder = model_config.model
+        hf_folder = model_name_or_path
 
     possible_config_filenames = quant_cls.get_config_filenames()
 
@@ -475,18 +519,58 @@ def np_cache_weights_iterator(
 def safetensors_weights_iterator(
     hf_weights_files: list[str],
     use_tqdm_on_load: bool,
+    safetensors_load_strategy: str = "lazy",
 ) -> Generator[tuple[str, torch.Tensor], None, None]:
     """Iterate over the weights in the model safetensor files."""
+    loading_desc = "Loading safetensors checkpoint shards"
+    if safetensors_load_strategy == "eager":
+        loading_desc += " (eager)"
+
     for st_file in tqdm(
             hf_weights_files,
-            desc="Loading safetensors checkpoint shards",
+            desc=loading_desc,
             disable=not enable_tqdm(use_tqdm_on_load),
             bar_format=_BAR_FORMAT,
     ):
-        with safe_open(st_file, framework="pt") as f:
-            for name in f.keys():  # noqa: SIM118
-                param = f.get_tensor(name)
-                yield name, param
+        if safetensors_load_strategy == "eager":
+            with open(st_file, "rb") as f:
+                state_dict = load(f.read())
+            yield from state_dict.items()
+        else:
+            with safe_open(st_file, framework="pt") as f:
+                for name in f.keys():  # noqa: SIM118
+                    param = f.get_tensor(name)
+                    yield name, param
+
+
+def multi_thread_safetensors_weights_iterator(
+    hf_weights_files: list[str],
+    use_tqdm_on_load: bool,
+    max_workers: int = 4,
+) -> Generator[tuple[str, torch.Tensor], None, None]:
+    """Multi-Thread iterate over the weights in the model safetensor files."""
+
+    def _load_file(st_file: str):
+        result = load_file(st_file, device="cpu")
+        return result
+
+    with concurrent.futures.ThreadPoolExecutor(
+            max_workers=max_workers) as executor:
+        futures = [
+            executor.submit(_load_file, st_file)
+            for st_file in hf_weights_files
+        ]
+        futures_iter = tqdm(
+            concurrent.futures.as_completed(futures),
+            total=len(hf_weights_files),
+            desc="Multi-thread loading shards",
+            disable=not enable_tqdm(use_tqdm_on_load),
+            bar_format=_BAR_FORMAT,
+        )
+
+        for future in futures_iter:
+            state_dict = future.result()
+            yield from state_dict.items()
 
 
 def runai_safetensors_weights_iterator(
@@ -569,6 +653,39 @@ def pt_weights_iterator(
         del state
 
 
+def multi_thread_pt_weights_iterator(
+    hf_weights_files: list[str],
+    use_tqdm_on_load: bool,
+    pt_load_map_location: Union[str, dict[str, str]] = "cpu",
+    max_workers: int = 4,
+) -> Generator[tuple[str, torch.Tensor], None, None]:
+    """Multi-Thread iterate over the weights in the model bin/pt files."""
+
+    def _load_file(bin_file: str):
+        return torch.load(bin_file,
+                          map_location=pt_load_map_location,
+                          weights_only=True)
+
+    with concurrent.futures.ThreadPoolExecutor(
+            max_workers=max_workers) as executor:
+        futures = [
+            executor.submit(_load_file, bin_file)
+            for bin_file in hf_weights_files
+        ]
+        futures_iter = tqdm(
+            concurrent.futures.as_completed(futures),
+            total=len(hf_weights_files),
+            desc="Multi-thread loading pt checkpoint shards",
+            disable=not enable_tqdm(use_tqdm_on_load),
+            bar_format=_BAR_FORMAT,
+        )
+
+        for future in futures_iter:
+            state = future.result()
+            yield from state.items()
+            del state
+
+
 def get_gguf_extra_tensor_names(
         gguf_file: str, gguf_to_hf_name_map: dict[str, str]) -> list[str]:
     reader = gguf.GGUFReader(gguf_file)
diff --git a/vllm/model_executor/models/adapters.py b/vllm/model_executor/models/adapters.py
index 50c2cd97f3d0993bd54daecd95ae7635f5f9653b..78ad9a433e314a725b6a1158d7700a3bfeab94a6 100644
--- a/vllm/model_executor/models/adapters.py
+++ b/vllm/model_executor/models/adapters.py
@@ -49,26 +49,28 @@ def _load_st_projector(model_config: "ModelConfig") -> Optional[nn.Module]:
         if not dense_modules:
             return None
 
-        module = dense_modules[0]
-        folder = module.get("path", "")
+        layers = []
+        for module in dense_modules:
+            folder = module.get("path", "")
+
+            config_path = f"{folder}/config.json" if folder else "config.json"
+            layer_config = get_hf_file_to_dict(config_path, model_config.model,
+                                               model_config.revision)
+            if not layer_config:
+                continue
 
-        config_path = f"{folder}/config.json" if folder else "config.json"
-        layer_config = get_hf_file_to_dict(config_path, model_config.model,
-                                           model_config.revision)
-        if not layer_config:
-            return None
+            linear = nn.Linear(layer_config.get("in_features", 768),
+                               layer_config.get("out_features", 768),
+                               bias=layer_config.get("bias", True),
+                               dtype=model_config.head_dtype)
 
-        linear = nn.Linear(layer_config.get("in_features", 768),
-                           layer_config.get("out_features", 768),
-                           bias=layer_config.get("bias", True),
-                           dtype=torch.float32)
+            if not _load_dense_weights(linear, folder, model_config):
+                continue
 
-        if _load_dense_weights(linear, folder, model_config):
-            layers = [linear]
+            layers.append(linear)
             if act_name := layer_config.get("activation_function"):
                 layers.append(get_act_fn(act_name))
-            return nn.Sequential(*layers).to(dtype=torch.float32)
-
+        return nn.Sequential(*layers).to(dtype=model_config.head_dtype)
     except Exception:
         logger.exception("ST projector loading failed")
 
@@ -103,15 +105,13 @@ def _load_dense_weights(linear: nn.Linear, folder: str,
                 if weight_key in state_dict:
                     weight_loader = getattr(linear.weight, "weight_loader",
                                             default_weight_loader)
-                    weight_loader(linear.weight,
-                                  state_dict[weight_key].to(torch.float32))
+                    weight_loader(linear.weight, state_dict[weight_key])
 
                     bias_key = weight_key.replace("weight", "bias")
                     if linear.bias is not None and bias_key in state_dict:
                         bias_loader = getattr(linear.bias, "weight_loader",
                                               default_weight_loader)
-                        bias_loader(linear.bias,
-                                    state_dict[bias_key].to(torch.float32))
+                        bias_loader(linear.bias, state_dict[bias_key])
                     return True
         except Exception:
             logger.exception("Failed to load %s", filename)
@@ -255,7 +255,7 @@ def as_seq_cls_model(cls: _T) -> _T:
     from vllm.model_executor.models.interfaces import SupportsCrossEncoding
     from vllm.sequence import IntermediateTensors
 
-    from .utils import maybe_prefix
+    from .utils import get_model_hidden_size, maybe_prefix
 
     class ModelForSequenceClassification(_create_pooling_model_cls(cls),
                                          SupportsCrossEncoding):
@@ -263,9 +263,10 @@ def as_seq_cls_model(cls: _T) -> _T:
         def _init_pooler(self, vllm_config: "VllmConfig", prefix: str = ""):
             config = vllm_config.model_config.hf_config
             quant_config = vllm_config.quant_config
+            hidden_size = get_model_hidden_size(config)
 
             self.score = ReplicatedLinear(
-                config.hidden_size,
+                hidden_size,
                 config.num_labels,
                 bias=False,
                 params_dtype=torch.float32,
diff --git a/vllm/model_executor/models/apertus.py b/vllm/model_executor/models/apertus.py
index 0de683d2cd060b94d34b4d70a3f49b811c89299e..f6400b05e110a89de5a0152719198bd28a3caf6b 100644
--- a/vllm/model_executor/models/apertus.py
+++ b/vllm/model_executor/models/apertus.py
@@ -415,6 +415,12 @@ class ApertusModel(nn.Module):
             (".qkv_proj", ".v_proj", "v"),
         ]
         params_dict = dict(self.named_parameters())
+
+        # we need to load the buffers for beta and eps (XIELU)
+        for name, buffer in self.named_buffers():
+            if name.endswith(".beta") or name.endswith(".eps"):
+                params_dict[name] = buffer
+
         loaded_params: set[str] = set()
         for name, loaded_weight in weights:
             if "rotary_emb.inv_freq" in name:
diff --git a/vllm/model_executor/models/arcee.py b/vllm/model_executor/models/arcee.py
index 13ed4da0602ad5cb054d887d619dc976aec5039e..be82c2fd59644d272f5489ff0bcb86f7f01a066e 100644
--- a/vllm/model_executor/models/arcee.py
+++ b/vllm/model_executor/models/arcee.py
@@ -342,7 +342,7 @@ class ArceeModel(nn.Module):
 class ArceeForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
     """Arcee Model for causal language modeling, integrated with vLLM
     runtime."""
-    # Map fused module names to their sub-module components
+    # Map fused module names to their submodule components
     # (for quantization and LoRA)
     packed_modules_mapping = {
         "qkv_proj": ["q_proj", "k_proj", "v_proj"],
diff --git a/vllm/model_executor/models/bart.py b/vllm/model_executor/models/bart.py
index 32551d8102f32e637b77635179c55151a1889262..fd4d820a01e9d67b6332246f0d5e4326e974810e 100644
--- a/vllm/model_executor/models/bart.py
+++ b/vllm/model_executor/models/bart.py
@@ -29,7 +29,8 @@ from transformers import BartConfig
 from transformers.utils import logging
 
 from vllm.attention import Attention, AttentionType
-from vllm.config import CacheConfig, LoRAConfig, VllmConfig
+from vllm.config import CacheConfig, VllmConfig
+from vllm.config.lora import LoRAConfig
 from vllm.distributed import get_tensor_model_parallel_world_size
 from vllm.model_executor.layers.activation import get_act_fn
 from vllm.model_executor.layers.linear import (ColumnParallelLinear,
diff --git a/vllm/model_executor/models/bert.py b/vllm/model_executor/models/bert.py
index 8f23439655ed7257457a98d620a9e80e45b800a8..c07e5364814ac9ea5f4ccfde2223ec5f5b989f2d 100644
--- a/vllm/model_executor/models/bert.py
+++ b/vllm/model_executor/models/bert.py
@@ -562,7 +562,9 @@ class BertForSequenceClassification(nn.Module, SupportsCrossEncoding,
         self.bert = BertPoolingModel(vllm_config=vllm_config,
                                      prefix=maybe_prefix(prefix, "bert"),
                                      embedding_class=BertEmbedding)
-        self.classifier = nn.Linear(config.hidden_size, config.num_labels)
+        self.classifier = nn.Linear(config.hidden_size,
+                                    config.num_labels,
+                                    dtype=vllm_config.model_config.head_dtype)
 
         pooler_config = vllm_config.model_config.pooler_config
         assert pooler_config is not None
diff --git a/vllm/model_executor/models/bert_with_rope.py b/vllm/model_executor/models/bert_with_rope.py
index 3be7e11d947d5c466e4fce9032c704691bf45775..b758cbf28d89322914f6839ad5f2e3fb2ca63c5f 100644
--- a/vllm/model_executor/models/bert_with_rope.py
+++ b/vllm/model_executor/models/bert_with_rope.py
@@ -637,14 +637,14 @@ class GteNewForSequenceClassification(nn.Module, SupportsCrossEncoding):
         self.new = GteNewModel(vllm_config=vllm_config,
                                prefix=prefix,
                                add_pooling_layer=True)
-        self.classifier = RowParallelLinear(config.hidden_size,
-                                            config.num_labels,
-                                            input_is_parallel=False,
-                                            bias=True,
-                                            quant_config=quant_config,
-                                            prefix=maybe_prefix(
-                                                prefix, "classifier"),
-                                            return_bias=False)
+        self.classifier = ReplicatedLinear(
+            config.hidden_size,
+            config.num_labels,
+            bias=True,
+            quant_config=quant_config,
+            params_dtype=vllm_config.model_config.head_dtype,
+            prefix=maybe_prefix(prefix, "classifier"),
+            return_bias=False)
 
         pooler_config = vllm_config.model_config.pooler_config
         assert pooler_config is not None
diff --git a/vllm/model_executor/models/blip2.py b/vllm/model_executor/models/blip2.py
index 2f2b880bb0e144d84149bcbe872ed6a9a0da9052..ed98a3008c567878acbafc893c3143174efc3ec1 100644
--- a/vllm/model_executor/models/blip2.py
+++ b/vllm/model_executor/models/blip2.py
@@ -560,8 +560,8 @@ class Blip2ForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP,
         self.make_empty_intermediate_tensors = (
             self.language_model.make_empty_intermediate_tensors)
 
-    def _create_image_input(self,
-                            **kwargs: object) -> Optional[Blip2ImageInputs]:
+    def _parse_and_validate_image_input(
+            self, **kwargs: object) -> Optional[Blip2ImageInputs]:
         pixel_values = kwargs.pop("pixel_values", None)
         image_embeds = kwargs.pop("image_embeds", None)
 
diff --git a/vllm/model_executor/models/config.py b/vllm/model_executor/models/config.py
index 377b7bf26a07a247ba7b1aeb6bd848d5c3ca689c..687af7a189ceaee91a77a62aa685cb39db6f70da 100644
--- a/vllm/model_executor/models/config.py
+++ b/vllm/model_executor/models/config.py
@@ -24,6 +24,14 @@ class VerifyAndUpdateConfig:
         raise NotImplementedError
 
 
+class Gemma3TextModelConfig:
+
+    @staticmethod
+    def verify_and_update_config(vllm_config: "VllmConfig") -> None:
+        hf_config = vllm_config.model_config.hf_config
+        hf_config.is_causal = not hf_config.use_bidirectional_attention
+
+
 class GteNewModelConfig(VerifyAndUpdateConfig):
 
     @staticmethod
@@ -210,8 +218,10 @@ class JinaVLForSequenceClassificationConfig(VerifyAndUpdateConfig):
     @staticmethod
     def verify_and_update_config(vllm_config: "VllmConfig") -> None:
         config = vllm_config.model_config.hf_config
-
         config.num_labels = 1
+        pooler_config = vllm_config.model_config.pooler_config
+        if pooler_config.logit_bias is None:
+            pooler_config.logit_bias = 2.65
 
 
 class SnowflakeGteNewModelConfig(VerifyAndUpdateConfig):
@@ -254,7 +264,7 @@ class GptOssForCausalLMConfig(VerifyAndUpdateConfig):
     def verify_and_update_config(vllm_config: "VllmConfig") -> None:
         decoding_config = vllm_config.decoding_config
         if decoding_config.reasoning_backend == "":
-            decoding_config.reasoning_backend = "GptOss"
+            decoding_config.reasoning_backend = "openai_gptoss"
 
         # Increase the max capture size from 512 to 1024 for performance.
         # NOTE(woosuk): This will increase the number of CUDA graphs
@@ -302,7 +312,8 @@ class MambaModelConfig(VerifyAndUpdateConfig):
 
         # TODO(tdoublep): remove as full cuda graph support is added
         FCG_NOT_SUPPORTED_MODELS = [
-            "Lfm2ForCausalLM", "MiniMaxText01ForCausalLM"
+            "Lfm2ForCausalLM",
+            "MiniMaxText01ForCausalLM",
         ]
 
         if (model_config.architecture not in FCG_NOT_SUPPORTED_MODELS
@@ -407,6 +418,7 @@ MODELS_CONFIG_MAP: dict[str, type[VerifyAndUpdateConfig]] = {
     "GteModel": SnowflakeGteNewModelConfig,
     "GteNewModel": GteNewModelConfig,
     "GteNewForSequenceClassification": GteNewModelConfig,
+    "Gemma3TextModel": Gemma3TextModelConfig,
     "NomicBertModel": NomicBertModelConfig,
     "Qwen2ForProcessRewardModel": Qwen2ForProcessRewardModelConfig,
     "Qwen2ForRewardModel": Qwen2ForRewardModelConfig,
diff --git a/vllm/model_executor/models/deepseek_eagle.py b/vllm/model_executor/models/deepseek_eagle.py
index 0c9c83cf610001a2863cdac024c9334f3aacb91b..5e8447a7f48f9f50710ee783ea253a2814110ff8 100644
--- a/vllm/model_executor/models/deepseek_eagle.py
+++ b/vllm/model_executor/models/deepseek_eagle.py
@@ -37,8 +37,6 @@ class DeepseekV2Model(nn.Module):
         super().__init__()
         self.config = vllm_config. \
             speculative_config.draft_model_config.hf_config
-        model_config = vllm_config.model_config
-        cache_config = vllm_config.cache_config
         quant_config = vllm_config.quant_config
         self.vocab_size = self.config.vocab_size
 
@@ -51,11 +49,8 @@ class DeepseekV2Model(nn.Module):
 
         self.layers = nn.ModuleList([
             DeepseekV2DecoderLayer(
-                self.config,
+                vllm_config,
                 prefix=maybe_prefix(prefix, f"layers.{i + start_layer_id}"),
-                model_config=model_config,
-                cache_config=cache_config,
-                quant_config=quant_config,
             ) for i in range(self.config.num_hidden_layers)
         ])
 
diff --git a/vllm/model_executor/models/deepseek_mtp.py b/vllm/model_executor/models/deepseek_mtp.py
index 0ad001be71c19172916a1413cc3dd94c34726e64..8fbf16d206a86460be7c62969ab9dac9709993d7 100644
--- a/vllm/model_executor/models/deepseek_mtp.py
+++ b/vllm/model_executor/models/deepseek_mtp.py
@@ -7,7 +7,7 @@ import torch
 import torch.nn as nn
 from transformers import PretrainedConfig
 
-from vllm.config import CacheConfig, ModelConfig, VllmConfig
+from vllm.config import VllmConfig
 from vllm.model_executor.layers.fused_moe import FusedMoE
 from vllm.model_executor.layers.layernorm import RMSNorm
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
@@ -43,23 +43,19 @@ class SharedHead(nn.Module):
 
 class DeepSeekMultiTokenPredictorLayer(nn.Module):
 
-    def __init__(
-        self,
-        config: PretrainedConfig,
-        prefix: str,
-        model_config: ModelConfig,
-        cache_config: Optional[CacheConfig] = None,
-        quant_config: Optional[QuantizationConfig] = None,
-    ) -> None:
+    def __init__(self, vllm_config: VllmConfig, prefix: str) -> None:
         super().__init__()
+
+        config = vllm_config.model_config.hf_config
+        quant_config = vllm_config.quant_config
+
         self.enorm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
         self.hnorm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
         self.eh_proj = nn.Linear(config.hidden_size * 2,
                                  config.hidden_size,
                                  bias=False)
         self.shared_head = SharedHead(config=config, quant_config=quant_config)
-        self.mtp_block = DeepseekV2DecoderLayer(config, prefix, model_config,
-                                                cache_config, quant_config)
+        self.mtp_block = DeepseekV2DecoderLayer(vllm_config, prefix)
 
     def forward(
         self,
@@ -95,13 +91,8 @@ class DeepSeekMultiTokenPredictor(nn.Module):
         # to map the exact layer index from weights
         self.layers = torch.nn.ModuleDict({
             str(idx):
-            DeepSeekMultiTokenPredictorLayer(
-                config,
-                f"{prefix}.layers.{idx}",
-                model_config=vllm_config.model_config,
-                cache_config=vllm_config.cache_config,
-                quant_config=vllm_config.quant_config,
-            )
+            DeepSeekMultiTokenPredictorLayer(vllm_config,
+                                             f"{prefix}.layers.{idx}")
             for idx in range(self.mtp_start_layer_idx,
                              self.mtp_start_layer_idx + self.num_mtp_layers)
         })
diff --git a/vllm/model_executor/models/deepseek_v2.py b/vllm/model_executor/models/deepseek_v2.py
index 9e75405dcab3656e623921ab44a3ffaba242f085..0b08a5e809a2cee8b77aa93112f863c40e51e2bd 100644
--- a/vllm/model_executor/models/deepseek_v2.py
+++ b/vllm/model_executor/models/deepseek_v2.py
@@ -34,29 +34,34 @@ import torch
 from torch import nn
 from transformers import DeepseekV2Config, DeepseekV3Config
 
+import vllm.envs as envs
 from vllm.attention import Attention
 from vllm.compilation.decorators import support_torch_compile
-from vllm.config import (CacheConfig, ModelConfig, VllmConfig,
-                         get_current_vllm_config)
+from vllm.config import CacheConfig, ParallelConfig, VllmConfig
 from vllm.distributed import (get_ep_group, get_pp_group,
-                              get_tensor_model_parallel_world_size)
+                              get_tensor_model_parallel_rank,
+                              get_tensor_model_parallel_world_size,
+                              tensor_model_parallel_all_gather)
 from vllm.model_executor.layers.activation import SiluAndMul
 from vllm.model_executor.layers.fused_moe import FusedMoE
 from vllm.model_executor.layers.layernorm import RMSNorm
 from vllm.model_executor.layers.linear import (ColumnParallelLinear,
                                                MergedColumnParallelLinear,
-                                               MergedReplicatedLinear,
                                                ReplicatedLinear,
                                                RowParallelLinear)
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
+from vllm.model_executor.layers.mla import MLAModules, MultiHeadLatentAttention
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.layers.rotary_embedding import get_rope
+from vllm.model_executor.layers.shared_fused_moe import SharedFusedMoE
 from vllm.model_executor.layers.vocab_parallel_embedding import (
     ParallelLMHead, VocabParallelEmbedding)
 from vllm.model_executor.model_loader.weight_utils import (
     default_weight_loader, maybe_remap_kv_scale_name)
 from vllm.model_executor.sampling_metadata import SamplingMetadata
+from vllm.platforms import current_platform
 from vllm.sequence import IntermediateTensors
+from vllm.utils import cdiv, direct_register_custom_op
 
 from .interfaces import MixtureOfExperts, SupportsLoRA, SupportsPP
 from .utils import (PPMissingLayer, is_pp_missing_parameter,
@@ -74,19 +79,27 @@ class DeepseekV2MLP(nn.Module):
         hidden_act: str,
         quant_config: Optional[QuantizationConfig] = None,
         reduce_results: bool = True,
+        is_sequence_parallel=False,
         prefix: str = "",
     ) -> None:
         super().__init__()
+
+        # If is_sequence_parallel, the input and output tensors are sharded
+        # across the ranks within the tp_group. In this case the weights are
+        # replicated and no collective ops are needed.
+        # Otherwise we use standard TP with an allreduce at the end.
         self.gate_up_proj = MergedColumnParallelLinear(
             hidden_size, [intermediate_size] * 2,
             bias=False,
             quant_config=quant_config,
+            disable_tp=is_sequence_parallel,
             prefix=f"{prefix}.gate_up_proj")
         self.down_proj = RowParallelLinear(intermediate_size,
                                            hidden_size,
                                            bias=False,
                                            quant_config=quant_config,
                                            reduce_results=reduce_results,
+                                           disable_tp=is_sequence_parallel,
                                            prefix=f"{prefix}.down_proj")
         if hidden_act != "silu":
             raise ValueError(f"Unsupported activation: {hidden_act}. "
@@ -100,17 +113,58 @@ class DeepseekV2MLP(nn.Module):
         return x
 
 
+# Chunk x along the num_tokens axis for sequence parallelism
+# NOTE: This is wrapped in a torch custom op to work around the following issue:
+# The output tensor can have a sequence length 0 at small input sequence lengths
+# even though we explicitly pad to avoid this.
+def sequence_parallel_chunk(x: torch.Tensor) -> torch.Tensor:
+    tp_size = get_tensor_model_parallel_world_size()
+    tp_rank = get_tensor_model_parallel_rank()
+
+    # all_gather needs the sequence length to be divisible by tp_size
+    seq_len = x.size(0)
+    remainder = seq_len % tp_size
+    if remainder != 0:
+        pad_len = tp_size - remainder
+        x = nn.functional.pad(x, (0, 0, 0, pad_len))
+
+    chunk = x.shape[0] // tp_size
+    start = tp_rank * chunk
+    return torch.narrow(x, 0, start, chunk)
+
+
+def sequence_parallel_chunk_fake(x: torch.Tensor) -> torch.Tensor:
+    tp_size = get_tensor_model_parallel_world_size()
+    seq_len = cdiv(x.size(0), tp_size)
+    shape = list(x.shape)
+    shape[0] = seq_len
+    out = torch.empty(shape, dtype=x.dtype, device=x.device)
+    return out
+
+
+direct_register_custom_op(
+    op_name="sequence_parallel_chunk",
+    op_func=sequence_parallel_chunk,
+    mutates_args=[],
+    fake_impl=sequence_parallel_chunk_fake,
+    dispatch_key=current_platform.dispatch_key,
+    tags=(torch.Tag.needs_fixed_stride_order, ),
+)
+
+
 class DeepseekV2MoE(nn.Module):
 
     def __init__(
         self,
         config: Union[DeepseekV2Config, DeepseekV3Config],
+        parallel_config: ParallelConfig,
         quant_config: Optional[QuantizationConfig] = None,
         prefix: str = "",
-        enable_eplb: bool = False,
     ):
         super().__init__()
         self.tp_size = get_tensor_model_parallel_world_size()
+        self.tp_rank = get_tensor_model_parallel_rank()
+
         self.routed_scaling_factor = config.routed_scaling_factor
 
         self.ep_group = get_ep_group().device_group
@@ -119,6 +173,21 @@ class DeepseekV2MoE(nn.Module):
         self.n_routed_experts: int = config.n_routed_experts
         self.n_shared_experts: int = config.n_shared_experts
 
+        # The all_reduce at the end of attention (during o_proj) means that
+        # inputs are replicated across each rank of the tensor parallel group.
+        # If using expert-parallelism with DeepEP All2All ops, replicated
+        # tokens results in useless duplicate computation and communication.
+        #
+        # In this case, ensure the input to the experts is sequence parallel
+        # to avoid the excess work.
+        #
+        # Not needed for pplx-kernels as it can handle duplicate input tokens.
+        self.is_sequence_parallel = (envs.VLLM_ALL2ALL_BACKEND
+                                     in ("deepep_high_throughput",
+                                         "deepep_low_latency")
+                                     and parallel_config.enable_expert_parallel
+                                     and self.tp_size > 1)
+
         if config.hidden_act != "silu":
             raise ValueError(f"Unsupported activation: {config.hidden_act}. "
                              "Only silu is supported for now.")
@@ -135,9 +204,8 @@ class DeepseekV2MoE(nn.Module):
             self.gate.e_score_correction_bias = None
 
         # Load balancing settings.
-        vllm_config = get_current_vllm_config()
-        eplb_config = vllm_config.parallel_config.eplb_config
-        self.enable_eplb = enable_eplb
+        eplb_config = parallel_config.eplb_config
+        self.enable_eplb = parallel_config.enable_eplb
 
         self.n_redundant_experts = eplb_config.num_redundant_experts
         self.n_logical_experts = self.n_routed_experts
@@ -150,64 +218,105 @@ class DeepseekV2MoE(nn.Module):
         self.physical_expert_end = (self.physical_expert_start +
                                     self.n_local_physical_experts)
 
-        self.experts = FusedMoE(
-            num_experts=config.n_routed_experts,
-            top_k=config.num_experts_per_tok,
-            hidden_size=config.hidden_size,
-            intermediate_size=config.moe_intermediate_size,
-            reduce_results=False,
-            renormalize=config.norm_topk_prob,
-            quant_config=quant_config,
-            use_grouped_topk=True,
-            num_expert_group=config.n_group,
-            topk_group=config.topk_group,
-            prefix=f"{prefix}.experts",
-            scoring_func=config.scoring_func,
-            routed_scaling_factor=self.routed_scaling_factor,
-            e_score_correction_bias=self.gate.e_score_correction_bias,
-            enable_eplb=self.enable_eplb,
-            num_redundant_experts=self.n_redundant_experts)
-
-        if config.n_shared_experts is not None:
+        if config.n_shared_experts is None:
+            self.experts = FusedMoE(
+                num_experts=config.n_routed_experts,
+                top_k=config.num_experts_per_tok,
+                hidden_size=config.hidden_size,
+                intermediate_size=config.moe_intermediate_size,
+                reduce_results=False,
+                renormalize=config.norm_topk_prob,
+                quant_config=quant_config,
+                use_grouped_topk=True,
+                num_expert_group=config.n_group,
+                topk_group=config.topk_group,
+                prefix=f"{prefix}.experts",
+                scoring_func=config.scoring_func,
+                # we do scaling outside, set factor to 1.0 to avoid double mul
+                routed_scaling_factor=1.0,
+                e_score_correction_bias=self.gate.e_score_correction_bias,
+                enable_eplb=self.enable_eplb,
+                num_redundant_experts=self.n_redundant_experts,
+                is_sequence_parallel=self.is_sequence_parallel,
+            )
+            self.shared_experts = None
+        else:
             intermediate_size = (config.moe_intermediate_size *
                                  config.n_shared_experts)
+
             self.shared_experts = DeepseekV2MLP(
                 hidden_size=config.hidden_size,
                 intermediate_size=intermediate_size,
                 hidden_act=config.hidden_act,
                 quant_config=quant_config,
-                reduce_results=self.experts.must_reduce_shared_expert_outputs(
-                ),
+                is_sequence_parallel=self.is_sequence_parallel,
+                reduce_results=False,
                 prefix=f"{prefix}.shared_experts",
             )
 
+            self.experts = SharedFusedMoE(
+                shared_experts=self.shared_experts,
+                num_experts=config.n_routed_experts,
+                top_k=config.num_experts_per_tok,
+                hidden_size=config.hidden_size,
+                intermediate_size=config.moe_intermediate_size,
+                reduce_results=False,
+                renormalize=config.norm_topk_prob,
+                quant_config=quant_config,
+                use_grouped_topk=True,
+                num_expert_group=config.n_group,
+                topk_group=config.topk_group,
+                prefix=f"{prefix}.experts",
+                scoring_func=config.scoring_func,
+                # we do scaling outside, set factor to 1.0 to avoid double mul
+                routed_scaling_factor=1.0,
+                e_score_correction_bias=self.gate.e_score_correction_bias,
+                enable_eplb=self.enable_eplb,
+                num_redundant_experts=self.n_redundant_experts,
+                is_sequence_parallel=self.is_sequence_parallel,
+            )
+
     def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
         num_tokens, hidden_dim = hidden_states.shape
         hidden_states = hidden_states.view(-1, hidden_dim)
-        if self.n_shared_experts is not None:
-            shared_output = self.shared_experts(hidden_states)
+
+        # Chunk the hidden states so they aren't replicated across TP ranks.
+        # This avoids duplicate computation in self.experts.
+        # TODO: We can replace the all_reduce at the end of attn with a
+        # reduce_scatter instead of chunking here.
+        if self.is_sequence_parallel:
+            hidden_states = torch.ops.vllm.sequence_parallel_chunk(
+                hidden_states)
+
         # router_logits: (num_tokens, n_experts)
         router_logits, _ = self.gate(hidden_states)
 
-        if hidden_states.dtype != torch.float16:
-            final_hidden_states = self.experts(
-                hidden_states=hidden_states,
-                router_logits=router_logits) * self.routed_scaling_factor
+        fused_moe_out = self.experts(hidden_states=hidden_states,
+                                     router_logits=router_logits)
+
+        if self.shared_experts is not None:
+            shared_output, final_hidden_states = fused_moe_out
         else:
-            # Fix FP16 overflow
-            # See DeepseekV2DecoderLayer for more details.
-            final_hidden_states = self.experts(hidden_states=hidden_states,
-                                               router_logits=router_logits)
-        if shared_output is not None:
-            if hidden_states.dtype != torch.float16:
-                final_hidden_states = final_hidden_states + shared_output
-            else:
-                # Fix FP16 overflow
-                # See DeepseekV2DecoderLayer for more details.
-                final_hidden_states = final_hidden_states + shared_output \
-                    * (1. / self.routed_scaling_factor)
+            shared_output = None
+            final_hidden_states = fused_moe_out
 
-        if self.tp_size > 1:
+        # Fix FP16 overflow
+        # See DeepseekV2DecoderLayer for more details.
+        if hidden_states.dtype != torch.float16:
+            final_hidden_states *= self.routed_scaling_factor
+        elif self.shared_experts is not None:
+            assert shared_output is not None
+            shared_output *= (1. / self.routed_scaling_factor)
+
+        if self.shared_experts is not None:
+            assert shared_output is not None
+            final_hidden_states += shared_output
+
+        if self.is_sequence_parallel:
+            final_hidden_states = tensor_model_parallel_all_gather(
+                final_hidden_states, 0)
+            final_hidden_states = final_hidden_states[:num_tokens]
+        elif self.tp_size > 1:
             final_hidden_states = (
                 self.experts.maybe_all_reduce_tensor_model_parallel(
                     final_hidden_states))
@@ -413,12 +522,13 @@ class DeepseekV2MLAAttention(nn.Module):
         self.max_position_embeddings = max_position_embeddings
 
         if self.q_lora_rank is not None:
-            self.fused_qkv_a_proj = MergedReplicatedLinear(
+            self.fused_qkv_a_proj = MergedColumnParallelLinear(
                 self.hidden_size,
                 [self.q_lora_rank, self.kv_lora_rank + self.qk_rope_head_dim],
                 bias=False,
                 quant_config=quant_config,
-                prefix=f"{prefix}.fused_qkv_a_proj")
+                prefix=f"{prefix}.fused_qkv_a_proj",
+                disable_tp=True)
         else:
             self.kv_a_proj_with_mqa = ReplicatedLinear(
                 self.hidden_size,
@@ -471,86 +581,54 @@ class DeepseekV2MLAAttention(nn.Module):
             mscale = yarn_get_mscale(scaling_factor, float(mscale_all_dim))
             self.scaling = self.scaling * mscale * mscale
 
-        # In the MLA backend, kv_cache includes both k_c and
-        # pe (i.e. decoupled position embeddings). In particular,
-        # the concat_and_cache_mla op requires
-        #     k_c.size(1) + k_pe.size(1) == kv_cache.size(2)
-        # i.e.
-        #     kv_lora_rank + qk_rope_head_dim == head_size
-        self.mla_attn = Attention(
-            num_heads=self.num_local_heads,
-            head_size=self.kv_lora_rank + self.qk_rope_head_dim,
-            scale=self.scaling,
-            num_kv_heads=1,
-            cache_config=cache_config,
-            quant_config=quant_config,
-            prefix=f"{prefix}.attn",
-            use_mla=True,
-            # MLA Args
-            q_lora_rank=self.q_lora_rank,
-            kv_lora_rank=self.kv_lora_rank,
-            qk_nope_head_dim=self.qk_nope_head_dim,
-            qk_rope_head_dim=self.qk_rope_head_dim,
-            qk_head_dim=self.qk_head_dim,
-            v_head_dim=self.v_head_dim,
+        mla_modules = MLAModules(
+            kv_a_layernorm=self.kv_a_layernorm,
             kv_b_proj=self.kv_b_proj,
+            rotary_emb=self.rotary_emb,
+            o_proj=self.o_proj,
+            fused_qkv_a_proj=self.fused_qkv_a_proj
+            if self.q_lora_rank is not None else None,
+            kv_a_proj_with_mqa=self.kv_a_proj_with_mqa
+            if self.q_lora_rank is None else None,
+            q_a_layernorm=self.q_a_layernorm
+            if self.q_lora_rank is not None else None,
+            q_b_proj=self.q_b_proj if self.q_lora_rank is not None else None,
+            q_proj=self.q_proj if self.q_lora_rank is None else None,
+        )
+        self.mla_attn = MultiHeadLatentAttention(
+            self.hidden_size,
+            self.num_local_heads,
+            self.scaling,
+            self.qk_nope_head_dim,
+            self.qk_rope_head_dim,
+            self.v_head_dim,
+            self.q_lora_rank,
+            self.kv_lora_rank,
+            mla_modules,
+            cache_config,
+            quant_config,
+            prefix,
         )
-
-        self.prefix = prefix
-        self.debug_layer_idx = int(self.prefix.split(".")[-2])
 
     def forward(
         self,
         positions: torch.Tensor,
         hidden_states: torch.Tensor,
     ) -> torch.Tensor:
-        q_c = None
-        kv_lora = None
-
-        if self.q_lora_rank is not None:
-            qkv_lora = self.fused_qkv_a_proj(hidden_states)[0]
-            q_c, kv_lora = qkv_lora.split(
-                [self.q_lora_rank, self.kv_lora_rank + self.qk_rope_head_dim],
-                dim=-1,
-            )
-            q_c = self.q_a_layernorm(q_c)
-            q = self.q_b_proj(q_c)[0]
-        else:
-            kv_lora = self.kv_a_proj_with_mqa(hidden_states)[0]
-            q = self.q_proj(hidden_states)[0]
-
-        kv_c, k_pe = kv_lora.split([self.kv_lora_rank, self.qk_rope_head_dim],
-                                   dim=-1)
-        kv_c_normed = self.kv_a_layernorm(kv_c)
-
-        q = q.view(-1, self.num_local_heads, self.qk_head_dim)
-        # Add head dim of 1 to k_pe
-        k_pe = k_pe.unsqueeze(1)
-
-        q[..., self.qk_nope_head_dim:], k_pe = self.rotary_emb(
-            positions, q[..., self.qk_nope_head_dim:], k_pe)
-
-        attn_out = self.mla_attn(
-            q,
-            kv_c_normed,
-            k_pe,
-            output_shape=(hidden_states.shape[0],
-                          self.num_local_heads * self.v_head_dim))
-        return self.o_proj(attn_out)[0]
+        return self.mla_attn(positions, hidden_states)
 
 
 class DeepseekV2DecoderLayer(nn.Module):
 
-    def __init__(
-        self,
-        config: Union[DeepseekV2Config, DeepseekV3Config],
-        prefix: str,
-        model_config: ModelConfig,
-        cache_config: Optional[CacheConfig] = None,
-        quant_config: Optional[QuantizationConfig] = None,
-        enable_eplb: bool = False,
-    ) -> None:
+    def __init__(self, vllm_config: VllmConfig, prefix: str) -> None:
         super().__init__()
+
+        config = vllm_config.model_config.hf_config
+        model_config = vllm_config.model_config
+        cache_config = vllm_config.cache_config
+        quant_config = vllm_config.quant_config
+        parallel_config = vllm_config.parallel_config
+
         self.hidden_size = config.hidden_size
         rope_theta = getattr(config, "rope_theta", 10000)
         rope_scaling = getattr(config, "rope_scaling", None)
@@ -587,9 +665,9 @@ class DeepseekV2DecoderLayer(nn.Module):
                 and layer_idx % config.moe_layer_freq == 0):
             self.mlp = DeepseekV2MoE(
                 config=config,
+                parallel_config=parallel_config,
                 quant_config=quant_config,
                 prefix=f"{prefix}.mlp",
-                enable_eplb=enable_eplb,
             )
         else:
             self.mlp = DeepseekV2MLP(
@@ -659,10 +737,7 @@ class DeepseekV2Model(nn.Module):
         super().__init__()
 
         config = vllm_config.model_config.hf_config
-        model_config = vllm_config.model_config
-        cache_config = vllm_config.cache_config
         quant_config = vllm_config.quant_config
-        enable_eplb = vllm_config.parallel_config.enable_eplb
         self.config = config
 
         self.vocab_size = config.vocab_size
@@ -678,14 +753,7 @@ class DeepseekV2Model(nn.Module):
 
         self.start_layer, self.end_layer, self.layers = make_layers(
             config.num_hidden_layers,
-            lambda prefix: DeepseekV2DecoderLayer(
-                config,
-                prefix,
-                model_config=model_config,
-                cache_config=cache_config,
-                quant_config=quant_config,
-                enable_eplb=enable_eplb,
-            ),
+            lambda prefix: DeepseekV2DecoderLayer(vllm_config, prefix),
             prefix=f"{prefix}.layers")
 
         if get_pp_group().is_last_rank:
diff --git a/vllm/model_executor/models/deepseek_vl2.py b/vllm/model_executor/models/deepseek_vl2.py
index 5eab02b17151c8a09a22efa59df4fc0d6a970d91..d7ae8206baca5eaa0926280487d55dc0030d16d2 100644
--- a/vllm/model_executor/models/deepseek_vl2.py
+++ b/vllm/model_executor/models/deepseek_vl2.py
@@ -21,7 +21,8 @@ from vllm.model_executor.model_loader.utils import set_default_torch_dtype
 from vllm.model_executor.models.transformers import replace_linear_class
 from vllm.multimodal import MULTIMODAL_REGISTRY
 from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
-                                    MultiModalKwargsItems, NestedTensors)
+                                    MultiModalKwargsItems, MultiModalUUIDDict,
+                                    NestedTensors)
 from vllm.multimodal.parse import (ImageEmbeddingItems, ImageProcessorItems,
                                    ImageSize, MultiModalDataItems)
 from vllm.multimodal.processing import (BaseMultiModalProcessor,
@@ -290,7 +291,7 @@ class DeepseekVL2MultiModalProcessor(
         mm_data_items: MultiModalDataItems,
         hf_processor_mm_kwargs: Mapping[str, object],
         tokenization_kwargs: Mapping[str, object],
-        mm_hash_overrides: Optional[dict[str, list[str]]] = None,
+        mm_uuids: Optional[MultiModalUUIDDict] = None,
     ) -> tuple[list[int], MultiModalProcessingInfo, bool]:
         # The processor logic is different for len(images) <= 2 vs > 2
         # Since the processing cache assumes that the processor output is
@@ -302,7 +303,7 @@ class DeepseekVL2MultiModalProcessor(
                 mm_data_items=mm_data_items,
                 hf_processor_mm_kwargs=hf_processor_mm_kwargs,
                 tokenization_kwargs=tokenization_kwargs,
-                mm_hash_overrides=mm_hash_overrides,
+                mm_uuids=mm_uuids,
             )
 
         return super()._cached_apply_hf_processor(
@@ -310,7 +311,7 @@ class DeepseekVL2MultiModalProcessor(
             mm_data_items=mm_data_items,
             hf_processor_mm_kwargs=hf_processor_mm_kwargs,
             tokenization_kwargs=tokenization_kwargs,
-            mm_hash_overrides=mm_hash_overrides,
+            mm_uuids=mm_uuids,
         )
 
 
diff --git a/vllm/model_executor/models/dots1.py b/vllm/model_executor/models/dots1.py
index a5477af8694b441f6d7e50af00e08324dbc26700..4ddf906dddefedabb7eb10cc448afb649cf6b113 100644
--- a/vllm/model_executor/models/dots1.py
+++ b/vllm/model_executor/models/dots1.py
@@ -137,7 +137,8 @@ class Dots1MoE(nn.Module):
             topk_group=config.topk_group,
             prefix=f"{prefix}.experts",
             scoring_func=config.scoring_func,
-            routed_scaling_factor=self.routed_scaling_factor,
+            # we do scaling outside, set factor to 1.0 to avoid double mul
+            routed_scaling_factor=1.0,
             e_score_correction_bias=self.gate.e_score_correction_bias)
 
         if config.n_shared_experts is not None:
diff --git a/vllm/model_executor/models/ernie45_vl.py b/vllm/model_executor/models/ernie45_vl.py
index d880fc434e20fd17f0210a89ff3bb197b41a35bc..97aace5a20c3a28f47fbe4af9167b618dca8e20e 100644
--- a/vllm/model_executor/models/ernie45_vl.py
+++ b/vllm/model_executor/models/ernie45_vl.py
@@ -66,8 +66,6 @@ from .vision import get_vit_attn_backend
 
 logger = init_logger(__name__)
 
-_MAX_FRAMES_PER_VIDEO = 16
-
 # === Vision Transformer === #
 
 
@@ -839,6 +837,15 @@ class Ernie4_5_VLProcessingInfo(BaseProcessingInfo):
     def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
         return {"image": None, "video": None}
 
+    def get_mm_max_tokens_per_item(
+        self,
+        seq_len: int,
+        mm_counts: Mapping[str, int],
+    ) -> Mapping[str, int]:
+        max_image_tokens = self.get_max_image_tokens()
+        max_video_tokens = self.get_max_video_tokens(seq_len, mm_counts)
+        return {"image": max_image_tokens, "video": max_video_tokens}
+
     def _get_vision_info(
         self,
         *,
@@ -964,8 +971,7 @@ class Ernie4_5_VLProcessingInfo(BaseProcessingInfo):
         max_image_tokens = self.get_max_image_tokens() * max_images
         max_total_frames = self._get_max_video_frames(seq_len -
                                                       max_image_tokens)
-        max_frames_per_video = min(max_total_frames // max(max_videos, 1),
-                                   _MAX_FRAMES_PER_VIDEO)
+        max_frames_per_video = max_total_frames // max(max_videos, 1)
 
         return max(max_frames_per_video, 2)
 
diff --git a/vllm/model_executor/models/ernie45_vl_moe.py b/vllm/model_executor/models/ernie45_vl_moe.py
index 780974c3b758e8187d71b916ed7f42a0be716130..6034505fa7d68b2c7c265fbbe758b4cb0a53bc46 100644
--- a/vllm/model_executor/models/ernie45_vl_moe.py
+++ b/vllm/model_executor/models/ernie45_vl_moe.py
@@ -287,8 +287,13 @@ class Ernie4_5_VLMoeMoE(nn.Module):
         if self.has_shared_experts:
             shared_output = self.shared_experts(hidden_states)
 
-        if visual_token_mask is not None and visual_token_mask.any():
-            # assert visual_token_mask.shape[0] != hidden_states.shape[0]
+        if visual_token_mask is not None and visual_token_mask.all():
+            # only vision modal input
+            router_logits, _ = self.vision_experts_gate(hidden_states)
+            final_hidden_states = self.vision_experts(
+                hidden_states=hidden_states, router_logits=router_logits)
+        elif visual_token_mask is not None and visual_token_mask.any():
+            # text and vision modals input
             visual_token_mask = visual_token_mask.repeat(
                 1, self.hidden_size).bool()
             text_token_mask = ~visual_token_mask
@@ -310,7 +315,7 @@ class Ernie4_5_VLMoeMoE(nn.Module):
                 hidden_states=vision_hidden_states,
                 router_logits=vision_router_logits).flatten()
         else:
-            # text modal input processing directly
+            # only text modal input
             text_router_logits, _ = self.text_experts_gate(hidden_states)
 
             final_hidden_states = self.text_experts(
diff --git a/vllm/model_executor/models/exaone4.py b/vllm/model_executor/models/exaone4.py
index 971fcbd2aa2754cc2f182a8f83f6eef118dcc3f1..e94c43a47f76a5461a95d1919beacfe374fa57bc 100644
--- a/vllm/model_executor/models/exaone4.py
+++ b/vllm/model_executor/models/exaone4.py
@@ -164,8 +164,8 @@ class Exaone4Attention(nn.Module):
         is_sliding = config.layer_types[layer_idx] == "sliding_attention"
         self.sliding_window = config.sliding_window if is_sliding else None
 
-        # apply rotary embeddings to every layer
-        self.apply_all_layers = not is_sliding
+        # apply rotary embeddings to every layer in full attention models
+        self.apply_rope_all_layers = "sliding_attention" not in config.layer_types
 
         self.rotary_emb = get_rope(
             self.head_dim,
@@ -201,7 +201,7 @@ class Exaone4Attention(nn.Module):
         k = self.k_norm(k)
         k = k.flatten(-2, -1)
 
-        if self.sliding_window or self.apply_all_layers:
+        if self.sliding_window or self.apply_rope_all_layers:
             q, k = self.rotary_emb(positions, q, k)
         attn_output = self.attn(q, k, v)
         output, _ = self.o_proj(attn_output)
diff --git a/vllm/model_executor/models/gemma3.py b/vllm/model_executor/models/gemma3.py
index 410c715d5241b717d4699322b21522f42ae02524..1263e3049a14a9013e609f357e98e63184b716b1 100644
--- a/vllm/model_executor/models/gemma3.py
+++ b/vllm/model_executor/models/gemma3.py
@@ -24,7 +24,7 @@ import torch.nn.functional as F
 from torch import nn
 from transformers import Gemma3TextConfig
 
-from vllm.attention import Attention
+from vllm.attention import Attention, AttentionType
 from vllm.compilation.decorators import support_torch_compile
 from vllm.config import CacheConfig, VllmConfig
 from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size
@@ -44,6 +44,7 @@ from vllm.model_executor.model_loader.weight_utils import (
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.sequence import IntermediateTensors
 
+from ...attention.layers.encoder_only_attention import EncoderOnlyAttention
 from .interfaces import SupportsLoRA, SupportsPP
 from .utils import (AutoWeightsLoader, extract_layer_index,
                     is_pp_missing_parameter,
@@ -169,16 +170,24 @@ class Gemma3Attention(nn.Module):
             rope_scaling=self.rope_scaling,
         )
 
-        # Initialize the attention.
-        self.attn = Attention(self.num_heads,
-                              self.head_dim,
-                              self.scaling,
-                              num_kv_heads=self.num_kv_heads,
-                              cache_config=cache_config,
-                              quant_config=quant_config,
-                              logits_soft_cap=attn_logits_soft_cap,
-                              per_layer_sliding_window=sliding_window,
-                              prefix=f"{prefix}.attn")
+        if getattr(config, "is_causal", True):
+            attn_type = AttentionType.DECODER
+        else:
+            attn_type = AttentionType.ENCODER_ONLY
+
+        attn_cls = (EncoderOnlyAttention
+                    if attn_type == AttentionType.ENCODER_ONLY else Attention)
+
+        self.attn = attn_cls(self.num_heads,
+                             self.head_dim,
+                             self.scaling,
+                             num_kv_heads=self.num_kv_heads,
+                             cache_config=cache_config,
+                             quant_config=quant_config,
+                             attn_type=attn_type,
+                             logits_soft_cap=attn_logits_soft_cap,
+                             per_layer_sliding_window=sliding_window,
+                             prefix=f"{prefix}.attn")
 
     def forward(
         self,
diff --git a/vllm/model_executor/models/gemma3n_mm.py b/vllm/model_executor/models/gemma3n_mm.py
index d59dde1560aea26f0747db7aaf3cf74d7a9b1ca6..3074451e40a4d321e45329f8024917ea04aec1d6 100644
--- a/vllm/model_executor/models/gemma3n_mm.py
+++ b/vllm/model_executor/models/gemma3n_mm.py
@@ -1,9 +1,11 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from collections.abc import Iterable, Mapping, Sequence
-from typing import Any, Optional, TypedDict, Union, cast
+from typing import Any, Literal, Optional, TypedDict, Union, cast
 
+import numpy as np
 import torch
+# yapf: disable
 from torch import nn
 from transformers import AutoModel, BatchFeature
 from transformers.models.gemma3n import (Gemma3nAudioConfig,
@@ -13,7 +15,8 @@ from transformers.models.gemma3n import (Gemma3nAudioConfig,
                                          Gemma3nVisionConfig)
 from transformers.models.siglip import SiglipImageProcessorFast
 
-from vllm.config import VllmConfig
+from vllm.config import ModelConfig, SpeechToTextConfig, VllmConfig
+from vllm.inputs.data import PromptType
 from vllm.logger import init_logger
 from vllm.model_executor.layers.layernorm import RMSNorm
 from vllm.model_executor.layers.linear import RowParallelLinear
@@ -21,13 +24,13 @@ from vllm.model_executor.layers.vocab_parallel_embedding import (
     VocabParallelEmbedding)
 from vllm.model_executor.models.gemma3n import Gemma3nForCausalLM
 from vllm.model_executor.models.module_mapping import MultiModelKeys
+from vllm.model_executor.models.whisper import ISO639_1_SUPPORTED_LANGS
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.multimodal import MULTIMODAL_REGISTRY
 from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
                                     MultiModalKwargsItems)
 from vllm.multimodal.parse import (ImageProcessorItems, MultiModalDataItems,
                                    MultiModalDataParser)
-# yapf: disable
 from vllm.multimodal.processing import (BaseMultiModalProcessor,
                                         BaseProcessingInfo,
                                         MultiModalPromptUpdates,
@@ -40,7 +43,8 @@ from vllm.multimodal.processing import (BaseMultiModalProcessor,
 from vllm.multimodal.profiling import BaseDummyInputsBuilder
 from vllm.sequence import IntermediateTensors
 
-from .interfaces import MultiModalEmbeddings, SupportsMultiModal
+from .interfaces import (MultiModalEmbeddings, SupportsMultiModal,
+                         SupportsTranscription)
 from .utils import (AutoWeightsLoader, WeightsMapper, flatten_bn,
                     init_vllm_registered_model, maybe_prefix,
                     merge_multimodal_embeddings)
@@ -58,7 +62,8 @@ class Gemma3nImagePixelInputs(TypedDict):
 
 
 class Gemma3nAudioInputs(TypedDict):
-    input_features: torch.Tensor
+    input_features: Union[torch.Tensor, list[torch.Tensor]]
+    input_features_padded: torch.Tensor
     """Shape: `(batch_size * num_audio, seq_length, num_features)`"""
     input_features_mask: torch.Tensor
     """Shape: `(batch_size * num_audio, seq_length)`"""
@@ -174,7 +179,7 @@ class Gemma3nMultiModalProcessor(BaseMultiModalProcessor[Gemma3nProcessingInfo]
     ) -> BatchFeature:
 
         # HF Transformers audio processor no longer accepts `audios` key.
-        # We pop `audios` and replace it with `audio` key to surpress
+        # We pop `audios` and replace it with `audio` key to suppress
         # the warning.
         if 'audios' in mm_data:
             mm_data['audio'] = mm_data.pop('audios')
@@ -184,8 +189,13 @@ class Gemma3nMultiModalProcessor(BaseMultiModalProcessor[Gemma3nProcessingInfo]
             mm_kwargs,
             tok_kwargs,
         )
+
         if 'input_features' in processed_outputs:
-            # Avoid padding since we need the output of each item to be
+            # Padding enables audio_tower to run in batched mode
+            processed_outputs["input_features_padded"] = \
+                processed_outputs["input_features"]
+
+            # Unpad features here since we need the output of each item to be
             # independent of other items for the cache to work correctly
             unpadded_features = [
                 f[mask] for f, mask in zip(
@@ -202,9 +212,11 @@ class Gemma3nMultiModalProcessor(BaseMultiModalProcessor[Gemma3nProcessingInfo]
         hf_processor_mm_kwargs: Mapping[str, object],
     ) -> Mapping[str, MultiModalFieldConfig]:
 
-        return dict(pixel_values=MultiModalFieldConfig.batched("image"),
-                    input_features=MultiModalFieldConfig.batched("audio"),
-                    input_features_mask=MultiModalFieldConfig.batched("audio"))
+        return dict(
+            pixel_values=MultiModalFieldConfig.batched("image"),
+            input_features=MultiModalFieldConfig.batched("audio"),
+            input_features_padded=MultiModalFieldConfig.batched("audio"),
+            input_features_mask=MultiModalFieldConfig.batched("audio"))
 
     def _get_prompt_updates(
         self,
@@ -410,7 +422,10 @@ class Gemma3nMultimodalEmbedder(nn.Module):
 @MULTIMODAL_REGISTRY.register_processor(Gemma3nMultiModalProcessor,
                                         info=Gemma3nProcessingInfo,
                                         dummy_inputs=Gemma3nDummyInputsBuilder)
-class Gemma3nForConditionalGeneration(nn.Module, SupportsMultiModal):
+class Gemma3nForConditionalGeneration(nn.Module, SupportsMultiModal,
+                                      SupportsTranscription):
+    supported_languages = ISO639_1_SUPPORTED_LANGS
+
     packed_modules_mapping = {
         "qkv_proj": [
             "q_proj",
@@ -509,9 +524,14 @@ class Gemma3nForConditionalGeneration(nn.Module, SupportsMultiModal):
         if input_features_mask is None:
             return None
 
+        input_features_padded = kwargs.pop("input_features_padded", None)
+        if input_features_padded is None:
+            return None
+
         return Gemma3nAudioInputs(
             input_features=input_features,
             input_features_mask=input_features_mask,
+            input_features_padded=input_features_padded,
         )
 
     def _parse_and_validate_multimodal_inputs(self, **kwargs: object) -> dict:
@@ -557,7 +577,8 @@ class Gemma3nForConditionalGeneration(nn.Module, SupportsMultiModal):
         audio_input: Gemma3nAudioInputs,
     ) -> list[torch.Tensor]:
         assert self.audio_tower is not None
-        input_features = audio_input["input_features"].squeeze(1)
+        # Run on padded features to enable batching
+        input_features = audio_input["input_features_padded"].squeeze(1)
         input_features_mask = audio_input["input_features_mask"].squeeze(1)
         audio_outputs, audio_mask = self.audio_tower(input_features,
                                                      ~input_features_mask)
@@ -694,3 +715,53 @@ class Gemma3nForConditionalGeneration(nn.Module, SupportsMultiModal):
             return "<audio_soft_token>"
         else:
             raise ValueError(f"Unsupported modality: {modality}")
+
+    @classmethod
+    def get_generation_prompt(cls, audio: np.ndarray,
+                              stt_config: SpeechToTextConfig,
+                              model_config: ModelConfig,
+                              language: Optional[str],
+                              task_type: Literal["transcribe", "translate"],
+                              request_prompt: str,
+                              to_language: Optional[str]) -> PromptType:
+        """
+        Gemma3n supports "free-form" transcription.
+        We fix its prompt here to standardize transcriptions/translations 
+        requests.
+        """
+        # Transcribe this audio [into <>] | for transcription
+        # Translate this audio [from <> into <>] | for translation
+        prompt = "<start_of_turn>user\n"
+        prompt += "Transcribe" if task_type == "transcribe" else "Translate"
+        prompt += " this audio"
+
+        # We assume the language is a valid ISO 639-1 code.
+        full_lang_name = cls.supported_languages.get(language, "")
+        # Translation only for now
+        full_lang_name_to = cls.supported_languages.get(to_language, "")
+
+        if task_type == "transcribe" and full_lang_name:
+            prompt += f" into {full_lang_name}"
+        elif task_type == "translate":
+            if full_lang_name:
+                prompt += f" from {full_lang_name}"
+            if full_lang_name_to:
+                prompt += f" into {full_lang_name_to}"
+
+        prompt += ": <audio_soft_token><end_of_turn>\n<start_of_turn>model\n"
+
+        audio = (audio, stt_config.sample_rate)
+        prompts_dict = {"multi_modal_data": {"audio": audio}, "prompt": prompt}
+        return cast(PromptType, prompts_dict)
+
+    @classmethod
+    def get_speech_to_text_config(cls, model_config: ModelConfig,
+                                  task_type: str) -> SpeechToTextConfig:
+        return SpeechToTextConfig(
+            # Let's set this to 30 as suggested in the docs for now, although
+            # the model is only limited by its context length.
+            max_audio_clip_s=30,
+            sample_rate=16000,
+            # TODO enable chunking after more thorough testing.
+            min_energy_split_window_size=None,
+        )
diff --git a/vllm/model_executor/models/glm4_1v.py b/vllm/model_executor/models/glm4_1v.py
index 662728e6b13934f19c8883307294d2c685e27cac..539381b618000b33178844067f9352c18e204ad0 100644
--- a/vllm/model_executor/models/glm4_1v.py
+++ b/vllm/model_executor/models/glm4_1v.py
@@ -45,7 +45,8 @@ from transformers.models.glm4v.video_processing_glm4v import (
 from transformers.video_utils import VideoMetadata
 
 from vllm.config import VllmConfig
-from vllm.distributed import parallel_state
+from vllm.distributed import (get_tensor_model_parallel_world_size,
+                              parallel_state)
 from vllm.distributed import utils as dist_utils
 from vllm.logger import init_logger
 from vllm.model_executor import SamplingMetadata
@@ -66,6 +67,7 @@ from vllm.multimodal.processing import (BaseMultiModalProcessor,
                                         BaseProcessingInfo, PromptReplacement,
                                         PromptUpdate, PromptUpdateDetails)
 from vllm.multimodal.profiling import BaseDummyInputsBuilder
+from vllm.multimodal.utils import run_dp_sharded_mrope_vision_model
 from vllm.platforms import _Backend
 from vllm.sequence import IntermediateTensors
 from vllm.transformers_utils.config import uses_mrope
@@ -153,7 +155,7 @@ class Glm4vVideoEmbeddingInputs(TensorSchema):
 
 Glm4vVideoInputs = Union[Glm4vVideoPixelInputs, Glm4vVideoEmbeddingInputs]
 
-# === Vision Encoder === #
+# ==== Vision Encoder ==== #
 
 
 class Glm4vVisionMLP(nn.Module):
@@ -165,6 +167,7 @@ class Glm4vVisionMLP(nn.Module):
         bias: bool = False,
         quant_config: Optional[QuantizationConfig] = None,
         prefix: str = "",
+        use_data_parallel: bool = False,
     ):
         super().__init__()
         self.gate_up_proj = MergedColumnParallelLinear(
@@ -172,12 +175,17 @@ class Glm4vVisionMLP(nn.Module):
             output_sizes=[hidden_features] * 2,
             bias=bias,
             quant_config=quant_config,
-            prefix=f"{prefix}.gate_up_proj")
-        self.down_proj = RowParallelLinear(hidden_features,
-                                           in_features,
-                                           bias=bias,
-                                           quant_config=quant_config,
-                                           prefix=f"{prefix}.down_proj")
+            prefix=f"{prefix}.gate_up_proj",
+            disable_tp=use_data_parallel,
+        )
+        self.down_proj = RowParallelLinear(
+            hidden_features,
+            in_features,
+            bias=bias,
+            quant_config=quant_config,
+            prefix=f"{prefix}.down_proj",
+            disable_tp=use_data_parallel,
+        )
         self.act_fn = SiluAndMul()
 
     def forward(self, x: torch.Tensor):
@@ -218,11 +226,14 @@ class Glm4vVisionAttention(nn.Module):
         projection_size: int,
         quant_config: Optional[QuantizationConfig] = None,
         prefix: str = "",
+        use_data_parallel: bool = False,
     ) -> None:
         super().__init__()
         # Per attention head and per partition values.
-        self.tp_size = parallel_state.get_tensor_model_parallel_world_size()
-        self.tp_rank = parallel_state.get_tensor_model_parallel_rank()
+        self.tp_size = (1 if use_data_parallel else
+                        get_tensor_model_parallel_world_size())
+        self.tp_rank = (0 if use_data_parallel else
+                        parallel_state.get_tensor_model_parallel_rank())
         self.hidden_size_per_attention_head = dist_utils.divide(
             projection_size, num_heads)
         self.num_attention_heads_per_partition = dist_utils.divide(
@@ -235,8 +246,9 @@ class Glm4vVisionAttention(nn.Module):
             total_num_kv_heads=num_heads,
             bias=False,
             quant_config=quant_config,
-            # Change qkv prefix to align with GLM-4.5V-FP8 quantization config
+            # Change qkv prefix to align with GLM-4.5V-FP8 quantization cfg
             prefix=f"{prefix}.qkv_proj" if quant_config else f"{prefix}.qkv",
+            disable_tp=use_data_parallel,
         )
         self.proj = RowParallelLinear(
             input_size=projection_size,
@@ -244,6 +256,7 @@ class Glm4vVisionAttention(nn.Module):
             quant_config=quant_config,
             prefix=f"{prefix}.proj",
             bias=False,
+            disable_tp=use_data_parallel,
         )
 
         # Detect attention implementation.
@@ -259,23 +272,10 @@ class Glm4vVisionAttention(nn.Module):
     def split_qkv(self, qkv: torch.Tensor) -> tuple[torch.Tensor, ...]:
         # [s, b, 3 * head * head_dim]
         seq_len, bs, _ = qkv.shape
-        if self.tp_size > 1:
-            qkv = all_gather_interleave(qkv, self.qkv.hidden_size,
-                                        self.tp_size)
 
         # [s, b, 3 * head * head_dim] -> 3 * [s, b, head * head_dim]
         q, k, v = qkv.chunk(3, dim=2)
 
-        # 3 * [s, b, head * head_dim]
-        if self.tp_size > 1:
-            splitter = partial(
-                dist_utils.split_tensor_along_last_dim,
-                num_partitions=self.tp_size,
-            )
-            q = splitter(q)[self.tp_rank]
-            k = splitter(k)[self.tp_rank]
-            v = splitter(v)[self.tp_rank]
-
         # 3 * [s, b, head * head_dim] -> 3 * [s, b, head, head_dim]
         new_shape = (
             seq_len,
@@ -375,6 +375,7 @@ class Glm4vVisionBlock(nn.Module):
         norm_layer: Optional[Callable[[int], nn.Module]] = None,
         quant_config: Optional[QuantizationConfig] = None,
         prefix: str = "",
+        use_data_parallel: bool = False,
     ) -> None:
         super().__init__()
         if norm_layer is None:
@@ -387,6 +388,7 @@ class Glm4vVisionBlock(nn.Module):
             projection_size=dim,
             quant_config=quant_config,
             prefix=f"{prefix}.attn",
+            use_data_parallel=use_data_parallel,
         )
         self.mlp = Glm4vVisionMLP(
             dim,
@@ -394,6 +396,7 @@ class Glm4vVisionBlock(nn.Module):
             bias=False,
             quant_config=quant_config,
             prefix=f"{prefix}.mlp",
+            use_data_parallel=use_data_parallel,
         )
 
     def forward(
@@ -456,15 +459,19 @@ class Glm4vPatchMerger(nn.Module):
         quant_config: Optional[QuantizationConfig] = None,
         bias: bool = False,
         prefix: str = "",
+        use_data_parallel: bool = False,
     ) -> None:
         super().__init__()
         self.hidden_size = d_model
-        self.proj = ColumnParallelLinear(self.hidden_size,
-                                         self.hidden_size,
-                                         bias=bias,
-                                         gather_output=True,
-                                         quant_config=quant_config,
-                                         prefix=f"{prefix}.proj")
+        self.proj = ColumnParallelLinear(
+            self.hidden_size,
+            self.hidden_size,
+            bias=bias,
+            gather_output=True,
+            quant_config=quant_config,
+            prefix=f"{prefix}.proj",
+            disable_tp=use_data_parallel,
+        )
         self.post_projection_norm = nn.LayerNorm(self.hidden_size)
         self.gate_up_proj = MergedColumnParallelLinear(
             input_size=self.hidden_size,
@@ -472,6 +479,7 @@ class Glm4vPatchMerger(nn.Module):
             bias=bias,
             quant_config=quant_config,
             prefix=f"{prefix}.gate_up_proj",
+            disable_tp=use_data_parallel,
         )
         self.down_proj = RowParallelLinear(
             context_dim,
@@ -479,6 +487,7 @@ class Glm4vPatchMerger(nn.Module):
             bias=bias,
             quant_config=quant_config,
             prefix=f"{prefix}.down_proj",
+            disable_tp=use_data_parallel,
         )
         self.act_fn = SiluAndMul()
         self.extra_activation_func = nn.GELU()
@@ -548,14 +557,33 @@ class Glm4vVisionEmbeddings(nn.Module):
                                                         dtype=torch.float32))
 
             # Calculate target dimensions for each patch
-            target_h = torch.cat([
-                image_shapes[i, 1].repeat(lengths[i])
-                for i in range(len(lengths))
-            ]).to(device=device, dtype=torch.float32)
-            target_w = torch.cat([
-                image_shapes[i, 2].repeat(lengths[i])
-                for i in range(len(lengths))
-            ]).to(device=device, dtype=torch.float32)
+            # Add bounds checking for data parallel mode
+            if len(lengths) > image_shapes.shape[0]:
+                # In data parallel mode, some GPUs might not have all
+                # image shapes
+                # Use available image shapes, cycling if necessary
+                target_h_list = []
+                target_w_list = []
+                for i in range(len(lengths)):
+                    # Cycle through available shapes
+                    shape_idx = i % image_shapes.shape[0]
+                    target_h_list.append(image_shapes[shape_idx,
+                                                      1].repeat(lengths[i]))
+                    target_w_list.append(image_shapes[shape_idx,
+                                                      2].repeat(lengths[i]))
+                target_h = torch.cat(target_h_list).to(device=device,
+                                                       dtype=torch.float32)
+                target_w = torch.cat(target_w_list).to(device=device,
+                                                       dtype=torch.float32)
+            else:
+                target_h = torch.cat([
+                    image_shapes[i, 1].repeat(lengths[i])
+                    for i in range(len(lengths))
+                ]).to(device=device, dtype=torch.float32)
+                target_w = torch.cat([
+                    image_shapes[i, 2].repeat(lengths[i])
+                    for i in range(len(lengths))
+                ]).to(device=device, dtype=torch.float32)
 
             # Normalize coordinates to [-1, 1] range for grid_sample
             h_coords = h_coords.to(device=device, dtype=torch.float32)
@@ -629,6 +657,7 @@ class Glm4vVisionTransformer(nn.Module):
         norm_eps: float = 1e-6,
         quant_config: Optional[QuantizationConfig] = None,
         prefix: str = "",
+        use_data_parallel: bool = False,
     ) -> None:
         super().__init__()
 
@@ -638,6 +667,7 @@ class Glm4vVisionTransformer(nn.Module):
         depth = vision_config.depth
         self.hidden_size = vision_config.hidden_size
         self.num_heads = vision_config.num_heads
+        self.use_data_parallel = use_data_parallel
 
         self.patch_size = vision_config.patch_size
         self.spatial_merge_size = vision_config.spatial_merge_size
@@ -661,6 +691,7 @@ class Glm4vVisionTransformer(nn.Module):
                 norm_layer=norm_layer,
                 quant_config=quant_config,
                 prefix=f"{prefix}.blocks.{layer_idx}",
+                use_data_parallel=self.use_data_parallel,
             ) for layer_idx in range(depth)
         ])
         self.merger = Glm4vPatchMerger(
@@ -669,6 +700,7 @@ class Glm4vVisionTransformer(nn.Module):
             quant_config=quant_config,
             bias=False,
             prefix=f"{prefix}.merger",
+            use_data_parallel=self.use_data_parallel,
         )
         self.embeddings = Glm4vVisionEmbeddings(vision_config)
 
@@ -731,8 +763,11 @@ class Glm4vVisionTransformer(nn.Module):
     def forward(
         self,
         x: torch.Tensor,
-        grid_thw: torch.Tensor,
+        grid_thw: list[list[int]],
     ) -> torch.Tensor:
+        # Convert grid_thw to tensor (always expecting list format now)
+        grid_thw = torch.tensor(grid_thw, device=x.device, dtype=torch.long)
+
         # patchify
         x = x.to(device=self.device, dtype=self.dtype)
         x = self.patch_embed(x)
@@ -988,6 +1023,43 @@ class Glm4vProcessingInfo(BaseProcessingInfo):
             selected_timestamps.append(timestamps_list[idx])
         return selected_timestamps
 
+    def _construct_video_placeholder(
+        self,
+        video_array: np.ndarray,
+        metadata: dict[str, Any],
+        grid_thw: torch.Tensor,
+    ) -> str:
+        hf_processor = self.get_hf_processor()
+        tokenizer = self.get_tokenizer()
+        image_processor = hf_processor.image_processor
+
+        hf_config = self.get_hf_config()
+        boi_token_id = hf_config.image_start_token_id
+        eoi_token_id = hf_config.image_end_token_id
+        bov_token_id = hf_config.video_start_token_id
+        eov_token_id = hf_config.video_end_token_id
+        merge_length = image_processor.merge_size**2
+
+        assert isinstance(grid_thw, torch.Tensor)
+        timestamps = self._get_video_second_idx(metadata, len(video_array))
+        frames_idx_token = [
+            tokenizer.encode(str(i), add_special_tokens=False)
+            for i in timestamps
+        ]
+        T, H, W = grid_thw
+        num_tokens_per_frame = int(H * W) // merge_length
+        placeholder = []
+        placeholder.append(bov_token_id)
+        for frame_idx in frames_idx_token:
+            placeholder.append(boi_token_id)
+            placeholder.extend([hf_processor.video_token_id] *
+                               num_tokens_per_frame)
+            placeholder.append(eoi_token_id)
+            placeholder.extend(frame_idx)
+        placeholder.append(eov_token_id)
+
+        return placeholder
+
 
 class Glm4vDummyInputsBuilder(BaseDummyInputsBuilder[Glm4vProcessingInfo]):
 
@@ -1083,17 +1155,10 @@ class Glm4vMultiModalProcessor(BaseMultiModalProcessor[Glm4vProcessingInfo]):
             for item in mm_data.pop("videos", []):
                 video_array, metadata = item
 
-                # FIXME(Isotr0py): Activate the below logic after we can disable
-                # resampling from video loader backend.
-                # assert metadata["total_num_frames"] == len(video_array), (
-                #     f"Total frames {metadata['total_num_frames']} does not "
-                #     f"match the length of video array {len(video_array)}.")
+                if metadata["video_backend"] == "opencv_dynamic":
+                    mm_kwargs["do_sample_frames"] = False
 
-                # NOTE: Temporary workaround for resampled videos.
-                # this can cause a divergence with HF implementation if
-                # the input video is resampled in advance.
-
-                if metadata["total_num_frames"] != len(video_array):
+                elif metadata["total_num_frames"] != len(video_array):
                     logger.warning(
                         "Total frames in metadata "
                         "(%s) does not match the length of "
@@ -1105,11 +1170,10 @@ class Glm4vMultiModalProcessor(BaseMultiModalProcessor[Glm4vProcessingInfo]):
                         len(video_array),
                     )
                     metadata["total_num_frames"] = len(video_array)
-                metadata = VideoMetadata(**metadata)
 
                 video_mm_data = dict()
                 video_mm_data["videos"] = [[video_array]]
-                video_mm_data["video_metadata"] = [[metadata]]
+                video_mm_data["video_metadata"] = [[VideoMetadata(**metadata)]]
 
                 video_outputs = super()._call_hf_processor(
                     prompt="<|begin_of_video|><|video|><|end_of_video|>",
@@ -1117,11 +1181,23 @@ class Glm4vMultiModalProcessor(BaseMultiModalProcessor[Glm4vProcessingInfo]):
                     mm_kwargs=mm_kwargs,
                     tok_kwargs=tok_kwargs,
                 )
-                input_ids = video_outputs.pop("input_ids")
-                input_ids[input_ids == processor.image_token_id] = (
-                    processor.video_token_id)
-                video_placeholder = processor.tokenizer.batch_decode(
-                    input_ids)[0]
+                if "do_sample_frames" in mm_kwargs and not mm_kwargs[
+                        "do_sample_frames"]:
+                    # Transformers v4.55 has incorrect timestamps issue for
+                    # skip sampling. We construct the placeholder manually to
+                    # get placeholders with correct timestamps.
+                    placeholder = self.info._construct_video_placeholder(
+                        video_array,
+                        metadata,
+                        video_outputs["video_grid_thw"].squeeze(0),
+                    )
+                    video_placeholder = processor.tokenizer.decode(placeholder)
+                else:
+                    input_ids = video_outputs.pop("input_ids")
+                    input_ids[input_ids == processor.image_token_id] = (
+                        processor.video_token_id)
+                    video_placeholder = processor.tokenizer.batch_decode(
+                        input_ids)[0]
                 prompt = prompt.replace(
                     "<|begin_of_video|><|video|><|end_of_video|>",
                     video_placeholder,
@@ -1167,14 +1243,6 @@ class Glm4vMultiModalProcessor(BaseMultiModalProcessor[Glm4vProcessingInfo]):
         hf_processor = self.info.get_hf_processor(**hf_processor_mm_kwargs)
         image_processor = self.info.get_image_processor(
             **hf_processor_mm_kwargs)
-        tokenizer = self.info.get_tokenizer()
-        hf_config = self.info.get_hf_config()
-
-        boi_token_id = hf_config.image_start_token_id
-        eoi_token_id = hf_config.image_end_token_id
-
-        bov_token_id = hf_config.video_start_token_id
-        eov_token_id = hf_config.video_end_token_id
 
         merge_length = image_processor.merge_size**2
 
@@ -1192,21 +1260,8 @@ class Glm4vMultiModalProcessor(BaseMultiModalProcessor[Glm4vProcessingInfo]):
             assert isinstance(grid_thw, torch.Tensor)
 
             video, metadata = mm_items["video"][item_idx]
-            timestamps = self.info._get_video_second_idx(metadata, len(video))
-            frames_idx_token = [
-                tokenizer.encode(str(i), add_special_tokens=False)
-                for i in timestamps
-            ]
-            num_tokens_per_frame = int(grid_thw[1:].prod()) // merge_length
-            placeholder = []
-            placeholder.append(bov_token_id)
-            for frame_idx in frames_idx_token:
-                placeholder.append(boi_token_id)
-                placeholder.extend([hf_processor.video_token_id] *
-                                   num_tokens_per_frame)
-                placeholder.append(eoi_token_id)
-                placeholder.extend(frame_idx)
-            placeholder.append(eov_token_id)
+            placeholder = self.info._construct_video_placeholder(
+                video, metadata, grid_thw)
             return PromptUpdateDetails.select_token_id(
                 placeholder,
                 embed_token_id=hf_processor.video_token_id,
@@ -1250,6 +1305,8 @@ class Glm4vForConditionalGeneration(nn.Module, SupportsMultiModal,
             "model.visual.": "visual.",
         })
 
+    supports_encoder_tp_data = True
+
     @classmethod
     def get_placeholder_str(cls, modality: str, i: int) -> Optional[str]:
         if modality.startswith("image"):
@@ -1267,12 +1324,14 @@ class Glm4vForConditionalGeneration(nn.Module, SupportsMultiModal,
 
         self.config = config
         self.multimodal_config = multimodal_config
+        self.use_data_parallel = multimodal_config.mm_encoder_tp_mode == "data"
 
         self.visual = Glm4vVisionTransformer(
             config.vision_config,
             norm_eps=getattr(config, "rms_norm_eps", 1e-5),
             quant_config=quant_config,
             prefix=maybe_prefix(prefix, "visual"),
+            use_data_parallel=self.use_data_parallel,
         )
 
         if config.model_type == "glm4v":
@@ -1377,40 +1436,49 @@ class Glm4vForConditionalGeneration(nn.Module, SupportsMultiModal,
             self, image_input: Glm4vImageInputs) -> tuple[torch.Tensor, ...]:
         grid_thw = image_input["image_grid_thw"]
         assert grid_thw.ndim == 2
+        grid_thw_list = grid_thw.tolist()
 
         if image_input["type"] == "image_embeds":
             image_embeds = image_input["image_embeds"].type(self.visual.dtype)
         else:
             pixel_values = image_input["pixel_values"].type(self.visual.dtype)
-            image_embeds = self.visual(pixel_values, grid_thw=grid_thw)
-
+            if self.use_data_parallel:
+                return run_dp_sharded_mrope_vision_model(self.visual,
+                                                         pixel_values,
+                                                         grid_thw.tolist(),
+                                                         rope_type="rope_3d")
+            else:
+                image_embeds = self.visual(pixel_values,
+                                           grid_thw=grid_thw.tolist())
         merge_size = self.visual.spatial_merge_size
-        sizes = grid_thw.prod(-1) // merge_size // merge_size
-        return image_embeds.split(sizes.tolist())
+        sizes = (torch.tensor(grid_thw_list, dtype=torch.long).prod(-1) //
+                 (merge_size * merge_size)).tolist()
+        return image_embeds.split(sizes)
 
     def _process_video_input(
             self, video_input: Glm4vVideoInputs) -> tuple[torch.Tensor, ...]:
         grid_thw = video_input["video_grid_thw"]
         assert grid_thw.ndim == 2
+        grid_thw_list = grid_thw.tolist()
 
-        device = self.visual.device
-        flat_grid_thw = torch.cat([
-            torch.tensor([[1, h, w]] * t, device=device)
-            for t, h, w in grid_thw
-        ])
         if video_input["type"] == "video_embeds":
             video_embeds = video_input["video_embeds"].type(self.visual.dtype)
         else:
             pixel_values_videos = video_input["pixel_values_videos"].type(
                 self.visual.dtype)
-            video_embeds = self.visual(pixel_values_videos,
-                                       grid_thw=flat_grid_thw)
-
+            if self.use_data_parallel:
+                return run_dp_sharded_mrope_vision_model(self.visual,
+                                                         pixel_values_videos,
+                                                         grid_thw.tolist(),
+                                                         rope_type="rope_3d")
+            else:
+                video_embeds = self.visual(pixel_values_videos,
+                                           grid_thw=grid_thw.tolist())
         # Split concatenated embeddings for each video item.
         merge_size = self.visual.spatial_merge_size
-        sizes = grid_thw.prod(-1) // merge_size // merge_size
-
-        return video_embeds.split(sizes.tolist())
+        sizes = (torch.tensor(grid_thw_list, dtype=torch.long).prod(-1) //
+                 (merge_size * merge_size)).tolist()
+        return video_embeds.split(sizes)
 
     def _parse_and_validate_multimodal_inputs(self, **kwargs: object) -> dict:
         mm_input_by_modality = {}
diff --git a/vllm/model_executor/models/glm4_moe.py b/vllm/model_executor/models/glm4_moe.py
index 06ed453ec29f9325b399d686f4bb9ef388167127..1fb4576092892555c35b2335ea2cdfdf1c699968 100644
--- a/vllm/model_executor/models/glm4_moe.py
+++ b/vllm/model_executor/models/glm4_moe.py
@@ -159,7 +159,8 @@ class Glm4MoE(nn.Module):
             topk_group=config.topk_group,
             prefix=f"{prefix}.experts",
             scoring_func="sigmoid",
-            routed_scaling_factor=self.routed_scaling_factor,
+            # we do scaling outside, set factor to 1.0 to avoid double mul
+            routed_scaling_factor=1.0,
             e_score_correction_bias=self.gate.e_score_correction_bias,
             enable_eplb=self.enable_eplb,
             num_redundant_experts=self.n_redundant_experts)
@@ -183,6 +184,8 @@ class Glm4MoE(nn.Module):
 
         if self.n_shared_experts is not None:
             shared_output = self.shared_experts(hidden_states)
+        else:
+            shared_output = None
         router_logits = self.gate(hidden_states.to(dtype=torch.float32))
         final_hidden_states = self.experts(
             hidden_states=hidden_states,
diff --git a/vllm/model_executor/models/gpt2.py b/vllm/model_executor/models/gpt2.py
index 4446b5ab181c1c99d390ea89a35d8fae14e38241..0f6521e44e6becc6b95a15d3894cbf2c74d9473d 100644
--- a/vllm/model_executor/models/gpt2.py
+++ b/vllm/model_executor/models/gpt2.py
@@ -339,7 +339,10 @@ class GPT2ForSequenceClassification(nn.Module):
         config = vllm_config.model_config.hf_config
         self.transformer = GPT2Model(vllm_config=vllm_config,
                                      prefix=maybe_prefix(prefix, "gpt2"))
-        self.score = nn.Linear(config.n_embd, config.num_labels, bias=False)
+        self.score = nn.Linear(config.n_embd,
+                               config.num_labels,
+                               bias=False,
+                               dtype=vllm_config.model_config.head_dtype)
 
         pooler_config = vllm_config.model_config.pooler_config
         assert pooler_config is not None
@@ -348,7 +351,7 @@ class GPT2ForSequenceClassification(nn.Module):
             "encode":
             Pooler.for_encode(pooler_config),
             "classify":
-            Pooler.for_classify(pooler_config, classifier=None),
+            Pooler.for_classify(pooler_config, classifier=self.score),
         })
 
     def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]):
@@ -367,8 +370,7 @@ class GPT2ForSequenceClassification(nn.Module):
             position_ids=positions,
             inputs_embeds=inputs_embeds,
             intermediate_tensors=intermediate_tensors)
-        logits = self.score(hidden_states)
-        return logits
+        return hidden_states
 
 
 def _add_transformer_prefix(
diff --git a/vllm/model_executor/models/h2ovl.py b/vllm/model_executor/models/h2ovl.py
index 306775af68065429eee51e3928612013f6f5b8d8..b42df3ad8650816edce0df38c844a4da6d5fd2b9 100644
--- a/vllm/model_executor/models/h2ovl.py
+++ b/vllm/model_executor/models/h2ovl.py
@@ -17,7 +17,7 @@ from transformers import PretrainedConfig
 
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.multimodal import MULTIMODAL_REGISTRY
-from vllm.multimodal.inputs import MultiModalKwargsItems
+from vllm.multimodal.inputs import MultiModalKwargsItems, MultiModalUUIDDict
 from vllm.multimodal.parse import (ImageEmbeddingItems, ImageProcessorItems,
                                    MultiModalDataItems)
 from vllm.multimodal.processing import (MultiModalProcessingInfo,
@@ -479,7 +479,7 @@ class H2OVLMultiModalProcessor(
         mm_data_items: MultiModalDataItems,
         hf_processor_mm_kwargs: Mapping[str, object],
         tokenization_kwargs: Mapping[str, object],
-        mm_hash_overrides: Optional[dict[str, list[str]]] = None,
+        mm_uuids: Optional[MultiModalUUIDDict] = None,
     ) -> tuple[list[int], MultiModalProcessingInfo, bool]:
         # The processor logic is different for len(images) <= 1 vs > 1
         # Since the processing cache assumes that the processor output is
@@ -491,7 +491,7 @@ class H2OVLMultiModalProcessor(
                 mm_data_items=mm_data_items,
                 hf_processor_mm_kwargs=hf_processor_mm_kwargs,
                 tokenization_kwargs=tokenization_kwargs,
-                mm_hash_overrides=mm_hash_overrides,
+                mm_uuids=mm_uuids,
             )
 
         return super()._cached_apply_hf_processor(
@@ -499,7 +499,7 @@ class H2OVLMultiModalProcessor(
             mm_data_items=mm_data_items,
             hf_processor_mm_kwargs=hf_processor_mm_kwargs,
             tokenization_kwargs=tokenization_kwargs,
-            mm_hash_overrides=mm_hash_overrides,
+            mm_uuids=mm_uuids,
         )
 
 
diff --git a/vllm/model_executor/models/hunyuan_v1.py b/vllm/model_executor/models/hunyuan_v1.py
index fbba849a76f23235fcdab6048d66e0bd83291b33..a74a44bc2b511e48c086ceb7f840dfff2a10d4cb 100644
--- a/vllm/model_executor/models/hunyuan_v1.py
+++ b/vllm/model_executor/models/hunyuan_v1.py
@@ -56,7 +56,7 @@ from vllm.model_executor.model_loader.weight_utils import (
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.sequence import IntermediateTensors
 
-from .interfaces import SupportsLoRA
+from .interfaces import SupportsLoRA, SupportsPP
 from .utils import (AutoWeightsLoader, PPMissingLayer, is_pp_missing_parameter,
                     make_layers)
 
@@ -841,7 +841,7 @@ class HunYuanModel(nn.Module):
         return loaded_params
 
 
-class HunYuanV1Base(nn.Module, SupportsLoRA):
+class HunYuanV1Base(nn.Module, SupportsLoRA, SupportsPP):
     packed_modules_mapping = {
         "qkv_proj": [
             "q_proj",
diff --git a/vllm/model_executor/models/idefics2_vision_model.py b/vllm/model_executor/models/idefics2_vision_model.py
index 0ca2e9e4bb688d0d65895dfee78191a4f9a2b6c2..76737a4428232af991d3ac8b4de37b71db5243dd 100644
--- a/vllm/model_executor/models/idefics2_vision_model.py
+++ b/vllm/model_executor/models/idefics2_vision_model.py
@@ -31,7 +31,6 @@ from vllm.distributed import get_tensor_model_parallel_world_size
 from vllm.model_executor.layers.activation import get_act_fn
 from vllm.model_executor.layers.linear import (ColumnParallelLinear,
                                                QKVParallelLinear,
-                                               ReplicatedLinear,
                                                RowParallelLinear)
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
@@ -139,37 +138,23 @@ class Idefics2VisionAttention(nn.Module):
         assert self.num_heads % tp_size == 0
         self.num_heads_per_partition = self.num_heads // tp_size
 
-        if use_data_parallel:
-            self.q_size = self.num_heads * self.head_dim
-            self.qkv_proj = ReplicatedLinear(
-                self.embed_dim,
-                3 * self.q_size,
-                bias=True,
-                quant_config=quant_config,
-                prefix=f"{prefix}.qkv_proj",
-            )
-            self.out_proj = ReplicatedLinear(
-                self.embed_dim,
-                self.embed_dim,
-                bias=True,
-                quant_config=quant_config,
-                prefix=f"{prefix}.out_proj",
-            )
-        else:
-            self.qkv_proj = QKVParallelLinear(
-                self.embed_dim,
-                self.head_dim,
-                self.num_heads,
-                quant_config=quant_config,
-                prefix=f"{prefix}.qkv_proj",
-            )
-            self.out_proj = RowParallelLinear(
-                self.embed_dim,
-                self.embed_dim,
-                bias=True,
-                quant_config=quant_config,
-                prefix=f"{prefix}.out_proj",
-            )
+        self.qkv_proj = QKVParallelLinear(
+            self.embed_dim,
+            self.head_dim,
+            self.num_heads,
+            quant_config=quant_config,
+            prefix=f"{prefix}.qkv_proj",
+            disable_tp=use_data_parallel,
+        )
+        self.out_proj = RowParallelLinear(
+            self.embed_dim,
+            self.embed_dim,
+            bias=True,
+            quant_config=quant_config,
+            prefix=f"{prefix}.out_proj",
+            disable_tp=use_data_parallel,
+        )
+        # Use unified MultiHeadAttention with Flash Attention support
         self.attn = MultiHeadAttention(self.num_heads_per_partition,
                                        self.head_dim, self.scale)
 
@@ -181,6 +166,8 @@ class Idefics2VisionAttention(nn.Module):
             hidden_states
         )  # batch_size, q_len, 3 * num_heads_per_partition * head_dim
         query_states, key_states, value_states = qkv.chunk(3, dim=-1)
+
+        # Use unified MultiHeadAttention implementation
         out = self.attn(query_states, key_states, value_states)
         attn_output, _ = self.out_proj(out)
         return attn_output
@@ -198,23 +185,21 @@ class Idefics2VisionMLP(nn.Module):
         super().__init__()
         self.config = config
         self.activation_fn = get_act_fn(config.hidden_act)
-        cls_fc1 = (ReplicatedLinear
-                   if use_data_parallel else ColumnParallelLinear)
-        self.fc1 = cls_fc1(
+        self.fc1 = ColumnParallelLinear(
             config.hidden_size,
             config.intermediate_size,
             bias=True,
             quant_config=quant_config,
             prefix=f"{prefix}.fc1",
+            disable_tp=use_data_parallel,
         )
-        cls_fc2 = (ReplicatedLinear
-                   if use_data_parallel else RowParallelLinear)
-        self.fc2 = cls_fc2(
+        self.fc2 = RowParallelLinear(
             config.intermediate_size,
             config.hidden_size,
             bias=True,
             quant_config=quant_config,
             prefix=f"{prefix}.fc2",
+            disable_tp=use_data_parallel,
         )
 
     def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
@@ -386,30 +371,6 @@ class Idefics2VisionTransformer(nn.Module):
         last_hidden_state = self.post_layernorm(encoder_outputs)
         return last_hidden_state
 
-    def _consolidate_qkv_weights(
-        self, weights: Iterable[tuple[str, torch.Tensor]]
-    ) -> Iterable[tuple[str, torch.Tensor]]:
-        qkv_idx_mappings = {
-            ".self_attn.q_proj": 0,
-            ".self_attn.k_proj": 1,
-            ".self_attn.v_proj": 2,
-        }
-        qkv_weights = {}
-        for name, loaded_weight in weights:
-            for weight_name, idx in qkv_idx_mappings.items():
-                if weight_name not in name:
-                    continue
-                new_name = name.replace(weight_name, ".self_attn.qkv_proj")
-                if new_name not in qkv_weights:
-                    qkv_weights[new_name] = [None] * 3
-                qkv_weights[new_name][idx] = loaded_weight
-                break
-            else:
-                yield name, loaded_weight
-        for key, weight in qkv_weights.items():
-            qkv_weight = torch.cat(weight, dim=0)
-            yield key, qkv_weight
-
     def load_weights(self, weights: Iterable[tuple[str,
                                                    torch.Tensor]]) -> set[str]:
         stacked_params_mapping = [
@@ -422,9 +383,6 @@ class Idefics2VisionTransformer(nn.Module):
         loaded_params: set[str] = set()
         layer_count = len(self.encoder.layers)
 
-        if self.use_data_parallel:
-            weights = self._consolidate_qkv_weights(weights)
-
         for name, loaded_weight in weights:
             # skip pooling header
             if name.startswith("head."):
diff --git a/vllm/model_executor/models/interfaces.py b/vllm/model_executor/models/interfaces.py
index 2ee966fb5c0c8bd837b06ca30a5af486ab8065bc..d5b71b057831bb58f5542c7f10607ee4c955f8be 100644
--- a/vllm/model_executor/models/interfaces.py
+++ b/vllm/model_executor/models/interfaces.py
@@ -700,8 +700,10 @@ class SupportsTranscription(Protocol):
     def get_generation_prompt(cls, audio: np.ndarray,
                               stt_config: SpeechToTextConfig,
                               model_config: ModelConfig,
-                              language: Optional[str], task_type: str,
-                              request_prompt: str) -> PromptType:
+                              language: Optional[str],
+                              task_type: Literal["transcribe", "translate"],
+                              request_prompt: str,
+                              to_language: Optional[str]) -> PromptType:
         """Get the prompt for the ASR model.
         The model has control over the construction, as long as it
         returns a valid PromptType."""
diff --git a/vllm/model_executor/models/intern_vit.py b/vllm/model_executor/models/intern_vit.py
index 58e8163e0b26e409c877194ab4df3d21c2cce0ff..8e9ab9649bd4476fe90d00cdcca0c115296adc6e 100644
--- a/vllm/model_executor/models/intern_vit.py
+++ b/vllm/model_executor/models/intern_vit.py
@@ -255,6 +255,10 @@ class InternSdpaAttention(nn.Module):
 
         self.proj = nn.Linear(self.dummy_dim, self.embed_dim)
 
+        # Use unified MultiHeadAttention with automatic backend selection
+        self.attn = MultiHeadAttention(self.num_heads, self.head_dim,
+                                       self.scale)
+
     def forward(self, x: torch.Tensor) -> torch.Tensor:
         B, N, C = x.shape
         qkv = self.qkv(x)
@@ -268,12 +272,9 @@ class InternSdpaAttention(nn.Module):
             B_, N_, H_, D_ = q.shape
             q = self.q_norm(q.flatten(-2, -1)).view(B_, N_, H_, D_)
             k = self.k_norm(k.flatten(-2, -1)).view(B_, N_, H_, D_)
-        q = q.transpose(1, 2)
-        k = k.transpose(1, 2)
-        v = v.transpose(1, 2)
 
-        x = F.scaled_dot_product_attention(q, k, v, scale=self.scale)
-        x = x.transpose(1, 2).reshape(B, N, -1)
+        # Use unified MultiHeadAttention with automatic backend selection
+        x = self.attn(q, k, v)
 
         x = self.proj(x)
         return x
diff --git a/vllm/model_executor/models/internlm2.py b/vllm/model_executor/models/internlm2.py
index 320e8d9d480c3ffe83d02f025a717cfa01f78e1e..ce94328797ed6b0ec2296a159169c72daad2895a 100644
--- a/vllm/model_executor/models/internlm2.py
+++ b/vllm/model_executor/models/internlm2.py
@@ -423,13 +423,15 @@ class InternLM2ForRewardModel(InternLM2ForCausalLM):
             delattr(self, attr)
 
         config = vllm_config.model_config.hf_config
-        self.v_head = RowParallelLinear(
-            config.hidden_size,
-            1,
-            bias=False,
-            input_is_parallel=False,
-            prefix=maybe_prefix(prefix, "v_head"),
-        )
+        self.head_dtype = vllm_config.model_config.head_dtype
+
+        self.v_head = RowParallelLinear(config.hidden_size,
+                                        1,
+                                        bias=False,
+                                        input_is_parallel=False,
+                                        params_dtype=self.head_dtype,
+                                        prefix=maybe_prefix(prefix, "v_head"),
+                                        return_bias=False)
 
         pooler_config = vllm_config.model_config.pooler_config
         assert pooler_config is not None
@@ -446,5 +448,6 @@ class InternLM2ForRewardModel(InternLM2ForCausalLM):
     ) -> Union[torch.Tensor, IntermediateTensors]:
         hidden_states = self.model(input_ids, positions, intermediate_tensors,
                                    inputs_embeds)
-        logits, _ = self.v_head(hidden_states)
+        hidden_states = hidden_states.to(self.head_dtype)
+        logits = self.v_head(hidden_states)
         return logits
diff --git a/vllm/model_executor/models/interns1.py b/vllm/model_executor/models/interns1.py
index c739e74b058fa59bd57438e002f049ad67bed5de..d998b8a0ab4f78cb1063f079885471ae767133dc 100644
--- a/vllm/model_executor/models/interns1.py
+++ b/vllm/model_executor/models/interns1.py
@@ -7,7 +7,7 @@
 # Licensed under The MIT License [see LICENSE for details]
 # --------------------------------------------------------
 from collections.abc import Iterable, Mapping, Sequence
-from typing import Literal, Optional, TypedDict, Union
+from typing import Annotated, Literal, Optional, Union
 
 import regex as re
 import torch
@@ -32,6 +32,7 @@ from vllm.multimodal.processing import (BaseMultiModalProcessor,
                                         PromptUpdate, PromptUpdateDetails)
 from vllm.multimodal.profiling import BaseDummyInputsBuilder
 from vllm.sequence import IntermediateTensors
+from vllm.utils.tensor_schema import TensorSchema, TensorShape
 
 from .interfaces import (MultiModalEmbeddings, SupportsLoRA,
                          SupportsMultiModal, SupportsPP)
@@ -62,51 +63,60 @@ class InternS1MultiModalProjector(nn.Module):
         return hidden_states
 
 
-class InternS1ImagePixelInputs(TypedDict):
-    type: Literal["pixel_values"]
-    pixel_values: torch.Tensor
+class InternS1ImagePixelInputs(TensorSchema):
     """
-    Shape:
-    `(batch_size * num_images * (1 + num_patches), num_channels, height, width)`
+    Dimensions:
+        - bnp: Batch size * number of images * (1 + num_patches)
+        - c: Number of channels (3)
+        - h: Height
+        - w: Width
+        - bn: Batch size * number of images
     """
+    type: Literal["pixel_values"] = "pixel_values"
+    pixel_values: Annotated[torch.Tensor, TensorShape("bnp", 3, "h", "w")]
+    num_patches: Annotated[torch.Tensor, TensorShape("bn")]
 
 
-class InternS1ImageEmbeddingInputs(TypedDict):
-    type: Literal["image_embeds"]
-    data: Union[torch.Tensor, list[torch.Tensor]]
+class InternS1ImageEmbeddingInputs(TensorSchema):
     """
-    A tensor of shape `(num_images, total_image_feature_size, hidden_size)`
-    or a list of tensors of shape `(total_image_feature_size, hidden_size)`
-
-    `hidden_size` must match the hidden size of language model backbone.
+    Dimensions:
+        - ni: Number of images
+        - tifs: Total image feature size
+        - hs: Hidden size (must match language model backbone)
     """
+    type: Literal["image_embeds"] = "image_embeds"
+    data: Annotated[Union[torch.Tensor, list[torch.Tensor]],
+                    TensorShape("ni", "tifs", "hs")]
 
 
 InternS1ImageInputs = Union[InternS1ImagePixelInputs,
                             InternS1ImageEmbeddingInputs]
 
 
-class InternS1VideoPixelInputs(TypedDict):
-    type: Literal["pixel_values_videos"]
-    pixel_values: torch.Tensor
+class InternS1VideoPixelInputs(TensorSchema):
     """
-    Shape:
-    `(batch_size * num_video * num_frames, num_channels, height, width)`
+    Dimensions:
+        - bnv: Batch size * number of videos * number of frames
+        - bn: Batch size * number of images
+        - c: Number of channels (3)
+        - h: Height
+        - w: Width
     """
-
-    num_patches: torch.Tensor
-    """Shape: `(batch_size * num_images)`"""
+    type: Literal["pixel_values_videos"] = "pixel_values_videos"
+    pixel_values: Annotated[torch.Tensor, TensorShape("bnv", 3, "h", "w")]
+    num_patches: Annotated[torch.Tensor, TensorShape("bn")]
 
 
-class InternS1VideoEmbeddingInputs(TypedDict):
-    type: Literal["video_embeds"]
-    data: Union[torch.Tensor, list[torch.Tensor]]
+class InternS1VideoEmbeddingInputs(TensorSchema):
     """
-    A tensor of shape `(num_videos, total_video_feature_size, hidden_size)`
-    or a list of tensors of shape `(total_video_feature_size, hidden_size)`
-
-    `hidden_size` must match the hidden size of language model backbone.
+    Dimensions:
+        - nv: Number of videos
+        - tvfs: Total video feature size
+        - hs: Hidden size (must match language model backbone)
     """
+    type: Literal["video_embeds"] = "video_embeds"
+    data: Annotated[Union[torch.Tensor, list[torch.Tensor]],
+                    TensorShape("nv", "tvfs", "hs")]
 
 
 InternS1VideoInputs = Union[InternS1VideoPixelInputs,
@@ -482,7 +492,7 @@ class InternS1ForConditionalGeneration(nn.Module, SupportsMultiModal,
 
     @classmethod
     def get_placeholder_str(cls, modality: str, i: int) -> Optional[str]:
-        # transformers InternVLProcessor uses <IMG_CONTEXT> as the seperator
+        # transformers InternVLProcessor uses <IMG_CONTEXT> as the separator
         # refer to https://github.com/huggingface/transformers/blob/f90de364c2484c7c325bbe05befdcf487bd75b63/src/transformers/models/internvl/processing_internvl.py#L116
         if modality.startswith("image"):
             return '<IMG_CONTEXT>'
@@ -572,26 +582,6 @@ class InternS1ForConditionalGeneration(nn.Module, SupportsMultiModal,
         vit_embeds = self.multi_modal_projector(vit_embeds)
         return vit_embeds
 
-    def _validate_pixel_values(self, data: torch.Tensor) -> torch.Tensor:
-
-        h, w = self.config.vision_config.image_size
-        expected_dims = (3, h, w)
-
-        def _validate_shape(d: torch.Tensor):
-            actual_dims = tuple(d.shape)
-
-            if actual_dims != expected_dims:
-                expected_expr = str(expected_dims)
-                raise ValueError(
-                    "The expected shape of pixel values per image per batch "
-                    f" per patch is {expected_expr}. "
-                    f"You supplied {tuple(d.shape)}.")
-
-        for d in data:
-            _validate_shape(d)
-
-        return data
-
     def _parse_and_validate_image_input(
             self, **kwargs: object) -> Optional[InternS1ImageInputs]:
         pixel_values = kwargs.pop("pixel_values", None)
@@ -627,10 +617,15 @@ class InternS1ForConditionalGeneration(nn.Module, SupportsMultiModal,
             pixel_values = flatten_bn(pixel_values, concat=True)
             image_num_patches = flatten_bn(image_num_patches, concat=True)
 
+            h, w = self.config.vision_config.image_size
             return InternS1ImagePixelInputs(
                 type="pixel_values",
-                pixel_values=self._validate_pixel_values(pixel_values),
+                pixel_values=pixel_values,
                 num_patches=image_num_patches,
+                resolve_bindings={
+                    "h": h,
+                    "w": w,
+                },
             )
 
         raise AssertionError("This line should be unreachable.")
@@ -671,11 +666,15 @@ class InternS1ForConditionalGeneration(nn.Module, SupportsMultiModal,
                                                  concat=True)
             video_num_patches = flatten_bn(video_num_patches, concat=True)
 
+            h, w = self.config.vision_config.image_size
             return InternS1VideoPixelInputs(
                 type="pixel_values_videos",
-                pixel_values=self._validate_pixel_values(
-                    pixel_values_flat_video),
                 num_patches=video_num_patches,
+                pixel_values=pixel_values_flat_video,
+                resolve_bindings={
+                    "h": h,
+                    "w": w,
+                },
             )
 
         raise AssertionError("This line should be unreachable.")
diff --git a/vllm/model_executor/models/interns1_vit.py b/vllm/model_executor/models/interns1_vit.py
index 300ed17ecaabccd03e6f058f95e3e7ba85d0911a..eb6b685d03dc5c3f3e221b75aad6304a46e77cdf 100644
--- a/vllm/model_executor/models/interns1_vit.py
+++ b/vllm/model_executor/models/interns1_vit.py
@@ -12,10 +12,10 @@ from typing import Optional
 
 import torch
 import torch.nn as nn
-import torch.nn.functional as F
 from transformers import PretrainedConfig
 from transformers.utils import torch_int
 
+from vllm.attention.layer import MultiHeadAttention
 from vllm.model_executor.layers.activation import get_act_fn
 from vllm.model_executor.layers.layernorm import RMSNorm
 from vllm.model_executor.layers.linear import (ColumnParallelLinear,
@@ -206,6 +206,10 @@ class InternSdpaAttention(nn.Module):
 
         self.projection_layer = nn.Linear(self.dummy_dim, self.embed_dim)
 
+        # Use unified MultiHeadAttention with automatic backend selection
+        self.attn = MultiHeadAttention(self.num_heads, self.head_dim,
+                                       self.scale)
+
     def forward(self, x: torch.Tensor) -> torch.Tensor:
         B, N, C = x.shape
 
@@ -213,20 +217,13 @@ class InternSdpaAttention(nn.Module):
         k = self.k_proj(x)
         v = self.v_proj(x)
 
-        q = q.view(B, N, self.num_heads, self.head_dim)
-        k = k.view(B, N, self.num_heads, self.head_dim)
-        v = v.view(B, N, self.num_heads, self.head_dim)
-
         if self.qk_normalization:
             B_, N_, H_, D_ = q.shape
             q = self.q_norm(q.flatten(-2, -1)).view(B_, N_, H_, D_)
             k = self.k_norm(k.flatten(-2, -1)).view(B_, N_, H_, D_)
-        q = q.transpose(1, 2)
-        k = k.transpose(1, 2)
-        v = v.transpose(1, 2)
 
-        x = F.scaled_dot_product_attention(q, k, v, scale=self.scale)
-        x = x.transpose(1, 2).reshape(B, N, -1)
+        # Use unified MultiHeadAttention with automatic backend selection
+        x = self.attn(q, k, v)
 
         x = self.projection_layer(x)
         return x
diff --git a/vllm/model_executor/models/internvl.py b/vllm/model_executor/models/internvl.py
index b09ed7bbe72a39acdd7b638a6ccfe29b586b53ac..9565628b198e2a6b51450431f312c555394af1c6 100644
--- a/vllm/model_executor/models/internvl.py
+++ b/vllm/model_executor/models/internvl.py
@@ -7,6 +7,7 @@
 # Copyright (c) 2023 OpenGVLab
 # Licensed under The MIT License [see LICENSE for details]
 # --------------------------------------------------------
+import os
 from abc import ABC, abstractmethod
 from collections.abc import Iterable, Mapping, Sequence
 from typing import Annotated, Any, Literal, Optional, TypeVar, Union
@@ -37,6 +38,7 @@ from vllm.multimodal.processing import (BaseMultiModalProcessor,
 from vllm.multimodal.profiling import BaseDummyInputsBuilder
 from vllm.sequence import IntermediateTensors
 from vllm.transformers_utils.tokenizer import AnyTokenizer
+from vllm.utils import set_default_torch_num_threads
 from vllm.utils.tensor_schema import TensorSchema, TensorShape
 
 from .interfaces import (MultiModalEmbeddings, SupportsLoRA,
@@ -115,13 +117,26 @@ InternVLVideoInputs = Union[InternVLVideoPixelInputs,
 # adapted from https://huggingface.co/OpenGVLab/InternVL2-1B
 def build_transform(input_size: int):
     MEAN, STD = IMAGENET_MEAN, IMAGENET_STD
-    return T.Compose([
+    transform = T.Compose([
         T.Lambda(lambda img: convert_image_mode(img, 'RGB')),
         T.Resize((input_size, input_size),
                  interpolation=T.InterpolationMode.BICUBIC),
         T.ToTensor(),
         T.Normalize(mean=MEAN, std=STD)
     ])
+    # Image transformation operations (which include tensor computations
+    # on the CPU) can occupy a substantial number of CPU cores, introducing
+    # overhead due to CPU contention. This issue becomes particularly
+    # noticeable when deploying multiple vLLM instances on a single machine.
+    # Therefore, it is necessary to limit the number of threads allocated to
+    # image transformation tasks.
+    num_threads = int(os.environ.get("OMP_NUM_THREADS", "1"))
+
+    def apply(img):
+        with set_default_torch_num_threads(num_threads):
+            return transform(img)
+
+    return apply
 
 
 # adapted from https://huggingface.co/OpenGVLab/InternVL2-1B
diff --git a/vllm/model_executor/models/jamba.py b/vllm/model_executor/models/jamba.py
index aebd2cbe2e99997ce71f7578e33278a8a2eb2a5f..550fde17b6c53b54635eb4ff39ec59387d24cf40 100644
--- a/vllm/model_executor/models/jamba.py
+++ b/vllm/model_executor/models/jamba.py
@@ -613,7 +613,7 @@ class JambaForSequenceClassification(JambaForCausalLM):
             config.hidden_size,
             num_labels,
             bias=score_bias,
-            dtype=torch.float32,
+            dtype=vllm_config.model_config.head_dtype,
         )
 
         pooler_config = vllm_config.model_config.pooler_config
diff --git a/vllm/model_executor/models/jina_vl.py b/vllm/model_executor/models/jina_vl.py
index 8c64f636c6a0f4935ae41434b31e287b78f2330d..f8c2a1e507a74cfcaa1b64c73336d3e285f317aa 100644
--- a/vllm/model_executor/models/jina_vl.py
+++ b/vllm/model_executor/models/jina_vl.py
@@ -5,9 +5,9 @@ from typing import Optional
 
 import torch
 import torch.nn as nn
-from transformers import BatchFeature, PretrainedConfig
+from transformers import BatchFeature
 
-from vllm.config import VllmConfig
+from vllm.config import ModelConfig, VllmConfig
 from vllm.inputs import TokensPrompt
 from vllm.logger import init_logger
 from vllm.model_executor.layers.linear import (ColumnParallelLinear,
@@ -28,13 +28,17 @@ logger = init_logger(__name__)
 
 class JinaVLScorer(nn.Module):
 
-    def __init__(self, config: PretrainedConfig):
+    def __init__(self, model_config: "ModelConfig"):
         super().__init__()
+        config = model_config.hf_config
+        head_dtype = model_config.head_dtype
         self.dense = ColumnParallelLinear(config.hidden_size,
                                           config.hidden_size,
+                                          params_dtype=head_dtype,
                                           bias=True)
         self.out_proj = RowParallelLinear(config.hidden_size,
                                           config.num_labels,
+                                          params_dtype=head_dtype,
                                           bias=True)
 
     def forward(self, x, **kwargs):
@@ -88,21 +92,17 @@ class JinaVLForSequenceClassification(Qwen2VLForConditionalGeneration,
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__(vllm_config=vllm_config,
                          prefix=maybe_prefix(prefix, "qwen2_vl"))
-        config = vllm_config.model_config.hf_config
         pooler_config = vllm_config.model_config.pooler_config
         assert pooler_config is not None
 
-        # logit bias for sigmoid normalization
-        self.LOGIT_BIAS = 2.65
-
-        self.score = JinaVLScorer(config)
+        self.score = JinaVLScorer(vllm_config.model_config)
         self.pooler = DispatchPooler({
             "encode":
             Pooler.for_encode(pooler_config),
             "classify":
-            Pooler.for_classify(pooler_config, classifier=None),
+            Pooler.for_classify(pooler_config, classifier=self.score),
             "score":
-            Pooler.for_classify(pooler_config, classifier=None),
+            Pooler.for_classify(pooler_config, classifier=self.score),
         })
 
     @classmethod
@@ -137,9 +137,7 @@ class JinaVLForSequenceClassification(Qwen2VLForConditionalGeneration,
             inputs_embeds=inputs_embeds,
             **kwargs,
         )
-
-        logits = self.score(hidden_states) - self.LOGIT_BIAS
-        return logits
+        return hidden_states
 
     def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]):
         loader = AutoWeightsLoader(self)
diff --git a/vllm/model_executor/models/keye.py b/vllm/model_executor/models/keye.py
index c6dbd62b905e18933d0367ee72160a673935bd12..710b805acb3eaa564ae514a148134789d302ad92 100644
--- a/vllm/model_executor/models/keye.py
+++ b/vllm/model_executor/models/keye.py
@@ -1,9 +1,10 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import math
+from abc import abstractmethod
 from collections.abc import Iterable, Mapping, Sequence
 from functools import partial
-from typing import Annotated, Any, Literal, Optional, Union
+from typing import Annotated, Any, Literal, Optional, TypeVar, Union
 
 import numpy as np
 import torch
@@ -57,16 +58,13 @@ from .vision import get_vit_attn_backend
 
 logger = init_logger(__name__)
 
-_MAX_FRAMES_PER_VIDEO = 16
-_MAX_IMAGE_SIZE = 9999999
-
 
 def smart_resize(
     height: int,
     width: int,
-    factor: int = 28,
-    min_pixels: int = 28 * 28 * 130,
-    max_pixels: int = 28 * 28 * 1280,
+    factor: int,
+    min_pixels: int,
+    max_pixels: int,
 ):
     if height < factor:
         logger.warning(
@@ -887,9 +885,9 @@ class Projector(nn.Module):
 
     def forward(
         self,
-        image_features: torch.Tensor,
+        image_features: Union[torch.Tensor, list[torch.Tensor]],
         image_grid_thw: list[tuple[int, int, int]],
-    ) -> torch.Tensor:
+    ) -> Union[torch.Tensor, list[torch.Tensor]]:
         m1, m2 = self.merge_kernel_size
         if isinstance(image_features, (list, tuple)):
             processed_features = list()
@@ -986,6 +984,12 @@ class KeyeMultiModalDataParser(MultiModalDataParser):
 
 class KeyeProcessingInfo(BaseProcessingInfo):
 
+    def get_max_image_size(self) -> int:
+        return 9999999  #_MAX_IMAGE_SIZE
+
+    def get_max_frame_per_video(self) -> int:
+        return 16  #_MAX_FRAMES_PER_VIDEO
+
     def get_image_processor(self, **kwargs: object):
         return self.get_hf_processor(**kwargs).image_processor
 
@@ -1077,8 +1081,8 @@ class KeyeProcessingInfo(BaseProcessingInfo):
 
     def get_image_size_with_most_features(self, ) -> ImageSize:
         max_image_size, _ = self._get_vision_info(
-            image_width=_MAX_IMAGE_SIZE,
-            image_height=_MAX_IMAGE_SIZE,
+            image_width=self.get_max_image_size(),
+            image_height=self.get_max_image_size(),
             image_processor=None,
         )
         return max_image_size
@@ -1123,7 +1127,7 @@ class KeyeProcessingInfo(BaseProcessingInfo):
                                                       max_image_tokens)
         max_frames_per_video = min(
             max_total_frames // max(max_videos, 1),
-            _MAX_FRAMES_PER_VIDEO,
+            self.get_max_frame_per_video(),
         )
 
         return max(max_frames_per_video, 1)
@@ -1139,7 +1143,10 @@ class KeyeProcessingInfo(BaseProcessingInfo):
         )
 
 
-class KeyeDummyInputsBuilder(BaseDummyInputsBuilder[KeyeProcessingInfo]):
+_I = TypeVar("_I", bound=KeyeProcessingInfo)
+
+
+class KeyeBaseDummyInputsBuilder(BaseDummyInputsBuilder[_I]):
 
     def get_dummy_text(self, mm_counts: Mapping[str, int]) -> str:
         num_images = mm_counts.get("image", 0)
@@ -1183,6 +1190,10 @@ class KeyeDummyInputsBuilder(BaseDummyInputsBuilder[KeyeProcessingInfo]):
         return mm_data
 
 
+class KeyeDummyInputsBuilder(KeyeBaseDummyInputsBuilder[KeyeProcessingInfo]):
+    ...
+
+
 class KeyeMultiModalProcessor(BaseMultiModalProcessor[KeyeProcessingInfo]):
 
     def _get_data_parser(self) -> MultiModalDataParser:
@@ -1231,13 +1242,7 @@ class KeyeMultiModalProcessor(BaseMultiModalProcessor[KeyeProcessingInfo]):
         return _keye_field_config(hf_inputs)
 
 
-@MULTIMODAL_REGISTRY.register_processor(
-    KeyeMultiModalProcessor,
-    info=KeyeProcessingInfo,
-    dummy_inputs=KeyeDummyInputsBuilder,
-)
-class KeyeForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsLoRA,
-                                   SupportsPP):
+class BaseKeyeModule(nn.Module):
     packed_modules_mapping = {
         "qkv_proj": [
             "q_proj",
@@ -1264,6 +1269,11 @@ class KeyeForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsLoRA,
 
         raise ValueError("Only image or video modality is supported")
 
+    def _maybe_ignore_quant_config(self, quant_config: QuantizationConfig):
+        if isinstance(quant_config, (GPTQConfig, GPTQMarlinConfig)):
+            return None
+        return quant_config
+
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__()
         config: PretrainedConfig = vllm_config.model_config.hf_config
@@ -1278,7 +1288,8 @@ class KeyeForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsLoRA,
             quant_config=self._maybe_ignore_quant_config(quant_config),
             prefix=maybe_prefix(prefix, "visual"),
         )
-        self.mlp_AR = Projector(
+
+        self.mlp_AR = self._build_projector(
             config,
             config.vision_config,
             quant_config=self._maybe_ignore_quant_config(quant_config),
@@ -1294,102 +1305,16 @@ class KeyeForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsLoRA,
         self.make_empty_intermediate_tensors = (
             self.language_model.make_empty_intermediate_tensors)
 
-    def _maybe_ignore_quant_config(self, quant_config: QuantizationConfig):
-        if isinstance(quant_config, (GPTQConfig, GPTQMarlinConfig)):
-            return None
-        return quant_config
-
-    def _validate_and_reshape_mm_tensor(self, mm_input: NestedTensors,
-                                        name: str) -> torch.Tensor:
-        if not isinstance(mm_input, (torch.Tensor, list)):
-            raise ValueError(f"Incorrect type of {name}. "
-                             f"Got type: {type(mm_input)}")
-        if isinstance(mm_input, torch.Tensor):
-            if mm_input.ndim == 2:
-                return mm_input
-            if mm_input.ndim == 5:
-                return mm_input
-            if mm_input.ndim != 3:
-                raise ValueError(f"{name} should be 2D or batched 3D tensor. "
-                                 f"Got ndim: {mm_input.ndim} "
-                                 f"(shape={mm_input.shape})")
-            return torch.concat(list(mm_input))
-        elif is_list_of(mm_input, torch.Tensor):
-            if all(p.dim() == 4 for p in mm_input) or all(p.dim() == 2
-                                                          for p in mm_input):
-                return mm_input
-        return torch.concat(list(mm_input))
-
-    def _parse_and_validate_image_input(
-            self, **kwargs: object) -> Optional[KeyeImageInputs]:
-        pixel_values = kwargs.pop("pixel_values", None)
-        image_embeds = kwargs.pop("image_embeds", None)
-        image_grid_thw = kwargs.pop("image_grid_thw", None)
-
-        if pixel_values is None and image_embeds is None:
-            return None
-
-        if pixel_values is not None:
-            pixel_values = self._validate_and_reshape_mm_tensor(
-                pixel_values, "image pixel values")
-            image_grid_thw = self._validate_and_reshape_mm_tensor(
-                image_grid_thw, "image grid_thw")
-
-            return KeyeImagePixelInputs(
-                type="pixel_values",
-                pixel_values=pixel_values,
-                image_grid_thw=image_grid_thw,
-            )
-
-        if image_embeds is not None:
-            image_embeds = self._validate_and_reshape_mm_tensor(
-                image_embeds, "image embeds")
-            image_grid_thw = self._validate_and_reshape_mm_tensor(
-                image_grid_thw, "image grid_thw")
-
-            return KeyeImageEmbeddingInputs(
-                type="image_embeds",
-                image_embeds=image_embeds,
-                image_grid_thw=image_grid_thw,
-            )
-
-    def _parse_and_validate_video_input(
-            self, **kwargs: object) -> Optional[KeyeVideoInputs]:
-        pixel_values_videos = kwargs.pop("pixel_values_videos", None)
-        video_embeds = kwargs.pop("video_embeds", None)
-        video_grid_thw = kwargs.pop("video_grid_thw", None)
-
-        if pixel_values_videos is None and video_embeds is None:
-            return None
-
-        if pixel_values_videos is not None:
-            pixel_values_videos = self._validate_and_reshape_mm_tensor(
-                pixel_values_videos,
-                "video pixel values",
-            )
-            video_grid_thw = self._validate_and_reshape_mm_tensor(
-                video_grid_thw, "video grid_thw")
-
-            return KeyeVideoPixelInputs(
-                type="pixel_values_videos",
-                pixel_values_videos=pixel_values_videos,
-                video_grid_thw=video_grid_thw,
-            )
-
-        if video_embeds is not None:
-            video_embeds = self._validate_and_reshape_mm_tensor(
-                video_embeds, "video embeds")
-            video_grid_thw = self._validate_and_reshape_mm_tensor(
-                video_grid_thw, "video grid_thw")
-
-            return KeyeVideoEmbeddingInputs(
-                type="video_embeds",
-                video_embeds=video_embeds,
-                video_grid_thw=video_grid_thw,
-            )
+    @abstractmethod
+    def _build_projector(self,
+                         text_config: PretrainedConfig,
+                         vision_config: PretrainedConfig,
+                         quant_config: Optional[QuantizationConfig] = None,
+                         prefix: str = "") -> nn.Module:
+        raise ValueError("Need projector")
 
-    def _process_image_input(
-            self, image_input: KeyeImageInputs) -> tuple[torch.Tensor, ...]:
+    def _process_image_input(self,
+                             image_input: Any) -> tuple[torch.Tensor, ...]:
         siglip_position_ids = list()
         image_grid_hws = list()
         sample_indices = list()
@@ -1434,18 +1359,20 @@ class KeyeForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsLoRA,
             image_embeds = tuple(self.mlp_AR(image_embeds, image_grid_thw))
             return image_embeds
 
-    def _process_video_input(
-            self, video_input: KeyeVideoInputs) -> tuple[torch.Tensor, ...]:
+    def _process_video_embeds(
+        self,
+        video_type: Literal["video_embeds", "pixel_values_videos"],
+        video_grid_thw: list[torch.Tensor],
+        pixel_values_videos: Optional[torch.Tensor] = None
+    ) -> Union[torch.Tensor, list[torch.Tensor]]:
         siglip_position_ids = list()
         video_grid_hws = list()
         sample_indices = list()
         cu_seqlens = [0]
 
-        video_grid_thw = video_input["video_grid_thw"]
         assert video_grid_thw.ndim == 2
-
-        for idx, thaw in enumerate(video_grid_thw):
-            thw_tuple = tuple(thaw.detach().cpu().numpy().tolist())
+        for idx, sub_thw in enumerate(video_grid_thw):
+            thw_tuple = tuple(sub_thw.detach().cpu().numpy().tolist())
             numel = np.prod(thw_tuple)
 
             video_grid_hws.append(thw_tuple)
@@ -1455,12 +1382,11 @@ class KeyeForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsLoRA,
                                              dtype=torch.int64))
             cu_seqlens.append(cu_seqlens[-1] + numel)
 
-        if video_input["type"] == "video_embeds":
+        if video_type == "video_embeds":
             raise ValueError(
                 "Video embeddings are not supported for this processing path.")
         else:
-            pixel_values_videos = video_input["pixel_values_videos"].type(
-                self.visual.dtype)
+            pixel_values_videos = pixel_values_videos.type(self.visual.dtype)
             siglip_position_ids = torch.concat(siglip_position_ids, dim=0).to(
                 pixel_values_videos.device)
             cu_seqlens = torch.tensor(cu_seqlens, dtype=torch.int32).to(
@@ -1479,7 +1405,7 @@ class KeyeForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsLoRA,
                 use_rope=True,
                 window_size=-1,
             )
-            video_embeds = tuple(self.mlp_AR(video_embeds, video_grid_thw))
+            video_embeds = self.mlp_AR(video_embeds, video_grid_thw)
             return video_embeds
 
     def _parse_and_validate_multimodal_inputs(self, **kwargs: object) -> dict:
@@ -1541,8 +1467,8 @@ class KeyeForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsLoRA,
     def get_input_embeddings_v0(
         self,
         input_ids: torch.Tensor,
-        image_input: Optional[KeyeImagePixelInputs] = None,
-        video_input: Optional[KeyeVideoPixelInputs] = None,
+        image_input: Optional[Any] = None,
+        video_input: Optional[Any] = None,
     ) -> torch.Tensor:
         inputs_embeds = self.get_input_embeddings(input_ids)
         if image_input is not None:
@@ -1572,7 +1498,7 @@ class KeyeForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsLoRA,
         inputs_embeds: Optional[torch.Tensor] = None,
         **kwargs: object,
     ) -> Union[torch.Tensor, IntermediateTensors]:
-        """Run forward pass for Qwen2-VL.
+        """Run forward pass for Keye-VL.
 
         Args:
             input_ids: Flattened (concatenated) input_ids corresponding to a
@@ -1591,14 +1517,12 @@ class KeyeForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsLoRA,
             video_grid_thw: Tensor `(n_videos, 3)` of video 3D grid in LLM.
                 `None` if no videos are passed.
         """
-
         if intermediate_tensors is not None:
             inputs_embeds = None
 
         elif inputs_embeds is None:
             image_input = self._parse_and_validate_image_input(**kwargs)
             video_input = self._parse_and_validate_video_input(**kwargs)
-
             if image_input is None and video_input is None:
                 inputs_embeds = None
             else:
@@ -1619,6 +1543,7 @@ class KeyeForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsLoRA,
             intermediate_tensors=intermediate_tensors,
             inputs_embeds=inputs_embeds,
         )
+
         return hidden_states
 
     def compute_logits(
@@ -1631,7 +1556,6 @@ class KeyeForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsLoRA,
 
     def load_weights(self, weights: Iterable[tuple[str,
                                                    torch.Tensor]]) -> set[str]:
-
         loader = AutoWeightsLoader(self)
         return loader.load_weights(weights, mapper=self.hf_to_vllm_mapper)
 
@@ -1639,6 +1563,122 @@ class KeyeForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsLoRA,
         """Get the module prefix in multimodal models."""
         return MultiModelKeys.from_string_field(
             language_model="language_model",
-            connector="visual.",
-            tower_model="mlp_AR.",
+            connector="mlp_AR.",
+            tower_model="visual.",
         )
+
+
+@MULTIMODAL_REGISTRY.register_processor(
+    KeyeMultiModalProcessor,
+    info=KeyeProcessingInfo,
+    dummy_inputs=KeyeDummyInputsBuilder,
+)
+class KeyeForConditionalGeneration(BaseKeyeModule, SupportsMultiModal,
+                                   SupportsLoRA, SupportsPP):
+
+    def _build_projector(self,
+                         text_config: PretrainedConfig,
+                         vision_config: PretrainedConfig,
+                         quant_config: Optional[QuantizationConfig] = None,
+                         prefix: str = "") -> nn.Module:
+        return Projector(text_config, vision_config, quant_config, prefix)
+
+    def _validate_and_reshape_mm_tensor(
+            self, mm_input: NestedTensors,
+            name: str) -> Union[torch.Tensor, list[torch.Tensor]]:
+        if not isinstance(mm_input, (torch.Tensor, list)):
+            raise ValueError(f"Incorrect type of {name}. "
+                             f"Got type: {type(mm_input)}")
+        if isinstance(mm_input, torch.Tensor):
+            if mm_input.ndim == 2:
+                return mm_input
+            if mm_input.ndim == 5:
+                return mm_input
+            if mm_input.ndim != 3:
+                raise ValueError(f"{name} should be 2D or batched 3D tensor. "
+                                 f"Got ndim: {mm_input.ndim} "
+                                 f"(shape={mm_input.shape})")
+            return torch.concat(list(mm_input))
+        elif is_list_of(mm_input, torch.Tensor):
+            if all(p.dim() == 4 for p in mm_input) or all(p.dim() == 2
+                                                          for p in mm_input):
+                return mm_input
+        return torch.concat(list(mm_input))
+
+    def _parse_and_validate_image_input(
+            self, **kwargs: object) -> Optional[KeyeImageInputs]:
+        pixel_values = kwargs.pop("pixel_values", None)
+        image_embeds = kwargs.pop("image_embeds", None)
+        image_grid_thw = kwargs.pop("image_grid_thw", None)
+
+        if pixel_values is None and image_embeds is None:
+            return None
+
+        if pixel_values is not None:
+            pixel_values = self._validate_and_reshape_mm_tensor(
+                pixel_values, "image pixel values")
+            image_grid_thw = self._validate_and_reshape_mm_tensor(
+                image_grid_thw, "image grid_thw")
+
+            return KeyeImagePixelInputs(
+                type="pixel_values",
+                pixel_values=pixel_values,
+                image_grid_thw=image_grid_thw,
+            )
+
+        if image_embeds is not None:
+            image_embeds = self._validate_and_reshape_mm_tensor(
+                image_embeds, "image embeds")
+            image_grid_thw = self._validate_and_reshape_mm_tensor(
+                image_grid_thw, "image grid_thw")
+
+            return KeyeImageEmbeddingInputs(
+                type="image_embeds",
+                image_embeds=image_embeds,
+                image_grid_thw=image_grid_thw,
+            )
+
+    def _parse_and_validate_video_input(
+            self, **kwargs: object) -> Optional[KeyeVideoInputs]:
+        pixel_values_videos = kwargs.pop("pixel_values_videos", None)
+        video_embeds = kwargs.pop("video_embeds", None)
+        video_grid_thw = kwargs.pop("video_grid_thw", None)
+
+        if pixel_values_videos is None and video_embeds is None:
+            return None
+
+        if pixel_values_videos is not None:
+            pixel_values_videos = self._validate_and_reshape_mm_tensor(
+                pixel_values_videos,
+                "video pixel values",
+            )
+            video_grid_thw = self._validate_and_reshape_mm_tensor(
+                video_grid_thw, "video grid_thw")
+
+            return KeyeVideoPixelInputs(
+                type="pixel_values_videos",
+                pixel_values_videos=pixel_values_videos,
+                video_grid_thw=video_grid_thw,
+            )
+
+        if video_embeds is not None:
+            video_embeds = self._validate_and_reshape_mm_tensor(
+                video_embeds, "video embeds")
+            video_grid_thw = self._validate_and_reshape_mm_tensor(
+                video_grid_thw, "video grid_thw")
+
+            return KeyeVideoEmbeddingInputs(
+                type="video_embeds",
+                video_embeds=video_embeds,
+                video_grid_thw=video_grid_thw,
+            )
+
+    def _process_video_input(
+            self, video_input: KeyeVideoInputs) -> tuple[torch.Tensor, ...]:
+        video_type = video_input["type"]
+        video_grid_thw = video_input["video_grid_thw"]
+        pixel_values_videos = video_input.get("pixel_values_videos", None)
+
+        return tuple(
+            self._process_video_embeds(video_type, video_grid_thw,
+                                       pixel_values_videos))
diff --git a/vllm/model_executor/models/keye_vl1_5.py b/vllm/model_executor/models/keye_vl1_5.py
new file mode 100644
index 0000000000000000000000000000000000000000..605c6d3eaf643923dc5c175db494ee92f6d38657
--- /dev/null
+++ b/vllm/model_executor/models/keye_vl1_5.py
@@ -0,0 +1,601 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import itertools
+from collections.abc import Mapping, Sequence
+from functools import partial
+from typing import Annotated, Any, Literal, Optional, Union
+
+import numpy as np
+import torch
+import torch.nn as nn
+from einops import rearrange
+from transformers import PretrainedConfig
+from transformers.activations import GELUActivation
+from transformers.feature_extraction_utils import BatchFeature
+
+from vllm.config import VllmConfig
+from vllm.logger import init_logger
+from vllm.model_executor.layers.linear import (ColumnParallelLinear,
+                                               RowParallelLinear)
+from vllm.model_executor.layers.quantization import QuantizationConfig
+from vllm.multimodal import MULTIMODAL_REGISTRY, NestedTensors
+from vllm.multimodal.inputs import (ImageItem, ModalityData,
+                                    MultiModalFieldConfig,
+                                    MultiModalKwargsItems, VideoItem)
+from vllm.multimodal.parse import (DictEmbeddingItems, ModalityDataItems,
+                                   MultiModalDataItems, MultiModalDataParser)
+from vllm.multimodal.processing import (PromptReplacement, PromptUpdate,
+                                        PromptUpdateDetails)
+from vllm.utils.tensor_schema import TensorSchema, TensorShape
+
+from .interfaces import SupportsLoRA, SupportsMultiModal, SupportsPP
+from .keye import (BaseKeyeModule, BaseMultiModalProcessor,
+                   KeyeBaseDummyInputsBuilder, KeyeProcessingInfo)
+
+logger = init_logger(__name__)
+
+
+def split_thw(grid_thw: torch.Tensor) -> torch.Tensor:
+    """
+    Split grid_thw in t dimension.
+
+    Args:
+        grid_thw: [N, 3] tensor of [t, h, w]
+
+    Returns:
+        [Σt, 3] tensor where each row is [1, h, w]
+
+    Example:
+    >>> grid_thw = torch.tensor([[2, 3, 4], [1, 5, 6]])
+    >>> split_thw(grid_thw)
+    tensor([[1, 3, 4],
+           [1, 3, 4],
+           [1, 5, 6]])
+    """
+    t = grid_thw[:, 0]
+    h_w = grid_thw[:, 1:]
+    ones = torch.ones_like(h_w[:, :1])
+    return torch.cat([ones, h_w], dim=1).repeat_interleave(t, dim=0)
+
+
+def get_num_patches(grid_thw: torch.Tensor, num_frames: Union[list[int],
+                                                              torch.Tensor]):
+    """
+    Return num_patches per video.
+
+    Args:
+        t: tensor with shape [N, ...] where each item is a list/tensor
+        cu_seqlens: list indicating the boundaries of groups
+
+    Returns:
+        list of ints representing the sum of products for each group
+
+    Examples:
+        >>> # Suppose there are 2 videos with a total of 3 grids
+        >>> grid_thw = torch.tensor([[2, 2, 2],  # grid 0: 2*2*2=8 patches
+        ...                          [2, 2, 2],  # grid 1: 2*2*2=8 patches
+        ...                          [1, 1, 1]]) # grid 2: 1*1*1=1 patches
+        >>> num_frames = [2, 1]  # The first video contains 2 grids,
+                                   the second contains 1 grid.
+        >>> get_num_patches(grid_thw, num_frames)
+        tensor([16, 1])  # Total patches for first video: 8+8=16,
+                           second video: 1.
+    """
+
+    assert len(grid_thw.shape) == 2
+    if isinstance(num_frames, torch.Tensor):
+        num_frames = num_frames.clone().tolist()
+
+    num_grids_per_frame = grid_thw.prod(dim=1)
+    start_idx_per_video = [0, *itertools.accumulate(num_frames)]
+    num_patches = [
+        num_grids_per_frame[start_idx_per_video[i]:start_idx_per_video[i + 1]].
+        sum() for i in range(len(num_frames))
+    ]
+    return torch.stack(num_patches) if num_patches else torch.zeros(
+        0, dtype=grid_thw.dtype, device=grid_thw.device)
+
+
+class KeyeVL1_5ImagePixelInputs(TensorSchema):
+    """
+    Dimensions:
+        - b: Batch size
+        - np: Number of patches
+        - c: Number of channels
+        - ps: Patch size
+        - ni: Number of images
+        - g: Grid dimensions (3 for t, h, w)
+    """
+    type: Literal["pixel_values"]
+
+    pixel_values: Annotated[
+        torch.Tensor,
+        TensorShape("np", 3, "ps", "ps", dynamic_dims={"np"})]
+
+    image_grid_thw: Annotated[torch.Tensor, TensorShape("ni", 3)]
+
+
+class KeyeVL1_5ImageEmbeddingInputs(TensorSchema):
+    """
+    Dimensions:
+        - nf: Number of image features
+        - hs: Hidden size (must match the hidden size of language model
+          backbone)
+        - ni: Number of images
+        - g: Grid dimensions (3 for t, h, w)
+    """
+    type: Literal["image_embeds"]
+    image_embeds: Annotated[torch.Tensor, TensorShape("nf", "hs")]
+    image_grid_thw: Annotated[torch.Tensor, TensorShape("ni", 3)]
+
+
+KeyeVL1_5ImageInputs = Union[KeyeVL1_5ImagePixelInputs,
+                             KeyeVL1_5ImageEmbeddingInputs]
+
+
+class KeyeVL1_5VideoPixelInputs(TensorSchema):
+    """
+    Dimensions:
+        - b: Batch size
+        - np: Number of patches
+        - c: Number of channels
+        - ps: Patch size
+        - ni: Number of images
+        - g: Grid dimensions (3 for t, h, w)
+    """
+    type: Literal["pixel_values_videos"]
+    pixel_values_videos: Annotated[
+        torch.Tensor,
+        TensorShape("np", 3, "ps", "ps", dynamic_dims={"np"})]
+    video_grid_thw: Annotated[torch.Tensor, TensorShape("nv", 3)]
+
+    num_frames: torch.Tensor
+
+
+class KeyeVL1_5VideoEmbeddingInputs(TensorSchema):
+    """
+    Dimensions:
+        - nf: Number of video features
+        - hs: Hidden size (must match the hidden size of language model
+          backbone)
+        - nv: Number of videos
+        - g: Grid dimensions (3 for t, h, w)
+    """
+    type: Literal["video_embeds"]
+    video_embeds: Annotated[torch.Tensor, TensorShape("nf", "hs")]
+    video_grid_thw: Annotated[torch.Tensor, TensorShape("nv", 3)]
+    num_frames: torch.Tensor
+
+
+KeyeVL1_5VideoInputs = Union[KeyeVL1_5VideoPixelInputs,
+                             KeyeVL1_5VideoEmbeddingInputs]
+
+
+class KeyeVL1_5Projector(nn.Module):
+
+    def __init__(
+        self,
+        text_config: PretrainedConfig,
+        vision_config: PretrainedConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ):
+        super().__init__()
+        self.text_config = text_config
+        self.vision_config = vision_config
+        self.merge_kernel_size = (2, 2)
+
+        self.hidden_size = (self.vision_config.hidden_size *
+                            self.merge_kernel_size[0] *
+                            self.merge_kernel_size[1])
+
+        self.pre_norm = torch.nn.LayerNorm(self.hidden_size, eps=1e-05)
+        self.act = GELUActivation()
+
+        self.linear_1 = ColumnParallelLinear(
+            self.hidden_size,
+            self.hidden_size,
+            bias=True,
+            quant_config=quant_config,
+            prefix=f"{prefix}.linear_1",
+        )
+        self.linear_2 = RowParallelLinear(
+            self.hidden_size,
+            self.text_config.hidden_size,
+            bias=True,
+            quant_config=quant_config,
+            prefix=f"{prefix}.linear_2",
+        )
+
+    def forward(
+        self,
+        image_features: Union[torch.Tensor, tuple[torch.Tensor],
+                              list[torch.Tensor]],
+        image_grid_thw: list[tuple[int, int, int]],
+    ) -> Union[torch.Tensor, list[torch.Tensor]]:
+        m1, m2 = self.merge_kernel_size
+        if isinstance(image_features, (list, tuple)):
+            processed_features = list()
+            for image_feature, image_grid in zip(image_features,
+                                                 image_grid_thw):
+                t, h, w = image_grid
+                image_feature = rearrange(
+                    image_feature,
+                    "(t h p1 w p2) d -> (t h w) (p1 p2 d)",
+                    t=t,
+                    h=h // m1,
+                    p1=m1,
+                    w=w // m2,
+                    p2=m2,
+                )
+                image_feature = self.pre_norm(image_feature)
+                hidden_states, _ = self.linear_1(image_feature)
+                hidden_states = self.act(hidden_states)
+                hidden_states, _ = self.linear_2(hidden_states)
+                processed_features.append(hidden_states)
+
+            return processed_features
+
+        dims = image_features.shape[:-1]
+        dim = image_features.shape[-1]
+        image_features = image_features.view(np.prod(dims), dim)
+        hidden_states = self.pre_norm(image_features.view(
+            -1, self.hidden_size))
+        hidden_states = self.linear_1(hidden_states)
+        hidden_states = self.act(hidden_states)
+        hidden_states = self.linear_2(hidden_states)
+
+        return hidden_states.view(*dims, -1)
+
+
+class KeyeVL1_5ProcessingInfo(KeyeProcessingInfo):
+
+    def get_max_frame_per_video(self) -> int:
+        return 2048
+
+    def get_supported_mm_limits(self, ) -> Mapping[str, Optional[int]]:
+        return {"image": None, "video": 1}
+
+
+def _keye_field_config(hf_inputs: Mapping[str, torch.Tensor], ):
+    image_grid_thw = hf_inputs.get("image_grid_thw",
+                                   torch.empty((0, 3), dtype=torch.int64))
+    image_grid_sizes = image_grid_thw.prod(-1)
+
+    video_grid_thw = hf_inputs.get("video_grid_thw",
+                                   torch.empty((0, 3), dtype=torch.int64))
+    video_grid_thw = split_thw(video_grid_thw)
+    num_frames = hf_inputs.get("num_frames",
+                               video_grid_thw[:, 0]).clone().tolist()
+
+    video_num_patches = get_num_patches(video_grid_thw, num_frames)
+
+    video_num_grids = []
+    if len(num_frames) > 0:
+        i = 0
+        j = 1
+        cur_frames = num_frames[i]
+        for t, _, _ in video_grid_thw.tolist():
+            cur_frames -= t
+            if cur_frames == 0:
+                video_num_grids.append(j)
+                i += 1
+                if i < len(num_frames):
+                    cur_frames = num_frames[i]
+                j = 1
+            else:
+                j += 1
+    video_num_grids = torch.tensor(video_num_grids)
+    return dict(pixel_values=MultiModalFieldConfig.flat_from_sizes(
+        "image", image_grid_sizes),
+                image_embeds=MultiModalFieldConfig.flat_from_sizes(
+                    "image", image_grid_sizes),
+                image_grid_thw=MultiModalFieldConfig.batched("image"),
+                pixel_values_videos=MultiModalFieldConfig.flat_from_sizes(
+                    "video", video_num_patches),
+                video_embeds=MultiModalFieldConfig.flat_from_sizes(
+                    "video", video_num_patches),
+                video_grid_thw=MultiModalFieldConfig.flat_from_sizes(
+                    "video", video_num_grids),
+                num_frames=MultiModalFieldConfig.batched("video"))
+
+
+class KeyeVL1_5MultiModalDataParser(MultiModalDataParser):
+
+    def _parse_image_data(
+        self,
+        data: Union[dict[str, torch.Tensor], ModalityData[ImageItem]],
+    ) -> ModalityDataItems[Any, Any]:
+        if isinstance(data, dict):
+            return DictEmbeddingItems(
+                data,
+                modality="image",
+                required_fields={
+                    "image_embeds",
+                    "image_grid_thw",
+                },
+                fields_factory=_keye_field_config,
+            )
+
+        return super()._parse_image_data(data)
+
+    def _parse_video_data(
+        self,
+        data: Union[dict[str, torch.Tensor], ModalityData[VideoItem]],
+    ) -> ModalityDataItems[Any, Any]:
+        if isinstance(data, dict):
+            return DictEmbeddingItems(
+                data,
+                modality="video",
+                required_fields={
+                    "video_embeds",
+                    "video_grid_thw",
+                },
+                fields_factory=_keye_field_config,
+            )
+
+        return super()._parse_video_data(data)
+
+
+class KeyeVL1_5MultiModalProcessor(
+        BaseMultiModalProcessor[KeyeVL1_5ProcessingInfo]):
+
+    def _get_data_parser(self) -> MultiModalDataParser:
+        return KeyeVL1_5MultiModalDataParser()
+
+    def _get_prompt_updates(
+        self,
+        mm_items: MultiModalDataItems,
+        hf_processor_mm_kwargs: Mapping[str, Any],
+        out_mm_kwargs: MultiModalKwargsItems,
+    ) -> Sequence[PromptUpdate]:
+        hf_processor = self.info.get_hf_processor(**hf_processor_mm_kwargs)
+        image_processor = self.info.get_image_processor(
+            **hf_processor_mm_kwargs)
+        tokenizer = self.info.get_tokenizer()
+        vocab = tokenizer.get_vocab()
+        image_token_id = vocab[hf_processor.image_token]
+        video_token_id = vocab[hf_processor.video_token]
+        placeholder = {"image": image_token_id, "video": video_token_id}
+        merge_length = image_processor.merge_size**2
+
+        out_mm_kwargs_data = out_mm_kwargs.get_data()
+        frame_types: list[torch.Tensor] = \
+            hf_processor_mm_kwargs.get("frame_types", None)
+        timestamps: list[torch.Tensor] = \
+            hf_processor_mm_kwargs.get("timestamps", None)
+        num_videos = mm_items.get_count("video", strict=False)
+
+        if frame_types is None:
+            frame_types = [None] * num_videos
+        assert len(frame_types) == num_videos, \
+            f"Number of frame_types={len(frame_types)} " \
+            f"doesn't equal to number of videos={num_videos}"
+        if timestamps is None:
+            timestamps = [None] * num_videos
+        assert len(timestamps) == num_videos, \
+            f"Number of timestamps={len(timestamps)} " \
+            f"doesn't equal to number of videos={num_videos}"
+
+        video_grid_thw = out_mm_kwargs_data.get(
+            'video_grid_thw', torch.empty((0, 3), dtype=torch.int64))
+        num_frames = out_mm_kwargs_data.get(
+            'num_frames', torch.tensor([], dtype=torch.int64))
+
+        assert len(num_frames) == num_videos, \
+            f"Size of num_frames={len(num_frames)} " \
+            f"doesn't equal to number of videos={num_videos}"
+
+        video_grid_hws = split_thw(video_grid_thw)
+        assert int(num_frames.sum().tolist()) == video_grid_hws.shape[0], (
+            f"The first dimension of `video_grid_hws`={video_grid_hws.shape[0]}"
+            f"doesn't equal to num of frames.")
+
+        cu_seqlens = torch.cumsum(torch.tensor([0] + num_frames.tolist()),
+                                  dim=-1)
+
+        def get_replacement_keye(item_idx: int, modality: str):
+            """
+            Args:
+                item_idx(int): The item index of modality to replace 
+                modality(str): The modality
+            """
+            if modality == "image":
+                out_item = out_mm_kwargs[modality][item_idx]
+                grid_thw = out_item[f"{modality}_grid_thw"].data
+                assert isinstance(grid_thw, torch.Tensor)
+
+                num_tokens = int(grid_thw.prod()) // merge_length
+                return [image_token_id] * num_tokens
+            elif modality == "video":
+                placeholders = []
+                video_timestamps = timestamps[item_idx]
+                video_frame_types = frame_types[item_idx]
+                grid_thw = video_grid_hws[
+                    cu_seqlens[item_idx]:cu_seqlens[item_idx + 1]]
+
+                nframes = grid_thw.shape[0]
+
+                if video_timestamps is None:
+                    video_timestamps = [""] * nframes
+                else:
+                    video_timestamps = [
+                        format(ts, ".1f") for ts in video_timestamps
+                    ]
+
+                if video_frame_types is None:
+                    video_frame_types = [0] * nframes
+                for i, sub_thw in enumerate(grid_thw):
+                    s = f"{hf_processor.frame_token}{video_timestamps[i]}"
+                    if video_frame_types[i] == 1:
+                        s += hf_processor.fast_start
+                    placeholders.extend(tokenizer.encode(s))
+                    num_frame_tokens = int(sub_thw.prod()) // merge_length
+                    placeholders.extend([video_token_id] * num_frame_tokens)
+                    if video_frame_types[i] == 1:
+                        placeholders.append(vocab[hf_processor.fast_end])
+
+                return PromptUpdateDetails.select_token_id(
+                    placeholders, embed_token_id=video_token_id)
+            else:
+                raise ValueError(f"Unsupported modality {modality}")
+
+        return [
+            PromptReplacement(
+                modality=modality,
+                target=[placeholder[modality]],
+                replacement=partial(get_replacement_keye, modality=modality),
+            ) for modality in ("image", "video")
+        ]
+
+    def _get_mm_fields_config(
+        self,
+        hf_inputs: BatchFeature,
+        hf_processor_mm_kwargs: Mapping[str, object],
+    ) -> Mapping[str, MultiModalFieldConfig]:
+        return _keye_field_config(hf_inputs)
+
+
+class KeyeVL1_5DummyInputsBuilder(
+        KeyeBaseDummyInputsBuilder[KeyeVL1_5ProcessingInfo]):
+    ...
+
+
+@MULTIMODAL_REGISTRY.register_processor(
+    KeyeVL1_5MultiModalProcessor,
+    info=KeyeVL1_5ProcessingInfo,
+    dummy_inputs=KeyeVL1_5DummyInputsBuilder,
+)
+class KeyeVL1_5ForConditionalGeneration(BaseKeyeModule, SupportsMultiModal,
+                                        SupportsLoRA, SupportsPP):
+
+    def _build_projector(self,
+                         text_config: PretrainedConfig,
+                         vision_config: PretrainedConfig,
+                         quant_config: Optional[QuantizationConfig] = None,
+                         prefix: str = "") -> nn.Module:
+        return KeyeVL1_5Projector(text_config, vision_config, quant_config,
+                                  prefix)
+
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+        config: PretrainedConfig = vllm_config.model_config.hf_config
+        self.merge_size = config.vision_config.spatial_merge_size
+        super().__init__(vllm_config=vllm_config, prefix=prefix)
+
+    def _validate_and_reshape_mm_tensor(self, mm_input: NestedTensors,
+                                        expected_dim: int, name: str):
+        if not isinstance(mm_input, (torch.Tensor, list)):
+            raise ValueError(f"Incorrect type of {name}. "
+                             f"Got type: {type(mm_input)}")
+        if isinstance(mm_input, torch.Tensor):
+            if mm_input.ndim == expected_dim:
+                return mm_input
+            elif mm_input.ndim == expected_dim + 1:
+                return torch.concat(list(mm_input))
+            else:
+                raise ValueError(
+                    f"{name} should be {expected_dim}D or "
+                    f"batched {expected_dim}D tensor."
+                    f"Got ndim: {mm_input.ndim} (shape={mm_input.shape})")
+        else:
+            return torch.concat(list(mm_input))
+
+    def _parse_and_validate_image_input(
+            self, **kwargs: object) -> Optional[KeyeVL1_5ImageInputs]:
+        pixel_values = kwargs.pop("pixel_values", None)
+        image_embeds = kwargs.pop("image_embeds", None)
+        image_grid_thw = kwargs.pop("image_grid_thw", None)
+
+        if pixel_values is None and image_embeds is None:
+            return None
+
+        if pixel_values is not None:
+            pixel_values = self._validate_and_reshape_mm_tensor(
+                pixel_values, expected_dim=4, name="image pixel values")
+            image_grid_thw = self._validate_and_reshape_mm_tensor(
+                image_grid_thw, expected_dim=2, name="image grid_thw")
+
+            return KeyeVL1_5ImagePixelInputs(
+                type="pixel_values",
+                pixel_values=pixel_values,
+                image_grid_thw=image_grid_thw,
+            )
+
+        if image_embeds is not None:
+            image_embeds = self._validate_and_reshape_mm_tensor(
+                image_embeds, expected_dim=2, name="image embeds")
+            image_grid_thw = self._validate_and_reshape_mm_tensor(
+                image_grid_thw, expected_dim=2, name="image grid_thw")
+
+            return KeyeVL1_5ImageEmbeddingInputs(
+                type="image_embeds",
+                image_embeds=image_embeds,
+                image_grid_thw=image_grid_thw,
+            )
+
+    def _parse_and_validate_video_input(
+            self, **kwargs: object) -> Optional[KeyeVL1_5VideoInputs]:
+        pixel_values_videos = kwargs.pop("pixel_values_videos", None)
+        video_embeds = kwargs.pop("video_embeds", None)
+        video_grid_thw = kwargs.pop("video_grid_thw", None)
+        num_frames = kwargs.pop("num_frames", None)
+
+        if pixel_values_videos is None and video_embeds is None:
+            return None
+
+        if pixel_values_videos is not None:
+            pixel_values_videos = self._validate_and_reshape_mm_tensor(
+                pixel_values_videos,
+                expected_dim=4,
+                name="video pixel values",
+            )
+            video_grid_thw = self._validate_and_reshape_mm_tensor(
+                video_grid_thw, expected_dim=2, name="video grid_thw")
+
+            num_frames = self._validate_and_reshape_mm_tensor(
+                num_frames, expected_dim=1, name="video num frames")
+
+            return KeyeVL1_5VideoPixelInputs(
+                type="pixel_values_videos",
+                pixel_values_videos=pixel_values_videos,
+                video_grid_thw=video_grid_thw,
+                num_frames=num_frames)
+
+        if video_embeds is not None:
+            video_embeds = self._validate_and_reshape_mm_tensor(
+                video_embeds, expected_dim=2, name="video embeds")
+            video_grid_thw = self._validate_and_reshape_mm_tensor(
+                video_grid_thw, expected_dim=2, name="video grid_thw")
+
+            return KeyeVL1_5VideoEmbeddingInputs(type="video_embeds",
+                                                 video_embeds=video_embeds,
+                                                 video_grid_thw=video_grid_thw,
+                                                 num_frames=num_frames)
+
+    def _process_video_input(
+            self,
+            video_input: KeyeVL1_5VideoInputs) -> tuple[torch.Tensor, ...]:
+        video_type = video_input["type"]
+        video_grid_thw = split_thw(video_input["video_grid_thw"])
+        pixel_values_videos = video_input.get("pixel_values_videos", None)
+
+        video_embeds = self._process_video_embeds(video_type, video_grid_thw,
+                                                  pixel_values_videos)
+        video_embeds = torch.concat(video_embeds, dim=0)
+
+        num_frames = video_input["num_frames"].clone().tolist()
+
+        num_patches = get_num_patches(video_grid_thw, num_frames).tolist()
+
+        patch_cu_seqlens = torch.cumsum(
+            torch.tensor([0] + num_patches).detach().clone(), dim=-1)
+        patch_cu_seqlens = torch.div(patch_cu_seqlens,
+                                     self.merge_size**2,
+                                     rounding_mode="floor")
+
+        new_video_embeds = []
+        for idx in range(patch_cu_seqlens.shape[0] - 1):
+            start = patch_cu_seqlens[idx]
+            end = patch_cu_seqlens[idx + 1]
+            new_video_embeds.append(video_embeds[start:end])
+        return tuple(new_video_embeds)
diff --git a/vllm/model_executor/models/kimi_vl.py b/vllm/model_executor/models/kimi_vl.py
index a08a9a62a57c5dab041283e9883163d800a3cf1a..4f76d4afdb20eb76cdd84f65addee856b8f04f51 100644
--- a/vllm/model_executor/models/kimi_vl.py
+++ b/vllm/model_executor/models/kimi_vl.py
@@ -56,6 +56,7 @@ from transformers.activations import GELUActivation
 from vllm.config import VllmConfig
 from vllm.distributed import get_pp_group
 from vllm.model_executor.layers.fused_moe import FusedMoE
+from vllm.model_executor.layers.linear import ReplicatedLinear
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
 from vllm.model_executor.layers.vocab_parallel_embedding import (
     DEFAULT_VOCAB_PADDING_SIZE, ParallelLMHead)
@@ -76,6 +77,7 @@ from vllm.multimodal.processing import (BaseMultiModalProcessor,
                                         BaseProcessingInfo, PromptReplacement,
                                         PromptUpdate)
 from vllm.multimodal.profiling import BaseDummyInputsBuilder
+from vllm.multimodal.utils import run_dp_sharded_mrope_vision_model
 from vllm.sequence import IntermediateTensors
 from vllm.transformers_utils.configs import KimiVLConfig, MoonViTConfig
 from vllm.transformers_utils.configs.deepseek_vl2 import DeepseekV2Config
@@ -93,8 +95,10 @@ class MaxImageTokenMeta:
 
 class KimiVLMultiModalProjector(nn.Module):
 
-    def __init__(self, config: KimiVLConfig):
+    def __init__(self, config: KimiVLConfig, \
+                 use_data_parallel: bool = False, prefix: str = ""):
         super().__init__()
+        self.use_data_parallel = use_data_parallel
 
         self.hidden_size = (config.vision_config.hidden_size *
                             config.vision_config.merge_kernel_size[0] *
@@ -102,20 +106,24 @@ class KimiVLMultiModalProjector(nn.Module):
 
         self.pre_norm = torch.nn.LayerNorm(config.vision_config.hidden_size,
                                            eps=1e-5)
-        self.linear_1 = nn.Linear(self.hidden_size,
-                                  self.hidden_size,
-                                  bias=True)
+        self.linear_1 = ReplicatedLinear(self.hidden_size,
+                                         self.hidden_size,
+                                         bias=True,
+                                         prefix=maybe_prefix(
+                                             prefix, "linear_1"))
+        self.linear_2 = ReplicatedLinear(self.hidden_size,
+                                         config.text_config.hidden_size,
+                                         bias=True,
+                                         prefix=maybe_prefix(
+                                             prefix, "linear_2"))
         self.act = GELUActivation()
-        self.linear_2 = nn.Linear(self.hidden_size,
-                                  config.text_config.hidden_size,
-                                  bias=True)
 
     def forward(self, image_features: torch.Tensor) -> torch.Tensor:
         hidden_states = self.pre_norm(image_features).view(
             -1, self.hidden_size)
-        hidden_states = self.linear_1(hidden_states)
+        hidden_states, _ = self.linear_1(hidden_states)
         hidden_states = self.act(hidden_states)
-        hidden_states = self.linear_2(hidden_states)
+        hidden_states, _ = self.linear_2(hidden_states)
         return hidden_states
 
 
@@ -273,6 +281,8 @@ class KimiVLMultiModalProcessor(BaseMultiModalProcessor[KimiVLProcessingInfo]):
 class KimiVLForConditionalGeneration(nn.Module, SupportsMultiModal,
                                      SupportsPP):
 
+    supports_encoder_tp_data = True
+
     @classmethod
     def get_placeholder_str(cls, modality: str, i: int) -> Optional[str]:
         if modality.startswith("image"):
@@ -292,10 +302,17 @@ class KimiVLForConditionalGeneration(nn.Module, SupportsMultiModal,
         quant_config = vllm_config.quant_config
 
         assert isinstance(config.vision_config, MoonViTConfig)
-
-        self.vision_tower = MoonVitPretrainedModel(config.vision_config)
-
-        self.multi_modal_projector = KimiVLMultiModalProjector(config=config)
+        self.use_data_parallel = model_config.multimodal_config.mm_encoder_tp_mode == "data"
+        self.hidden_size = config.text_config.hidden_size
+        self.vision_tower = MoonVitPretrainedModel(config.vision_config,
+                                                   self.use_data_parallel,
+                                                   prefix=maybe_prefix(
+                                                       prefix, "vision_tower"))
+
+        self.multi_modal_projector = KimiVLMultiModalProjector(
+            config=config,
+            use_data_parallel=self.use_data_parallel,
+            prefix=maybe_prefix(prefix, "multi_modal_projector"))
 
         self.quant_config = quant_config
         sub_vllm_config = copy.deepcopy(vllm_config)
@@ -376,13 +393,19 @@ class KimiVLForConditionalGeneration(nn.Module, SupportsMultiModal,
 
         pixel_values = inputs["pixel_values"]
         image_grid_hws = inputs["image_grid_hws"]
-        return self.vision_tower(pixel_values, image_grid_hws)
+        if self.use_data_parallel:
+            return run_dp_sharded_mrope_vision_model(self.vision_tower,
+                                                     pixel_values,
+                                                     image_grid_hws.tolist(),
+                                                     rope_type="rope_2d")
+        else:
+            return self.vision_tower(pixel_values, image_grid_hws)
 
     def _process_image_input(self,
                              image_input: KimiVLImageInputs) -> torch.Tensor:
         assert image_input["type"] == "pixel_values"
         image_features = self._process_image_pixels(image_input)
-        assert isinstance(image_features, list)
+        assert isinstance(image_features, (list, tuple))
         lengths = [x.shape[0] for x in image_features]
         return self.multi_modal_projector(
             torch.cat(image_features)).split(lengths)
@@ -496,6 +519,7 @@ class KimiVLForConditionalGeneration(nn.Module, SupportsMultiModal,
             expert_params_mapping = []
 
         params_dict = dict(self.named_parameters())
+
         for args in weights:
             name, loaded_weight = args[:2]
             kwargs = args[2] if len(args) > 2 else {}
diff --git a/vllm/model_executor/models/llama.py b/vllm/model_executor/models/llama.py
index a22bde194f5de1d93a269ca4c81505251b82eff7..f8ea2111fed578e422edb8b5e95410d5bb7244f8 100644
--- a/vllm/model_executor/models/llama.py
+++ b/vllm/model_executor/models/llama.py
@@ -171,7 +171,22 @@ class LlamaAttention(nn.Module):
 
         sliding_window = None
         if layer_types := getattr(config, "layer_types", None):
-            is_sliding = layer_types[layer_idx] == "sliding_attention"
+            # Fix for Eagle3 compatibility:
+            # for draft models, subtract target layer count
+            # to get draft-relative layer index starting from 0
+            if hasattr(config, 'target_layer_count'):
+                # This is a draft model,
+                # adjust layer_idx to be relative to draft layers
+                effective_layer_idx = layer_idx - config.target_layer_count
+            else:
+                # This is a target model, use layer_idx directly
+                effective_layer_idx = layer_idx
+            assert effective_layer_idx < len(layer_types), \
+                f"effective_layer_idx: {effective_layer_idx} \
+                is out of bounds for layer_types: {layer_types}"
+
+            is_sliding = layer_types[
+                effective_layer_idx] == "sliding_attention"
             if is_sliding:
                 sliding_window = config.sliding_window
 
@@ -611,9 +626,8 @@ class LlamaForCausalLM(nn.Module, SupportsLoRA, SupportsPP, SupportsEagle3):
         loaded_weight: torch.Tensor,
     ) -> tuple[str, torch.Tensor]:
 
-        def permute(w: torch.Tensor, n_heads: int):
+        def permute(w: torch.Tensor, n_heads: int, attn_out: int):
             attn_in = self.config.head_dim * n_heads
-            attn_out = self.config.hidden_size
 
             return w.view(n_heads, attn_in // n_heads // 2, 2,
                           attn_out).transpose(1, 2).reshape(attn_in, attn_out)
@@ -622,12 +636,24 @@ class LlamaForCausalLM(nn.Module, SupportsLoRA, SupportsPP, SupportsEagle3):
         modules = name.split(".")
 
         # rotary embeds should be sliced
+        # If using quantized model in mistral format,
+        # quantization scales (qscale_weight) also need to be sliced
         if "wk" in modules and modules[-1] == "weight":
             loaded_weight = permute(loaded_weight,
-                                    self.config.num_key_value_heads)
+                                    self.config.num_key_value_heads,
+                                    self.config.hidden_size)
+        elif "wk" in modules and modules[
+                -1] == "qscale_weight" and loaded_weight.numel() > 1:
+            loaded_weight = permute(loaded_weight,
+                                    self.config.num_key_value_heads, 1)
         elif "wq" in modules and modules[-1] == "weight":
             loaded_weight = permute(loaded_weight,
-                                    self.config.num_attention_heads)
+                                    self.config.num_attention_heads,
+                                    self.config.hidden_size)
+        elif "wq" in modules and modules[
+                -1] == "qscale_weight" and loaded_weight.numel() > 1:
+            loaded_weight = permute(loaded_weight,
+                                    self.config.num_attention_heads, 1)
 
         num_modules = len(modules)
         for i in range(num_modules):
diff --git a/vllm/model_executor/models/llama4.py b/vllm/model_executor/models/llama4.py
index ba08e6f81f7fe7397425a26a210ebee5539c42b1..ddd7e6a5936e3898d974e6a4e7a072eb7b27d730 100644
--- a/vllm/model_executor/models/llama4.py
+++ b/vllm/model_executor/models/llama4.py
@@ -36,6 +36,7 @@ from vllm.model_executor.layers.linear import (QKVParallelLinear,
                                                RowParallelLinear)
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.layers.rotary_embedding import get_rope
+from vllm.model_executor.layers.shared_fused_moe import SharedFusedMoE
 from vllm.model_executor.model_loader.weight_utils import (
     default_weight_loader, maybe_remap_kv_scale_name)
 
@@ -73,7 +74,18 @@ class Llama4MoE(nn.Module):
                                        quant_config=None,
                                        prefix=f"{prefix}.router")
 
-        self.experts = FusedMoE(
+        self.shared_expert = LlamaMLP(
+            hidden_size=config.hidden_size,
+            intermediate_size=intermediate_size_moe,
+            hidden_act="silu",
+            quant_config=quant_config,
+            bias=False,
+            prefix=f"{prefix}.shared_expert",
+            reduce_results=False,
+        )
+
+        self.experts = SharedFusedMoE(
+            shared_experts=self.shared_expert,
             num_experts=config.num_local_experts,
             top_k=config.num_experts_per_tok,
             hidden_size=config.hidden_size,
@@ -83,22 +95,13 @@ class Llama4MoE(nn.Module):
             reduce_results=False,
             renormalize=False,
             quant_config=quant_config,
-            prefix=f"{prefix}.experts")
-
-        self.shared_expert = LlamaMLP(
-            hidden_size=config.hidden_size,
-            intermediate_size=intermediate_size_moe,
-            hidden_act="silu",
-            quant_config=quant_config,
-            bias=False,
-            prefix=f"{prefix}.shared_expert",
-            reduce_results=self.experts.must_reduce_shared_expert_outputs(),
+            prefix=f"{prefix}.experts",
         )
 
     def forward(self, hidden_states):
         router_logits, _ = self.router(hidden_states)
-        shared_out = self.shared_expert(hidden_states)
-        routed_out = self.experts(
+
+        shared_out, routed_out = self.experts(
             hidden_states=hidden_states,
             router_logits=router_logits,
         )
diff --git a/vllm/model_executor/models/llama_eagle3.py b/vllm/model_executor/models/llama_eagle3.py
index 572930c39a846756cf0ffaf64e06c6de5295bca4..bceb6cc42768e08612b457222cde52819118304a 100644
--- a/vllm/model_executor/models/llama_eagle3.py
+++ b/vllm/model_executor/models/llama_eagle3.py
@@ -199,6 +199,10 @@ class Eagle3LlamaForCausalLM(LlamaForCausalLM):
             speculative_config.draft_model_config.hf_config
         target_layer_num = vllm_config.model_config.get_num_layers(
             vllm_config.parallel_config)
+
+        # Store target layer count in draft config for
+        # proper layer_types indexing in draft models
+        self.config.target_layer_count = target_layer_num
         self.model = LlamaModel(vllm_config=vllm_config,
                                 prefix="model",
                                 start_layer_id=target_layer_num)
diff --git a/vllm/model_executor/models/llava.py b/vllm/model_executor/models/llava.py
index 8a847a6180f3a7694e87e5cff0968238d33e8e0f..d692b2783048fad7999162f04b9297b5f06ec94f 100644
--- a/vllm/model_executor/models/llava.py
+++ b/vllm/model_executor/models/llava.py
@@ -24,7 +24,8 @@ from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.multimodal import MULTIMODAL_REGISTRY
 from vllm.multimodal.cache import BaseMultiModalProcessorCache
 from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
-                                    MultiModalInputs, MultiModalKwargsItems)
+                                    MultiModalInputs, MultiModalKwargsItems,
+                                    MultiModalUUIDDict)
 from vllm.multimodal.parse import (ImageEmbeddingItems, ImageProcessorItems,
                                    ImageSize, MultiModalDataItems)
 from vllm.multimodal.processing import (BaseMultiModalProcessor,
@@ -795,7 +796,7 @@ class MantisMultiModalProcessor(LlavaMultiModalProcessor):
         mm_data: MultiModalDataDict,
         hf_processor_mm_kwargs: Mapping[str, object],
         tokenization_kwargs: Optional[Mapping[str, object]] = None,
-        mm_hash_overrides: Optional[dict[str, list[str]]] = None,
+        mm_uuids: Optional[MultiModalUUIDDict] = None,
     ) -> MultiModalInputs:
         hf_config = self.info.get_hf_config()
         image_token_id = hf_config.image_token_index
@@ -810,7 +811,7 @@ class MantisMultiModalProcessor(LlavaMultiModalProcessor):
                                mm_data,
                                hf_processor_mm_kwargs,
                                tokenization_kwargs,
-                               mm_hash_overrides=mm_hash_overrides)
+                               mm_uuids=mm_uuids)
 
         mm_items = self._to_mm_items(mm_data)
         mm_item_counts = mm_items.get_all_counts()
diff --git a/vllm/model_executor/models/llava_onevision.py b/vllm/model_executor/models/llava_onevision.py
index e4ac0cd919101982b714a41984b54210a7208fbe..46d54452a52d829f13c8ad79854ea4f9de0c8ae2 100644
--- a/vllm/model_executor/models/llava_onevision.py
+++ b/vllm/model_executor/models/llava_onevision.py
@@ -216,12 +216,9 @@ class LlavaOnevisionProcessingInfo(LlavaNextProcessingInfo):
         seq_len: int,
         mm_counts: Mapping[str, int],
     ) -> int:
-        max_images = mm_counts.get("image", 0)
         max_videos = mm_counts.get("video", 0)
 
-        max_image_tokens = self.get_max_image_tokens() * max_images
-        max_total_frames = self._get_max_video_frames(seq_len -
-                                                      max_image_tokens)
+        max_total_frames = self._get_max_video_frames(seq_len)
         max_frames_per_video = min(max_total_frames // max(max_videos, 1),
                                    _MAX_FRAMES_PER_VIDEO)
 
@@ -838,7 +835,7 @@ class LlavaOnevisionForConditionalGeneration(nn.Module, SupportsMultiModal,
             return None
 
         # The result multimodal_embeddings is tuple of tensors, with each
-        # tensor correspoending to a multimodal data item (image or video).
+        # tensor corresponding to a multimodal data item (image or video).
         multimodal_embeddings: tuple[torch.Tensor, ...] = ()
 
         # NOTE: It is important to iterate over the keys in this dictionary
diff --git a/vllm/model_executor/models/midashenglm.py b/vllm/model_executor/models/midashenglm.py
new file mode 100644
index 0000000000000000000000000000000000000000..858d4e7e34cf1dd89a7c8696d9d552e3e8a00365
--- /dev/null
+++ b/vllm/model_executor/models/midashenglm.py
@@ -0,0 +1,788 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+# Copyright 2025 Horizon team, Xiaomi MiLM Plus.
+# Copyright 2024 The Qwen team.
+# Copyright 2023 The vLLM team.
+# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.
+#
+# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
+# and OPT implementations in this library. It has been modified from its
+# original forms to accommodate minor architectural differences compared
+# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Inference-only MiDashengLM model compatible with HuggingFace weights."""
+import collections
+import collections.abc
+from collections.abc import Iterable, Mapping, Sequence
+from typing import Any, Callable, Optional, TypedDict, Union, cast
+
+import numpy as np
+import torch
+import torch.nn as nn
+import torchaudio.transforms as audio_transforms
+from transformers import BatchFeature
+
+from vllm.attention.layer import MultiHeadAttention
+from vllm.config import VllmConfig
+from vllm.distributed import get_tensor_model_parallel_world_size
+from vllm.model_executor.layers.activation import get_act_fn
+from vllm.model_executor.layers.linear import (ColumnParallelLinear,
+                                               QKVParallelLinear,
+                                               RowParallelLinear)
+from vllm.model_executor.layers.quantization import QuantizationConfig
+from vllm.model_executor.model_loader.utils import set_default_torch_dtype
+from vllm.model_executor.sampling_metadata import SamplingMetadata
+from vllm.multimodal import MULTIMODAL_REGISTRY
+from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
+                                    MultiModalKwargsItems)
+from vllm.multimodal.parse import MultiModalDataItems, MultiModalDataParser
+from vllm.multimodal.processing import (BaseMultiModalProcessor,
+                                        BaseProcessingInfo, PromptReplacement,
+                                        PromptUpdate, PromptUpdateDetails)
+from vllm.multimodal.profiling import BaseDummyInputsBuilder
+from vllm.sequence import IntermediateTensors
+from vllm.transformers_utils.configs.midashenglm import DashengConfig
+
+from .interfaces import MultiModalEmbeddings, SupportsMultiModal, SupportsPP
+from .utils import (AutoWeightsLoader, init_vllm_registered_model,
+                    maybe_prefix, merge_multimodal_embeddings)
+
+_Tuple2 = Union[int, tuple[int, int], Sequence[int]]
+
+
+def _resolve_tuple2(x: _Tuple2) -> tuple[int, int]:
+    if isinstance(x, collections.abc.Sequence):
+        assert len(x) == 2, (
+            f"Expected a sequence of length 2, got {x} with length {len(x)}")
+        return cast(tuple[int, int], tuple(x))
+    return (x, x)
+
+
+def calculate_mel_frames_dasheng(
+    audio_length_samples: int,
+    n_fft: int = 512,
+    hop_size: int = 160,
+    dasheng_subsampling: int = 4,
+    center=True,
+    model_subsampling: int = 5,
+) -> int:
+    """Calculate the number of Mel-spectrogram frames."""
+    if center:
+        audio_length_samples = audio_length_samples + n_fft
+
+    return (int(1 + ((audio_length_samples - n_fft) / hop_size)) //
+            dasheng_subsampling // model_subsampling)
+
+
+class AudioPatchEmbed(nn.Module):
+
+    def __init__(
+        self,
+        input_size: _Tuple2 = 64,
+        patch_size: _Tuple2 = 16,
+        patch_stride: _Tuple2 = 16,
+        in_chans: int = 1,
+        embed_dim: int = 768,
+        norm_layer: Optional[Callable] = None,
+        flatten: bool = False,
+    ):
+        super().__init__()
+        self.input_size = _resolve_tuple2(input_size)
+        self.patch_size = _resolve_tuple2(patch_size)
+        self.patch_stride = _resolve_tuple2(patch_stride)
+        self.grid_size = (
+            self.input_size[0] // self.patch_stride[0],
+            self.input_size[1] // self.patch_stride[1],
+        )
+        self.num_patches = self.grid_size[0] * self.grid_size[1]
+        self.flatten = flatten
+
+        self.proj = nn.Conv2d(
+            in_chans,
+            embed_dim,
+            kernel_size=self.patch_size,
+            stride=self.patch_stride,
+        )
+        self.norm = norm_layer(embed_dim) if norm_layer else nn.Identity()
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        x = self.proj(x)
+        if self.flatten:
+            x = torch.permute(torch.flatten(
+                x, 2, 3), (0, 2, 1))  # rearrange(x, "b c f t -> b (f t) c")
+        x = self.norm(x)
+        return x
+
+
+class LayerScale(nn.Module):
+
+    def __init__(self, dim, init_values=1e-5, inplace=False):
+        super().__init__()
+        self.inplace = inplace
+        self.gamma = nn.Parameter(init_values * torch.ones(dim))
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return x.mul_(self.gamma) if self.inplace else x * self.gamma
+
+
+class DashengMlp(nn.Module):
+
+    def __init__(
+        self,
+        in_features: int,
+        hidden_features: Optional[int] = None,
+        out_features: Optional[int] = None,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ):
+        super().__init__()
+        out_features = out_features or in_features
+        hidden_features = hidden_features or in_features
+        self.fc1 = ColumnParallelLinear(input_size=in_features,
+                                        output_size=hidden_features,
+                                        quant_config=quant_config,
+                                        prefix=f"{prefix}.fc1")
+        self.act = get_act_fn("gelu")
+        self.fc2 = RowParallelLinear(input_size=hidden_features,
+                                     output_size=out_features,
+                                     quant_config=quant_config,
+                                     prefix=f"{prefix}.fc2")
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        x, _ = self.fc1(x)
+        x = self.act(x)
+        x, _ = self.fc2(x)
+        return x
+
+
+class DashengAttention(nn.Module):
+
+    def __init__(
+        self,
+        dim: int,
+        num_heads: int = 8,
+        qkv_bias: bool = False,
+        causal: bool = False,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ):
+        super().__init__()
+        assert dim % num_heads == 0, "dim should be divisible by num_heads"
+        self.embed_dim = dim
+        tp_size = get_tensor_model_parallel_world_size()
+        self.total_num_heads = num_heads
+        assert self.total_num_heads % tp_size == 0
+        self.num_heads = self.total_num_heads // tp_size
+        if self.total_num_heads >= tp_size:
+            # Number of heads is greater than TP size, so we partition
+            # the KV heads across multiple tensor parallel GPUs.
+            assert self.total_num_heads % tp_size == 0
+        else:
+            # Number of heads is less than TP size, so we replicate
+            # the KV heads across multiple tensor parallel GPUs.
+            assert tp_size % self.total_num_heads == 0
+        self.num_kv_heads = max(1, self.total_num_heads // tp_size)
+        self.head_dim = self.embed_dim // self.total_num_heads
+        self.q_size = self.num_heads * self.head_dim
+        self.kv_size = self.num_kv_heads * self.head_dim
+        self.scale = self.head_dim**-0.5
+
+        self.qkv = QKVParallelLinear(
+            hidden_size=self.embed_dim,
+            head_size=self.head_dim,
+            total_num_heads=self.total_num_heads,
+            total_num_kv_heads=self.total_num_heads,
+            bias=qkv_bias,
+            quant_config=quant_config,
+            prefix=f"{prefix}.qkv",
+        )
+        self.attn = MultiHeadAttention(
+            self.num_heads,
+            self.head_dim,
+            self.scale,
+            num_kv_heads=self.num_kv_heads,
+        )
+        self.proj = RowParallelLinear(
+            input_size=dim,
+            output_size=dim,
+            quant_config=quant_config,
+            prefix=f"{prefix}.proj",
+        )
+        self.causal = causal
+
+    def forward(self, x: torch.Tensor, mask: Optional[torch.Tensor] = None):
+        B, N, C = x.shape
+
+        qkv_out, _ = self.qkv(x)
+        q, k, v = qkv_out.split([self.q_size, self.kv_size, self.kv_size],
+                                dim=-1)
+
+        attn_out = self.attn(q, k, v)
+        C_local = attn_out.numel() // (B * N)  # C_local for parallel
+        attn_out = attn_out.view(B, N, C_local)
+
+        x, _ = self.proj(attn_out)
+
+        return x
+
+
+class DashengBlock(nn.Module):
+
+    def __init__(
+        self,
+        dim: int,
+        num_heads: int,
+        mlp_ratio: float = 4.0,
+        qkv_bias: bool = False,
+        init_values: Optional[float] = None,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ):
+        super().__init__()
+        self.norm1 = nn.LayerNorm(dim, eps=1e-6)
+        self.attn = DashengAttention(
+            dim,
+            num_heads=num_heads,
+            qkv_bias=qkv_bias,
+            quant_config=quant_config,
+            prefix=f"{prefix}.attn",
+        )
+        self.ls1 = (LayerScale(dim, init_values=init_values)
+                    if init_values else nn.Identity())
+
+        self.norm2 = nn.LayerNorm(dim, eps=1e-6)
+        self.mlp = DashengMlp(
+            in_features=dim,
+            hidden_features=int(dim * mlp_ratio),
+            quant_config=quant_config,
+            prefix=f"{prefix}.mlp",
+        )
+        self.ls2 = (LayerScale(dim, init_values=init_values)
+                    if init_values else nn.Identity())
+
+    # Kwargs usually has a mask parameter that is passed to Attention
+    def forward(
+        self,
+        x: torch.Tensor,
+        mask: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        x = x + self.ls1(self.attn(self.norm1(x), mask))
+        x = x + self.ls2(self.mlp(self.norm2(x)))
+        return x
+
+
+class DashengAudioTransformer(nn.Module):
+
+    def __init__(
+        self,
+        config: DashengConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ):
+        super().__init__()
+
+        self.target_length = config.target_length
+        self.hop_length = config.hop_length
+
+        self._init_front_end(config)
+
+        self.init_bn = nn.BatchNorm2d(config.n_mels, momentum=0.01)
+
+        self.patch_embed = AudioPatchEmbed(
+            input_size=(config.n_mels, config.target_length),
+            embed_dim=config.embed_dim,
+            in_chans=config.input_channels,
+            patch_size=config.patch_size,
+            flatten=False,
+            patch_stride=config.patch_stride,
+        )
+
+        self.time_pos_embed = nn.Parameter(
+            torch.empty(1, config.embed_dim, 1, self.patch_embed.grid_size[1]))
+        self.freq_pos_embed = nn.Parameter(
+            torch.empty(1, config.embed_dim, self.patch_embed.grid_size[0], 1))
+        self.blocks = nn.ModuleList(
+            DashengBlock(
+                dim=config.embed_dim,
+                num_heads=config.num_heads,
+                mlp_ratio=config.mlp_ratio,
+                qkv_bias=config.qkv_bias,
+                init_values=config.init_values,
+                quant_config=quant_config,
+                prefix=f"{prefix}.block{i}",
+            ) for i in range(config.depth))
+        self.norm = nn.LayerNorm(config.embed_dim, eps=1e-6)
+
+    def _init_front_end(self, config):
+        with set_default_torch_dtype(torch.float32):
+            self.front_end = nn.Sequential(
+                audio_transforms.MelSpectrogram(
+                    f_min=config.f_min,
+                    f_max=config.f_max,
+                    center=config.center,
+                    win_length=config.win_length,
+                    hop_length=config.hop_length,
+                    sample_rate=config.sample_rate,
+                    n_fft=config.n_fft,
+                    n_mels=config.n_mels,
+                ),
+                audio_transforms.AmplitudeToDB(top_db=120),
+            )
+
+            mel_spectrogram = self.front_end[0]
+            fb = mel_spectrogram.mel_scale.fb
+            win = mel_spectrogram.spectrogram.window
+            mel_spectrogram.mel_scale.fb = fb.to(torch.bfloat16).to(
+                torch.float32)
+            mel_spectrogram.spectrogram.window = win.to(torch.bfloat16).to(
+                torch.float32)
+
+    def forward_features(
+        self,
+        x: torch.Tensor,
+        mask: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        t = x.shape[-1]
+        x = x + self.time_pos_embed[:, :, :, :t]
+        x = (x + self.freq_pos_embed[:, :, :, :]
+             )  # Just to support __getitem__ in posembed
+        x = torch.permute(torch.flatten(x, 2, 3),
+                          (0, 2, 1))  # rearrange(x, "b c f t -> b (f t) c")
+        for block in self.blocks:
+            x = block(x, mask)
+        x = self.norm(x)
+        return x
+
+    def _to_mask(self, lengths: torch.Tensor, max_length: int) -> torch.Tensor:
+        batch_size = len(lengths)
+        idx = torch.arange(max_length, device=lengths.device)
+        idx = idx.repeat(batch_size).view(batch_size, max_length)
+        mask = (idx < lengths.unsqueeze(-1)).bool()
+        return mask
+
+    def forward(
+        self,
+        x: torch.Tensor,
+        x_length: Optional[torch.Tensor] = None,
+    ) -> tuple[torch.Tensor, Optional[torch.Tensor]]:
+        x = self.front_end(x)
+        x = x.to(self.time_pos_embed.dtype)
+        target_length_in_patches = self.target_length // 4
+        x = x.unsqueeze(1)
+        x = torch.permute(x, (0, 2, 1, 3))
+        x = self.init_bn(x)
+        x = torch.permute(x, (0, 2, 1, 3))
+
+        x = self.patch_embed(x)
+        t = x.shape[-1]
+
+        input_splits = x.split(target_length_in_patches, dim=-1)
+
+        if x_length is not None:
+            assert len(x_length) == len(x), (
+                "batchsizes of input x and x_length need to be same")
+            assert x_length.ndim == 1, "Lengths are of size (B,)"
+            scaled_lengths = (x_length / (self.hop_length * 4)).long()
+            mask = self._to_mask(max_length=t, lengths=scaled_lengths)
+            split_masks = mask.logical_not().split(target_length_in_patches,
+                                                   dim=-1)
+        else:
+            mask = None
+            split_masks = [None] * len(input_splits)
+
+        outputs = []
+
+        for split_x, split_mask in zip(input_splits, split_masks):
+            forward_kwargs = {}
+            forward_kwargs["mask"] = split_mask
+            split_x = self.forward_features(split_x, **forward_kwargs)
+            outputs.append(split_x)
+        x = torch.cat(outputs, dim=1)
+        return x, mask
+
+
+class AudioProjectorSubsample(nn.Module):
+
+    def __init__(
+        self,
+        in_dim: int,
+        out_dim: int,
+        downsample_rate=5,
+        dtype: Optional[torch.dtype] = None,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ):
+        super().__init__()
+        self.k = downsample_rate
+        self.net = nn.Sequential(
+            ColumnParallelLinear(
+                input_size=in_dim * self.k,
+                output_size=out_dim,
+                quant_config=quant_config,
+                prefix=f"{prefix}.net.0",
+                return_bias=False,
+            ), get_act_fn("gelu"),
+            RowParallelLinear(
+                input_size=out_dim,
+                output_size=out_dim,
+                quant_config=quant_config,
+                prefix=f"{prefix}.net.2",
+                return_bias=False,
+            ))
+
+    def forward(self, x, mask=None):
+        batch_size, seq_len, dim = x.shape
+        num_frames_to_discard = seq_len % self.k
+        if num_frames_to_discard > 0:
+            x = x[:, :-num_frames_to_discard, :]
+            if mask is not None:
+                mask = mask[:, :-num_frames_to_discard]
+        if mask is None:
+            mask = torch.ones(x.shape[:-1], dtype=torch.long, device=x.device)
+        x = x.reshape(batch_size, -1, self.k *
+                      dim)  # rearrange(x, "b (s k) d -> b s (k d)", k=self.k)
+        for layer in self.net:
+            x = layer(x)
+        mask = mask.reshape(
+            batch_size, -1,
+            self.k)  # rearrange(mask, "b (s k) -> b s k", k=self.k)
+        mask = mask.any(dim=-1).long()
+        return x, mask
+
+
+# === Audio Inputs === #
+class MiDashengLMAudioInputs(TypedDict):
+    input_values: torch.Tensor
+    """Shape: `(num_audios, num_sampling_points)`"""
+    audio_length: torch.Tensor
+    """Shape: `(num_audios, 1)`"""
+
+
+class MiDashengLMProcessingInfo(BaseProcessingInfo):
+
+    def get_hf_config(self):
+        return self.ctx.get_hf_config()
+
+    def get_feature_extractor(self):
+        hf_processor = self.get_hf_processor()
+        feature_extractor = hf_processor.feature_extractor
+        return feature_extractor
+
+    def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
+        return {"audio": None}
+
+    def get_min_audio_len(self):
+        return 3200
+
+    def get_max_audio_len(self):
+        return 160000
+
+
+class MiDashengLMDummyInputsBuilder(
+        BaseDummyInputsBuilder[MiDashengLMProcessingInfo]):
+
+    def get_dummy_text(self, mm_counts: Mapping[str, int]) -> str:
+        num_audios = mm_counts.get("audio", 0)
+
+        hf_processor = self.info.get_hf_processor()
+        audio_token = hf_processor.audio_token
+
+        return audio_token * num_audios
+
+    def get_dummy_mm_data(
+        self,
+        seq_len: int,
+        mm_counts: Mapping[str, int],
+    ) -> MultiModalDataDict:
+        num_audios = mm_counts.get("audio", 0)
+
+        return {
+            "audio":
+            self._get_dummy_audios(length=self.info.get_max_audio_len(),
+                                   num_audios=num_audios)
+        }
+
+
+class MiDashengLMMultiModalProcessor(
+        BaseMultiModalProcessor[MiDashengLMProcessingInfo]):
+
+    def _get_data_parser(self) -> MultiModalDataParser:
+        feature_extractor = self.info.get_feature_extractor()
+        return MultiModalDataParser(target_sr=feature_extractor.sampling_rate)
+
+    def _call_hf_processor(
+        self,
+        prompt: str,
+        mm_data: Mapping[str, object],
+        mm_kwargs: Mapping[str, Any],
+        tok_kwargs: Mapping[str, object],
+    ) -> BatchFeature:
+        audios = mm_data.pop("audios", [])
+
+        # + Padding
+        min_audio_len = self.info.get_min_audio_len()
+        processed_audios = [
+            np.pad(audio, (0, min_audio_len - audio.shape[-1]),
+                   mode='constant',
+                   constant_values=0) if isinstance(audio, np.ndarray)
+            and audio.shape[-1] < min_audio_len else audio for audio in audios
+        ]
+
+        if processed_audios:
+            mm_data["audio"] = processed_audios
+
+        if not mm_data.get("audio", []):
+            prompt_ids = self.info.get_tokenizer().encode(prompt)
+            prompt_ids = self._apply_hf_processor_tokens_only(prompt_ids)
+            return BatchFeature(dict(input_ids=[prompt_ids]), tensor_type="pt")
+
+        mm_kwargs = dict(**mm_kwargs, )
+
+        return super()._call_hf_processor(
+            prompt=prompt,
+            mm_data=mm_data,
+            mm_kwargs=mm_kwargs,
+            tok_kwargs=tok_kwargs,
+        )
+
+    def _get_mm_fields_config(
+        self,
+        hf_inputs: BatchFeature,
+        hf_processor_mm_kwargs: Mapping[str, object],
+    ) -> Mapping[str, MultiModalFieldConfig]:
+        return dict(
+            input_values=MultiModalFieldConfig.batched("audio"),
+            audio_length=MultiModalFieldConfig.batched("audio"),
+        )
+
+    def _get_prompt_updates(
+        self,
+        mm_items: MultiModalDataItems,
+        hf_processor_mm_kwargs: Mapping[str, object],
+        out_mm_kwargs: MultiModalKwargsItems,
+    ) -> Sequence[PromptUpdate]:
+        processor = self.info.get_hf_processor(**hf_processor_mm_kwargs)
+        tokenizer = self.info.get_tokenizer()
+        vocab = tokenizer.get_vocab()
+
+        audio_token = getattr(processor, "audio_token", "<|AUDIO|>")
+        audio_bos_token = getattr(processor, "audio_bos_token",
+                                  "<|audio_bos|>")
+        audio_eos_token = getattr(processor, "audio_eos_token",
+                                  "<|audio_eos|>")
+
+        audio_token_id = vocab[audio_token]
+        audio_bos_id = vocab[audio_bos_token]
+        audio_eos_id = vocab[audio_eos_token]
+
+        out_mm_data = out_mm_kwargs.get_data()
+        audio_length = out_mm_data.get("audio_length")
+        if audio_length is None:
+            audio_output_lengths = []
+        else:
+            audio_length_np = audio_length.cpu().numpy() if isinstance(
+                audio_length, torch.Tensor) else audio_length
+            audio_output_lengths = [
+                max(1, calculate_mel_frames_dasheng(
+                    int(length)))  # at least one frame
+                for length in audio_length_np
+            ]
+
+        def get_replacement_midashenglm(item_idx: int):
+            num_features = audio_output_lengths[item_idx]
+            audio_tokens = [audio_token_id] * num_features
+
+            return PromptUpdateDetails.select_token_id(
+                [audio_bos_id] + audio_tokens + [audio_eos_id],
+                embed_token_id=audio_token_id,
+            )
+
+        return [
+            PromptReplacement(
+                modality="audio",
+                target=audio_token,
+                replacement=get_replacement_midashenglm,
+            )
+        ]
+
+
+@MULTIMODAL_REGISTRY.register_processor(
+    MiDashengLMMultiModalProcessor,
+    info=MiDashengLMProcessingInfo,
+    dummy_inputs=MiDashengLMDummyInputsBuilder,
+)
+class MiDashengLMModel(nn.Module, SupportsMultiModal, SupportsPP):
+
+    @classmethod
+    def get_placeholder_str(cls, modality: str, i: int) -> Optional[str]:
+        if modality.startswith("audio"):
+            return "<|audio_bos|><|AUDIO|><|audio_eos|>"
+
+        raise ValueError("Only audio modality is supported")
+
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+        super().__init__()
+        config = vllm_config.model_config.hf_config
+        quant_config = vllm_config.quant_config
+        self.config = config
+
+        # Initialize audio components
+        self.audio_encoder = DashengAudioTransformer(
+            config.audio_encoder_config,
+            quant_config=quant_config,
+            prefix=maybe_prefix(prefix, "audio_encoder"),
+        )
+        self.audio_projector = AudioProjectorSubsample(
+            in_dim=config.audio_encoder_config.embed_dim,
+            out_dim=config.text_config.hidden_size,
+            downsample_rate=config.subsample_factor,
+            quant_config=quant_config,
+            prefix=maybe_prefix(prefix, "audio_projector"),
+        )
+
+        # Initialize language model (decoder)
+        self.decoder = init_vllm_registered_model(
+            vllm_config=vllm_config,
+            hf_config=config.text_config,
+            prefix=maybe_prefix(prefix, "decoder"),
+            architectures=["Qwen2ForCausalLM"],
+        )
+
+        self.quant_config = quant_config
+        self.make_empty_intermediate_tensors = (
+            self.decoder.make_empty_intermediate_tensors)
+
+    def _validate_and_reshape_mm_tensor(self, mm_input: object,
+                                        name: str) -> torch.Tensor:
+        if not isinstance(mm_input, (torch.Tensor, list)):
+            raise ValueError(f"Incorrect type of {name}. "
+                             f"Got type: {type(mm_input)}")
+        if isinstance(mm_input, torch.Tensor):
+            return torch.concat(list(mm_input))
+        else:
+            return torch.concat(mm_input)
+
+    def _parse_and_validate_audio_input(
+            self, **kwargs: object) -> Optional[MiDashengLMAudioInputs]:
+        input_values = kwargs.pop("input_values", None)
+        audio_length = kwargs.pop("audio_length", None)
+
+        if input_values is None:
+            return None
+        input_values = self._validate_and_reshape_mm_tensor(
+            input_values, "input_values")
+        audio_length = self._validate_and_reshape_mm_tensor(
+            audio_length, "audio_length")
+        if not isinstance(input_values, (torch.Tensor, list)):
+            raise ValueError("Incorrect type of audio input features. "
+                             f"Got type: {type(input_values)}")
+
+        return MiDashengLMAudioInputs(
+            input_values=input_values,
+            audio_length=audio_length,
+        )
+
+    def _process_audio_input(
+            self, audio_input: MiDashengLMAudioInputs) -> torch.Tensor:
+        # Process audio through encoder and projector
+        input_values = audio_input["input_values"]
+        audio_length = audio_input["audio_length"]
+
+        encoder_out, encoder_atts = self.audio_encoder(input_values,
+                                                       audio_length)
+        audio_embeddings, _ = self.audio_projector(encoder_out, encoder_atts)
+        audio_embeddings = audio_embeddings.to(
+            audio_input["input_values"].dtype)
+        batch_size, max_audio_tokens, embed_dim = audio_embeddings.shape
+
+        audio_length_np = audio_length.cpu().numpy() if isinstance(
+            audio_length, torch.Tensor) else audio_length
+        audio_output_lengths = [
+            max(1, calculate_mel_frames_dasheng(
+                int(length)))  # at least one frame
+            for length in audio_length_np
+        ]
+        audio_output_lengths = torch.tensor(audio_output_lengths).to(
+            audio_embeddings.device)
+
+        audio_feature_mask = (torch.arange(
+            max_audio_tokens,
+            device=audio_embeddings.device).unsqueeze(0).expand(
+                batch_size, max_audio_tokens)
+                              < audio_output_lengths.unsqueeze(1))
+
+        masked_audio_features = audio_embeddings[audio_feature_mask].view(
+            -1, embed_dim)
+
+        return torch.split(masked_audio_features,
+                           audio_output_lengths.tolist())
+
+    def get_language_model(self) -> torch.nn.Module:
+        return self.decoder
+
+    def get_multimodal_embeddings(self,
+                                  **kwargs: object) -> MultiModalEmbeddings:
+        audio_input = self._parse_and_validate_audio_input(**kwargs)
+
+        if audio_input is None:
+            return []
+        return self._process_audio_input(audio_input)
+
+    def get_input_embeddings(
+        self,
+        input_ids: torch.Tensor,
+        multimodal_embeddings: Optional[MultiModalEmbeddings] = None,
+    ) -> torch.Tensor:
+        inputs_embeds = self.decoder.get_input_embeddings(input_ids)
+        if multimodal_embeddings and len(multimodal_embeddings) > 0:
+            inputs_embeds = merge_multimodal_embeddings(
+                input_ids,
+                inputs_embeds,
+                multimodal_embeddings,
+                self.config.audio_token_id,
+            )
+        return inputs_embeds
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        intermediate_tensors: Optional[IntermediateTensors] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        **kwargs: object,
+    ) -> Union[torch.Tensor, IntermediateTensors]:
+        if intermediate_tensors is not None:
+            inputs_embeds = None
+        elif inputs_embeds is None:
+            multimodal_embeddings = self.get_multimodal_embeddings(**kwargs)
+            inputs_embeds = self.get_input_embeddings(input_ids,
+                                                      multimodal_embeddings)
+            input_ids = None
+
+        return self.decoder.model(input_ids,
+                                  positions,
+                                  intermediate_tensors,
+                                  inputs_embeds=inputs_embeds)
+
+    def compute_logits(
+        self,
+        hidden_states: torch.Tensor,
+        sampling_metadata: SamplingMetadata,
+    ) -> Optional[torch.Tensor]:
+        return self.decoder.compute_logits(hidden_states, sampling_metadata)
+
+    def load_weights(self, weights: Iterable[tuple[str,
+                                                   torch.Tensor]]) -> set[str]:
+        loader = AutoWeightsLoader(self)
+        return loader.load_weights(weights)
diff --git a/vllm/model_executor/models/mixtral_quant.py b/vllm/model_executor/models/mixtral_quant.py
deleted file mode 100644
index 692267b4d7271c353218e6227bd93e404a1d9918..0000000000000000000000000000000000000000
--- a/vllm/model_executor/models/mixtral_quant.py
+++ /dev/null
@@ -1,454 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-# Adapted from
-# https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/llama/modeling_llama.py
-# Copyright 2023 The vLLM team.
-# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.
-#
-# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
-# and OPT implementations in this library. It has been modified from its
-# original forms to accommodate minor architectural differences compared
-# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Inference-only Mixtral model."""
-from collections.abc import Iterable
-from itertools import islice
-from typing import Optional, Union
-
-import numpy as np
-import torch
-import torch.nn.functional as F
-from torch import nn
-from transformers import MixtralConfig
-
-from vllm.attention import Attention
-from vllm.config import CacheConfig, VllmConfig
-from vllm.distributed import (get_pp_group, get_tensor_model_parallel_rank,
-                              get_tensor_model_parallel_world_size,
-                              tensor_model_parallel_all_reduce)
-from vllm.model_executor.layers.layernorm import RMSNorm
-from vllm.model_executor.layers.linear import (QKVParallelLinear,
-                                               ReplicatedLinear,
-                                               RowParallelLinear)
-from vllm.model_executor.layers.logits_processor import LogitsProcessor
-from vllm.model_executor.layers.quantization import QuantizationConfig
-from vllm.model_executor.layers.rotary_embedding import get_rope
-from vllm.model_executor.layers.vocab_parallel_embedding import (
-    ParallelLMHead, VocabParallelEmbedding)
-from vllm.model_executor.model_loader.weight_utils import (
-    default_weight_loader, maybe_remap_kv_scale_name)
-from vllm.model_executor.sampling_metadata import SamplingMetadata
-from vllm.sequence import IntermediateTensors
-
-from .interfaces import SupportsPP
-from .utils import (AutoWeightsLoader, is_pp_missing_parameter,
-                    make_empty_intermediate_tensors_factory, make_layers,
-                    maybe_prefix)
-
-
-class MixtralMLP(nn.Module):
-
-    def __init__(
-        self,
-        num_experts: int,
-        hidden_size: int,
-        intermediate_size: int,
-        quant_config: Optional[QuantizationConfig] = None,
-    ) -> None:
-        super().__init__()
-        self.num_experts = num_experts
-        self.ffn_dim = intermediate_size
-        self.hidden_dim = hidden_size
-
-        self.w1 = ReplicatedLinear(self.hidden_dim,
-                                   self.ffn_dim,
-                                   bias=False,
-                                   quant_config=quant_config)
-        self.w2 = ReplicatedLinear(self.ffn_dim,
-                                   self.hidden_dim,
-                                   bias=False,
-                                   quant_config=quant_config)
-        self.w3 = ReplicatedLinear(self.hidden_dim,
-                                   self.ffn_dim,
-                                   bias=False,
-                                   quant_config=quant_config)
-
-        # TODO: Use vllm's SiluAndMul
-        self.act_fn = nn.SiLU()
-
-    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
-        w1_out, _ = self.w1(hidden_states)
-        w1_out = self.act_fn(w1_out)
-        w3_out, _ = self.w3(hidden_states)
-        current_hidden_states = w1_out * w3_out
-        current_hidden_states, _ = self.w2(current_hidden_states)
-        return current_hidden_states
-
-
-class MixtralMoE(nn.Module):
-
-    def __init__(
-        self,
-        config: MixtralConfig,
-        quant_config: Optional[QuantizationConfig] = None,
-    ):
-        super().__init__()
-        self.config = config
-        self.rank = get_tensor_model_parallel_rank()
-        self.tp_size = get_tensor_model_parallel_world_size()
-        self.num_total_experts = config.num_local_experts
-        self.top_k = config.num_experts_per_tok
-        if self.tp_size > self.num_total_experts:
-            raise ValueError(
-                f"Tensor parallel size {self.tp_size} is greater than "
-                f"the number of experts {self.num_total_experts}.")
-        # Split experts equally between ranks
-        self.expert_indices = np.array_split(range(self.num_total_experts),
-                                             self.tp_size)[self.rank].tolist()
-        if not self.expert_indices:
-            raise ValueError(
-                f"Rank {self.rank} has no experts assigned to it.")
-
-        self.experts = nn.ModuleList([
-            MixtralMLP(self.num_total_experts,
-                       config.hidden_size,
-                       config.intermediate_size,
-                       quant_config=quant_config)
-            if idx in self.expert_indices else None
-            for idx in range(self.num_total_experts)
-        ])
-        self.gate = ReplicatedLinear(config.hidden_size,
-                                     self.num_total_experts,
-                                     bias=False,
-                                     quant_config=None)
-
-    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
-        num_tokens, hidden_dim = hidden_states.shape
-        hidden_states = hidden_states.view(-1, hidden_dim)
-        # router_logits: (num_tokens, n_experts)
-        router_logits, _ = self.gate(hidden_states)
-
-        routing_weights = F.softmax(router_logits, dim=1, dtype=torch.float)
-        routing_weights, selected_experts = torch.topk(routing_weights,
-                                                       self.top_k,
-                                                       dim=-1)
-        routing_weights /= routing_weights.sum(dim=-1, keepdim=True)
-
-        final_hidden_states = None
-        for expert_idx in self.expert_indices:
-            expert_layer = self.experts[expert_idx]
-            expert_mask = (selected_experts == expert_idx)
-            expert_weights = (routing_weights * expert_mask).sum(dim=-1,
-                                                                 keepdim=True)
-
-            current_hidden_states = expert_layer(hidden_states).mul_(
-                expert_weights)
-            if final_hidden_states is None:
-                final_hidden_states = current_hidden_states
-            else:
-                final_hidden_states.add_(current_hidden_states)
-
-        return tensor_model_parallel_all_reduce(final_hidden_states).view(
-            num_tokens, hidden_dim)
-
-
-class MixtralAttention(nn.Module):
-
-    def __init__(
-        self,
-        config: MixtralConfig,
-        hidden_size: int,
-        num_heads: int,
-        num_kv_heads: int,
-        max_position: int = 4096 * 32,
-        rope_theta: float = 10000,
-        quant_config: Optional[QuantizationConfig] = None,
-        cache_config: Optional[CacheConfig] = None,
-        prefix: str = "",
-    ) -> None:
-        super().__init__()
-        self.hidden_size = hidden_size
-        tp_size = get_tensor_model_parallel_world_size()
-        self.total_num_heads = num_heads
-        assert self.total_num_heads % tp_size == 0
-        self.num_heads = self.total_num_heads // tp_size
-        self.total_num_kv_heads = num_kv_heads
-        if self.total_num_kv_heads >= tp_size:
-            # Number of KV heads is greater than TP size, so we partition
-            # the KV heads across multiple tensor parallel GPUs.
-            assert self.total_num_kv_heads % tp_size == 0
-        else:
-            # Number of KV heads is less than TP size, so we replicate
-            # the KV heads across multiple tensor parallel GPUs.
-            assert tp_size % self.total_num_kv_heads == 0
-        self.num_kv_heads = max(1, self.total_num_kv_heads // tp_size)
-        # MixtralConfig has an optional head_dim argument
-        self.head_dim = getattr(config, "head_dim", None)
-        if self.head_dim is None:
-            self.head_dim = self.hidden_size // self.total_num_heads
-        self.q_size = self.num_heads * self.head_dim
-        self.kv_size = self.num_kv_heads * self.head_dim
-        self.scaling = self.head_dim**-0.5
-        self.rope_theta = rope_theta
-
-        self.qkv_proj = QKVParallelLinear(
-            hidden_size,
-            self.head_dim,
-            self.total_num_heads,
-            self.total_num_kv_heads,
-            bias=False,
-            quant_config=quant_config,
-        )
-        self.o_proj = RowParallelLinear(
-            self.total_num_heads * self.head_dim,
-            hidden_size,
-            bias=False,
-            quant_config=quant_config,
-        )
-        self.rotary_emb = get_rope(
-            self.head_dim,
-            rotary_dim=self.head_dim,
-            max_position=max_position,
-            base=int(self.rope_theta),
-            is_neox_style=True,
-        )
-        self.attn = Attention(self.num_heads,
-                              self.head_dim,
-                              self.scaling,
-                              num_kv_heads=self.num_kv_heads,
-                              cache_config=cache_config,
-                              quant_config=quant_config,
-                              prefix=f"{prefix}.attn")
-
-    def forward(
-        self,
-        positions: torch.Tensor,
-        hidden_states: torch.Tensor,
-    ) -> torch.Tensor:
-        qkv, _ = self.qkv_proj(hidden_states)
-        q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1)
-        q, k = self.rotary_emb(positions, q, k)
-        attn_output = self.attn(q, k, v)
-        output, _ = self.o_proj(attn_output)
-        return output
-
-
-class MixtralDecoderLayer(nn.Module):
-
-    def __init__(
-        self,
-        config: MixtralConfig,
-        cache_config: Optional[CacheConfig] = None,
-        quant_config: Optional[QuantizationConfig] = None,
-        prefix: str = "",
-    ) -> None:
-        super().__init__()
-        self.hidden_size = config.hidden_size
-        # Requires transformers > 4.32.0
-        rope_theta = getattr(config, "rope_theta", 10000)
-        self.self_attn = MixtralAttention(
-            config=config,
-            hidden_size=self.hidden_size,
-            num_heads=config.num_attention_heads,
-            max_position=config.max_position_embeddings,
-            num_kv_heads=config.num_key_value_heads,
-            rope_theta=rope_theta,
-            cache_config=cache_config,
-            quant_config=quant_config,
-            prefix=f"{prefix}.self_attn",
-        )
-        self.block_sparse_moe = MixtralMoE(config=config,
-                                           quant_config=quant_config)
-        self.input_layernorm = RMSNorm(config.hidden_size,
-                                       eps=config.rms_norm_eps)
-        self.post_attention_layernorm = RMSNorm(config.hidden_size,
-                                                eps=config.rms_norm_eps)
-
-    def forward(
-        self,
-        positions: torch.Tensor,
-        hidden_states: torch.Tensor,
-        residual: Optional[torch.Tensor],
-    ) -> torch.Tensor:
-        # Self Attention
-        if residual is None:
-            residual = hidden_states
-            hidden_states = self.input_layernorm(hidden_states)
-        else:
-            hidden_states, residual = self.input_layernorm(
-                hidden_states, residual)
-        hidden_states = self.self_attn(
-            positions=positions,
-            hidden_states=hidden_states,
-        )
-
-        # Fully Connected
-        hidden_states, residual = self.post_attention_layernorm(
-            hidden_states, residual)
-        hidden_states = self.block_sparse_moe(hidden_states)
-        return hidden_states, residual
-
-
-class MixtralModel(nn.Module):
-
-    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
-        super().__init__()
-
-        config = vllm_config.model_config.hf_config
-        cache_config = vllm_config.cache_config
-        quant_config = vllm_config.quant_config
-
-        self.vocab_size = config.vocab_size
-
-        self.embed_tokens = VocabParallelEmbedding(
-            config.vocab_size,
-            config.hidden_size,
-        )
-        self.start_layer, self.end_layer, self.layers = make_layers(
-            config.num_hidden_layers,
-            lambda prefix: MixtralDecoderLayer(
-                config, cache_config, quant_config=quant_config, prefix=prefix
-            ),
-            prefix=f"{prefix}.layers")
-        self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
-        self.make_empty_intermediate_tensors = (
-            make_empty_intermediate_tensors_factory(
-                ["hidden_states", "residual"], config.hidden_size))
-
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
-        return self.embed_tokens(input_ids)
-
-    def forward(
-        self,
-        input_ids: torch.Tensor,
-        positions: torch.Tensor,
-        intermediate_tensors: Optional[IntermediateTensors],
-        inputs_embeds: Optional[torch.Tensor] = None,
-    ) -> Union[torch.Tensor, IntermediateTensors]:
-        if get_pp_group().is_first_rank:
-            if inputs_embeds is not None:
-                hidden_states = inputs_embeds
-            else:
-                hidden_states = self.get_input_embeddings(input_ids)
-            residual = None
-        else:
-            assert intermediate_tensors is not None
-            hidden_states = intermediate_tensors["hidden_states"]
-            residual = intermediate_tensors["residual"]
-        for layer in islice(self.layers, self.start_layer, self.end_layer):
-            hidden_states, residual = layer(positions, hidden_states, residual)
-        if not get_pp_group().is_last_rank:
-            return IntermediateTensors({
-                "hidden_states": hidden_states,
-                "residual": residual
-            })
-        hidden_states, _ = self.norm(hidden_states, residual)
-        return hidden_states
-
-    def load_weights(self, weights: Iterable[tuple[str,
-                                                   torch.Tensor]]) -> set[str]:
-        stacked_params_mapping = [
-            # (param_name, shard_name, shard_id)
-            ("qkv_proj", "q_proj", "q"),
-            ("qkv_proj", "k_proj", "k"),
-            ("qkv_proj", "v_proj", "v"),
-        ]
-
-        params_dict = dict(self.named_parameters())
-        loaded_params: set[str] = set()
-        for name, loaded_weight in weights:
-            if name.endswith("scale"):
-                # Remapping the name of FP8 kv-scale.
-                name = maybe_remap_kv_scale_name(name, params_dict)
-                if name is None:
-                    continue
-            for (param_name, weight_name, shard_id) in stacked_params_mapping:
-                if weight_name not in name:
-                    continue
-                name = name.replace(weight_name, param_name)
-                # Skip loading extra bias for GPTQ models.
-                if name.endswith(".bias") and name not in params_dict:
-                    continue
-                if is_pp_missing_parameter(name, self):
-                    continue
-                param = params_dict[name]
-                weight_loader = param.weight_loader
-                weight_loader(param, loaded_weight, shard_id)
-                break
-            else:
-                # Skip loading extra bias for GPTQ models.
-                if name.endswith(".bias") and name not in params_dict:
-                    continue
-                # Skip experts that are not assigned to this worker.
-                if ("block_sparse_moe.experts." in name
-                        and name not in params_dict):
-                    continue
-                if is_pp_missing_parameter(name, self):
-                    continue
-                param = params_dict[name]
-                weight_loader = getattr(param, "weight_loader",
-                                        default_weight_loader)
-                weight_loader(param, loaded_weight)
-            loaded_params.add(name)
-        return loaded_params
-
-
-class MixtralForCausalLM(nn.Module, SupportsPP):
-    fall_back_to_pt_during_load = False
-
-    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
-        super().__init__()
-        config = vllm_config.model_config.hf_config
-        quant_config = vllm_config.quant_config
-        self.config = config
-        self.quant_config = quant_config
-        self.model = MixtralModel(vllm_config=vllm_config,
-                                  prefix=maybe_prefix(prefix, "model"))
-        self.lm_head = ParallelLMHead(config.vocab_size,
-                                      config.hidden_size,
-                                      quant_config=quant_config)
-        if self.config.tie_word_embeddings:
-            self.lm_head.weight = self.model.embed_tokens.weight
-        self.logits_processor = LogitsProcessor(config.vocab_size)
-        self.make_empty_intermediate_tensors = (
-            self.model.make_empty_intermediate_tensors)
-
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
-        return self.model.get_input_embeddings(input_ids)
-
-    def forward(
-        self,
-        input_ids: torch.Tensor,
-        positions: torch.Tensor,
-        intermediate_tensors: Optional[IntermediateTensors] = None,
-        inputs_embeds: Optional[torch.Tensor] = None,
-    ) -> Union[torch.Tensor, IntermediateTensors]:
-        hidden_states = self.model(input_ids, positions, intermediate_tensors,
-                                   inputs_embeds)
-        return hidden_states
-
-    def compute_logits(
-        self,
-        hidden_states: torch.Tensor,
-        sampling_metadata: SamplingMetadata,
-    ) -> Optional[torch.Tensor]:
-        logits = self.logits_processor(self.lm_head, hidden_states,
-                                       sampling_metadata)
-        return logits
-
-    def load_weights(self, weights: Iterable[tuple[str,
-                                                   torch.Tensor]]) -> set[str]:
-        loader = AutoWeightsLoader(self)
-        return loader.load_weights(weights)
diff --git a/vllm/model_executor/models/mllama.py b/vllm/model_executor/models/mllama.py
index f441287a4d08965e61515ef8860771200cebaf74..048894085b360975e318bac8142996349f138587 100644
--- a/vllm/model_executor/models/mllama.py
+++ b/vllm/model_executor/models/mllama.py
@@ -35,6 +35,7 @@ from transformers.models.mllama.processing_mllama import (
 
 import vllm.distributed.parallel_state as ps
 from vllm.attention import Attention, AttentionMetadata, AttentionType
+from vllm.attention.layer import MultiHeadAttention
 from vllm.attention.ops.paged_attn import PagedAttention
 from vllm.attention.selector import _Backend
 from vllm.config import VllmConfig
@@ -57,7 +58,7 @@ from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.multimodal import MULTIMODAL_REGISTRY
 from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalEncDecInputs,
                                     MultiModalFieldConfig,
-                                    MultiModalKwargsItems)
+                                    MultiModalKwargsItems, MultiModalUUIDDict)
 from vllm.multimodal.parse import (ImageProcessorItems, ImageSize,
                                    MultiModalDataItems)
 from vllm.multimodal.processing import (BaseProcessingInfo,
@@ -184,13 +185,13 @@ class MllamaMultiModalProcessor(EncDecMultiModalProcessor[MllamaProcessingInfo]
         mm_data: MultiModalDataDict,
         hf_processor_mm_kwargs: Mapping[str, object],
         tokenization_kwargs: Optional[Mapping[str, object]] = None,
-        mm_hash_overrides: Optional[dict[str, list[str]]] = None,
+        mm_uuids: Optional[MultiModalUUIDDict] = None,
     ) -> MultiModalEncDecInputs:
         mm_inputs = super().apply(prompt,
                                   mm_data,
                                   hf_processor_mm_kwargs,
                                   tokenization_kwargs,
-                                  mm_hash_overrides=mm_hash_overrides)
+                                  mm_uuids=mm_uuids)
 
         image_token_id = self.info.get_hf_config().image_token_index
         # Check that the number of image tokens in the decoder prompt matches
@@ -517,6 +518,10 @@ class MllamaVisionSdpaAttention(nn.Module):
             prefix=f"{prefix}.o_proj",
         )
 
+        # Use unified MultiHeadAttention with automatic backend selection
+        self.attn = MultiHeadAttention(self.num_local_heads, self.head_dim,
+                                       1.0 / math.sqrt(self.head_dim))
+
     def forward(
         self,
         hidden_state: torch.Tensor,
@@ -524,21 +529,10 @@ class MllamaVisionSdpaAttention(nn.Module):
     ) -> torch.Tensor:
         qkv, _ = self.qkv_proj(hidden_state)
         q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1)
-        q = q.view(q.shape[0], q.shape[1], self.num_local_heads,
-                   self.head_dim).transpose(1, 2)
-        k = k.view(k.shape[0], k.shape[1], self.num_local_heads,
-                   self.head_dim).transpose(1, 2)
-        v = v.view(v.shape[0], v.shape[1], self.num_local_heads,
-                   self.head_dim).transpose(1, 2)
-
-        # TODO: remove padding in image encoder
-        attn_output = F.scaled_dot_product_attention(q,
-                                                     k,
-                                                     v,
-                                                     attn_mask=attention_mask,
-                                                     dropout_p=0.0)
-
-        attn_output = attn_output.transpose(1, 2).contiguous()
+
+        # Use unified MultiHeadAttention with automatic backend selection
+        attn_output = self.attn(q, k, v)
+
         attn_output = attn_output.reshape(attn_output.shape[0],
                                           attn_output.shape[1], -1)
         output, _ = self.o_proj(attn_output)
diff --git a/vllm/model_executor/models/mllama4.py b/vllm/model_executor/models/mllama4.py
index 53204966554cfd34034449a8d9157d6fca606500..efc7a674c0bb7bbe0d9c52d9dc5bc1b5ebdca04b 100644
--- a/vllm/model_executor/models/mllama4.py
+++ b/vllm/model_executor/models/mllama4.py
@@ -106,22 +106,21 @@ class Llama4VisionMLP(nn.Module):
         use_data_parallel: bool = False,
     ):
         super().__init__()
-        cls_fc1 = (ReplicatedLinear
-                   if use_data_parallel else ColumnParallelLinear)
-        self.fc1 = cls_fc1(
+        self.fc1 = ColumnParallelLinear(
             input_size=input_size,
             output_size=intermediate_size,
             bias=bias,
             quant_config=quant_config,
             prefix=f"{prefix}.fc1",
+            disable_tp=use_data_parallel,
         )
-        cls_fc2 = ReplicatedLinear if use_data_parallel else RowParallelLinear
-        self.fc2 = cls_fc2(
+        self.fc2 = RowParallelLinear(
             input_size=intermediate_size,
             output_size=output_size,
             bias=bias,
             quant_config=quant_config,
             prefix=f"{prefix}.fc2",
+            disable_tp=use_data_parallel,
         )
         self.activation_fn = nn.GELU()
         self.output_activation = output_activation
@@ -419,20 +418,15 @@ class Llama4UnfoldConvolution(nn.Module):
             kernel_size = (kernel_size, kernel_size)
         self.unfold = torch.nn.Unfold(kernel_size=kernel_size,
                                       stride=config.patch_size)
-        params = {
-            "input_size":
-            config.num_channels * kernel_size[0] * kernel_size[1],
-            "output_size": config.hidden_size,
-            "bias": False,
-            "quant_config": quant_config,
-            "prefix": f"{prefix}.linear",
-        }
-        if use_data_parallel:
-            cls = ReplicatedLinear
-        else:
-            cls = ColumnParallelLinear
-            params["gather_output"] = True
-        self.linear = cls(**params)
+        self.linear = ColumnParallelLinear(
+            input_size=config.num_channels * kernel_size[0] * kernel_size[1],
+            output_size=config.hidden_size,
+            bias=False,
+            gather_output=True,
+            quant_config=quant_config,
+            prefix=f"{prefix}.linear",
+            disable_tp=use_data_parallel,
+        )
 
     def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
         hidden_states = self.unfold(hidden_states)
diff --git a/vllm/model_executor/models/modernbert.py b/vllm/model_executor/models/modernbert.py
index 776287589808a969599084203b91fe4eb61c66bd..1d5da3139de9276c214c9d3b1c292c97968788ca 100644
--- a/vllm/model_executor/models/modernbert.py
+++ b/vllm/model_executor/models/modernbert.py
@@ -306,7 +306,9 @@ class ModernBertForSequenceClassification(nn.Module, SupportsCrossEncoding):
         self.config = config
         self.model = ModernBertModel(vllm_config=vllm_config,
                                      prefix=maybe_prefix(prefix, "modernbert"))
-        self.classifier = nn.Linear(config.hidden_size, config.num_labels)
+        self.classifier = nn.Linear(config.hidden_size,
+                                    config.num_labels,
+                                    dtype=vllm_config.model_config.head_dtype)
         self.pooling = ModernBertPooler(config)
 
         pooler_config = vllm_config.model_config.pooler_config
diff --git a/vllm/model_executor/models/molmo.py b/vllm/model_executor/models/molmo.py
index b2fc7be1af224f9db193619809b8d39821e0e362..5d999a02b4e657a893e10db2a034932bf833685e 100644
--- a/vllm/model_executor/models/molmo.py
+++ b/vllm/model_executor/models/molmo.py
@@ -76,20 +76,22 @@ class MolmoImageInputs(TensorSchema):
     """
     Dimensions:
         - bn: Batch size * number of images
-        - nc: Number of crops
+        - nc: Number of crops (dynamic)
         - np: Number of patches
+        - tp: Token sequence positions
         - pd: Patch dimension
     """
     images: Annotated[Union[torch.Tensor, list[torch.Tensor]],
-                      TensorShape("bn", "nc", "np", "pd")]
+                      TensorShape("bn", "nc", "np", "pd", dynamic_dims={"nc"})]
+    # Number of crops may vary per batch and image, so pass it as a list.
 
     image_masks: Annotated[Optional[Union[torch.Tensor, list[torch.Tensor]]],
-                           TensorShape("bn", "nc", "np")]
+                           TensorShape("bn", "nc", "np", dynamic_dims={"nc"})]
 
-    feat_is_patch: Annotated[Union[torch.Tensor, list[torch.Tensor]],
-                             TensorShape("bn", "nc", "np")]
+    feat_is_patch: Annotated[
+        Union[torch.Tensor, list[torch.Tensor]],
+        TensorShape("bn", "nc", "tp", dynamic_dims={"nc"})]
     # A boolean mask indicating which image features correspond to patch tokens.
-
     num_crops: Annotated[torch.Tensor, TensorShape("bn")]
 
 
diff --git a/vllm/model_executor/models/moonvit.py b/vllm/model_executor/models/moonvit.py
index d0fdab13ef0c95921bb52c06d3c3eb569795477e..41a2c836b09f3912dbf6ca01a1f9700f45321104 100644
--- a/vllm/model_executor/models/moonvit.py
+++ b/vllm/model_executor/models/moonvit.py
@@ -42,7 +42,6 @@
 # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 # SOFTWARE.
-import math
 from collections.abc import Sequence
 from copy import deepcopy
 from functools import cached_property
@@ -55,6 +54,8 @@ from transformers.activations import ACT2FN, PytorchGELUTanh
 from transformers.modeling_utils import PreTrainedModel
 from transformers.utils import is_flash_attn_2_available
 
+from vllm.model_executor.layers.linear import ReplicatedLinear
+from vllm.model_executor.models.utils import maybe_prefix
 from vllm.transformers_utils.configs.moonvit import MoonViTConfig
 
 if is_flash_attn_2_available():
@@ -383,21 +384,30 @@ class MLP2(nn.Module):
         bias: whether to use bias in linear layer.
     """
 
-    def __init__(self, dims: list[int], activation, bias=True):
+    def __init__(self,
+                 dims: list[int],
+                 activation,
+                 bias=True,
+                 prefix: str = "",
+                 use_data_parallel: bool = False):
         super().__init__()
         assert len(dims) == 3
-        self.fc0 = nn.Linear(dims[0], dims[1], bias=bias)
-        self.fc1 = nn.Linear(dims[1], dims[2], bias=bias)
+        self.use_data_parallel = use_data_parallel
+        self.fc0 = ReplicatedLinear(dims[0],
+                                    dims[1],
+                                    bias=bias,
+                                    prefix=maybe_prefix(prefix, "fc0"))
+        self.fc1 = ReplicatedLinear(dims[1],
+                                    dims[2],
+                                    bias=bias,
+                                    prefix=maybe_prefix(prefix, "fc1"))
         self.activation = activation
-        for m in [self.fc0, self.fc1]:
-            nn.init.trunc_normal_(m.weight, std=math.sqrt(2 / m.in_features))
-            if m.bias is not None:
-                nn.init.zeros_(m.bias)
 
     def forward(self, x: torch.Tensor) -> torch.Tensor:
-        x = self.fc0(x)
+        x, _ = self.fc0(x)
         x = self.activation(x)
-        return self.fc1(x)
+        x, _ = self.fc1(x)
+        return x
 
 
 class MoonVitEncoderLayer(nn.Module):
@@ -407,6 +417,8 @@ class MoonVitEncoderLayer(nn.Module):
         num_heads: int,
         hidden_dim: int,
         mlp_dim: int,
+        prefix: str = "",
+        use_data_parallel: bool = False,
         *,
         attn_implementation: str = "sdpa",
         activation=F.gelu,
@@ -423,9 +435,19 @@ class MoonVitEncoderLayer(nn.Module):
 
         self.norm0 = nn.LayerNorm(hidden_dim)
         self.norm1 = nn.LayerNorm(hidden_dim)
-        self.mlp = MLP2([hidden_dim, mlp_dim, hidden_dim], activation)
-        self.wqkv = nn.Linear(hidden_dim, hidden_dim * 3, bias=attn_bias)
-        self.wo = nn.Linear(hidden_dim, hidden_dim, bias=attn_bias)
+        self.use_data_parallel = use_data_parallel
+        self.mlp = MLP2([hidden_dim, mlp_dim, hidden_dim],
+                        activation,
+                        prefix=f"{prefix}.mlp",
+                        use_data_parallel=use_data_parallel)
+        self.wqkv = ReplicatedLinear(hidden_dim,
+                                     hidden_dim * 3,
+                                     bias=attn_bias,
+                                     prefix=f"{prefix}.wqkv")
+        self.wo = ReplicatedLinear(hidden_dim,
+                                   hidden_dim,
+                                   bias=attn_bias,
+                                   prefix=f"{prefix}.wo")
 
     def attention_qkvpacked(
         self,
@@ -438,7 +460,7 @@ class MoonVitEncoderLayer(nn.Module):
             x (torch.Tensor): (batch_size, seqlen, hidden_dim)
             cu_seqlens (torch.Tensor):
         """
-        xqkv = self.wqkv(x)
+        xqkv, _ = self.wqkv(x)
 
         qkv_shape = xqkv.size()[:-1] + (
             3,
@@ -457,8 +479,7 @@ class MoonVitEncoderLayer(nn.Module):
                              xv,
                              q_cu_seqlens=cu_seqlens,
                              k_cu_seqlens=cu_seqlens)
-
-        attn_out = self.wo(attn_out)
+        attn_out, _ = self.wo(attn_out)
         return attn_out
 
     def forward(
@@ -494,13 +515,17 @@ class MoonVitEncoder(nn.Module):
         hidden_dim: int,
         num_layers: int,
         block_cfg: dict,
+        prefix: str = "",
+        use_data_parallel: bool = False,
     ) -> None:
         super().__init__()
 
         self.rope_2d = Rope2DPosEmb(
             block_cfg["hidden_dim"] // block_cfg["num_heads"], 512, 512)
         self.blocks = nn.ModuleList(
-            [MoonVitEncoderLayer(**block_cfg) for _ in range(num_layers)])
+            [MoonVitEncoderLayer(use_data_parallel=use_data_parallel, \
+                                 prefix=f"{prefix}.blocks.{layer_idx}", \
+                                 **block_cfg) for layer_idx in range(num_layers)])
         self.final_layernorm = nn.LayerNorm(hidden_dim)
 
     def forward(self, hidden_states: torch.Tensor,
@@ -508,10 +533,9 @@ class MoonVitEncoder(nn.Module):
         rope_freqs_cis = self.rope_2d.get_freqs_cis_by_seqlens(
             grid_hws=grid_hw)
 
-        lengths = torch.cat((
-            torch.zeros(1, device=hidden_states.device, dtype=grid_hw.dtype),
-            grid_hw[:, 0] * grid_hw[:, 1],
-        ))
+        lengths = torch.cat(
+            (torch.zeros(1, device=hidden_states.device, dtype=grid_hw.dtype),
+             (grid_hw[:, 0] * grid_hw[:, 1]).to(hidden_states.device)))
         cu_seqlens = lengths.cumsum(dim=0, dtype=torch.int32)
 
         for _, block in enumerate(self.blocks):
@@ -587,11 +611,19 @@ class MoonVitPretrainedModel(PreTrainedModel):
     _supports_flash_attn_2 = True
     _supports_sdpa = True
 
-    def __init__(self, config: MoonViTConfig, *inputs, **kwargs):
+    def __init__(self,
+                 config: MoonViTConfig,
+                 use_data_parallel: bool = False,
+                 prefix: str = "",
+                 *inputs,
+                 **kwargs):
         super().__init__(config, *inputs, **kwargs)
         config = deepcopy(config)
+        self.use_data_parallel = use_data_parallel
         self.merge_kernel_size = config.merge_kernel_size
+        self.hidden_size = config.hidden_size
         self.patch_size = config.patch_size
+        self.vit_processing_type = "rope_2d"
         self.patch_embed = MoonVisionPatchEmbed(
             out_dim=config.hidden_size,
             patch_size=config.patch_size,
@@ -610,6 +642,7 @@ class MoonVitPretrainedModel(PreTrainedModel):
                 "attn_bias": True,
                 "attn_implementation": config._attn_implementation,
             },
+            prefix=f"{prefix}.encoder",
         )
 
     def forward(self, pixel_values: torch.Tensor,
diff --git a/vllm/model_executor/models/motif.py b/vllm/model_executor/models/motif.py
new file mode 100644
index 0000000000000000000000000000000000000000..153f36dcf1f5509c61bba8fa128029ad9bf75dd9
--- /dev/null
+++ b/vllm/model_executor/models/motif.py
@@ -0,0 +1,345 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+# Adapted from
+# https://huggingface.co/Motif-Technologies/Motif-2.6B/blob/main/modeling_motif.py
+# Copyright (c) Alibaba Cloud.
+# LICENSE: https://huggingface.co/Motif-Technologies/Motif-2.6B/blob/main/LICENSE
+"""Inference-only Motif model compatible with HuggingFace weights."""
+import math
+from typing import Any, Optional
+
+import torch
+from torch import nn
+from transformers import PretrainedConfig
+
+from vllm.attention import Attention, AttentionType
+from vllm.attention.selector import _Backend
+from vllm.config import CacheConfig, VllmConfig
+from vllm.distributed import get_tensor_model_parallel_world_size
+from vllm.model_executor.layers.layernorm import PolyNorm, RMSNorm
+from vllm.model_executor.layers.linear import (MergedColumnParallelLinear,
+                                               QKVParallelLinear,
+                                               RowParallelLinear)
+from vllm.model_executor.layers.quantization import QuantizationConfig
+from vllm.model_executor.layers.rotary_embedding import get_rope
+from vllm.model_executor.models.llama import LlamaForCausalLM
+
+from .adapters import as_seq_cls_model
+from .interfaces import SupportsV0Only
+from .utils import extract_layer_index
+
+
+class MotifMLP(nn.Module):
+    """MLP for the language component of the Motif model, which contains a
+    MergedColumnParallelLinear merging 2 outputs via PolyNorm activation."""
+
+    def __init__(
+        self,
+        hidden_size: int,
+        intermediate_size: int,
+        hidden_act: str = "poly_norm",
+        quant_config: Optional[QuantizationConfig] = None,
+        bias: bool = False,
+        prefix: str = "",
+        reduce_results: bool = True,
+    ):
+        super().__init__()
+        self.gate_up_proj = MergedColumnParallelLinear(
+            input_size=hidden_size,
+            output_sizes=[intermediate_size] * 2,
+            bias=bias,
+            quant_config=quant_config,
+            prefix=f"{prefix}.gate_up_proj",
+        )
+        self.down_proj = RowParallelLinear(
+            input_size=intermediate_size,
+            output_size=hidden_size,
+            bias=bias,
+            quant_config=quant_config,
+            reduce_results=reduce_results,
+            prefix=f"{prefix}.down_proj",
+        )
+        if hidden_act != "poly_norm":
+            raise NotImplementedError(f"Unsupported activation: {hidden_act}. "
+                                      "Only poly_norm is supported for now.")
+        self.act_fn = PolyNorm()
+        self.intermediate_size = intermediate_size
+        tp_size = get_tensor_model_parallel_world_size()
+        if hidden_act == "poly_norm" and tp_size > 1:
+            raise NotImplementedError(
+                "Tensor parallelism for poly_norm is not supported yet. "
+                "Support will be added in the future.")
+
+    def forward(self, x):
+        x, _ = self.gate_up_proj(x)
+        x = self.act_fn(
+            x[..., :self.intermediate_size]) * x[..., self.intermediate_size:]
+        x, _ = self.down_proj(x)
+        return x
+
+
+class MotifAttention(nn.Module):
+
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        hidden_size: int,
+        num_heads: int,
+        num_kv_heads: int,
+        rope_theta: float = 10000,
+        rope_scaling: Optional[dict[str, Any]] = None,
+        max_position_embeddings: int = 8192,
+        quant_config: Optional[QuantizationConfig] = None,
+        bias: bool = False,
+        bias_o_proj: bool = False,
+        cache_config: Optional[CacheConfig] = None,
+        prefix: str = "",
+        attn_type: str = AttentionType.DECODER,
+    ) -> None:
+        super().__init__()
+        layer_idx = extract_layer_index(prefix)
+        self.hidden_size = hidden_size
+        tp_size = get_tensor_model_parallel_world_size()
+        self.total_num_heads = num_heads
+        assert self.total_num_heads % tp_size == 0
+        self.num_heads = self.total_num_heads // tp_size
+        self.total_num_kv_heads = num_kv_heads
+        if self.total_num_kv_heads >= tp_size:
+            # Number of KV heads is greater than TP size, so we partition
+            # the KV heads across multiple tensor parallel GPUs.
+            assert self.total_num_kv_heads % tp_size == 0
+        else:
+            # Number of KV heads is less than TP size, so we replicate
+            # the KV heads across multiple tensor parallel GPUs.
+            assert tp_size % self.total_num_kv_heads == 0
+        self.num_kv_heads = max(1, self.total_num_kv_heads // tp_size)
+        # MistralConfig has an optional head_dim introduced by Mistral-Nemo
+        head_dim = getattr(config, "head_dim", None)
+        if head_dim is None:
+            head_dim = self.hidden_size // self.total_num_heads
+        self.head_dim = head_dim
+        # Phi models introduced a partial_rotary_factor parameter in the config
+        self.partial_rotary_factor = getattr(config, "partial_rotary_factor",
+                                             1)
+        self.q_size = self.num_heads * self.head_dim
+        self.kv_size = self.num_kv_heads * self.head_dim
+        self.scaling = self.head_dim**-0.5
+        self.rope_theta = rope_theta
+        self.max_position_embeddings = max_position_embeddings
+
+        assert self.num_heads % 2 == 0, 'num_heads should be even'
+        assert self.num_kv_heads % 2 == 0, 'num_heads should be even'
+
+        self.qkv_proj = QKVParallelLinear(
+            hidden_size=hidden_size,
+            head_size=self.head_dim,
+            total_num_heads=self.total_num_heads,
+            total_num_kv_heads=self.total_num_kv_heads,
+            bias=bias,
+            quant_config=quant_config,
+            prefix=f"{prefix}.qkv_proj",
+        )
+
+        self.o_proj = RowParallelLinear(
+            input_size=self.total_num_heads * self.head_dim,
+            output_size=hidden_size,
+            bias=bias_o_proj,
+            quant_config=quant_config,
+            prefix=f"{prefix}.o_proj",
+        )
+
+        self._init_rotary_emb(config,
+                              rope_scaling=rope_scaling,
+                              quant_config=quant_config)
+        sliding_window = None
+
+        self.lambda_init = self.lambda_init_fn(layer_idx)
+        self.lambda_q1 = nn.Parameter(
+            torch.zeros(self.head_dim, dtype=torch.float32).normal_(mean=0,
+                                                                    std=0.1))
+        self.lambda_k1 = nn.Parameter(
+            torch.zeros(self.head_dim, dtype=torch.float32).normal_(mean=0,
+                                                                    std=0.1))
+        self.lambda_q2 = nn.Parameter(
+            torch.zeros(self.head_dim, dtype=torch.float32).normal_(mean=0,
+                                                                    std=0.1))
+        self.lambda_k2 = nn.Parameter(
+            torch.zeros(self.head_dim, dtype=torch.float32).normal_(mean=0,
+                                                                    std=0.1))
+        self.subln = RMSNorm(2 * self.head_dim, eps=config.attn_rms_norm_eps)
+
+        params = {
+            'differential_flash_attention_config': {
+                'lambda_init': self.lambda_init,
+                'lambda_q1': self.lambda_q1,
+                'lambda_k1': self.lambda_k1,
+                'lambda_q2': self.lambda_q2,
+                'lambda_k2': self.lambda_k2,
+                "subln": self.subln,
+            }
+        }
+
+        diff_attn_err_msg = (
+            'Set VLLM_ATTENTION_BACKEND="DIFFERENTIAL_FLASH_ATTN" '
+            'to enable Differential Flash Attention.')
+        try:
+            self.attn = Attention(
+                self.num_heads,
+                self.head_dim,
+                self.scaling,
+                num_kv_heads=self.num_kv_heads,
+                cache_config=cache_config,
+                quant_config=quant_config,
+                per_layer_sliding_window=sliding_window,
+                attn_type=attn_type,
+                prefix=f"{prefix}.attn",
+                **params,
+            )
+        except TypeError as e:
+            raise ValueError(diff_attn_err_msg) from e
+        assert (self.attn.backend == _Backend.DIFFERENTIAL_FLASH_ATTN
+                ), diff_attn_err_msg
+
+    def lambda_init_fn(self, depth):
+        return 0.8 - 0.6 * math.exp(-0.3 * (depth - 1))
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+    ) -> torch.Tensor:
+        qkv, _ = self.qkv_proj(hidden_states)
+        q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1)
+        q, k = self.rotary_emb(positions, q, k)
+        attn_output = self.attn(q, k, v)
+        output, _ = self.o_proj(attn_output)
+        return output
+
+    def _init_rotary_emb(self, config: PretrainedConfig,
+                         rope_scaling: Optional[dict[str, Any]],
+                         quant_config: Optional[QuantizationConfig]) -> None:
+        is_neox_style = True
+        is_gguf = quant_config and quant_config.get_name() == "gguf"
+        if is_gguf and config.model_type == "llama":
+            is_neox_style = False
+
+        self.rotary_emb = get_rope(
+            self.head_dim,
+            rotary_dim=self.head_dim,
+            max_position=self.max_position_embeddings,
+            base=self.rope_theta,
+            rope_scaling=rope_scaling,
+            is_neox_style=is_neox_style,
+            partial_rotary_factor=self.partial_rotary_factor,
+        )
+
+
+class MotifDecoderLayer(nn.Module):
+
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        cache_config: Optional[CacheConfig] = None,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.hidden_size = config.hidden_size
+        rope_theta = getattr(config, "rope_theta", 10000)
+        rope_scaling = getattr(config, "rope_scaling", None)
+        if rope_scaling is not None and getattr(
+                config, "original_max_position_embeddings", None):
+            rope_scaling["original_max_position_embeddings"] = (
+                config.original_max_position_embeddings)
+        max_position_embeddings = getattr(config, "max_position_embeddings",
+                                          8192)
+        attention_bias = getattr(config, "attention_bias", False) or getattr(
+            config, "use_bias", False)
+        bias_o_proj = attention_bias
+        if hasattr(config, 'qkv_bias'):
+            attention_bias = config.qkv_bias
+
+        # By default, Motif uses causal attention as it is a decoder-only model.
+        # You can override the HF config with `is_causal=False` to enable
+        # bidirectional attention, which is used in some embedding models
+        # (e.g. parasail-ai/GritLM-7B-vllm)
+        if getattr(config, "is_causal", True):
+            attn_type = AttentionType.DECODER
+        else:
+            attn_type = AttentionType.ENCODER_ONLY
+
+        self.self_attn = MotifAttention(
+            config=config,
+            hidden_size=self.hidden_size,
+            num_heads=config.num_attention_heads,
+            num_kv_heads=getattr(config, "num_key_value_heads",
+                                 config.num_attention_heads),
+            rope_theta=rope_theta,
+            rope_scaling=rope_scaling,
+            max_position_embeddings=max_position_embeddings,
+            quant_config=quant_config,
+            bias=attention_bias,
+            bias_o_proj=bias_o_proj,
+            cache_config=cache_config,
+            prefix=f"{prefix}.self_attn",
+            attn_type=attn_type,
+        )
+        self.mlp = MotifMLP(
+            hidden_size=self.hidden_size,
+            intermediate_size=config.intermediate_size,
+            hidden_act=config.hidden_act,
+            quant_config=quant_config,
+            bias=getattr(config, "use_bias", False),
+            prefix=f"{prefix}.mlp",
+        )
+        self.input_layernorm = RMSNorm(config.hidden_size,
+                                       eps=config.rms_norm_eps)
+        self.post_attention_layernorm = RMSNorm(config.hidden_size,
+                                                eps=config.rms_norm_eps)
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+        residual: Optional[torch.Tensor],
+    ) -> tuple[torch.Tensor, torch.Tensor]:
+        # Self Attention
+        if residual is None:
+            residual = hidden_states
+            hidden_states = self.input_layernorm(hidden_states)
+        else:
+            hidden_states, residual = self.input_layernorm(
+                hidden_states, residual)
+        hidden_states = self.self_attn(positions=positions,
+                                       hidden_states=hidden_states)
+
+        # Fully Connected
+        hidden_states, residual = self.post_attention_layernorm(
+            hidden_states, residual)
+        hidden_states = self.mlp(hidden_states)
+        return hidden_states, residual
+
+
+# Motif model uses differential attention
+# Only supported in v0 (no chunked prefill support)
+class MotifForCausalLM(LlamaForCausalLM, SupportsV0Only):
+
+    def __init__(self,
+                 *,
+                 vllm_config: VllmConfig,
+                 prefix: str = "",
+                 layer_type: type[nn.Module] = MotifDecoderLayer):
+
+        # Prefix caching and chunked prefill is not supported for this model.
+        assert not vllm_config.cache_config.enable_prefix_caching, \
+            "Motif currently does not support prefix caching"
+        assert not vllm_config.scheduler_config.chunked_prefill_enabled, \
+            "Motif currently does not support chunked prefill"
+
+        super().__init__(vllm_config=vllm_config,
+                         prefix=prefix,
+                         layer_type=layer_type)
+
+
+MotifForSequenceClassification = as_seq_cls_model(MotifForCausalLM)
diff --git a/vllm/model_executor/models/nano_nemotron_vl.py b/vllm/model_executor/models/nano_nemotron_vl.py
new file mode 100644
index 0000000000000000000000000000000000000000..21765a483b8e0d81d9503de09d91d0e38cd6b430
--- /dev/null
+++ b/vllm/model_executor/models/nano_nemotron_vl.py
@@ -0,0 +1,1395 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+# --------------------------------------------------------
+# Adapted from
+# https://github.com/vllm-project/vllm/blob/main/vllm/model_executor/models/internvl.py
+# under Apache-2.0 License
+#     LICENSE is in root directory.
+# --------------------------------------------------------
+
+import copy
+import warnings
+from abc import ABC, abstractmethod
+from collections.abc import Iterable, Mapping, Sequence
+from typing import Annotated, Any, Literal, Optional, TypedDict, TypeVar, Union
+
+import numpy.typing as npt
+import torch
+import torch.nn as nn
+import torchvision.transforms as T
+from PIL import Image
+from transformers import (AutoModel, BatchEncoding, BatchFeature,
+                          PretrainedConfig, TensorType)
+
+from vllm.config import VllmConfig
+from vllm.model_executor.layers.activation import ReLUSquaredActivation
+from vllm.model_executor.layers.layernorm import RMSNorm
+from vllm.model_executor.model_loader.weight_utils import default_weight_loader
+from vllm.model_executor.models.interfaces import (HasInnerState, IsHybrid,
+                                                   MultiModalEmbeddings,
+                                                   SupportsMultiModal)
+from vllm.model_executor.models.internvl import (calculate_internvl_targets,
+                                                 get_internvl_target_ratios)
+from vllm.model_executor.models.module_mapping import MultiModelKeys
+from vllm.model_executor.models.nemotron_h import NemotronHForCausalLM
+from vllm.model_executor.models.utils import (flatten_bn,
+                                              init_vllm_registered_model,
+                                              maybe_prefix,
+                                              merge_multimodal_embeddings)
+from vllm.model_executor.sampling_metadata import SamplingMetadata
+from vllm.multimodal import MULTIMODAL_REGISTRY
+from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
+                                    MultiModalKwargs, MultiModalKwargsItems,
+                                    NestedTensors)
+from vllm.multimodal.parse import (ImageEmbeddingItems, ImageProcessorItems,
+                                   ImageSize, MultiModalDataItems)
+from vllm.multimodal.processing import (BaseMultiModalProcessor,
+                                        BaseProcessingInfo, PromptReplacement,
+                                        PromptUpdate, PromptUpdateDetails)
+from vllm.multimodal.profiling import BaseDummyInputsBuilder
+from vllm.sequence import IntermediateTensors
+from vllm.transformers_utils.tokenizer import AnyTokenizer
+from vllm.utils.tensor_schema import TensorSchema, TensorShape
+
+# Configure PIL to handle large images without warnings
+# This prevents DecompressionBombWarning for legitimate large images
+Image.MAX_IMAGE_PIXELS = None  # Disable the limit entirely
+# Alternative: Set a specific higher limit
+# Image.MAX_IMAGE_PIXELS = 300000000  # ~300M pixels
+
+IMG_START = "<img>"
+IMG_END = "</img>"
+IMG_CONTEXT = "<image>"
+
+# Profiling
+MAX_FRAMES = 16
+
+
+class NanoNemotronVLImagePixelInputs(TypedDict):
+    type: Literal["pixel_values"]
+    pixel_values_flat: torch.Tensor
+    """
+    Shape:
+    `(batch_size * num_images * (1 + num_patches), num_channels, height, width)`
+    """
+
+    num_patches: torch.Tensor
+    """Shape: `(batch_size * num_images)`"""
+
+
+class NanoNemotronVLImageEmbeddinInputs(TypedDict):
+    type: Literal["image_embeds"]
+    data: Union[torch.Tensor, list[torch.Tensor]]
+    """ 
+    A tensor of shape `(num_images, total_image_feature_size, hidden_size)`
+    or a list of tensors of shape `(total_image_feature_size, hidden_size)`
+
+    `hidden_size` must match the hidden size of language model backbone.
+    """
+
+
+NanoNemotronVLImageInputs = Union[NanoNemotronVLImagePixelInputs,
+                                  NanoNemotronVLImageEmbeddinInputs]
+
+
+class NanoNemotronVLVideoPixelInputs(TensorSchema):
+    """
+    Dimensions:
+        - bvf: Batch size * number of videos * num_frames
+        - bn: Batch size * number of images
+        - c: Number of channels (3)
+        - h: Height of each video frame
+        - w: Width of each video frame
+    """
+    type: Literal["pixel_values_videos"]
+    pixel_values_flat: Annotated[torch.Tensor, TensorShape("bvf", 3, "h", "w")]
+    num_patches: Annotated[torch.Tensor, TensorShape("bn")]
+
+
+class NanoNemotronVLVideoEmbeddingInputs(TensorSchema):
+    """
+    Dimensions:
+        - n: Number of videos
+        - f: Total video feature size
+        - h: Hidden size (must match the hidden size of language model backbone)
+    """
+    type: Literal["video_embeds"]
+    data: Annotated[Union[torch.Tensor, list[torch.Tensor]],
+                    TensorShape("n", "f", "h")]
+
+
+NanoNemotronVLVideoInputs = Union[NanoNemotronVLVideoPixelInputs,
+                                  NanoNemotronVLVideoEmbeddingInputs]
+
+
+def input_conditioner(x, norm_mean, norm_std):
+    y = (x - norm_mean) / norm_std
+    return y
+
+
+def dynamic_preprocess(image,
+                       *,
+                       image_size=512,
+                       max_num_tiles=12,
+                       use_thumbnail=True,
+                       idx=0):
+    orig_width, orig_height = image.size
+
+    target_ratios = get_internvl_target_ratios(1, max_num_tiles)
+
+    blocks, target_width, target_height = calculate_internvl_targets(
+        orig_width=orig_width,
+        orig_height=orig_height,
+        target_ratios=target_ratios,
+        image_size=image_size,
+        use_thumbnail=False)
+    # resize the image
+    resized_img = image.resize((target_width, target_height))
+    processed_images = []
+    for i in range(blocks):
+        box = (
+            (i % (target_width // image_size)) * image_size,
+            (i // (target_width // image_size)) * image_size,
+            ((i % (target_width // image_size)) + 1) * image_size,
+            ((i // (target_width // image_size)) + 1) * image_size,
+        )
+        # split the image
+        split_img = resized_img.crop(box)
+        processed_images.append(split_img)
+    assert len(processed_images) == blocks
+    if use_thumbnail and len(processed_images) != 1:
+        thumbnail_img = image.resize((image_size, image_size))
+        processed_images.append(thumbnail_img)
+
+    processed_images = [
+        img.convert("RGB") if img.mode != "RGB" else img
+        for img in processed_images
+    ]
+    processed_images = [
+        T.Resize((image_size, image_size),
+                 interpolation=T.InterpolationMode.BICUBIC)(img)
+        for img in processed_images
+    ]
+    processed_images = [T.ToTensor()(img) for img in processed_images]
+    return processed_images
+
+
+def image_to_pixel_values(
+    image: Image.Image,
+    *,
+    input_size: int,
+    max_num: int,
+    use_thumbnail: bool,
+    idx: int,
+) -> torch.Tensor:
+    images = dynamic_preprocess(
+        image,
+        image_size=input_size,
+        max_num_tiles=max_num,
+        use_thumbnail=use_thumbnail,
+        idx=idx,
+    )
+
+    pixel_values = torch.stack(images)
+    return pixel_values
+
+
+def video_to_pixel_values(
+    video: npt.NDArray,
+    *,
+    input_size: int,
+    max_num_tiles: int = 1,
+    use_thumbnail: bool,
+) -> torch.Tensor:
+    # Convert each frame to a single resized tile tensor consistent
+    # with image path
+    frames_tensors: list[torch.Tensor] = []
+    for frame in video:
+        pil_frame = dynamic_preprocess(
+            Image.fromarray(frame, mode="RGB"),
+            image_size=input_size,
+            max_num_tiles=max_num_tiles,
+            use_thumbnail=use_thumbnail,
+            idx=0,
+        )
+        # dynamic_preprocess returns tensors already; take the single tile
+        assert len(pil_frame) >= 1
+        frames_tensors.append(pil_frame[0])
+
+    return torch.stack(frames_tensors)
+
+
+class BaseNanoNemotronVLProcessor(ABC):
+    """
+    This model doesn't define its own HF processor,
+    so we implement our own one here.
+
+    The code to insert image tokens is based on:
+    https://huggingface.co/OpenGVLab/InternVL2-1B/blob/main/modeling_internvl_chat.py#L252
+    """
+
+    def __init__(self, config: PretrainedConfig, tokenizer: AnyTokenizer,
+                 *args, **kwargs) -> None:
+        super().__init__()
+
+        self.config = config
+        self.tokenizer = tokenizer
+
+        image_size: int = config.force_image_size
+        patch_size: int = config.patch_size
+
+        self.num_image_token = int(
+            (image_size // patch_size)**2 * (config.downsample_ratio**2))
+        self.image_size = image_size
+        self.use_thumbnail: bool = config.use_thumbnail
+        self.norm_mean = torch.Tensor(config.norm_mean).reshape(1, 3, 1, 1)
+        self.norm_std = torch.Tensor(config.norm_std).reshape(1, 3, 1, 1)
+
+    @property
+    @abstractmethod
+    def image_token_id(self) -> int:
+        raise NotImplementedError
+
+    @abstractmethod
+    def get_image_repl(
+        self,
+        feature_size: int,
+        num_patches: Optional[int],
+    ) -> PromptUpdateDetails[str]:
+        raise NotImplementedError
+
+    def get_num_image_tokens(
+        self,
+        *,
+        image_width: int,
+        image_height: int,
+        max_num_tiles: int,
+    ) -> int:
+        target_ratios = get_internvl_target_ratios(1, max_num_tiles)
+
+        num_patches, _, _ = calculate_internvl_targets(
+            orig_width=image_width,
+            orig_height=image_height,
+            target_ratios=target_ratios,
+            image_size=self.image_size,
+            use_thumbnail=self.use_thumbnail,
+        )
+
+        return num_patches * self.num_image_token
+
+    def _images_to_pixel_values_lst(
+        self,
+        images: list[Image.Image],
+        max_num_tiles: int,
+    ) -> list[torch.Tensor]:
+        return [
+            image_to_pixel_values(
+                image,
+                input_size=self.image_size,
+                max_num=max_num_tiles,
+                use_thumbnail=self.use_thumbnail,
+                idx=idx,
+            ) for idx, image in enumerate(images)
+        ]
+
+    def _preprocess_image(
+        self,
+        text: list[str],
+        images: list[Image.Image],
+        max_num_tiles: int,
+    ) -> tuple[list[str], dict[str, torch.Tensor]]:
+        if len(images) == 0:
+            image_inputs = {}
+        else:
+            pixel_values_lst = self._images_to_pixel_values_lst(
+                images, max_num_tiles)
+            image_inputs: dict[str, NestedTensors] = {
+                "pixel_values_flat":
+                input_conditioner(torch.cat(pixel_values_lst), self.norm_mean,
+                                  self.norm_std),
+                "image_num_patches":
+                torch.tensor([len(item) for item in pixel_values_lst]),
+            }
+
+            for pixel_values in pixel_values_lst:
+                num_patches = pixel_values.shape[0]
+                feature_size = num_patches * self.num_image_token
+                image_repl = self.get_image_repl(feature_size, num_patches)
+                text = [t.replace('<image>', image_repl.full, 1) for t in text]
+        return text, image_inputs
+
+    def _make_batch_input(self,
+                          input_item: Optional[Union[Any, list[Any]]] = None):
+        if input_item is None:
+            input_item = []
+        if not isinstance(input_item, list):
+            input_item = [input_item]
+        return input_item
+
+    def __call__(
+        self,
+        text: Optional[Union[str, list[str]]] = None,
+        images: Optional[Union[Image.Image, list[Image.Image]]] = None,
+        return_tensors: Optional[Union[str, TensorType]] = None,
+        max_num_tiles: Optional[int] = None,
+    ) -> Mapping[str, NestedTensors]:
+        # Use default if not provided
+        if max_num_tiles is None:
+            max_num_tiles = 12
+
+        text, images = [self._make_batch_input(x) for x in (text, images)]
+
+        text, image_inputs = self._preprocess_image(
+            text=text,
+            images=images,
+            max_num_tiles=max_num_tiles,
+        )
+
+        text_inputs = self.tokenizer(text, add_special_tokens=False)
+
+        return {
+            **BatchEncoding(text_inputs, tensor_type=return_tensors),
+            **image_inputs,
+        }
+
+
+class NanoNemotronVLProcessor(BaseNanoNemotronVLProcessor):
+    """
+    HF Processor  with extended video processing logic.
+    Code for video processing is adapted from video example:
+    https://huggingface.co/OpenGVLab/InternVL3-1B#inference-with-transformers
+    """
+
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        tokenizer: AnyTokenizer,
+        *,
+        min_dynamic_patch: Optional[int] = None,
+        max_dynamic_patch: Optional[int] = None,
+        dynamic_image_size: Optional[bool] = None,
+        video_token: Optional[str] = None,
+    ) -> None:
+        super().__init__(
+            config=config,
+            tokenizer=tokenizer,
+            min_dynamic_patch=min_dynamic_patch,
+            max_dynamic_patch=max_dynamic_patch,
+            dynamic_image_size=dynamic_image_size,
+        )
+        # add extra video token for video processing
+        self.video_token = video_token
+
+    @property
+    def supports_video(self) -> bool:
+        return self.video_token_id is not None
+
+    @property
+    def video_token_id(self) -> Optional[int]:
+        if self.video_token is None:
+            return None
+        return self.tokenizer.get_vocab().get(self.video_token, None)
+
+    @property
+    def image_token_id(self) -> int:
+        return self.tokenizer.convert_tokens_to_ids(IMG_CONTEXT)
+
+    def _videos_to_pixel_values_lst(
+        self,
+        videos: list[npt.NDArray],
+        max_num_tiles: int,
+        dynamic_image_size: Optional[bool] = None,
+    ) -> list[torch.Tensor]:
+
+        return [
+            video_to_pixel_values(
+                video,
+                input_size=self.image_size,
+                max_num_tiles=max_num_tiles,
+                use_thumbnail=self.use_thumbnail,
+            ) for video in videos
+        ]
+
+    def _preprocess_video(
+        self,
+        text: list[str],
+        videos: list[npt.NDArray],
+        max_num_tiles: int,
+        dynamic_image_size: Optional[bool] = None,
+    ):
+        if len(videos) == 0 or not self.supports_video:
+            video_inputs = {}
+        else:
+            pixel_values_lst_video = self._videos_to_pixel_values_lst(
+                videos,
+                max_num_tiles=max_num_tiles,
+                dynamic_image_size=dynamic_image_size,
+            )
+
+            video_inputs: dict[str, NestedTensors] = {
+                "pixel_values_flat_video":
+                input_conditioner(torch.cat(pixel_values_lst_video),
+                                  self.norm_mean, self.norm_std),
+                "video_num_patches":
+                torch.tensor([len(item) for item in pixel_values_lst_video]),
+            }
+
+            for pixel_values in pixel_values_lst_video:
+                num_patches = pixel_values.shape[0]
+
+                video_repl = self.get_video_repl(self.num_image_token,
+                                                 num_patches, self.video_token)
+                text = [t.replace('<video>', video_repl.full, 1) for t in text]
+        return text, video_inputs
+
+    def __call__(
+        self,
+        text: Optional[Union[str, list[str]]] = None,
+        images: Optional[Union[Image.Image, list[Image.Image]]] = None,
+        videos: Optional[Union[npt.NDArray, list[npt.NDArray]]] = None,
+        return_tensors: Optional[Union[str, TensorType]] = None,
+        max_num_tiles: Optional[int] = None,
+        dynamic_image_size: Optional[bool] = None,
+    ) -> Mapping[str, NestedTensors]:
+        # Use default if not provided
+        if max_num_tiles is None:
+            max_num_tiles = 12
+
+        text, images, videos = [
+            self._make_batch_input(x) for x in (text, images, videos)
+        ]
+
+        text, image_inputs = self._preprocess_image(
+            text=text,
+            images=images,
+            max_num_tiles=max_num_tiles,
+        )
+
+        text, video_inputs = self._preprocess_video(
+            text=text,
+            videos=videos,
+            max_num_tiles=max_num_tiles,
+            dynamic_image_size=dynamic_image_size,
+        )
+
+        text_inputs = self.tokenizer(text, add_special_tokens=False)
+
+        return BatchFeature({
+            **BatchEncoding(text_inputs, tensor_type=return_tensors),
+            **image_inputs,
+            **video_inputs,
+        })
+
+    def get_image_repl(
+        self,
+        feature_size: int,
+        num_patches: Optional[int],
+    ) -> PromptUpdateDetails[str]:
+        repl_features = IMG_CONTEXT * feature_size
+        repl_full = IMG_START + repl_features + IMG_END
+
+        return PromptUpdateDetails.select_text(repl_full, IMG_CONTEXT)
+
+    def get_video_repl(
+        self,
+        feature_size: int,
+        num_patches: Optional[int] = None,
+        video_context_token: str = IMG_CONTEXT,
+    ) -> PromptUpdateDetails[str]:
+        repl_features = video_context_token * self.num_image_token
+        repl_features_with_sep = IMG_START + repl_features + IMG_END
+        # num_patches is equal to num_frames
+        repl_full = ''.join([
+            f'Frame{i+1}: {repl_features_with_sep}' for i in range(num_patches)
+        ])
+
+        return PromptUpdateDetails.select_text(repl_full, video_context_token)
+
+
+class BaseNanoNemotronVLProcessingInfo(BaseProcessingInfo):
+    """Basic image-only ProcessingInfo for InternVL-style models."""
+
+    @abstractmethod
+    def get_hf_processor(
+        self,
+        **kwargs: object,
+    ) -> BaseNanoNemotronVLProcessor:
+        raise NotImplementedError
+
+    def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
+        return {"image": None}
+
+    def get_num_image_tokens(
+        self,
+        *,
+        image_width: int,
+        image_height: int,
+        max_num_tiles: int,
+        processor: Optional[BaseNanoNemotronVLProcessor],
+    ) -> int:
+        if processor is None:
+            processor = self.get_hf_processor()
+
+        return processor.get_num_image_tokens(
+            image_width=image_width,
+            image_height=image_height,
+            max_num_tiles=max_num_tiles,
+        )
+
+    def get_image_size_with_most_features(self,
+                                          max_num_tiles: int) -> ImageSize:
+        processor = self.get_hf_processor()
+
+        base_size = processor.image_size
+        target_ratios = get_internvl_target_ratios(1, max_num_tiles)
+
+        largest_feature_size, largest_feature_pinpoint = 0, None
+        for wr, hr in target_ratios:
+            width, height = base_size * wr, base_size * hr
+
+            feat_size = self.get_num_image_tokens(
+                image_width=width,
+                image_height=height,
+                max_num_tiles=max_num_tiles,
+                processor=processor,
+            )
+            if feat_size > largest_feature_size:
+                largest_feature_size = feat_size
+                largest_feature_pinpoint = ImageSize(width=width,
+                                                     height=height)
+
+        if largest_feature_size == 0 or largest_feature_pinpoint is None:
+            raise ValueError("Cannot have a largest feature size of 0!")
+
+        return largest_feature_pinpoint
+
+    def get_max_image_tokens(self) -> int:
+        processor = self.get_hf_processor()
+        # Use default max_num_tiles for max tokens calculation
+        max_num_tiles = 12
+        target_width, target_height = self.get_image_size_with_most_features(
+            max_num_tiles)
+
+        return self.get_num_image_tokens(
+            image_width=target_width,
+            image_height=target_height,
+            max_num_tiles=max_num_tiles,
+            processor=processor,
+        )
+
+
+_I = TypeVar("_I", bound=BaseNanoNemotronVLProcessingInfo)
+
+
+class NanoNemotronVLProcessingInfo(BaseNanoNemotronVLProcessingInfo):
+    """ ProcessingInfo extended for video processing"""
+
+    @property
+    def supports_video(self):
+        return self.get_hf_processor().supports_video
+
+    def get_supported_mm_limits(self):
+        video_limit = {"video": None} if self.supports_video else {}
+        return {**super().get_supported_mm_limits(), **video_limit}
+
+    def get_video_token(self) -> Optional[str]:
+        return IMG_CONTEXT
+
+    def get_num_frames_with_most_features(
+        self,
+        seq_len: int,
+        mm_counts: Mapping[str, int],
+    ) -> int:
+        max_images = mm_counts.get("image", 0)
+        max_videos = mm_counts.get("video", 0)
+
+        processor = self.get_hf_processor()  # we get the CustomProcessor here
+
+        max_image_tokens = self.get_max_image_tokens() * max_images
+        max_total_frames = (seq_len -
+                            max_image_tokens) // processor.num_image_token
+        max_frames_per_video = max_total_frames // max(max_videos, 1)
+
+        max_frames_per_video = min(max_frames_per_video, MAX_FRAMES)
+        return max(max_frames_per_video, 1)
+
+    def get_hf_processor(self, **kwargs: object) -> NanoNemotronVLProcessor:
+        return self.ctx.init_processor(
+            NanoNemotronVLProcessor,
+            config=self.get_hf_config(),
+            tokenizer=self.get_tokenizer(),
+            video_token=self.get_video_token(),
+            **kwargs,
+        )
+
+
+class NanoNemotronBaseVLMultiModalProcessor(BaseMultiModalProcessor[_I]):
+    """Basic image-only MultiModalProcessor for InternVL-style models."""
+
+    def _call_hf_processor(
+        self,
+        prompt: str,
+        mm_data: Mapping[str, object],
+        mm_kwargs: Mapping[str, object],
+        tok_kwargs: Mapping[str, object],
+    ) -> Mapping[str, NestedTensors]:
+        processed_outputs = super()._call_hf_processor(
+            prompt=prompt,
+            mm_data=mm_data,
+            mm_kwargs=mm_kwargs,
+            tok_kwargs=tok_kwargs,
+        )
+
+        hf_processor = self.info.get_hf_processor(**mm_kwargs)
+        image_token_id = hf_processor.image_token_id
+
+        # Since there may be extra tokens in the feature placeholders,
+        # we need to pass the image token ID to the model to select the
+        # tokens to merge from the vision encoder outputs
+        processed_outputs["image_token_id"] = torch.tensor(image_token_id)
+
+        return processed_outputs
+
+    def _get_mm_fields_config(
+        self,
+        hf_inputs: Mapping[str, NestedTensors],
+        hf_processor_mm_kwargs: Mapping[str, object],
+    ) -> Mapping[str, MultiModalFieldConfig]:
+        image_num_patches = hf_inputs.get("image_num_patches", torch.empty(0))
+        num_images = len(image_num_patches)
+
+        return dict(
+            pixel_values_flat=MultiModalFieldConfig.flat_from_sizes(
+                "image", image_num_patches),
+            image_num_patches=MultiModalFieldConfig.batched("image"),
+            image_embeds=MultiModalFieldConfig.batched("image"),
+            image_token_id=MultiModalFieldConfig.shared("image", num_images),
+        )
+
+    def _get_prompt_updates(
+        self,
+        mm_items: MultiModalDataItems,
+        hf_processor_mm_kwargs: Mapping[str, object],
+        out_mm_kwargs: MultiModalKwargs,
+    ) -> Sequence[PromptUpdate]:
+        hf_processor = self.info.get_hf_processor(**hf_processor_mm_kwargs)
+
+        if "image_num_patches" in out_mm_kwargs:
+            image_num_patches = out_mm_kwargs["image_num_patches"]
+            assert isinstance(image_num_patches, torch.Tensor)
+            image_num_patches = image_num_patches.tolist()
+        elif "image_embeds" in out_mm_kwargs:
+            # to compute num_patches (similar to Qwen2-VL)
+            image_num_patches = [None] * len(out_mm_kwargs["image_embeds"])
+        else:
+            image_num_patches = []
+
+        def get_replacement_custom(item_idx: int):
+            images = mm_items.get_items(
+                "image", (ImageEmbeddingItems, ImageProcessorItems))
+
+            if isinstance(images, ImageEmbeddingItems):
+                feature_size = images.get_feature_size(item_idx)
+            else:
+                image_size = images.get_image_size(item_idx)
+                # Extract max_num_tiles from kwargs, default to 12
+                max_num_tiles = hf_processor_mm_kwargs.get("max_num_tiles", 12)
+                feature_size = self.info.get_num_image_tokens(
+                    image_width=image_size.width,
+                    image_height=image_size.height,
+                    max_num_tiles=max_num_tiles,
+                    processor=hf_processor,
+                )
+
+            num_patches = None
+            local_image_num_patches = image_num_patches
+            if isinstance(local_image_num_patches, torch.Tensor):
+                local_image_num_patches = local_image_num_patches.tolist()
+            if isinstance(
+                    local_image_num_patches,
+                (list, tuple)) and item_idx < len(local_image_num_patches):
+                num_patches = int(local_image_num_patches[item_idx])
+
+            return hf_processor.get_image_repl(feature_size, num_patches)
+
+        return [
+            PromptReplacement(
+                modality="image",
+                target="<image>",
+                replacement=get_replacement_custom,
+            )
+        ]
+
+
+class NanoNemotronVLMultiModalProcessor(
+        NanoNemotronBaseVLMultiModalProcessor[NanoNemotronVLProcessingInfo]):
+    """MultiModalProcessor extended for video support"""
+
+    def _call_hf_processor(
+        self,
+        prompt: str,
+        mm_data: Mapping[str, object],
+        mm_kwargs: Mapping[str, object],
+        tok_kwargs: Mapping[str, object],
+    ) -> Mapping[str, NestedTensors]:
+        processed_outputs = super()._call_hf_processor(prompt, mm_data,
+                                                       mm_kwargs, tok_kwargs)
+
+        hf_processor = self.info.get_hf_processor(**mm_kwargs)
+        if self.info.supports_video and (
+                video_token_id := hf_processor.video_token_id) is not None:
+            processed_outputs["video_token_id"] = torch.tensor(video_token_id)
+        return processed_outputs
+
+    def _get_mm_fields_config(
+        self,
+        hf_inputs: Mapping[str, NestedTensors],
+        hf_processor_mm_kwargs: Mapping[str, object],
+    ) -> Mapping[str, MultiModalFieldConfig]:
+        image_fields = super()._get_mm_fields_config(hf_inputs,
+                                                     hf_processor_mm_kwargs)
+        if self.info.supports_video:
+            video_num_patches = hf_inputs.get("video_num_patches",
+                                              torch.empty(0))
+            num_videos = len(video_num_patches)
+            video_fields = dict(
+                pixel_values_flat_video=MultiModalFieldConfig.flat_from_sizes(
+                    "video", video_num_patches),
+                video_num_patches=MultiModalFieldConfig.batched("video"),
+                video_token_id=MultiModalFieldConfig.shared(
+                    "video", num_videos))
+        else:
+            video_fields = {}
+
+        return image_fields | video_fields
+
+    def _get_prompt_updates(
+        self,
+        mm_items: MultiModalDataItems,
+        hf_processor_mm_kwargs: Mapping[str, object],
+        out_mm_kwargs: MultiModalKwargsItems,
+    ) -> Sequence[PromptUpdate]:
+        prompt_repl = super()._get_prompt_updates(
+            mm_items=mm_items,
+            hf_processor_mm_kwargs=hf_processor_mm_kwargs,
+            out_mm_kwargs=out_mm_kwargs,
+        )
+
+        hf_processor = self.info.get_hf_processor(**hf_processor_mm_kwargs)
+
+        out_mm_data = out_mm_kwargs.get_data()
+        if "video_num_patches" in out_mm_data:
+            video_num_patches = out_mm_data["video_num_patches"]
+            assert isinstance(video_num_patches, torch.Tensor)
+            video_num_patches = video_num_patches.tolist()
+        else:
+            video_num_patches = []
+
+        def get_video_replacement_internvl(item_idx: int):
+            feature_size = hf_processor.num_image_token
+            num_patches = video_num_patches[item_idx]
+            if num_patches is not None:
+                assert isinstance(num_patches, int)
+
+            return hf_processor.get_video_repl(
+                feature_size,
+                num_patches,
+                video_context_token=hf_processor.video_token)
+
+        if self.info.supports_video:
+            prompt_repl = [
+                *prompt_repl,
+                PromptReplacement(
+                    modality="video",
+                    target="<video>",
+                    replacement=get_video_replacement_internvl,
+                )
+            ]
+
+        return prompt_repl
+
+
+class NanoNemotronVLDummyInputsBuilder(BaseDummyInputsBuilder[_I]):
+    """Basic image-only DummyInputsBuilder for InternVL-style models."""
+
+    def get_dummy_text(self, mm_counts: Mapping[str, int]) -> str:
+        num_images = mm_counts.get("image", 0)
+
+        return "<image>" * num_images
+
+    def get_dummy_mm_data(
+        self,
+        seq_len: int,
+        mm_counts: Mapping[str, int],
+    ) -> MultiModalDataDict:
+        # Use default max_num_tiles for dummy data generation
+        max_num_tiles = 12
+        target_width, target_height = (
+            self.info.get_image_size_with_most_features(max_num_tiles))
+        num_images = mm_counts.get("image", 0)
+
+        return {
+            "image":
+            self._get_dummy_images(width=target_width,
+                                   height=target_height,
+                                   num_images=num_images)
+        }
+
+
+class NanoNemotronVLDummyInputsBuilder(
+        NanoNemotronVLDummyInputsBuilder[NanoNemotronVLProcessingInfo]):
+    """DummyInputsBuilder extended for video support"""
+
+    def get_dummy_text(self, mm_counts: Mapping[str, int]) -> str:
+        num_videos = mm_counts.get("video", 0)
+
+        return super().get_dummy_text(mm_counts) + "<video>" * num_videos
+
+    def get_dummy_mm_data(
+        self,
+        seq_len: int,
+        mm_counts: Mapping[str, int],
+    ) -> MultiModalDataDict:
+        dummy_image = super().get_dummy_mm_data(seq_len=seq_len,
+                                                mm_counts=mm_counts)
+        if self.info.supports_video:
+            config = self.info.get_hf_config()
+            image_size: int = config.force_image_size
+            target_num_frames = \
+                self.info.get_num_frames_with_most_features(seq_len, mm_counts)
+            num_videos = mm_counts.get("video", 0)
+            dummy_video = {
+                "video":
+                self._get_dummy_videos(width=image_size,
+                                       height=image_size,
+                                       num_frames=target_num_frames,
+                                       num_videos=num_videos)
+            }
+        else:
+            dummy_video = {}
+        return {**dummy_image, **dummy_video}
+
+
+@MULTIMODAL_REGISTRY.register_processor(
+    NanoNemotronVLMultiModalProcessor,
+    info=NanoNemotronVLProcessingInfo,
+    dummy_inputs=NanoNemotronVLDummyInputsBuilder,
+)
+class NemotronH_Nano_VL(nn.Module, HasInnerState, IsHybrid,
+                        SupportsMultiModal):
+
+    @classmethod
+    def get_placeholder_str(cls, modality: str, i: int) -> Optional[str]:
+        if modality.startswith("image"):
+            return "<image>"
+        if modality.startswith("video"):
+            return "<video>"
+        return None
+
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+        super().__init__()
+        config = vllm_config.model_config.hf_config
+
+        image_size = config.force_image_size
+        patch_size = config.patch_size
+        self.patch_size = patch_size
+        self.template = config.template
+        self.num_image_token = int(
+            (image_size // patch_size)**2 * (config.downsample_ratio**2))
+        self.downsample_ratio = config.downsample_ratio
+        self.ps_version = config.ps_version
+        self.image_tag_type = config.image_tag_type
+
+        self.language_model = init_vllm_registered_model(
+            vllm_config=vllm_config,
+            hf_config=config.text_config,
+            prefix=maybe_prefix(prefix, "language_model"),
+        )
+        self.vision_model = AutoModel.from_config(config.vision_config,
+                                                  trust_remote_code=True)
+        self.vision_model.model._initialize_weights = (
+            self.vision_model.model._init_weights)
+        # Move input normalization to processor to mirror original HF
+        # implementation where normalization is done in fp32
+        self.vision_model.radio_model.make_preprocessor_external()
+        self.vision_model = self.vision_model.to(
+            self.language_model.config.torch_dtype)
+
+        self.drop_vision_class_token = True
+
+        # Construct the vision projection.
+        vit_hidden_size = config.vit_hidden_size
+        vision_projection_hidden_size = config.projector_hidden_size
+        llm_hidden_size = config.text_config.hidden_size
+
+        self.mlp1 = nn.Sequential(
+            RMSNorm(hidden_size=vit_hidden_size *
+                    int(1 / self.downsample_ratio)**2,
+                    eps=1e-5),
+            nn.Linear(
+                vit_hidden_size * int(1 / self.downsample_ratio)**2,
+                vision_projection_hidden_size,
+                bias=False,
+            ),
+            ReLUSquaredActivation(),
+            nn.Linear(vision_projection_hidden_size,
+                      llm_hidden_size,
+                      bias=False),
+        )
+        self.mlp1 = self.mlp1.to(self.language_model.config.torch_dtype)
+
+        self.img_context_token_id = None
+        self.video_context_token_id = None
+        self.config = config
+
+    def pixel_shuffle(self, x, scale_factor=0.5):
+        n, w, h, c = x.size()
+        # N, W, H, C --> N, W, H * scale, C // scale
+        x = x.view(
+            n,
+            w,
+            int(h * scale_factor),
+            int(c / scale_factor),
+        )
+        # N, W, H * scale, C // scale --> N, H * scale, W, C // scale
+        x = x.permute(0, 2, 1, 3).contiguous()
+        # N, H * scale, W, C // scale -->
+        # N, H * scale, W * scale, C // (scale ** 2)
+        x = x.view(
+            n,
+            int(h * scale_factor),
+            int(w * scale_factor),
+            int(c / (scale_factor * scale_factor)),
+        )
+        if self.ps_version == "v1":
+            warnings.warn(
+                "In ps_version 'v1', the height and width have not "
+                "been swapped back, which results in a transposed image.",
+                stacklevel=2,
+            )
+        else:
+            x = x.permute(0, 2, 1, 3).contiguous()
+        return x
+
+    def extract_feature(self, pixel_values):
+        vit_embeds = self.vision_model(pixel_values).features
+        vit_embeds = vit_embeds.to(dtype=torch.bfloat16)
+        h = w = int(vit_embeds.shape[1]**0.5)
+        vit_embeds = vit_embeds.reshape(vit_embeds.shape[0], h, w, -1)
+        vit_embeds = self.pixel_shuffle(vit_embeds,
+                                        scale_factor=self.downsample_ratio)
+        vit_embeds = vit_embeds.reshape(vit_embeds.shape[0], -1,
+                                        vit_embeds.shape[-1])
+        vit_embeds = self.mlp1(vit_embeds)
+        return vit_embeds
+
+    def _parse_and_validate_image_input(
+            self, **kwargs: object) -> Optional[NanoNemotronVLImageInputs]:
+        pixel_values_flat = kwargs.pop("pixel_values_flat", None)
+        image_num_patches = kwargs.pop("image_num_patches", None)
+        image_embeds = kwargs.pop("image_embeds", None)
+
+        if pixel_values_flat is None and image_embeds is None:
+            return None
+
+        if image_embeds is not None:
+            if not isinstance(image_embeds, (torch.Tensor, list)):
+                raise ValueError("Incorrect type of image embeddings. "
+                                 f"Got type: {type(image_embeds)}")
+
+            return NanoNemotronVLImageEmbeddinInputs(
+                type="image_embeds",
+                data=flatten_bn(image_embeds),
+            )
+
+        image_token_id = kwargs["image_token_id"]
+        assert isinstance(image_token_id, torch.Tensor)
+        self.img_context_token_id = image_token_id.flatten().unique().item()
+
+        if pixel_values_flat is not None:
+            if not isinstance(pixel_values_flat, (torch.Tensor, list)):
+                raise ValueError("Incorrect type of pixel values. "
+                                 f"Got type: {type(pixel_values_flat)}")
+
+            if not isinstance(image_num_patches, (torch.Tensor, list)):
+                raise ValueError("Incorrect type of image_num_patches. "
+                                 f"Got type: {type(image_num_patches)}")
+
+            pixel_values_flat = flatten_bn(pixel_values_flat, concat=True)
+            image_num_patches = flatten_bn(image_num_patches, concat=True)
+
+            return NanoNemotronVLImagePixelInputs(
+                type="pixel_values",
+                pixel_values_flat=pixel_values_flat,
+                num_patches=image_num_patches,
+            )
+
+        raise AssertionError("This line should be unreachable.")
+
+    def _process_image_input(
+            self, image_input: NanoNemotronVLImageInputs) -> torch.Tensor:
+        if image_input["type"] == "image_embeds":
+            return image_input["data"]
+
+        assert self.vision_model is not None
+
+        image_embeds = self.extract_feature(image_input["pixel_values_flat"])
+        num_patches = image_input["num_patches"]
+
+        # Only one image in the current batch
+        if len(num_patches) == 1:
+            return (image_embeds.view(-1,
+                                      self.config.text_config.hidden_size), )
+
+        # NOTE: Image embeddings are split into separate tensors for each image
+        # by the size of each embedding.
+        feature_size = image_embeds.shape[1]
+        image_embeds = image_embeds.view(-1,
+                                         self.config.text_config.hidden_size)
+        image_feature_sizes = [
+            num_patches * feature_size for num_patches in num_patches
+        ]
+        return image_embeds.split(image_feature_sizes)
+
+    def _parse_and_validate_video_input(
+            self,
+            **kwargs: object) -> Optional[NanoNemotronVLVideoPixelInputs]:
+        pixel_values_flat_video = kwargs.pop("pixel_values_flat_video", None)
+        video_num_patches = kwargs.pop("video_num_patches", None)
+        video_embeds = kwargs.pop("video_embeds", None)
+
+        if pixel_values_flat_video is None and video_embeds is None:
+            return None
+
+        if video_embeds is not None:
+            return NanoNemotronVLVideoEmbeddingInputs(
+                type="video_embeds",
+                data=flatten_bn(video_embeds),
+            )
+
+        video_token_id = kwargs["video_token_id"]
+        assert isinstance(video_token_id, torch.Tensor)
+        self.video_context_token_id = video_token_id.flatten().unique().item()
+
+        if pixel_values_flat_video is not None:
+            if not isinstance(pixel_values_flat_video, (torch.Tensor, list)):
+                raise ValueError("Incorrect type of pixel values. "
+                                 f"Got type: {type(pixel_values_flat_video)}")
+
+            if not isinstance(video_num_patches, (torch.Tensor, list)):
+                raise ValueError("Incorrect type of image_num_patches. "
+                                 f"Got type: {type(video_num_patches)}")
+
+            pixel_values_flat_video = flatten_bn(pixel_values_flat_video,
+                                                 concat=True)
+            video_num_patches = flatten_bn(video_num_patches, concat=True)
+            expected_h = expected_w = self.config.force_image_size
+            resolve_bindings = {"h": expected_h, "w": expected_w}
+
+            return NanoNemotronVLVideoPixelInputs(
+                type="pixel_values_videos",
+                pixel_values_flat=pixel_values_flat_video,
+                num_patches=video_num_patches,
+                resolve_bindings=resolve_bindings,
+            )
+
+        raise AssertionError("This line should be unreachable.")
+
+    def _parse_and_validate_multimodal_inputs(self, **kwargs: object) -> dict:
+        modalities = {}
+        # Preserve the order of modalities if there are multiple of them
+        # from the order of kwargs.
+        for input_key in kwargs:
+            if input_key in ("pixel_values_flat",
+                             "image_embeds") and "images" not in modalities:
+                modalities["images"] = self._parse_and_validate_image_input(
+                    **kwargs)
+            if input_key in ("pixel_values_flat_video",
+                             ) and "videos" not in modalities:
+                modalities["videos"] = self._parse_and_validate_video_input(
+                    **kwargs)
+
+        return modalities
+
+    def get_multimodal_embeddings(
+            self, **kwargs: object) -> Optional[MultiModalEmbeddings]:
+        # Validate the multimodal input keyword arguments
+        modalities = self._parse_and_validate_multimodal_inputs(**kwargs)
+        if modalities is None:
+            return []
+
+        # # The result multimodal_embeddings is tuple of tensors, with each
+        # tensor correspoending to a multimodal data item (image or video).
+        multimodal_embeddings: tuple[torch.Tensor, ...] = ()
+
+        # NOTE: It is important to iterate over the keys in this dictionary
+        # to preserve the order of the modalities.
+        for modality in modalities:
+            if modality == "images":
+                image_input = modalities["images"]
+                vision_embeddings = self._process_image_input(image_input)
+                multimodal_embeddings += vision_embeddings
+            if modality == "videos":
+                video_input = modalities["videos"]
+                video_embeddings = self._process_image_input(video_input)
+                multimodal_embeddings += video_embeddings
+
+        return multimodal_embeddings
+
+    def get_input_embeddings(
+        self,
+        input_ids: torch.Tensor,
+        multimodal_embeddings: Optional[MultiModalEmbeddings] = None,
+    ) -> torch.Tensor:
+        inputs_embeds = self.language_model.get_input_embeddings(input_ids)
+
+        if (multimodal_embeddings is not None
+                and len(multimodal_embeddings) != 0):
+            context_token_ids = [
+                token_id for token_id in (self.img_context_token_id,
+                                          self.video_context_token_id)
+                if token_id is not None
+            ]
+            assert len(context_token_ids) >= 1
+            inputs_embeds = merge_multimodal_embeddings(
+                input_ids,
+                inputs_embeds,
+                multimodal_embeddings,
+                context_token_ids,
+            )
+
+        return inputs_embeds
+
+    def get_language_model(self) -> torch.nn.Module:
+        return self.language_model
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        intermediate_tensors: Optional[IntermediateTensors] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        **kwargs: object,
+    ) -> Union[torch.Tensor, IntermediateTensors]:
+        if intermediate_tensors is not None:
+            input_ids = None
+            inputs_embeds = None
+
+        # NOTE: In v1, inputs_embeds is always generated at model runner, this
+        # condition is for v0 compatibility.
+        elif inputs_embeds is None:
+            vision_embeddings = self.get_multimodal_embeddings(**kwargs)
+            inputs_embeds = self.get_input_embeddings(input_ids,
+                                                      vision_embeddings)
+            input_ids = None
+
+        hidden_states = self.language_model(
+            input_ids=input_ids,
+            positions=positions,
+            intermediate_tensors=intermediate_tensors,
+            inputs_embeds=inputs_embeds,
+            **kwargs,
+        )
+
+        return hidden_states
+
+    def get_mm_mapping(self) -> MultiModelKeys:
+        """
+        Get the module prefix in multimodal models
+        """
+        return MultiModelKeys.from_string_field(
+            language_model="language_model",
+            connector="mlp1",
+            tower_model="vision_model",
+        )
+
+    def compute_logits(
+        self,
+        hidden_states: torch.Tensor,
+        sampling_metadata: SamplingMetadata,
+    ) -> Optional[torch.Tensor]:
+        return self.language_model.compute_logits(hidden_states,
+                                                  sampling_metadata)
+
+    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]):
+
+        def is_vision_model_weights(weight: tuple[str, torch.Tensor]):
+            return weight[0].startswith("vision_model")
+
+        def is_adapter_weights(weight: tuple[str, torch.Tensor]):
+            return weight[0].startswith("mlp1")
+
+        # Get references to parameters for direct loading
+        vision_model_dict = dict(self.vision_model.named_parameters())
+        vision_model_buffers = dict(self.vision_model.named_buffers())
+        adapter_dict = dict(self.mlp1.named_parameters())
+
+        def llm_weights_generator():
+            # Single pass over weights
+            for name, w in weights:
+                if is_vision_model_weights((name, w)):
+                    # Load vision encoder weights directly
+                    trimmed_name = ".".join(name.split(".")[1:])
+                    if "input_conditioner" in trimmed_name:
+                        continue
+                    if trimmed_name in vision_model_buffers:
+                        param = vision_model_buffers[trimmed_name]
+                    else:
+                        param = vision_model_dict[trimmed_name]
+                    with torch.no_grad():
+                        default_weight_loader(param, w)
+                elif is_adapter_weights((name, w)):
+                    # Load vision-language adapter weights directly
+                    trimmed_name = ".".join(name.split(".")[1:])
+                    param = adapter_dict[trimmed_name]
+                    with torch.no_grad():
+                        default_weight_loader(param, w)
+                else:
+                    # LLM weights: yield them to be loaded
+                    # by language_model.load_weights
+                    assert name.startswith("language_model")
+                    trimmed_name = ".".join(name.split(".")[1:])
+                    yield (trimmed_name, w)
+
+        # Now we call the language model load with the generator
+        self.language_model.load_weights(llm_weights_generator())
+
+    def print_architecture(self,
+                           detailed: bool = True,
+                           save_to_file: str = None):
+        """
+        Print model architecture with parameter names, shapes, and sizes.
+
+        Args:
+            detailed: If True, show detailed parameter breakdown
+            save_to_file: If provided, save output to this file path
+        """
+        import sys
+        from io import StringIO
+
+        # Capture output if saving to file
+        original_stdout = sys.stdout
+        if save_to_file:
+            sys.stdout = StringIO()
+
+        try:
+            print("=" * 100)
+            print("NemotronH_Nano_VL Model Architecture")
+            print("=" * 100)
+
+            total_params = 0
+            param_groups = {
+                "language_model": [],
+                "vision_model": [],
+                "mlp1": [],
+                "other": [],
+            }
+
+            for name, param in self.named_parameters():
+                param_size = param.numel()
+                total_params += param_size
+
+                # Group parameters by main component
+                if name.startswith("language_model"):
+                    param_groups["language_model"].append(
+                        (name, param.shape, param_size, param.dtype))
+                elif name.startswith("vision_model"):
+                    param_groups["vision_model"].append(
+                        (name, param.shape, param_size, param.dtype))
+                elif name.startswith("mlp1"):
+                    param_groups["mlp1"].append(
+                        (name, param.shape, param_size, param.dtype))
+                else:
+                    param_groups["other"].append(
+                        (name, param.shape, param_size, param.dtype))
+
+                if detailed:
+                    print(f"{name:<70} | Shape: {str(param.shape):<25} | "
+                          f"Size: {param_size:>12,} | Dtype: {param.dtype}")
+
+            print("=" * 100)
+            print("Summary by Component:")
+            print("-" * 60)
+
+            for component, params in param_groups.items():
+                if params:  # Only show components that have parameters
+                    component_total = sum(size for _, _, size, _ in params)
+                    percentage = ((component_total / total_params) *
+                                  100 if total_params > 0 else 0)
+                    print(f"{component:<20} | Parameters: {len(params):>4} | "
+                          f"Total Size: {component_total:>15,} | "
+                          f"{percentage:>6.2f}%")
+
+            print("-" * 60)
+            print(f"{'Total Parameters':<20} | {total_params:>15,}")
+
+            # Estimate memory usage (assuming bfloat16 = 2 bytes per parameter)
+            memory_mb = total_params * 2 / (1024**2)
+            memory_gb = memory_mb / 1024
+            print(f"{'Est. Memory (MB)':<20} | {memory_mb:>15.2f}")
+            print(f"{'Est. Memory (GB)':<20} | {memory_gb:>15.2f}")
+            print("=" * 100)
+
+            # Save to file if requested
+            if save_to_file:
+                output = sys.stdout.getvalue()
+                sys.stdout = original_stdout
+                with open(save_to_file, "w") as f:
+                    f.write(output)
+                print(f"Architecture saved to: {save_to_file}")
+                print(output)  # Also print to console
+
+        finally:
+            if save_to_file and sys.stdout != original_stdout:
+                sys.stdout = original_stdout
+
+    def get_model_info(self):
+        """
+        Get basic model information as a dictionary.
+        """
+        total_params = sum(p.numel() for p in self.parameters())
+
+        component_info = {}
+        for name, param in self.named_parameters():
+            component = name.split(".")[0]
+            if component not in component_info:
+                component_info[component] = {"params": 0, "size": 0}
+            component_info[component]["params"] += 1
+            component_info[component]["size"] += param.numel()
+
+        return {
+            "model_name": "NemotronH_Nano_VL",
+            "total_parameters": total_params,
+            "memory_estimate_mb": total_params * 2 / (1024**2),  # bfloat16
+            "components": component_info,
+            "config": {
+                "image_size": getattr(self.config, "force_image_size", None),
+                "patch_size": getattr(self.config, "patch_size", None),
+                "num_image_token": self.num_image_token,
+                "downsample_ratio": self.downsample_ratio,
+            },
+        }
+
+    def copy_inputs_before_cuda_graphs(self, input_buffers, **kwargs):
+        return self.language_model.mamba_cache.copy_inputs_before_cuda_graphs(
+            input_buffers, **kwargs)
+
+    def get_seqlen_agnostic_capture_inputs(self, batch_size: int):
+        return (self.language_model.mamba_cache.
+                get_seqlen_agnostic_capture_inputs(batch_size))
+
+    @classmethod
+    def get_mamba_state_shape_from_config(cls, vllm_config: "VllmConfig"):
+        text_config = vllm_config.model_config.hf_config.text_config
+        temp_vllm_config = copy.deepcopy(vllm_config)
+        temp_vllm_config.model_config.hf_config = text_config
+        return NemotronHForCausalLM.get_mamba_state_shape_from_config(
+            temp_vllm_config)
+
+    @classmethod
+    def get_mamba_state_dtype_from_config(cls, vllm_config: "VllmConfig"):
+        text_config = vllm_config.model_config.hf_config.text_config
+        temp_vllm_config = copy.deepcopy(vllm_config)
+        temp_vllm_config.model_config.hf_config = text_config
+        return NemotronHForCausalLM.get_mamba_state_dtype_from_config(
+            temp_vllm_config)
diff --git a/vllm/model_executor/models/nemotron_h.py b/vllm/model_executor/models/nemotron_h.py
index 8a563288cb4d6d592505e25276fe7897ff5c5607..da8628df1fe579192691216dc64d882c365cd9c1 100644
--- a/vllm/model_executor/models/nemotron_h.py
+++ b/vllm/model_executor/models/nemotron_h.py
@@ -44,15 +44,16 @@ from vllm.model_executor.layers.mamba.mamba_utils import (
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.layers.vocab_parallel_embedding import (
     DEFAULT_VOCAB_PADDING_SIZE, ParallelLMHead, VocabParallelEmbedding)
-from vllm.model_executor.model_loader.weight_utils import default_weight_loader
+from vllm.model_executor.model_loader.weight_utils import (
+    default_weight_loader, maybe_remap_kv_scale_name)
 from vllm.model_executor.models.interfaces import (HasInnerState, IsHybrid,
                                                    SupportsLoRA, SupportsPP,
                                                    SupportsQuant)
 from vllm.model_executor.models.mamba_cache import (MambaCacheManager,
                                                     MambaCacheParams)
 from vllm.model_executor.models.utils import (
-    AutoWeightsLoader, make_empty_intermediate_tensors_factory, make_layers,
-    maybe_prefix)
+    AutoWeightsLoader, WeightsMapper, make_empty_intermediate_tensors_factory,
+    make_layers, maybe_prefix)
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.sequence import IntermediateTensors
 from vllm.transformers_utils.configs import NemotronHConfig
@@ -426,38 +427,36 @@ class NemotronHModel(nn.Module):
 
     def load_weights(self, weights: Iterable[tuple[str,
                                                    torch.Tensor]]) -> set[str]:
-        attb_params_mapping = {
-            "q_proj": "q",
-            "k_proj": "k",
-            "v_proj": "v",
-        }
+        stacked_params_mapping = [
+            # (param_name, shard_name, shard_id)
+            ("qkv_proj", "q_proj", "q"),
+            ("qkv_proj", "k_proj", "k"),
+            ("qkv_proj", "v_proj", "v"),
+        ]
 
         params_dict = dict(self.named_parameters())
         loaded_params: set[str] = set()
         for name, loaded_weight in weights:
-            if "embeddings" in name:
-                name = name.replace("embeddings", "embed_tokens")
+            if "scale" in name:
+                # Remapping the name of FP8 kv-scale.
+                name = maybe_remap_kv_scale_name(name, params_dict)
+                if name is None:
+                    continue
+
+            # load stacked params
+            for param_name, weight_name, shard_id in stacked_params_mapping:
+                if weight_name not in name:
+                    continue
+                name = name.replace(weight_name, param_name)
+                # Skip loading extra bias for GPTQ models.
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
 
-            if "A_log" in name:
-                name = name.replace("A_log", "A")
-                loaded_weight = loaded_weight.to(torch.float32)
-
-            if "D" in name:
-                loaded_weight = loaded_weight.to(torch.float32)
-
-            if "dt_bias" in name:
-                loaded_weight = loaded_weight.to(torch.float32)
-
-            # load attn params
-            if any(proj in name for proj in ["q_proj", "k_proj", "v_proj"]):
-                weight_name = next(proj
-                                   for proj in ["q_proj", "k_proj", "v_proj"]
-                                   if proj in name)
-                name = name.replace(weight_name, "qkv_proj")
                 param = params_dict[name]
                 weight_loader = param.weight_loader
-                weight_loader(param, loaded_weight,
-                              attb_params_mapping[weight_name])
+                weight_loader(param, loaded_weight, shard_id)
+                break
+
             # load other params
             else:
                 param = params_dict[name]
@@ -471,6 +470,14 @@ class NemotronHModel(nn.Module):
 
 class NemotronHForCausalLM(nn.Module, HasInnerState, SupportsLoRA, SupportsPP,
                            IsHybrid, SupportsQuant):
+    hf_to_vllm_mapper = WeightsMapper(
+        orig_to_new_prefix={"backbone": "model"},
+        orig_to_new_substr={
+            "A_log": "A",
+            "embeddings": "embed_tokens"
+        },
+    )
+
     packed_modules_mapping = {
         "qkv_proj": [
             "q_proj",
@@ -622,10 +629,5 @@ class NemotronHForCausalLM(nn.Module, HasInnerState, SupportsLoRA, SupportsPP,
 
     def load_weights(self, weights: Iterable[tuple[str,
                                                    torch.Tensor]]) -> set[str]:
-        # update name in weights before passing to loader
-        updated_weights = []
-        for name, loaded_weight in weights:
-            name = name.replace("backbone", "model")
-            updated_weights.append((name, loaded_weight))
         loader = AutoWeightsLoader(self)
-        return loader.load_weights(updated_weights)
+        return loader.load_weights(weights, mapper=self.hf_to_vllm_mapper)
diff --git a/vllm/model_executor/models/ovis.py b/vllm/model_executor/models/ovis.py
index 04a06e5f9d6004b82b2481fb6564f4013e0ca52e..f1bb18716b40d52550dba52b85481c80bb1844f6 100644
--- a/vllm/model_executor/models/ovis.py
+++ b/vllm/model_executor/models/ovis.py
@@ -19,7 +19,7 @@
 """ PyTorch Ovis model."""
 import math
 from collections.abc import Iterable, Mapping
-from typing import Literal, Optional, TypedDict, Union
+from typing import Annotated, Literal, Optional, Union
 
 import torch
 import torch.nn as nn
@@ -49,6 +49,7 @@ from vllm.multimodal.processing import (BaseMultiModalProcessor,
 from vllm.multimodal.profiling import BaseDummyInputsBuilder
 from vllm.sequence import IntermediateTensors
 from vllm.transformers_utils.processors.ovis import OvisProcessor
+from vllm.utils.tensor_schema import TensorSchema, TensorShape
 
 from .interfaces import MultiModalEmbeddings, SupportsMultiModal, SupportsPP
 from .utils import merge_multimodal_embeddings
@@ -201,25 +202,22 @@ class VisualTokenizer(torch.nn.Module):
         return tokens
 
 
-class OvisImagePatchInputs(TypedDict):
-    type: Literal["image_patches"]
-    flat_data: torch.Tensor
-    """
-    Shape: 
-    `(batch_size * num_patches, patch_size_x * patch_size_y * num_channels)`
-    """
-
-    indicator_tokens: torch.Tensor
+class OvisImagePatchInputs(TensorSchema):
     """
-    Shape: 
-    `(batch_size * (num_patches + 1))`
-    """
-
-    patches_per_image: list[int]
-    """
-    List of number of total patches for each image in the batch.
-    This is used to restore the first two dimensions of `flat_data`.
+    Dimensions:
+        - batch_patches: Batch size * number of patches
+        - patch_size: patch_size_x * patch_size_y * num_channels
+        - patch_indicators: Batch size * (number of patches + 1)
+        - patches_per_image: List of number of total patches for each image
+          in the batch.
     """
+    type: Literal["image_patches"]
+    flat_data: Annotated[torch.Tensor,
+                         TensorShape("batch_patches", "patch_size")]
+    indicator_tokens: Annotated[torch.Tensor, TensorShape("patch_indicators")]
+    patches_per_image: Annotated[list[int],
+                                 TensorShape("num_patches_per_image")]
+    # This is used to restore the first two dimensions of `flat_data`.
 
 
 class VisualEmbedding(torch.nn.Embedding):
@@ -458,9 +456,12 @@ class Ovis(nn.Module, SupportsMultiModal, SupportsPP):
                 raise ValueError("Incorrect type of indicator_tokens. "
                                  f"Got type: {type(pixel_values)}")
 
+            flat_data = flatten_bn(pixel_values, concat=True)
+            if flat_data.ndim >= 3:
+                flat_data = flat_data.flatten(start_dim=1)
             return OvisImagePatchInputs(
                 type="image_patches",
-                flat_data=flatten_bn(flatten_bn(pixel_values), concat=True),
+                flat_data=flat_data,
                 patches_per_image=[
                     x.shape[0] for x in flatten_bn(pixel_values)
                 ],
@@ -544,7 +545,7 @@ class Ovis(nn.Module, SupportsMultiModal, SupportsPP):
                                                       vision_embeddings)
             input_ids = None
 
-        # up until here we have a inputs_embeds 100% numerical identity
+        # up until here we have an inputs_embeds 100% numerical identity
         # between the OG HF Transformers implementation and ours
         hidden_states = self.llm(
             input_ids=input_ids,
diff --git a/vllm/model_executor/models/paligemma.py b/vllm/model_executor/models/paligemma.py
index b74a09ee92c337bee2ad409b2fc4cbd7d6f1a216..d6eec77ebcee52f6e3f93ca7039a6380e1a26ecf 100644
--- a/vllm/model_executor/models/paligemma.py
+++ b/vllm/model_executor/models/paligemma.py
@@ -12,7 +12,8 @@ from vllm.logger import init_logger
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.multimodal import MULTIMODAL_REGISTRY
 from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
-                                    MultiModalInputs, MultiModalKwargsItems)
+                                    MultiModalInputs, MultiModalKwargsItems,
+                                    MultiModalUUIDDict)
 from vllm.multimodal.parse import (ImageEmbeddingItems, ImageProcessorItems,
                                    MultiModalDataItems)
 from vllm.multimodal.processing import (BaseMultiModalProcessor,
@@ -203,13 +204,13 @@ class PaliGemmaMultiModalProcessor(
         mm_data: MultiModalDataDict,
         hf_processor_mm_kwargs: Mapping[str, object],
         tokenization_kwargs: Optional[Mapping[str, object]] = None,
-        mm_hash_overrides: Optional[dict[str, list[str]]] = None,
+        mm_uuids: Optional[MultiModalUUIDDict] = None,
     ) -> MultiModalInputs:
         mm_inputs = super().apply(prompt,
                                   mm_data,
                                   hf_processor_mm_kwargs,
                                   tokenization_kwargs,
-                                  mm_hash_overrides=mm_hash_overrides)
+                                  mm_uuids=mm_uuids)
         prompt_token_ids = mm_inputs["prompt_token_ids"]
 
         tokenizer = self.info.get_tokenizer()
diff --git a/vllm/model_executor/models/phi4_multimodal.py b/vllm/model_executor/models/phi4_multimodal.py
index 6d973a964de0449e66302710747916d5cd50bb55..ab63649b43561d88c299c242ad6d355a695deb10 100644
--- a/vllm/model_executor/models/phi4_multimodal.py
+++ b/vllm/model_executor/models/phi4_multimodal.py
@@ -1350,7 +1350,7 @@ class Phi4MultimodalForCausalLM(nn.Module, SupportsLoRA, SupportsMultiModal):
             return None
 
         # The result multimodal_embeddings is tuple of tensors, with each
-        # tensor correspoending to a multimodal data item (image or video).
+        # tensor corresponding to a multimodal data item (image or video).
         multimodal_embeddings: tuple[torch.Tensor, ...] = ()
 
         # NOTE: It is important to iterate over the keys in this dictionary
diff --git a/vllm/model_executor/models/phi4mm_audio.py b/vllm/model_executor/models/phi4mm_audio.py
index 0b0d66ae771dd085bc64778177a4d5ed71b9b1cc..a1c452053ddd2918e1fe190c9f89dc9a26714661 100644
--- a/vllm/model_executor/models/phi4mm_audio.py
+++ b/vllm/model_executor/models/phi4mm_audio.py
@@ -7,7 +7,7 @@
 #!/usr/bin/env python3
 import abc
 import math
-from typing import Literal, Optional
+from typing import Any, Literal, Optional, Union
 
 import numpy as np
 import torch
@@ -43,7 +43,7 @@ class ConformerEncoderLayer(nn.Module):
             if set different to 0, the number of 
              depthwise_seperable_out_channel will be used as a
              channel_out of the second conv1d layer. 
-             otherwise, it equal to 0, the second conv1d layer is skipped.
+             otherwise, it equals to 0, the second conv1d layer is skipped.
         depthwise_multiplier: int
             number of input_dim channels duplication. this value
              will be used to compute the hidden channels of the Conv1D.
@@ -100,7 +100,7 @@ class ConformerEncoderLayer(nn.Module):
             activation function for glu used in the multihead attention,
              default "swish".
         activation_checkpointing: str, optional
-            a dictionarry of {"module","interval","offload"}, where
+            a dictionary of {"module","interval","offload"}, where
                 "module": str
                     accept ["transformer", "attention"] to select
                     which module should do activation checkpointing.
@@ -115,7 +115,7 @@ class ConformerEncoderLayer(nn.Module):
                     we recalculate activation in backward.
             default "".
         export: bool, optional
-            if set to True, it remove the padding from convolutional layers
+            if set to True, it removes the padding from convolutional layers
              and allow the onnx conversion for inference.
               default False.
         use_pt_scaled_dot_product_attention: bool, optional
@@ -131,31 +131,31 @@ class ConformerEncoderLayer(nn.Module):
 
     def __init__(
         self,
-        d_model=512,
-        ext_pw_out_channel=0,
-        depthwise_seperable_out_channel=256,
-        depthwise_multiplier=1,
-        n_head=4,
-        d_ffn=2048,
-        ext_pw_kernel_size=1,
-        kernel_size=3,
-        dropout_rate=0.1,
-        causal=False,
-        batch_norm=False,
-        activation="relu",
-        chunk_se=0,
-        chunk_size=18,
-        conv_activation="relu",
-        conv_glu_type="sigmoid",
-        bias_in_glu=True,
-        linear_glu_in_convm=False,
-        attention_inner_dim=-1,
-        attention_glu_type="swish",
-        activation_checkpointing="",
-        export=False,
-        use_pt_scaled_dot_product_attention=False,
+        d_model: int = 512,
+        ext_pw_out_channel: int = 0,
+        depthwise_seperable_out_channel: int = 256,
+        depthwise_multiplier: int = 1,
+        n_head: int = 4,
+        d_ffn: int = 2048,
+        ext_pw_kernel_size: int = 1,
+        kernel_size: int = 3,
+        dropout_rate: float = 0.1,
+        causal: bool = False,
+        batch_norm: bool = False,
+        activation: str = "relu",
+        chunk_se: int = 0,
+        chunk_size: int = 18,
+        conv_activation: str = "relu",
+        conv_glu_type: str = "sigmoid",
+        bias_in_glu: bool = True,
+        linear_glu_in_convm: bool = False,
+        attention_inner_dim: int = -1,
+        attention_glu_type: str = "swish",
+        activation_checkpointing: str = "",
+        export: bool = False,
+        use_pt_scaled_dot_product_attention: bool = False,
         attn_group_sizes: int = 1,
-    ):
+    ) -> None:
         super().__init__()
 
         self.feed_forward_in = FeedForward(
@@ -209,24 +209,21 @@ class ConformerEncoderLayer(nn.Module):
 
     def forward(
         self,
-        x,
-        pos_k,
-        pos_v,
-        mask,
+        x: torch.Tensor,
+        pos_k: torch.Tensor,
+        pos_v: torch.Tensor,
+        mask: torch.Tensor,
         relative_attention_bias: Optional[Tensor] = None,
-    ):
+    ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
         """ConformerEncoder forward.
 
         Args:
-            x: torch.Tensor
-                input feature of shape (batch, max_time_in, size)
-            pos_k: torch.Tensor
-                positional key embedding.
-            mask: torch.Tensor
-                mask for x (batch, max_time_in)
-            relative_attention_bias: Optional[torch.Tensor]
-                bias added to attention logits w.r.t. relative positions 
-                (1, n_head, time1, time2)
+            x: input feature of shape (batch, max_time_in, size)
+            pos_k: positional key embedding.
+            pos_v: positional value embedding.
+            mask: mask for x (batch, max_time_in)
+            relative_attention_bias: bias added to attention logits w.r.t.
+                relative positions (1, n_head, time1, time2)
         """
         x = x + 0.5 * self.feed_forward_in(x)
         norm_x = self.layer_norm_att(x)
@@ -323,25 +320,25 @@ class TransformerEncoderBase(abc.ABC, nn.Module):
 
     def __init__(
         self,
-        input_size,
-        chunk_size,
-        left_chunk,
-        attention_dim=256,
-        attention_heads=4,
-        input_layer="nemo_conv",
-        cnn_out=-1,
-        cnn_layer_norm=False,
-        time_reduction=4,
-        dropout_rate=0.0,
-        padding_idx=-1,
-        relative_attention_bias_args=None,
-        positional_dropout_rate=0.0,
-        nemo_conv_settings=None,
+        input_size: int,
+        chunk_size: Union[int, list[int]],
+        left_chunk: Union[int, list[int]],
+        attention_dim: int = 256,
+        attention_heads: int = 4,
+        input_layer: str = "nemo_conv",
+        cnn_out: int = -1,
+        cnn_layer_norm: bool = False,
+        time_reduction: int = 4,
+        dropout_rate: float = 0.0,
+        padding_idx: int = -1,
+        relative_attention_bias_args: Optional[dict[str, Any]] = None,
+        positional_dropout_rate: float = 0.0,
+        nemo_conv_settings: Optional[dict[str, Any]] = None,
         conv2d_extra_padding: Literal["feat", "feat_time", "none",
                                       True] = "none",
-        attention_group_size=1,
-        encoder_embedding_config=None,
-    ):
+        attention_group_size: int = 1,
+        encoder_embedding_config: Optional[dict[str, Any]] = None,
+    ) -> None:
         super().__init__()
         self.input_size = input_size
         self.input_layer = input_layer
@@ -399,7 +396,10 @@ class TransformerEncoderBase(abc.ABC, nn.Module):
         self.encoder_embedding = MeanVarianceNormLayer(
             self.encoder_embedding_config["input_size"])
 
-    def compute_lens_change(self, feature_lens):
+    def compute_lens_change(
+            self,
+            feature_lens: Union[int,
+                                torch.Tensor]) -> Union[int, torch.Tensor]:
         """feature_lens: int
         return updated feature lens.
 
@@ -433,10 +433,14 @@ class TransformerEncoderBase(abc.ABC, nn.Module):
             return ceil_func(feature_lens / self.time_reduction)
 
     @abc.abstractmethod
-    def forward(self):
+    def forward(self) -> Any:
         """Abstract forward method implementation."""
 
-    def _chunk_size_selection(self, chunk_size=None, left_chunk=None):
+    def _chunk_size_selection(
+            self,
+            chunk_size: Optional[Union[int, list[int]]] = None,
+            left_chunk: Optional[Union[int,
+                                       list[int]]] = None) -> tuple[int, int]:
         """If chunk size is a list, we will randomly select a chunk size."""
 
         if chunk_size is None:
@@ -463,7 +467,7 @@ class TransformerEncoderBase(abc.ABC, nn.Module):
 
         return chunk_size_train_eff, left_chunk_train_eff
 
-    def _get_embed_class(self, embed):
+    def _get_embed_class(self, embed: nn.Module) -> nn.Module:
         # pylint: disable=protected-access
         is_embed_using_act_chkpt = isinstance(embed, CheckpointWrapper)
         is_embed_fsdp_wrapped = isinstance(embed, FullyShardedDataParallel)
@@ -474,13 +478,17 @@ class TransformerEncoderBase(abc.ABC, nn.Module):
             embed_class = embed.module
         return embed_class
 
-    def _forward_embeddings_core(self, input_tensor, masks):
+    def _forward_embeddings_core(
+            self, input_tensor: torch.Tensor,
+            masks: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor]:
         embed_class = self._get_embed_class(self.embed)
         assert isinstance(embed_class, NemoConvSubsampling)
         input_tensor, masks = self.embed(input_tensor, masks)
         return input_tensor, masks
 
-    def _position_embedding(self, input_tensor):
+    def _position_embedding(
+        self, input_tensor: torch.Tensor
+    ) -> tuple[Optional[torch.Tensor], Optional[torch.Tensor]]:
         pos_k = None
         pos_v = None
         if self.relative_attention_bias_layer is None:
@@ -488,7 +496,9 @@ class TransformerEncoderBase(abc.ABC, nn.Module):
                 input_tensor)  # default to add abs sinusoid embedding
         return pos_k, pos_v
 
-    def _streaming_mask(self, seq_len, batch_size, chunk_size, left_chunk):
+    def _streaming_mask(self, seq_len: int, batch_size: int,
+                        chunk_size: Union[int, list[int]],
+                        left_chunk: Union[int, list[int]]) -> torch.Tensor:
         chunk_size_train_eff, left_chunk_train_eff = \
             self._chunk_size_selection(chunk_size, left_chunk)
 
@@ -502,11 +512,17 @@ class TransformerEncoderBase(abc.ABC, nn.Module):
                 [batch_size, -1, -1]))
         return enc_streaming_mask
 
-    def forward_embeddings(self,
-                           xs_pad,
-                           masks,
-                           chunk_size_nc=None,
-                           left_chunk_nc=None):
+    def forward_embeddings(
+        self,
+        xs_pad: torch.Tensor,
+        masks: torch.Tensor,
+        chunk_size_nc: Optional[Union[int, list[int]]] = None,
+        left_chunk_nc: Optional[Union[int, list[int]]] = None
+    ) -> Union[tuple[torch.Tensor, Optional[torch.Tensor],
+                     Optional[torch.Tensor], torch.Tensor, torch.Tensor],
+               tuple[torch.Tensor, Optional[torch.Tensor],
+                     Optional[torch.Tensor], torch.Tensor, torch.Tensor,
+                     torch.Tensor]]:
         """Forwarding the inputs through the top embedding layers
 
         Args:
@@ -569,7 +585,7 @@ class TransformerEncoderBase(abc.ABC, nn.Module):
             return input_tensor, pos_k, pos_v, hs_mask, masks
         return input_tensor, pos_k, pos_v, hs_mask, masks, hs_mask_nc
 
-    def get_offset(self):
+    def get_offset(self) -> int:
         """Returns offset used when retaining inputs for decoding.
 
         This is essentially, how many additional frames have to be added to
@@ -605,8 +621,6 @@ class ConformerEncoder(TransformerEncoderBase):
             Some examples for the 2 cases:
             left_chunk = 6
             left_chunk = [12, 9, 6, 3]
-        left_chunk: int
-            number of chunks used for masking in streaming mode.
         num_lang: int
             This parameter is used to store the number of languages in the 
             lang_dict, only used for multiseed/multilingual models. 
@@ -686,7 +700,7 @@ class ConformerEncoder(TransformerEncoderBase):
             only work for glu_in_attention !=0
             default "swish".
         export: bool, optional
-            if set to True, it remove the padding from convolutional layers
+            if set to True, it removes the padding from convolutional layers
              and allow the onnx conversion for inference.
               default False.
         activation_checkpointing: str, optional
@@ -751,46 +765,46 @@ class ConformerEncoder(TransformerEncoderBase):
 
     def __init__(  # pylint: disable-all
         self,
-        input_size,
-        chunk_size,
-        left_chunk,
-        num_lang=None,
-        attention_dim=256,
-        attention_heads=4,
-        linear_units=2048,
-        num_blocks=6,
-        dropout_rate=0.1,
-        input_layer="nemo_conv",
-        causal=True,
-        batch_norm=False,
-        cnn_out=-1,
-        cnn_layer_norm=False,
-        ext_pw_out_channel=0,
-        ext_pw_kernel_size=1,
-        depthwise_seperable_out_channel=256,
-        depthwise_multiplier=1,
-        chunk_se=0,
-        kernel_size=3,
-        activation="relu",
-        conv_activation="relu",
-        conv_glu_type="sigmoid",
-        bias_in_glu=True,
-        linear_glu_in_convm=False,
-        attention_glu_type="swish",
-        export=False,
-        extra_layer_output_idx=-1,
-        extra_multi_layer_output_idxs=[],  # noqa
-        activation_checkpointing="",
-        relative_attention_bias_args=None,
-        time_reduction=4,
-        use_pt_scaled_dot_product_attention=False,
-        nemo_conv_settings=None,
+        input_size: int,
+        chunk_size: Union[int, list[int]],
+        left_chunk: Union[int, list[int]],
+        num_lang: Optional[int] = None,
+        attention_dim: int = 256,
+        attention_heads: int = 4,
+        linear_units: int = 2048,
+        num_blocks: int = 6,
+        dropout_rate: float = 0.1,
+        input_layer: str = "nemo_conv",
+        causal: bool = True,
+        batch_norm: bool = False,
+        cnn_out: int = -1,
+        cnn_layer_norm: bool = False,
+        ext_pw_out_channel: int = 0,
+        ext_pw_kernel_size: int = 1,
+        depthwise_seperable_out_channel: int = 256,
+        depthwise_multiplier: int = 1,
+        chunk_se: int = 0,
+        kernel_size: int = 3,
+        activation: str = "relu",
+        conv_activation: str = "relu",
+        conv_glu_type: str = "sigmoid",
+        bias_in_glu: bool = True,
+        linear_glu_in_convm: bool = False,
+        attention_glu_type: str = "swish",
+        export: bool = False,
+        extra_layer_output_idx: int = -1,
+        extra_multi_layer_output_idxs: list[int] = [],  # noqa
+        activation_checkpointing: str = "",
+        relative_attention_bias_args: Optional[dict[str, Any]] = None,
+        time_reduction: int = 4,
+        use_pt_scaled_dot_product_attention: bool = False,
+        nemo_conv_settings: Optional[dict[str, Any]] = None,
         conv2d_extra_padding: Literal["feat", "feat_time", "none",
                                       True] = "none",
-        replication_pad_for_subsample_embedding=False,
-        attention_group_size=1,
-        encoder_embedding_config=None,
-    ):
+        replication_pad_for_subsample_embedding: bool = False,
+        attention_group_size: int = 1,
+        encoder_embedding_config: Optional[dict[str, Any]] = None,
+    ) -> None:
         super().__init__(
             input_size,
             chunk_size,
@@ -852,11 +866,13 @@ class ConformerEncoder(TransformerEncoderBase):
         # the device and the needed dtype:
         self.register_buffer("dev_type", torch.zeros(()), persistent=False)
 
-    def init_relative_attention_bias(self, input_tensor):
+    def init_relative_attention_bias(
+            self, input_tensor: torch.Tensor) -> Optional[torch.Tensor]:
         if self.relative_attention_bias_layer:
             return self.relative_attention_bias_layer(input_tensor)
 
-    def calculate_hs_mask(self, xs_pad, device, mask):
+    def calculate_hs_mask(self, xs_pad: torch.Tensor, device: torch.device,
+                          mask: Optional[torch.Tensor]) -> torch.Tensor:
         max_audio_length = xs_pad.shape[1]
         batch_size = xs_pad.shape[0]
         enc_streaming_mask = self._streaming_mask(max_audio_length, batch_size,
@@ -877,7 +893,8 @@ class ConformerEncoder(TransformerEncoderBase):
         return pad_mask
 
     @torch.jit.ignore
-    def forward(self, xs_pad, masks):
+    def forward(self, xs_pad: torch.Tensor,
+                masks: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor]:
         """Conformer Forward function
 
         Args:
@@ -997,7 +1014,12 @@ class WindowQformer(nn.Module):
                            if normalize_before else None)
         self.window_size = window_size
 
-    def forward(self, audio_embed, mask, embed_len=None):
+    def forward(
+            self,
+            audio_embed: torch.Tensor,
+            mask: Optional[torch.Tensor],
+            embed_len: Optional[int] = None
+    ) -> tuple[torch.Tensor, Optional[int]]:
         """forward decoder"""
         # audio_embed: N x T x D => N x D x T
 
@@ -1042,7 +1064,7 @@ class WindowQformer(nn.Module):
 class AudioEmbedding(nn.Module):
     """Image embedding."""
 
-    def __init__(self, config: PretrainedConfig, **kwargs) -> None:
+    def __init__(self, config: PretrainedConfig, **kwargs: Any) -> None:
         super().__init__()
         self.config = config
         # n_embed or hidden_size for text LM
@@ -1148,19 +1170,18 @@ class AudioEmbedding(nn.Module):
         self.input_embeds = None
         self.audio_embed_sizes = None
 
-    def set_audio_embeds(self, input_embeds: torch.FloatTensor) -> None:
+    def set_audio_embeds(self, input_embeds: torch.Tensor) -> None:
         self.input_embeds = input_embeds
 
-    def set_audio_embed_sizes(self,
-                              audio_embed_sizes: torch.LongTensor) -> None:
+    def set_audio_embed_sizes(self, audio_embed_sizes: torch.Tensor) -> None:
         self.audio_embed_sizes = audio_embed_sizes
 
     def get_audio_features(
         self,
-        input_embeds: torch.FloatTensor,
-        audio_attention_mask: torch.Tensor = None,
+        input_embeds: torch.Tensor,
+        audio_attention_mask: Optional[torch.Tensor] = None,
         audio_projection_mode: str = "speech",
-    ) -> torch.FloatTensor:
+    ) -> torch.Tensor:
         """
         arguments:
             input_embeds: audio features (B, T, D)  B: num audios in a sequence
@@ -1214,10 +1235,10 @@ class AudioEmbedding(nn.Module):
 
     def forward(
         self,
-        audio_features: torch.FloatTensor,
-        audio_attention_mask: torch.Tensor = None,
+        audio_features: torch.Tensor,
+        audio_attention_mask: Optional[torch.Tensor] = None,
         audio_projection_mode: str = "speech",
-    ) -> torch.FloatTensor:
+    ) -> torch.Tensor:
         """
         arguments:
             audio_features: audio features (T, D)
diff --git a/vllm/model_executor/models/phi4mm_utils.py b/vllm/model_executor/models/phi4mm_utils.py
index c4890d8427e2a2df07c72c324ec9291b2d58e520..6fbfca619a42f74db015a360ca80dff26424c9ce 100644
--- a/vllm/model_executor/models/phi4mm_utils.py
+++ b/vllm/model_executor/models/phi4mm_utils.py
@@ -16,13 +16,13 @@ from torch import Tensor, nn
 class BlockBase(nn.Module):
     """Block abstract module"""
 
-    def __init__(self, input_size, output_size):
+    def __init__(self, input_size: int, output_size: int) -> None:
         super().__init__()
         self.input_size = input_size
         self.output_size = output_size
 
 
-def get_activation(name="relu"):
+def get_activation(name: str = "relu") -> torch.nn.Module:
     """Select an activation function by name
 
     Args:
@@ -43,15 +43,18 @@ def get_activation(name="relu"):
     return nn.Identity()
 
 
-def adaptive_enc_mask(x_len, chunk_start_idx, left_window=0, right_window=0):
+def adaptive_enc_mask(x_len: int,
+                      chunk_start_idx: list[int],
+                      left_window: int = 0,
+                      right_window: int = 0) -> torch.Tensor:
     """
     The function is very important for Transformer Transducer Streaming mode
     Args:
-        xs_len (int): sequence length
-        chunk_start_idx (list): first idx of each chunk, such as [0,18,36,48]. 
+        x_len: sequence length
+        chunk_start_idx: first idx of each chunk, such as [0,18,36,48]. 
         It also supports adaptive chunk size [0,10,15,45]
-        left_window (int): how many left chunks can be seen
-        right_window (int): how many right chunks can be seen. It is used for 
+        left_window: how many left chunks can be seen
+        right_window: how many right chunks can be seen. It is used for 
         chunk overlap model.
         Returns:
             mask (torch.Tensor): a mask tensor for streaming model
@@ -172,13 +175,13 @@ class GLUPointWiseConv(nn.Module):
 
     def __init__(
         self,
-        input_dim,
-        output_dim,
-        kernel_size,
-        glu_type="sigmoid",
-        bias_in_glu=True,
-        causal=False,
-    ):
+        input_dim: int,
+        output_dim: int,
+        kernel_size: int,
+        glu_type: str = "sigmoid",
+        bias_in_glu: bool = True,
+        causal: bool = False,
+    ) -> None:
         super().__init__()
 
         self.glu_type = glu_type
@@ -216,11 +219,10 @@ class GLUPointWiseConv(nn.Module):
             self.b1 = nn.Parameter(torch.zeros(1, output_dim, 1))
             self.b2 = nn.Parameter(torch.zeros(1, output_dim, 1))
 
-    def forward(self, x):
+    def forward(self, x: Tensor) -> Tensor:
         """
         Args:
-            x: torch.Tensor
-                input tensor
+            x: input tensor
         """
         # to be consistent with GLULinear, we assume the input always has the
         # #channel (#dim) in the last dimension of the tensor, so need to
@@ -258,7 +260,7 @@ class DepthWiseSeperableConv1d(nn.Module):
             if set different to 0, the number of 
              depthwise_seperable_out_channel will be used as a channel_out
              of the second conv1d layer.
-             otherwise, it equal to 0, the second conv1d layer is skipped.
+             otherwise, it equals to 0, the second conv1d layer is skipped.
         kernel_size: int
             kernel_size
         depthwise_multiplier: int
@@ -272,12 +274,12 @@ class DepthWiseSeperableConv1d(nn.Module):
 
     def __init__(
         self,
-        input_dim,
-        depthwise_seperable_out_channel,
-        kernel_size,
-        depthwise_multiplier,
-        padding=0,
-    ):
+        input_dim: int,
+        depthwise_seperable_out_channel: int,
+        kernel_size: int,
+        depthwise_multiplier: int,
+        padding: int = 0,
+    ) -> None:
         super().__init__()
 
         self.dw_conv = nn.Conv1d(
@@ -301,12 +303,11 @@ class DepthWiseSeperableConv1d(nn.Module):
             self.pw_conv = nn.Identity()
         self.depthwise_seperable_out_channel = depthwise_seperable_out_channel
 
-    def forward(self, x):
+    def forward(self, x: Tensor) -> Tensor:
         """
 
         Args:
-            x: torch.Tensor
-                input tensor
+            x: input tensor
         """
         x = self.dw_conv(x)
         if self.depthwise_seperable_out_channel != 0:
@@ -375,23 +376,23 @@ class ConvModule(nn.Module):
 
     def __init__(
         self,
-        input_dim,
-        ext_pw_out_channel,
-        depthwise_seperable_out_channel,
-        ext_pw_kernel_size,
-        kernel_size,
-        depthwise_multiplier,
-        dropout_rate,
-        causal=False,
-        batch_norm=False,
-        chunk_se=0,
-        chunk_size=18,
-        activation="relu",
-        glu_type="sigmoid",
-        bias_in_glu=True,
-        linear_glu_in_convm=False,
-        export=False,
-    ):
+        input_dim: int,
+        ext_pw_out_channel: int,
+        depthwise_seperable_out_channel: int,
+        ext_pw_kernel_size: int,
+        kernel_size: int,
+        depthwise_multiplier: int,
+        dropout_rate: float,
+        causal: bool = False,
+        batch_norm: bool = False,
+        chunk_se: int = 0,
+        chunk_size: int = 18,
+        activation: str = "relu",
+        glu_type: str = "sigmoid",
+        bias_in_glu: bool = True,
+        linear_glu_in_convm: bool = False,
+        export: bool = False,
+    ) -> None:
         super().__init__()
         self.layer_norm = nn.LayerNorm(input_dim)
         self.input_dim = input_dim
@@ -437,7 +438,7 @@ class ConvModule(nn.Module):
                 self.ln2 = nn.Linear(input_dim * depthwise_multiplier,
                                      input_dim)
 
-    def _add_ext_pw_layer(self):
+    def _add_ext_pw_layer(self) -> None:
         """
         This function is an extension of __init__ function
         and dedicated to the convolution module creation
@@ -497,12 +498,11 @@ class ConvModule(nn.Module):
             self.pw_conv_simplify_w = torch.nn.Parameter(torch.ones(3))
             self.pw_conv_simplify_b = torch.nn.Parameter(torch.zeros(3))
 
-    def forward(self, x):
+    def forward(self, x: Tensor) -> Tensor:
         """ConvModule Forward.
 
         Args:
-            x: torch.Tensor
-                input tensor.
+            x: input tensor.
         """
         x = self.layer_norm(x)
 
@@ -567,21 +567,20 @@ class GLULinear(nn.Module):
 
     def __init__(
         self,
-        input_dim,
-        output_dim,
-        glu_type="sigmoid",
-        bias_in_glu=True,
-    ):
+        input_dim: int,
+        output_dim: int,
+        glu_type: str = "sigmoid",
+        bias_in_glu: bool = True,
+    ) -> None:
         super().__init__()
         self.linear = nn.Linear(input_dim, output_dim * 2, bias_in_glu)
         self.glu_act = GLU(-1, glu_type)
 
-    def forward(self, x):
+    def forward(self, x: Tensor) -> Tensor:
         """GLULinear forward
 
         Args:
-            x: torch.Tensor
-                inpute tensor.
+            x: input tensor.
         """
         x = self.linear(x)
         return self.glu_act(x)
@@ -609,12 +608,12 @@ class FeedForward(nn.Module):
 
     def __init__(
         self,
-        d_model,
-        d_inner,
-        dropout_rate,
-        activation="sigmoid",
-        bias_in_glu=True,
-    ):
+        d_model: int,
+        d_inner: int,
+        dropout_rate: float,
+        activation: str = "sigmoid",
+        bias_in_glu: bool = True,
+    ) -> None:
         super().__init__()
         self.d_model = d_model
         self.d_inner = d_inner
@@ -628,12 +627,11 @@ class FeedForward(nn.Module):
             nn.Dropout(dropout_rate),
         )
 
-    def forward(self, x):
+    def forward(self, x: Tensor) -> Tensor:
         """FeedForward forward function.
 
         Args:
-            x: torch.Tensor
-                input tensor.
+            x: input tensor.
         """
         out = self.net(self.layer_norm(x))
 
@@ -642,14 +640,14 @@ class FeedForward(nn.Module):
 
 #### positional encoding starts here
 def _pre_hook(
-    state_dict,
-    prefix,
-    local_metadata,
-    strict,
-    missing_keys,
-    unexpected_keys,
-    error_msgs,
-):
+    state_dict: dict,
+    prefix: str,
+    local_metadata: dict,
+    strict: bool,
+    missing_keys: list[str],
+    unexpected_keys: list[str],
+    error_msgs: list[str],
+) -> None:
     """Perform pre-hook in load_state_dict for backward compatibility.
 
     Note:
@@ -708,10 +706,10 @@ class T5RelativeAttentionLogitBias(nn.Module):
     """
 
     def __init__(self,
-                 num_heads,
-                 num_buckets=-1,
-                 max_distance=1000,
-                 symmetric=False):
+                 num_heads: int,
+                 num_buckets: int = -1,
+                 max_distance: int = 1000,
+                 symmetric: bool = False) -> None:
         super().__init__()
         self.num_heads = num_heads
         self.num_buckets = num_buckets
@@ -727,7 +725,7 @@ class T5RelativeAttentionLogitBias(nn.Module):
             self.num_buckets *= 2
         self.bias_values = nn.Embedding(self.num_buckets, self.num_heads)
 
-    def forward(self, x):
+    def forward(self, x: Tensor) -> Tensor:
         # instantiate bias compatible with shape of x
         maxpos = x.size(1)
         context_position = torch.arange(maxpos,
@@ -760,7 +758,7 @@ class T5RelativeAttentionLogitBias(nn.Module):
 
         return t5_rel_att_bias
 
-    def _bucket_relative_position(self, relative_position):
+    def _bucket_relative_position(self, relative_position: Tensor) -> Tensor:
         # this is a placeholder (isn't tested, likely buggy) using HuggingFace
         # implem as a reference this also needs to be extended to support
         # asymmetric +/- ve positions
@@ -810,7 +808,10 @@ class AbsolutePositionalEncoding(nn.Module):
 
     """
 
-    def __init__(self, d_model, dropout_rate, max_len=5000):
+    def __init__(self,
+                 d_model: int,
+                 dropout_rate: float,
+                 max_len: int = 5000) -> None:
         """Construct an PositionalEncoding object."""
         super().__init__()
         self.d_model = d_model
@@ -820,11 +821,11 @@ class AbsolutePositionalEncoding(nn.Module):
         self.extend_pe(torch.tensor(0.0).expand(1, max_len))
         self._register_load_state_dict_pre_hook(_pre_hook)
 
-    def extend_pe(self, x):
+    def extend_pe(self, x: torch.Tensor) -> None:
         """Reset the positional encodings.
 
         Args:
-            x: torch.Tensor
+            x: input tensor
         """
         if self.pe is not None and self.pe.size(1) >= x.size(1):
             if self.pe.dtype != x.dtype or self.pe.device != x.device:
@@ -840,15 +841,14 @@ class AbsolutePositionalEncoding(nn.Module):
         pe = pe.unsqueeze(0)
         self.pe = pe.to(device=x.device, dtype=x.dtype)
 
-    def forward(self, x: torch.Tensor):
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
         """Add positional encoding.
 
         Args:
-            x: torch.Tensor
-                Input tensor. shape is (batch, time, ...)
+            x: Input tensor. shape is (batch, time, ...)
 
         Returns:
-            torch.Tensor: Encoded tensor. Its shape is (batch, time, ...)
+            Encoded tensor. Its shape is (batch, time, ...)
 
         """
         self.extend_pe(x)
@@ -868,7 +868,7 @@ class MeanVarianceNormLayer(nn.Module):
             layer input size.
     """
 
-    def __init__(self, input_size):
+    def __init__(self, input_size: int) -> None:
         super().__init__()
         self.input_size = input_size
         self.global_mean = nn.Parameter(torch.zeros(input_size))
@@ -878,8 +878,7 @@ class MeanVarianceNormLayer(nn.Module):
         """MeanVarianceNormLayer Forward
 
         Args:
-            input_: torch.Tensor
-                input tensor.
+            input_: input tensor.
         """
         return (input_ - self.global_mean) * self.global_invstd
 
@@ -949,7 +948,10 @@ class CausalConv1D(nn.Conv1d):
             dtype=dtype,
         )
 
-    def update_cache(self, x, cache=None):
+    def update_cache(
+            self,
+            x: Tensor,
+            cache: Optional[Tensor] = None) -> tuple[Tensor, Optional[Tensor]]:
         if cache is None:
             new_x = F.pad(x, pad=(self._left_padding, self._right_padding))
             next_cache = cache
@@ -963,7 +965,11 @@ class CausalConv1D(nn.Conv1d):
             next_cache = next_cache[:, :, -cache.size(-1):]
         return new_x, next_cache
 
-    def forward(self, x, cache=None):
+    def forward(
+        self,
+        x: Tensor,
+        cache: Optional[Tensor] = None
+    ) -> Union[Tensor, tuple[Tensor, Optional[Tensor]]]:
         x, cache = self.update_cache(x, cache=cache)
         x = super().forward(x)
         if cache is None:
@@ -1017,8 +1023,8 @@ class CausalConv2D(nn.Conv2d):
 
     def forward(
         self,
-        x,
-    ):
+        x: Tensor,
+    ) -> Tensor:
         x = F.pad(
             x,
             pad=(self._left_padding, self._right_padding, 0, 0),
@@ -1062,16 +1068,16 @@ class NemoConvSubsampling(torch.nn.Module):
     """
 
     def __init__(
-            self,
-            feat_in,
-            feat_out,
-            subsampling_factor=4,
-            subsampling="dw_striding",
-            conv_channels=256,
-            subsampling_conv_chunking_factor=1,
-            activation=nn.ReLU(),  # noqa: B008
-            is_causal=False,
-    ):
+        self,
+        feat_in: int,
+        feat_out: int,
+        subsampling_factor: int = 4,
+        subsampling: str = "dw_striding",
+        conv_channels: int = 256,
+        subsampling_conv_chunking_factor: int = 1,
+        activation: torch.nn.Module = nn.ReLU(),  # noqa: B008
+        is_causal: bool = False,
+    ) -> None:
         super().__init__()
         self._subsampling = subsampling
         self._conv_channels = conv_channels
@@ -1328,28 +1334,25 @@ class NemoConvSubsampling(torch.nn.Module):
 
         self.conv = torch.nn.Sequential(*layers)
 
-    def get_sampling_frames(self):
+    def get_sampling_frames(self) -> list[int]:
         return [1, self.subsampling_factor]
 
-    def get_streaming_cache_size(self):
+    def get_streaming_cache_size(self) -> list[int]:
         return [0, self.subsampling_factor + 1]
 
-    def forward(self, x, mask):
+    def forward(self, x: Tensor,
+                mask: Optional[Tensor]) -> tuple[Tensor, Optional[Tensor]]:
         """
         Forward method for NeMo subsampling.
 
         Args:
-            x[Batch, Time, Filters]: torch.Tensor
-                input tensor
-            x_mask: torch.Tensor
-                input mask
+            x: input tensor
+            mask: input mask
 
         Returns:
-            x: torch.Tensor
-                Resulting tensor from subsampling (B, T // 
+            x: Resulting tensor from subsampling (B, T // 
                 time_reduction_factor, feat_out)
-            pad_mask: torch.Tensor
-                tensor of padded hidden state sequences (B, 1, T // 
+            pad_mask: tensor of padded hidden state sequences (B, 1, T // 
                 time_reduction_factor)
         """
         x = x.unsqueeze(1) if self.conv2d_subsampling else x.transpose(1, 2)
@@ -1403,7 +1406,7 @@ class NemoConvSubsampling(torch.nn.Module):
             padding_length.size(0), -1) < padding_length.unsqueeze(1)
         return x, pad_mask.unsqueeze(1)
 
-    def reset_parameters(self):
+    def reset_parameters(self) -> None:
         # initialize weights
         if self._subsampling == "dw_striding":
             with torch.no_grad():
@@ -1433,7 +1436,7 @@ class NemoConvSubsampling(torch.nn.Module):
                 torch.nn.init.uniform_(self.out.weight, -fc_scale, fc_scale)
                 torch.nn.init.uniform_(self.out.bias, -fc_scale, fc_scale)
 
-    def conv_split_by_batch(self, x):
+    def conv_split_by_batch(self, x: Tensor) -> tuple[Tensor, bool]:
         """Tries to split input by batch, run conv and concat results"""
         b, _, _, _ = x.size()
         if b == 1:  # can't split if batch size is 1
@@ -1460,7 +1463,7 @@ class NemoConvSubsampling(torch.nn.Module):
             True,
         )
 
-    def conv_split_by_channel(self, x):
+    def conv_split_by_channel(self, x: Tensor) -> Tensor:
         """For dw convs, tries to split input by time, run conv and concat 
         results"""
         x = self.conv[0](x)  # full conv2D
@@ -1500,7 +1503,8 @@ class NemoConvSubsampling(torch.nn.Module):
             x = self.conv[i * 3 + 4](x)  # activation
         return x
 
-    def channel_chunked_conv(self, conv, chunk_size, x):
+    def channel_chunked_conv(self, conv: torch.nn.Module, chunk_size: int,
+                             x: Tensor) -> Tensor:
         """Performs channel chunked convolution"""
 
         ind = 0
@@ -1541,7 +1545,7 @@ class NemoConvSubsampling(torch.nn.Module):
         return torch.cat(out_chunks, 1)
 
     def change_subsampling_conv_chunking_factor(
-            self, subsampling_conv_chunking_factor: int):
+            self, subsampling_conv_chunking_factor: int) -> None:
         if (subsampling_conv_chunking_factor != -1
                 and subsampling_conv_chunking_factor != 1
                 and subsampling_conv_chunking_factor % 2 != 0):
@@ -1552,12 +1556,12 @@ class NemoConvSubsampling(torch.nn.Module):
         self.subsampling_conv_chunking_factor = subsampling_conv_chunking_factor
 
 
-def calc_length(lengths,
-                all_paddings,
-                kernel_size,
-                stride,
-                ceil_mode,
-                repeat_num=1):
+def calc_length(lengths: Tensor,
+                all_paddings: int,
+                kernel_size: int,
+                stride: int,
+                ceil_mode: bool,
+                repeat_num: int = 1) -> Tensor:
     """Calculates the output length of a Tensor passed through a convolution or
       max pooling layer"""
     add_pad: float = all_paddings - kernel_size
@@ -1573,11 +1577,11 @@ def calc_length(lengths,
 class AttModule(nn.Module):
     """Attention abstraction module"""
 
-    def __init__(self):
+    def __init__(self) -> None:
         super().__init__()
         self.export_mode = False
 
-    def set_export(self, mode=True):
+    def set_export(self, mode: bool = True) -> None:
         """set the export mode"""
         self.export_mode = mode
 
@@ -1591,14 +1595,10 @@ class AttModule(nn.Module):
         """AttModule forward
 
         Args:
-            x: torch.Tensor
-                input tensor.
-            memory: torch.Tensor, optional
-                memory tensor.
-            pos_emb: torch.Tensor, optional
-                positional encoder embedding.
-            att_mask: torch.Tensor, optional
-                attention mask tensor.
+            x: input tensor.
+            memory: memory tensor.
+            pos_emb: positional encoder embedding.
+            att_mask: attention mask tensor.
         """
         return x, memory, pos_emb, att_mask
 
@@ -1606,15 +1606,15 @@ class AttModule(nn.Module):
 class AttBlock(BlockBase, AttModule):
     """Attention Block module to support both Attention and Block module."""
 
-    def memory_dims(self, max_len=False):
+    def memory_dims(self, max_len: bool = False) -> tuple[int, int]:
         """memory dimensions"""
         return (1, self.input_size)
 
 
 def masked_softmax(
-    scores,
+    scores: Tensor,
     mask: Optional[Tensor],
-):
+) -> Tensor:
     if mask is not None:
         mask = mask.unsqueeze(1).eq(0)  # (batch, 1, time1, time2)
         scores = scores.masked_fill(mask, -torch.inf)
@@ -1636,10 +1636,6 @@ class MultiHeadedAttention(nn.Module):
             input size features.
         dropout_rate: float
             dropout rate.
-        use_LN: bool
-            apply layer norm or not
-        dropout_at_output: bool
-            whether to apply dropout at output
         attention_inner_dim: int, optional
             the attention dimension used in the class,
             it can be different from the input dimension n_feat.
@@ -1666,16 +1662,16 @@ class MultiHeadedAttention(nn.Module):
 
     def __init__(
         self,
-        n_head,
-        n_feat,
-        dropout_rate,
-        attention_inner_dim=-1,
-        glu_type="swish",
-        bias_in_glu=True,
-        use_pt_scaled_dot_product_attention=False,
-        n_value=-1,
+        n_head: int,
+        n_feat: int,
+        dropout_rate: float,
+        attention_inner_dim: int = -1,
+        glu_type: str = "swish",
+        bias_in_glu: bool = True,
+        use_pt_scaled_dot_product_attention: bool = False,
+        n_value: int = -1,
         group_size: int = 1,
-    ):
+    ) -> None:
         super().__init__()
         if n_value == -1:
             n_value = n_feat
@@ -1718,28 +1714,22 @@ class MultiHeadedAttention(nn.Module):
         query: Tensor,
         key: Tensor,
         value: Tensor,
-        pos_k: Tensor,
-        pos_v: Tensor,
+        pos_k: Optional[Tensor],
+        pos_v: Optional[Tensor],
         mask: Optional[Tensor],
         relative_attention_bias: Optional[Tensor] = None,
-    ):
+    ) -> Tensor:
         """Compute 'Scaled Dot Product Attention'.
 
         Args:
-            query: torch.Tensor
-                query tensor (batch, time1, size)
-            key: torch.Tensor
-                key tensor (batch, time2, size)
-            value: torch.Tensor
-                value tensor (batch, time1, size)
-            pos_k: torch.Tensor
-                key tensor used for relative positional embedding.
-            pos_v: torch.Tensor
-                value tensor used for relative positional embedding.
-            mask: torch.Tensor
-                mask tensor (batch, time1, time2)
-            relative_attention_bias: torch.Tensor
-                bias added to attention logits w.r.t. relative positions
+            query: query tensor (batch, time1, size)
+            key: key tensor (batch, time2, size)
+            value: value tensor (batch, time1, size)
+            pos_k: key tensor used for relative positional embedding.
+            pos_v: value tensor used for relative positional embedding.
+            mask: mask tensor (batch, time1, time2)
+            relative_attention_bias: bias added to attention logits w.r.t. 
+                relative positions
                 (1, n_head, time1, time2)
         """
         n_batch = query.size(0)
@@ -1832,20 +1822,20 @@ class MultiSequential(torch.nn.Sequential):
     """Multi-input multi-output torch.nn.Sequential"""
 
     @torch.jit.ignore
-    def forward(self, *args):
+    def forward(self, *args) -> tuple:
         """Forward method implementation."""
         for m in self:
             args = m(*args)
         return args
 
 
-def get_offset(input_layer: str, time_reduction: int):
+def get_offset(input_layer: str, time_reduction: int) -> int:
     """Get an offset. We will use the offset for determining #frames of a 
     subsampled feature.
 
     Args:
-        input_layer (str): Type of an input layer
-        time_reduction (int): time reduction factor for downsampling a feature
+        input_layer: Type of an input layer
+        time_reduction: time reduction factor for downsampling a feature
     Returns:
         int: offset
     """
@@ -1858,13 +1848,14 @@ def get_offset(input_layer: str, time_reduction: int):
     return 0
 
 
-def unfold_tensor(xs_pad, max_seq_len):
+def unfold_tensor(xs_pad: Tensor, max_seq_len: int) -> Tensor:
     """
     For a given tensor with shape of (N, T, D), if sequence length T is 
     longer than max_seq_len, this function unfold it to a 
     (NT', max_seq_len, D) where T' is T // max_seq_len.
     Args:
-        xs_pad: N, T, D
+        xs_pad: input tensor with shape (N, T, D)
+        max_seq_len: maximum sequence length
     """
     _, _, D = xs_pad.shape
     xs_pad = xs_pad.transpose(-1, -2)  # convert to N, D, T
diff --git a/vllm/model_executor/models/pixtral.py b/vllm/model_executor/models/pixtral.py
index e7f5799a800674418f4df61dca5e8fb4762dea06..142d3251bc67a7e074336523a01a398a0a8c23f8 100644
--- a/vllm/model_executor/models/pixtral.py
+++ b/vllm/model_executor/models/pixtral.py
@@ -35,7 +35,7 @@ from vllm.model_executor.model_loader.weight_utils import default_weight_loader
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalKwargsItems
 from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
-                                    NestedTensors)
+                                    MultiModalUUIDDict, NestedTensors)
 from vllm.multimodal.parse import (ImageProcessorItems, ImageSize,
                                    MultiModalDataItems)
 from vllm.multimodal.processing import (BaseMultiModalProcessor,
@@ -316,14 +316,14 @@ class PixtralMultiModalProcessor(BaseMultiModalProcessor[PixtralProcessingInfo]
         mm_data_items: MultiModalDataItems,
         hf_processor_mm_kwargs: Mapping[str, object],
         tokenization_kwargs: Mapping[str, object],
-        mm_hash_overrides: Optional[dict[str, list[str]]] = None,
+        mm_uuids: Optional[MultiModalUUIDDict] = None,
     ) -> tuple[list[int], MultiModalProcessingInfo, bool]:
         prompt_ids, mm_info, _ = super()._cached_apply_hf_processor(
             prompt=prompt,
             mm_data_items=mm_data_items,
             hf_processor_mm_kwargs=hf_processor_mm_kwargs,
             tokenization_kwargs=tokenization_kwargs,
-            mm_hash_overrides=mm_hash_overrides,
+            mm_uuids=mm_uuids,
         )
 
         # NOTE: The tokens are already inserted by the chat template
diff --git a/vllm/model_executor/models/plamo2.py b/vllm/model_executor/models/plamo2.py
index 7f70e44b10a6d8eaceb610f6d4facb7ed09f8c78..b9869f5e58800d339aac4b8b11d4af18032a5076 100644
--- a/vllm/model_executor/models/plamo2.py
+++ b/vllm/model_executor/models/plamo2.py
@@ -3,19 +3,24 @@
 """Inference-only PLaMo2 model."""
 from collections.abc import Iterable
 from itertools import islice
-from typing import Optional
+from typing import TYPE_CHECKING, Optional
+
+if TYPE_CHECKING:
+    from vllm.attention.backends.abstract import AttentionBackend
 
 import torch
 from torch import nn
-from transformers import PretrainedConfig, PreTrainedModel
+from transformers import PretrainedConfig
 
+from vllm import envs
 from vllm.attention.backends.abstract import AttentionMetadata
 from vllm.attention.layer import Attention
 from vllm.compilation.decorators import support_torch_compile
-from vllm.config import VllmConfig
+from vllm.config import VllmConfig, get_current_vllm_config
 from vllm.distributed import divide, get_tensor_model_parallel_world_size
 from vllm.distributed.parallel_state import get_pp_group
-from vllm.forward_context import get_forward_context
+from vllm.forward_context import ForwardContext, get_forward_context
+from vllm.model_executor.custom_op import CustomOp
 from vllm.model_executor.layers.activation import SiluAndMul
 from vllm.model_executor.layers.layernorm import RMSNorm
 from vllm.model_executor.layers.linear import (ColumnParallelLinear,
@@ -23,8 +28,11 @@ from vllm.model_executor.layers.linear import (ColumnParallelLinear,
                                                QKVParallelLinear,
                                                RowParallelLinear)
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
+from vllm.model_executor.layers.mamba.abstract import MambaBase
 from vllm.model_executor.layers.mamba.mamba2_metadata import (
-    Mamba2Metadata, prepare_mamba2_metadata)
+    Mamba2Metadata, prepare_mamba2_metadata, update_metadata)
+from vllm.model_executor.layers.mamba.mamba_utils import (
+    MambaStateDtypeCalculator, MambaStateShapeCalculator)
 from vllm.model_executor.layers.mamba.ops.causal_conv1d import (
     causal_conv1d_fn, causal_conv1d_update)
 from vllm.model_executor.layers.mamba.ops.mamba_ssm import (
@@ -39,7 +47,7 @@ from vllm.model_executor.layers.vocab_parallel_embedding import (
 from vllm.model_executor.model_loader.weight_utils import (
     composed_weight_loader, default_weight_loader, sharded_weight_loader)
 from vllm.model_executor.models.interfaces import (HasInnerState, IsHybrid,
-                                                   SupportsPP, SupportsV0Only)
+                                                   SupportsPP)
 from vllm.model_executor.models.mamba_cache import (MambaCacheManager,
                                                     MambaCacheParams)
 from vllm.model_executor.models.utils import (
@@ -47,8 +55,10 @@ from vllm.model_executor.models.utils import (
     make_layers, maybe_prefix)
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.model_executor.utils import set_weight_attrs
+from vllm.platforms import current_platform
 from vllm.sequence import IntermediateTensors
-from vllm.utils import LayerBlockType
+from vllm.utils import LayerBlockType, direct_register_custom_op
+from vllm.v1.attention.backends.mamba2_attn import Mamba2AttentionMetadata
 
 
 # Only used for type hinting.
@@ -73,20 +83,6 @@ class Plamo2Config(PretrainedConfig):  # type: ignore
     vocab_size: int
 
 
-class Plamo2PreTrainedModel(PreTrainedModel):  # type: ignore
-
-    def _init_weights(self, module: torch.nn.Module) -> None:
-        std = 0.02
-        if isinstance(module, nn.Linear):
-            module.weight.data.normal_(mean=0.0, std=std)
-            if module.bias is not None:
-                module.bias.data.zero_()
-        elif isinstance(module, nn.Embedding):
-            module.weight.data.normal_(mean=0.0, std=std)
-            if module.padding_idx is not None:
-                module.weight.data[module.padding_idx].zero_()
-
-
 def is_mamba(config: Plamo2Config, i: int) -> bool:
     assert config.mamba_step > 1
 
@@ -99,7 +95,8 @@ def is_mamba(config: Plamo2Config, i: int) -> bool:
 # Adapted from:
 # vllm.model_executor.layers.mamba.mamba_mixer2.MambaMixer2
 # transformers.models.mamba.modeling_mamba.MambaMixer
-class Plamo2MambaMixer(nn.Module):
+@CustomOp.register(name="plamo2_mamba_mixer")
+class Plamo2MambaMixer(MambaBase, CustomOp):
 
     def __init__(self,
                  vllm_config: VllmConfig,
@@ -108,6 +105,8 @@ class Plamo2MambaMixer(nn.Module):
                  **kwargs) -> None:
         super().__init__()
         self.config = vllm_config.model_config.hf_config
+        self.cache_config = vllm_config.cache_config
+        self.model_config = vllm_config.model_config
         self.quant_config = vllm_config.quant_config
         self.hidden_size = self.config.hidden_size
         self.ssm_state_size = self.config.mamba_d_state
@@ -115,8 +114,6 @@ class Plamo2MambaMixer(nn.Module):
         self.intermediate_size = (self.config.mamba_num_heads *
                                   self.config.hidden_size_per_head)
         self.tp_size = get_tensor_model_parallel_world_size()
-        self.intermediate_size_per_tp_worker = \
-            self.intermediate_size // self.tp_size
         self.head_dim = self.config.hidden_size_per_head
         self.num_heads = self.config.mamba_num_heads
         self.time_step_rank = max(64, self.hidden_size // 16)
@@ -197,6 +194,22 @@ class Plamo2MambaMixer(nn.Module):
         self.C_norm = RMSNorm(self.ssm_state_size,
                               eps=self.config.rms_norm_eps)
 
+        self.chunk_size = self.config.mamba_chunk_size
+
+        if envs.VLLM_USE_V1:
+            compilation_config = get_current_vllm_config().compilation_config
+            if prefix in compilation_config.static_forward_context:
+                raise ValueError(f"Duplicate layer name: {prefix}")
+            compilation_config.static_forward_context[prefix] = self
+            # The outer list is for v0 PP virtual engine. Though this code path
+            # only runs for v1, we have to do this to unify with the interface
+            # of Attention + v0 PP.
+            # The inner tuple is (conv_state, ssm_state)
+            self.kv_cache = [(torch.tensor([]), torch.tensor([]))]
+            assert self.chunk_size != -1, "chunk_size must be set for v1"
+
+        self.prefix = prefix
+
     def _project_ssm_parameters(self, hidden_states):
         ssm_parameters = self.bcdt_proj(hidden_states)
         B, C, time_step = torch.split(
@@ -212,25 +225,76 @@ class Plamo2MambaMixer(nn.Module):
         dt = self.dt_proj(time_step)
         return B, C, dt
 
+    def forward_native(
+        self,
+        hidden_states: torch.Tensor,
+        output: torch.Tensor,
+        mamba_cache_params: MambaCacheParams,
+        mamba2_metadata: Mamba2Metadata,
+        **kwargs,
+    ):
+        pass
+
     def forward(
         self,
         hidden_states: torch.Tensor,
+        output: torch.Tensor,
         mamba_cache_params: MambaCacheParams,
         mamba2_metadata: Mamba2Metadata,
         **kwargs,
-    ) -> torch.Tensor:
+    ):
+        if not envs.VLLM_USE_V1:
+            CustomOp.forward(self, hidden_states, output, mamba_cache_params,
+                             mamba2_metadata)
+        else:
+            torch.ops.vllm.plamo2_mamba_mixer(
+                hidden_states,
+                output,
+                self.prefix,
+            )
+
+    def forward_cuda(
+        self,
+        hidden_states: torch.Tensor,
+        output: torch.Tensor,
+        mamba_cache_params: MambaCacheParams,
+        mamba2_metadata: Mamba2Metadata,
+        **kwargs,
+    ):
 
+        forward_context = get_forward_context()
         # mamba2_metadata contains metadata necessary for the mamba2 triton
         # kernels to operate in continuous batching and in chunked prefill
         # modes; they are computed at top-level model forward since they
         # stay the same and reused for all mamba layers in the same iteration
-        attn_metadata: AttentionMetadata = get_forward_context().attn_metadata
-
-        num_prefills = attn_metadata.num_prefills  # request count
-        num_decodes = attn_metadata.num_decode_tokens  # token count (=request)
-        num_prefill_tokens = attn_metadata.num_prefill_tokens  # token count
-        has_prefill = num_prefills > 0
-        has_decode = num_decodes > 0
+        attn_metadata: AttentionMetadata = forward_context.attn_metadata
+        if envs.VLLM_USE_V1:
+            if attn_metadata is not None:
+                assert isinstance(attn_metadata, dict)
+                attn_metadata = attn_metadata[self.prefix]
+                mamba2_metadata = attn_metadata
+                assert isinstance(attn_metadata, Mamba2AttentionMetadata)
+                self_kv_cache = self.kv_cache[forward_context.virtual_engine]
+                # conv_state = (..., dim, width-1) yet contiguous along 'dim'
+                conv_state = self_kv_cache[0].transpose(-1, -2)
+                ssm_state = self_kv_cache[1]
+                state_indices_tensor = attn_metadata.state_indices_tensor
+                has_initial_states_p = attn_metadata.has_initial_states_p
+                prep_initial_states = attn_metadata.prep_initial_states
+                chunk_size = attn_metadata.chunk_size
+                seq_idx_p = attn_metadata.seq_idx_p
+                chunk_indices_p = attn_metadata.chunk_indices_p
+                chunk_offsets_p = attn_metadata.chunk_offsets_p
+        else:
+            conv_state = mamba_cache_params.conv_state
+            ssm_state = mamba_cache_params.ssm_state
+            state_indices_tensor = mamba_cache_params.state_indices_tensor
+            has_initial_states_p = mamba2_metadata.has_initial_states
+            prep_initial_states = mamba2_metadata.prep_initial_states
+            chunk_size = mamba2_metadata.chunk_size
+            seq_idx_p = mamba2_metadata.seq_idx
+            chunk_indices_p = mamba2_metadata.chunk_indices
+            chunk_offsets_p = mamba2_metadata.chunk_offsets
 
         # 1. Gated MLP's linear projection
         projected_states = self.in_proj(hidden_states)
@@ -240,23 +304,59 @@ class Plamo2MambaMixer(nn.Module):
         conv_weights = self.conv1d.weight.view(self.conv1d.weight.size(0),
                                                self.conv1d.weight.size(2))
 
+        if envs.VLLM_USE_V1 and attn_metadata is None:
+            # V1 profile run
+            hidden_states = (hidden_states.transpose(0, 1).clone().transpose(
+                0, 1)).contiguous()
+            output[:] = self.out_proj(hidden_states)
+            return
+
+        num_prefills = attn_metadata.num_prefills  # request count
+        num_decodes = attn_metadata.num_decode_tokens  # token count (=request)
+        num_prefill_tokens = attn_metadata.num_prefill_tokens  # token count
+        has_prefill = num_prefills > 0
+        has_decode = num_decodes > 0
+        num_actual_tokens = num_prefill_tokens + num_decodes
+
+        # NOTE: V0 put prefill before decode, v1 puts decode before prefill
         # Separate prefill and decode by splitting varlen input
         # Split along token dimension
-        hidden_states_p, hidden_states_d = torch.split(
-            hidden_states,
-            [num_prefill_tokens, num_decodes],
-            dim=0,
-        )
-        gate_p, gate_d = torch.split(gate, [num_prefill_tokens, num_decodes],
-                                     dim=0)
-        # Split along batch dimension
-        state_indices_tensor_p, state_indices_tensor_d = torch.split(
-            mamba_cache_params.state_indices_tensor,
-            [num_prefills, num_decodes],
-            dim=0,
-        )
-        query_start_loc_p = (attn_metadata.query_start_loc[:num_prefills + 1]
-                             if has_prefill else None)
+        if envs.VLLM_USE_V1:
+            hidden_states_d, hidden_states_p = torch.split(
+                hidden_states[:num_actual_tokens],
+                [num_decodes, num_prefill_tokens],
+                dim=0,
+            )
+            gate_d, gate_p = torch.split(gate[:num_actual_tokens],
+                                         [num_decodes, num_prefill_tokens],
+                                         dim=0)
+            # Split along batch dimension
+            state_indices_tensor_d, state_indices_tensor_p = torch.split(
+                state_indices_tensor,
+                [num_decodes, num_prefills],
+                dim=0,
+            )
+            query_start_loc_p = (
+                attn_metadata.query_start_loc[-num_prefills - 1:] -
+                num_decodes if has_prefill else None)
+        else:
+            hidden_states_p, hidden_states_d = torch.split(
+                hidden_states,
+                [num_prefill_tokens, num_decodes],
+                dim=0,
+            )
+            gate_p, gate_d = torch.split(gate,
+                                         [num_prefill_tokens, num_decodes],
+                                         dim=0)
+            # Split along batch dimension
+            state_indices_tensor_p, state_indices_tensor_d = torch.split(
+                state_indices_tensor,
+                [num_prefills, num_decodes],
+                dim=0,
+            )
+            query_start_loc_p = (attn_metadata.query_start_loc[:num_prefills +
+                                                               1]
+                                 if has_prefill else None)
 
         # Preallocate output tensor to avoid memcpy cost for merging prefill
         # and decode outputs
@@ -268,25 +368,38 @@ class Plamo2MambaMixer(nn.Module):
             dtype=hidden_states.dtype,
             device=hidden_states.device,
         )
-        preallocated_ssm_out_p, preallocated_ssm_out_d = torch.split(
-            preallocated_ssm_out,
-            [num_prefill_tokens, num_decodes],
-            dim=0,
-        )
+        if envs.VLLM_USE_V1:
+            preallocated_ssm_out_d, preallocated_ssm_out_p = torch.split(
+                preallocated_ssm_out,
+                [num_decodes, num_prefill_tokens],
+                dim=0,
+            )
+        else:
+            preallocated_ssm_out_p, preallocated_ssm_out_d = torch.split(
+                preallocated_ssm_out,
+                [num_prefill_tokens, num_decodes],
+                dim=0,
+            )
 
         # Process prefill requests
         if has_prefill:
             # 2. Convolution sequence transformation
             # - "cache_indices" updates the conv_state cache in positions
-            # pointed to by "mamba_cache_params.state_indices_tensor"
+            #   pointed to by "state_indices_tensor"
+            x = hidden_states_p.transpose(
+                0, 1)  # this is the form that causal-conv see
+            if mamba2_metadata.cu_seqlen is None:
+                mamba2_metadata = update_metadata(x, query_start_loc_p,
+                                                  mamba2_metadata)
             hidden_states_p = causal_conv1d_fn(
-                hidden_states_p.transpose(0, 1),
+                x,
                 conv_weights,
                 self.conv1d.bias,
                 activation=self.activation,
-                conv_states=mamba_cache_params.conv_state,
-                has_initial_state=mamba2_metadata.has_initial_states,
+                conv_states=conv_state,
+                has_initial_state=has_initial_states_p,
                 cache_indices=state_indices_tensor_p,
+                metadata=mamba2_metadata,
                 query_start_loc=query_start_loc_p)
             hidden_states_p = hidden_states_p.transpose(0, 1)
             hidden_states_p = hidden_states_p[:num_prefill_tokens]
@@ -299,12 +412,16 @@ class Plamo2MambaMixer(nn.Module):
 
             # 3. State Space Model sequence transformation
             initial_states = None
-            if (mamba2_metadata.has_initial_states is not None
-                    and mamba2_metadata.prep_initial_states):
+            if has_initial_states_p is not None and prep_initial_states:
                 # making a copy of the states
-                initial_states = torch.where(
-                    mamba2_metadata.has_initial_states[:, None, None, None],
-                    mamba_cache_params.ssm_state[state_indices_tensor_p], 0)
+                if envs.VLLM_USE_V1:
+                    initial_states = torch.where(
+                        has_initial_states_p[:, None, None, None],
+                        ssm_state[state_indices_tensor_p], 0)
+                else:
+                    initial_states = torch.where(
+                        has_initial_states_p[:num_prefills, None, None, None],
+                        ssm_state[state_indices_tensor_p], 0)
             varlen_state = mamba_chunk_scan_combined(
                 hidden_states_p.view(1, num_prefill_tokens,
                                      self.num_heads // self.tp_size,
@@ -313,15 +430,15 @@ class Plamo2MambaMixer(nn.Module):
                 self.A,
                 B.view(1, num_prefill_tokens, 1, -1),
                 C.view(1, num_prefill_tokens, 1, -1),
-                chunk_size=mamba2_metadata.chunk_size,
+                chunk_size=chunk_size,
                 D=self.D,
                 z=gate_p.view(1, num_prefill_tokens,
                               self.num_heads // self.tp_size, self.head_dim),
                 dt_bias=self.dt_bias,
-                seq_idx=mamba2_metadata.seq_idx,
-                chunk_indices=mamba2_metadata.chunk_indices,
-                chunk_offsets=mamba2_metadata.chunk_offsets,
-                cu_seqlens=attn_metadata.query_start_loc[:num_prefills + 1],
+                seq_idx=seq_idx_p,
+                chunk_indices=chunk_indices_p,
+                chunk_offsets=chunk_offsets_p,
+                cu_seqlens=query_start_loc_p,
                 initial_states=initial_states,
                 return_varlen_states=True,
                 return_final_states=False,
@@ -329,18 +446,19 @@ class Plamo2MambaMixer(nn.Module):
                 dt_limit=(0.0, float("inf")),
                 out=preallocated_ssm_out_p.view(1, num_prefill_tokens, -1,
                                                 self.head_dim),
+                state_dtype=ssm_state.dtype,
             )
 
             # update ssm states
             # - varlen state is a (batch, nheads, headdim, dstate) tensor
-            mamba_cache_params.ssm_state[state_indices_tensor_p] = varlen_state
+            ssm_state[state_indices_tensor_p] = varlen_state
 
         # Process decode requests
         if has_decode:
             # 2. Convolution sequence transformation
             hidden_states_d = causal_conv1d_update(
                 hidden_states_d,
-                mamba_cache_params.conv_state,
+                conv_state,
                 conv_weights,
                 self.conv1d.bias,
                 self.activation,
@@ -363,8 +481,10 @@ class Plamo2MambaMixer(nn.Module):
             # - the hidden is reshaped into (bs, num_heads, head_dim)
             # - mamba_cache_params.ssm_state's slots will be selected
             #   using state_indices_tensor_d
+
+            # NOTE: final output is an in-place update of out tensor
             selective_state_update(
-                mamba_cache_params.ssm_state,
+                ssm_state,
                 hidden_states_d,
                 dt,
                 A,
@@ -378,11 +498,68 @@ class Plamo2MambaMixer(nn.Module):
                 out=preallocated_ssm_out_d.view(num_decodes, -1,
                                                 self.head_dim),
             )
-            assert self.num_heads % self.tp_size == 0
 
         # 4. Final linear projection
-        out = self.out_proj(preallocated_ssm_out)
-        return out
+        output[:num_actual_tokens] = self.out_proj(preallocated_ssm_out)
+
+    def get_state_dtype(self) -> tuple[torch.dtype, torch.dtype]:
+        assert self.model_config is not None
+        assert self.cache_config is not None
+        return MambaStateDtypeCalculator.mamba2_state_dtype(
+            self.model_config.dtype,
+            self.cache_config.mamba_cache_dtype,
+            self.cache_config.mamba_ssm_cache_dtype,
+        )
+
+    def get_state_shape(self) -> tuple[tuple[int, ...], tuple[int, ...]]:
+        return MambaStateShapeCalculator.mamba2_state_shape(
+            intermediate_size=self.intermediate_size,
+            tp_world_size=get_tensor_model_parallel_world_size(),
+            n_groups=0,
+            num_heads=self.num_heads,
+            head_dim=self.head_dim,
+            state_size=self.ssm_state_size,
+            conv_kernel=self.conv_kernel_size,
+        )
+
+    @property
+    def mamba_type(self) -> str:
+        return "mamba2"
+
+    def get_attn_backend(self) -> type["AttentionBackend"]:
+        from vllm.v1.attention.backends.mamba2_attn import (
+            Mamba2AttentionBackend)
+        return Mamba2AttentionBackend
+
+
+def plamo2_mamba_mixer(
+    hidden_states: torch.Tensor,
+    output: torch.Tensor,
+    layer_name: str,
+) -> None:
+    forward_context: ForwardContext = get_forward_context()
+    self = forward_context.no_compile_layers[layer_name]
+    self.forward_cuda(hidden_states=hidden_states,
+                      output=output,
+                      mamba_cache_params=None,
+                      mamba2_metadata=None)
+
+
+def plamo2_mamba_mixer_fake(
+    hidden_states: torch.Tensor,
+    output: torch.Tensor,
+    layer_name: str,
+) -> None:
+    return
+
+
+direct_register_custom_op(
+    op_name="plamo2_mamba_mixer",
+    op_func=plamo2_mamba_mixer,
+    mutates_args=["output"],
+    fake_impl=plamo2_mamba_mixer_fake,
+    dispatch_key=current_platform.dispatch_key,
+)
 
 
 class DenseMLP(nn.Module):
@@ -418,7 +595,6 @@ class DenseMLP(nn.Module):
         return self.down_proj(h)
 
 
-@support_torch_compile
 class Plamo2AttentionMixer(nn.Module):
 
     def __init__(self,
@@ -575,12 +751,24 @@ class Plamo2DecoderLayer(nn.Module):
             hidden_states, residual = self.pre_mixer_norm(
                 hidden_states, residual)
 
+        if self.is_mamba:
+            # Plamo2MambaMixer writes output to this tensor
+            output = torch.empty_like(hidden_states)
+            mixer_kwargs = {
+                "output": output,
+                "mamba_cache_params": mamba_cache_params,
+                "mamba2_metadata": mamba2_metadata,
+            }
+        else:
+            mixer_kwargs = {
+                "positions": positions,
+            }
         hidden_states = self.mixer(
-            positions=positions,
             hidden_states=hidden_states,
-            mamba_cache_params=mamba_cache_params,
-            mamba2_metadata=mamba2_metadata,
+            **mixer_kwargs,
         )
+        if self.is_mamba:
+            hidden_states = output
         hidden_states = self.post_mixer_norm(hidden_states)
         # Fully Connected
         hidden_states, residual = self.pre_mlp_norm(hidden_states, residual)
@@ -591,7 +779,7 @@ class Plamo2DecoderLayer(nn.Module):
 
 class Plamo2Decoder(torch.nn.Module):
 
-    def __init__(self, vllm_config: VllmConfig, prefix: str = "") -> None:
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = "") -> None:
         super().__init__()
         config = vllm_config.model_config.hf_config
         extra_kwargs = {"is_lora_enabled": bool(vllm_config.lora_config)}
@@ -617,7 +805,7 @@ class Plamo2Decoder(torch.nn.Module):
         mamba_cache_index = 0
         for layer in islice(self.layers, self.start_layer, self.end_layer):
             layer_mamba_cache_params = None
-            if layer.is_mamba:
+            if layer.is_mamba and mamba_cache_params is not None:
                 layer_mamba_cache_params = mamba_cache_params.at_layer_idx(
                     mamba_cache_index)
                 mamba_cache_index += 1
@@ -632,10 +820,11 @@ class Plamo2Decoder(torch.nn.Module):
         return hidden_states, residual
 
 
-class Plamo2Model(Plamo2PreTrainedModel):
+@support_torch_compile
+class Plamo2Model(torch.nn.Module):
 
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
-        super().__init__(vllm_config.model_config.hf_config)
+        super().__init__()
 
         config = vllm_config.model_config.hf_config
 
@@ -653,9 +842,9 @@ class Plamo2Model(Plamo2PreTrainedModel):
         self.make_empty_intermediate_tensors = (
             make_empty_intermediate_tensors_factory(
                 ["hidden_states", "residual"], config.hidden_size))
-        self.layers = Plamo2Decoder(vllm_config, prefix=f"{prefix}.layers")
+        self.layers = Plamo2Decoder(vllm_config=vllm_config,
+                                    prefix=f"{prefix}.layers")
         self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
-        self.post_init()
 
     def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
         return self.embed_tokens(input_ids)
@@ -679,11 +868,16 @@ class Plamo2Model(Plamo2PreTrainedModel):
             hidden_states = intermediate_tensors["hidden_states"]
             residual = intermediate_tensors["residual"]
 
-        attn_metadata: AttentionMetadata = get_forward_context().attn_metadata
-        mamba2_metadata = prepare_mamba2_metadata(
-            chunk_size=self.config.mamba_chunk_size,
-            attn_metadata=attn_metadata,
-        )
+        if not envs.VLLM_USE_V1:
+            attn_metadata: AttentionMetadata = get_forward_context(
+            ).attn_metadata
+            mamba2_metadata = prepare_mamba2_metadata(
+                chunk_size=self.config.mamba_chunk_size,
+                attn_metadata=attn_metadata,
+            )
+        else:
+            # v1 get mamba2_metadata from forward_context
+            mamba2_metadata = None
 
         hidden_states, residual = self.layers(
             positions=positions,
@@ -701,8 +895,7 @@ class Plamo2Model(Plamo2PreTrainedModel):
         return hidden_states
 
 
-class Plamo2ForCausalLM(Plamo2PreTrainedModel, HasInnerState, SupportsPP,
-                        IsHybrid, SupportsV0Only):
+class Plamo2ForCausalLM(torch.nn.Module, HasInnerState, SupportsPP, IsHybrid):
     packed_modules_mapping = {
         "qkv_proj": [
             "q_proj",
@@ -712,12 +905,10 @@ class Plamo2ForCausalLM(Plamo2PreTrainedModel, HasInnerState, SupportsPP,
     }
 
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = "") -> None:
+        super().__init__()
         config = vllm_config.model_config.hf_config
         scheduler_config = vllm_config.scheduler_config
-        assert not vllm_config.cache_config.enable_prefix_caching, \
-            "PLaMo2 currently does not support prefix caching"
 
-        super().__init__(config)
         self.config = config
         self.vllm_config = vllm_config
         self.model_config = vllm_config.model_config
@@ -751,8 +942,6 @@ class Plamo2ForCausalLM(Plamo2PreTrainedModel, HasInnerState, SupportsPP,
         self.sampler = get_sampler()
         self.make_empty_intermediate_tensors = (
             self.model.make_empty_intermediate_tensors)
-        # Initialize weights and apply final processing
-        self.post_init()
 
     def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
         return self.model.get_input_embeddings(input_ids)
@@ -763,19 +952,27 @@ class Plamo2ForCausalLM(Plamo2PreTrainedModel, HasInnerState, SupportsPP,
                 intermediate_tensors: Optional[IntermediateTensors] = None,
                 inputs_embeds: Optional[torch.Tensor] = None,
                 **kwargs):
-        if self.mamba_cache is None:
-            num_mamba_layers = self.model_config.get_num_layers_by_block_type(
-                self.vllm_config.parallel_config, LayerBlockType.mamba)
-
-            self.mamba_cache = MambaCacheManager(
-                self.vllm_config,
-                num_mamba_layers,
-                *self._get_mamba_cache_shape(),
-                self.lm_head.weight.dtype,
-                self.lm_head.weight.dtype,
-            )
-
-        mamba_cache_params = self.mamba_cache.current_run_tensors(**kwargs)
+        if not envs.VLLM_USE_V1:
+            if self.mamba_cache is None:
+                num_mamba_layers = (
+                    self.model_config.get_num_layers_by_block_type(
+                        self.vllm_config.parallel_config,
+                        LayerBlockType.mamba))
+
+                mamba_state_shape = self.get_mamba_state_shape_from_config(
+                    self.vllm_config, use_v1=False)
+                mamba_state_dtype = \
+                    self.get_mamba_state_dtype_from_config(
+                    self.vllm_config)
+                self.mamba_cache = MambaCacheManager(self.vllm_config,
+                                                     num_mamba_layers,
+                                                     *mamba_state_shape,
+                                                     *mamba_state_dtype)
+
+            mamba_cache_params = self.mamba_cache.current_run_tensors(**kwargs)
+        else:
+            # NOTE: mamba_cache_params is not needed for v1
+            mamba_cache_params = None
 
         hidden_states = self.model(input_ids, positions, mamba_cache_params,
                                    intermediate_tensors, inputs_embeds)
@@ -788,21 +985,48 @@ class Plamo2ForCausalLM(Plamo2PreTrainedModel, HasInnerState, SupportsPP,
     def get_seqlen_agnostic_capture_inputs(self, batch_size: int):
         return self.mamba_cache.get_seqlen_agnostic_capture_inputs(batch_size)
 
-    def _get_mamba_cache_shape(
-            self) -> tuple[tuple[int, int], tuple[int, int, int]]:
-        world_size = get_tensor_model_parallel_world_size()
-        hidden_size = (self.config.mamba_num_heads *
-                       self.config.hidden_size_per_head)
-        conv_state_shape = (
-            hidden_size // world_size,
-            self.config.mamba_d_conv - 1,
+    @classmethod
+    def get_mamba_state_dtype_from_config(
+        cls,
+        vllm_config: "VllmConfig",
+    ) -> tuple[torch.dtype, torch.dtype]:
+
+        return MambaStateDtypeCalculator.mamba2_state_dtype(
+            vllm_config.model_config.dtype,
+            vllm_config.cache_config.mamba_cache_dtype,
+            vllm_config.cache_config.mamba_ssm_cache_dtype,
         )
-        temporal_state_shape = (
-            divide(self.config.mamba_num_heads, world_size),
-            self.config.hidden_size_per_head,
-            self.config.mamba_d_state,
+
+    @classmethod
+    def get_mamba_state_shape_from_config(
+        cls,
+        vllm_config: "VllmConfig",
+        use_v1: bool = True,
+    ) -> tuple[tuple[int, int], tuple[int, int, int]]:
+        """Calculate shapes for Mamba's convolutional and state caches.
+        Args:
+            vllm_config: vLLM config
+            use_v1: Get shapes for V1 (or V0)
+        Returns:
+            Tuple containing:
+            - conv_state_shape: Shape for convolutional state cache
+            - temporal_state_shape: Shape for state space model cache
+        """
+        parallel_config = vllm_config.parallel_config
+        hf_config = vllm_config.model_config.hf_config
+        intermediate_size =\
+                hf_config.mamba_num_heads * hf_config.hidden_size_per_head
+
+        return MambaStateShapeCalculator.mamba2_state_shape(
+            intermediate_size=intermediate_size,
+            tp_world_size=parallel_config.tensor_parallel_size,
+            n_groups=0,
+            num_heads=hf_config.mamba_num_heads,
+            head_dim=hf_config.hidden_size_per_head,
+            state_size=hf_config.mamba_d_state,
+            conv_kernel=hf_config.mamba_d_conv,
+            use_v1=use_v1,
         )
-        return conv_state_shape, temporal_state_shape
 
     def compute_logits(
         self,
diff --git a/vllm/model_executor/models/prithvi_geospatial_mae.py b/vllm/model_executor/models/prithvi_geospatial_mae.py
deleted file mode 100644
index 2edc357d2df1bec8975d95c56c0dd6216a3cf593..0000000000000000000000000000000000000000
--- a/vllm/model_executor/models/prithvi_geospatial_mae.py
+++ /dev/null
@@ -1,313 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-# Copyright 2025 The vLLM team.
-# Copyright 2025 IBM.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Inference-only IBM/NASA Prithvi Geospatial model."""
-
-from collections.abc import Iterable, Mapping, Sequence
-from typing import Any, Optional, Union
-
-import torch
-import torch.nn as nn
-from transformers import BatchFeature
-
-from vllm.config import VllmConfig
-from vllm.model_executor.layers.pooler import DispatchPooler, Pooler
-from vllm.model_executor.model_loader.weight_utils import default_weight_loader
-from vllm.model_executor.models.utils import AutoWeightsLoader
-from vllm.multimodal import MULTIMODAL_REGISTRY
-from vllm.multimodal.inputs import (ImageItem, ModalityData,
-                                    MultiModalDataDict, MultiModalFieldConfig,
-                                    MultiModalInputs, MultiModalKwargsItems,
-                                    PlaceholderRange)
-from vllm.multimodal.parse import (DictEmbeddingItems, ModalityDataItems,
-                                   MultiModalDataItems, MultiModalDataParser)
-from vllm.multimodal.processing import (BaseMultiModalProcessor,
-                                        BaseProcessingInfo, PromptUpdate)
-from vllm.multimodal.profiling import BaseDummyInputsBuilder
-from vllm.sequence import IntermediateTensors
-
-from .interfaces import (IsAttentionFree, MultiModalEmbeddings,
-                         SupportsMultiModal)
-from .interfaces_base import default_pooling_type
-
-
-def _prithvi_field_config(hf_inputs: Mapping[str, torch.Tensor]):
-    # This model receives in input a multi-dimensional tensor representing
-    # a single image patch and therefore it is not to be split
-    # into multiple elements, but rather to be considered a single one.
-    # Hence, the decision of using a MultiModalSharedField.
-    # The expected shape is (num_channels, width, height).
-
-    # This model however allows the user to also submit multiple image
-    # patches as a batch, adding a further dimension to the above shape.
-    # At this stage we only support submitting one patch per request and
-    # batching is achieved via vLLM batching.
-    # TODO (christian-pinto): enable support for multi patch requests
-    # in tandem with vLLM batching.
-    return dict(
-        pixel_values=MultiModalFieldConfig.shared(batch_size=1,
-                                                  modality="image"),
-        location_coords=MultiModalFieldConfig.shared(batch_size=1,
-                                                     modality="image"),
-    )
-
-
-class PrithviGeoSpatialMAEMultiModalDataParser(MultiModalDataParser):
-
-    def _parse_image_data(
-        self,
-        data: Union[dict[str, torch.Tensor], ModalityData[ImageItem]],
-    ) -> Optional[ModalityDataItems[Any, Any]]:
-        if isinstance(data, dict):
-            return DictEmbeddingItems(
-                data,
-                modality="image",
-                required_fields={"pixel_values", "location_coords"},
-                fields_factory=_prithvi_field_config,
-            )
-
-        return super()._parse_image_data(data)
-
-
-class PrithviGeoSpatialMAEProcessingInfo(BaseProcessingInfo):
-
-    def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
-        return {"image": None}
-
-
-class PrithviGeoSpatialMAEInputBuilder(
-        BaseDummyInputsBuilder[PrithviGeoSpatialMAEProcessingInfo]):
-
-    def get_dummy_text(self, mm_counts: Mapping[str, int]) -> str:
-        return ""
-
-    def get_dummy_mm_data(
-        self,
-        seq_len: int,
-        mm_counts: Mapping[str, int],
-    ) -> MultiModalDataDict:
-        # This model input is fixed and is in the form of a torch Tensor.
-        # The size of pixel_values might change in the cases where we resize
-        # the input but never exceeds the dimensions below.
-        image_data = {
-            "pixel_values": torch.full((6, 512, 512), 1.0,
-                                       dtype=torch.float16),
-            "location_coords": torch.full((1, 2), 1.0, dtype=torch.float16),
-        }
-
-        return {"image": image_data}
-
-
-class PrithviGeoSpatialMAEMultiModalProcessor(BaseMultiModalProcessor):
-
-    def _get_data_parser(self) -> MultiModalDataParser:
-        return PrithviGeoSpatialMAEMultiModalDataParser()
-
-    def _get_mm_fields_config(
-        self,
-        hf_inputs: BatchFeature,
-        hf_processor_mm_kwargs: Mapping[str, object],
-    ) -> Mapping[str, MultiModalFieldConfig]:
-        return _prithvi_field_config(hf_inputs)
-
-    def _get_prompt_updates(
-        self,
-        mm_items: MultiModalDataItems,
-        hf_processor_mm_kwargs: Mapping[str, object],
-        out_mm_kwargs: MultiModalKwargsItems,
-    ) -> Sequence[PromptUpdate]:
-        return []
-
-    def apply(
-        self,
-        prompt: Union[str, list[int]],
-        mm_data: MultiModalDataDict,
-        hf_processor_mm_kwargs: Mapping[str, object],
-        tokenization_kwargs: Optional[Mapping[str, object]] = None,
-        mm_hash_overrides: Optional[dict[str, list[str]]] = None,
-    ) -> MultiModalInputs:
-        if "image" in mm_data:
-            image_data = mm_data["image"]
-        else:
-            image_data = mm_data
-            mm_data = {"image": mm_data}
-
-        mm_items = self._to_mm_items(mm_data)
-        tokenization_kwargs = tokenization_kwargs or {}
-        mm_hashes = (mm_hash_overrides if mm_hash_overrides is not None else
-                     self._hash_mm_items(mm_items, hf_processor_mm_kwargs,
-                                         tokenization_kwargs))
-        mm_placeholders = {"image": [PlaceholderRange(offset=0, length=0)]}
-
-        mm_processed_data = BatchFeature(image_data)
-
-        mm_kwargs = MultiModalKwargsItems.from_hf_inputs(
-            mm_processed_data,
-            self._get_mm_fields_config(mm_processed_data,
-                                       hf_processor_mm_kwargs),
-        )
-
-        return MultiModalInputs(
-            type="multimodal",
-            prompt=prompt,
-            prompt_token_ids=[1],
-            mm_kwargs=mm_kwargs,
-            mm_hashes=mm_hashes,
-            mm_placeholders=mm_placeholders,
-        )
-
-
-@default_pooling_type("All")
-@MULTIMODAL_REGISTRY.register_processor(
-    PrithviGeoSpatialMAEMultiModalProcessor,
-    info=PrithviGeoSpatialMAEProcessingInfo,
-    dummy_inputs=PrithviGeoSpatialMAEInputBuilder,
-)
-class PrithviGeoSpatialMAE(nn.Module, IsAttentionFree, SupportsMultiModal):
-    """Prithvi Masked Autoencoder"""
-
-    supports_multimodal_raw_input_only = True
-    is_pooling_model = True
-
-    @classmethod
-    def get_placeholder_str(cls, modality: str, i: int) -> Optional[str]:
-        if modality.startswith("image"):
-            return None
-
-        raise ValueError("Only image modality is supported")
-
-    def _instantiate_model(self, config: dict) -> Optional[nn.Module]:
-        # We might be able/need to support different tasks with this same model
-        if config["task_args"]["task"] == "SemanticSegmentationTask":
-            from terratorch.cli_tools import SemanticSegmentationTask
-
-            task = SemanticSegmentationTask(
-                config["model_args"],
-                config["task_args"]["model_factory"],
-                loss=config["task_args"]["loss"],
-                lr=config["task_args"]["lr"],
-                ignore_index=config["task_args"]["ignore_index"],
-                optimizer=config["task_args"]["optimizer"],
-                optimizer_hparams=config["optimizer_params"],
-                scheduler=config["task_args"]["scheduler"],
-                scheduler_hparams=config["scheduler_params"],
-                plot_on_val=config["task_args"]["plot_on_val"],
-                freeze_decoder=config["task_args"]["freeze_decoder"],
-                freeze_backbone=config["task_args"]["freeze_backbone"],
-            )
-
-            return task.model
-        else:
-            return None
-
-    def __init__(self, vllm_config: VllmConfig, prefix: str = ""):
-        super().__init__()
-
-        # the actual model is dynamically instantiated using terratorch
-        # allowing us to perform changes to the model architecture
-        # at startup time (e.g., change the model decoder class.)
-        self.model = self._instantiate_model(
-            vllm_config.model_config.hf_config.to_dict()["pretrained_cfg"])
-        if self.model is None:
-            raise ValueError(
-                "Unsupported task. "
-                "Only SemanticSegmentationTask is supported for now "
-                "by PrithviGeospatialMAE.")
-
-        pooler_config = vllm_config.model_config.pooler_config
-        assert pooler_config is not None
-
-        self.pooler = DispatchPooler(
-            {"encode": Pooler.for_encode(pooler_config)}, )
-
-    def _parse_and_validate_multimodal_data(
-            self, **kwargs) -> tuple[torch.Tensor, Optional[torch.Tensor]]:
-        pixel_values = kwargs.pop("pixel_values", None)
-        if not isinstance(pixel_values, torch.Tensor):
-            raise ValueError(f"Incorrect type of pixel_values. "
-                             f"Got type: {type(pixel_values)}")
-
-        location_coords = kwargs.pop("location_coords", None)
-        if not isinstance(location_coords, torch.Tensor):
-            raise ValueError(f"Incorrect type of location_coords. "
-                             f"Got type: {type(location_coords)}")
-        location_coords = torch.unbind(location_coords, dim=0)[0]
-        if location_coords.shape == torch.Size([0]):
-            location_coords = None
-
-        return pixel_values, location_coords
-
-    def get_input_embeddings(
-        self,
-        input_ids: torch.Tensor,
-        multimodal_embeddings: Optional[MultiModalEmbeddings] = None,
-    ) -> torch.Tensor:
-        # We do not really use any input tokens and therefore no embeddings
-        # to be calculated. However, due to the mandatory token ids in
-        # the input prompt we pass one token and the size of the dummy
-        # embedding tensors must reflect that.
-        return torch.empty((input_ids.shape[0], 0))
-
-    def forward(
-        self,
-        input_ids: Optional[torch.Tensor],
-        positions: torch.Tensor,
-        intermediate_tensors: Optional[IntermediateTensors] = None,
-        inputs_embeds: Optional[torch.Tensor] = None,
-        **kwargs: object,
-    ):
-        pixel_values, location_coords = (
-            self._parse_and_validate_multimodal_data(**kwargs))
-        model_output = self.model(pixel_values,
-                                  location_coords=location_coords)
-
-        return model_output.output
-
-    def load_weights(self, weights: Iterable[tuple[str,
-                                                   torch.Tensor]]) -> set[str]:
-        params_list = []
-        model_buffers = dict(self.named_buffers())
-        loaded_buffers = []
-        for key, value in weights:
-            if key == "state_dict":
-                weights_to_parse = value
-                for name, weight in weights_to_parse.items():
-                    if "pos_embed" in name:
-                        continue
-
-                    if "_timm_module." in name:
-                        name = name.replace("_timm_module.", "")
-
-                    # this model requires a couple of buffers to be loaded
-                    # that are not loadable with the AutoWeightsLoader
-                    if name in model_buffers:
-                        if "_timm_module." in name:
-                            name = name.replace("_timm_module.", "")
-                        buffer = model_buffers[name]
-                        weight_loader = getattr(buffer, "weight_loader",
-                                                default_weight_loader)
-                        weight_loader(buffer, weight)
-                        loaded_buffers.append(name)
-                    else:
-                        params_list.append((name, weight))
-                break
-
-        # Load the remaining model parameters
-        loader = AutoWeightsLoader(self)
-        autoloaded_weights = loader.load_weights(params_list)
-
-        return autoloaded_weights.union(set(loaded_buffers))
diff --git a/vllm/model_executor/models/qwen2_5_omni_thinker.py b/vllm/model_executor/models/qwen2_5_omni_thinker.py
index 5c64c81547e651650c4e835e857bf3671c92153d..f8a943d4cab3355031f08418c216fb8814cd010a 100644
--- a/vllm/model_executor/models/qwen2_5_omni_thinker.py
+++ b/vllm/model_executor/models/qwen2_5_omni_thinker.py
@@ -25,7 +25,7 @@
 from collections.abc import Iterable, Mapping, Sequence
 from copy import copy
 from functools import partial
-from typing import Any, Callable, Optional, Union
+from typing import Annotated, Any, Callable, Literal, Optional, Union
 
 import torch
 import torch.nn as nn
@@ -41,14 +41,14 @@ from transformers.models.whisper import WhisperFeatureExtractor
 from vllm.config import VllmConfig
 from vllm.logger import init_logger
 from vllm.model_executor.layers.rotary_embedding import MRotaryEmbedding
+from vllm.model_executor.models.module_mapping import MultiModelKeys
 from vllm.model_executor.models.qwen2_5_vl import (
     Qwen2_5_VisionTransformer, Qwen2_5_VLImageEmbeddingInputs,
     Qwen2_5_VLImageInputs, Qwen2_5_VLImagePixelInputs,
     Qwen2_5_VLProcessingInfo, Qwen2_5_VLVideoEmbeddingInputs,
     Qwen2_5_VLVideoInputs, Qwen2_5_VLVideoPixelInputs)
 from vllm.model_executor.models.qwen2_audio import (
-    Qwen2AudioFeatureInputs, Qwen2AudioProcessingInfo,
-    _get_feat_extract_output_lengths)
+    Qwen2AudioProcessingInfo, _get_feat_extract_output_lengths)
 from vllm.model_executor.models.qwen2_vl import Qwen2VLMultiModalDataParser
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.multimodal import MULTIMODAL_REGISTRY
@@ -65,8 +65,10 @@ from vllm.multimodal.processing import (BaseMultiModalProcessor,
 from vllm.multimodal.profiling import BaseDummyInputsBuilder
 from vllm.sequence import IntermediateTensors
 from vllm.transformers_utils.tokenizer import decode_tokens, encode_tokens
+from vllm.utils.tensor_schema import TensorSchema, TensorShape
 
-from .interfaces import MultiModalEmbeddings, SupportsMultiModal, SupportsPP
+from .interfaces import (MultiModalEmbeddings, SupportsLoRA,
+                         SupportsMultiModal, SupportsPP)
 from .utils import (AutoWeightsLoader, WeightsMapper,
                     init_vllm_registered_model, maybe_prefix,
                     merge_multimodal_embeddings)
@@ -79,6 +81,26 @@ except (ImportError, ModuleNotFoundError):
 logger = init_logger(__name__)
 
 
+class Qwen2_5OmniAudioFeatureInputs(TensorSchema):
+    """
+    Dimensions:
+        - na: Number of audios
+        - nmb: Number of mel bins
+        - msl: Maximum sequence length
+        - tsl: Total sequence length
+    """
+    type: Literal["audio_features"]
+    input_features: Annotated[
+        Union[torch.Tensor, list[torch.Tensor]],
+        TensorShape("nmb", "tsl"),
+    ]
+
+    feature_attention_mask: Annotated[
+        torch.Tensor,
+        TensorShape("na", "msl"),
+    ]
+
+
 def create_qwen2_5_omni_thinker_field_factory(
     spatial_merge_size: int
 ) -> Callable[[Mapping[str, torch.Tensor]], Mapping[str,
@@ -534,7 +556,7 @@ class Qwen2_5OmniConditionalGenerationMixin:
             return torch.concat(mm_input, dim=dim)
 
     def _parse_and_validate_audio_input(
-            self, **kwargs: object) -> Optional[Qwen2AudioFeatureInputs]:
+            self, **kwargs: object) -> Optional[Qwen2_5OmniAudioFeatureInputs]:
         input_audio_features = kwargs.pop('input_audio_features', None)
         audio_feature_lengths = kwargs.pop('audio_feature_lengths', None)
         feature_attention_mask = kwargs.pop('feature_attention_mask', None)
@@ -548,7 +570,8 @@ class Qwen2_5OmniConditionalGenerationMixin:
         if not isinstance(input_audio_features, (torch.Tensor, list)):
             raise ValueError("Incorrect type of audio input features. "
                              f"Got type: {type(input_audio_features)}")
-        return Qwen2AudioFeatureInputs(
+        return Qwen2_5OmniAudioFeatureInputs(
+            type="audio_features",
             input_features=input_audio_features,
             audio_feature_lengths=audio_feature_lengths,
             feature_attention_mask=feature_attention_mask)
@@ -631,7 +654,7 @@ class Qwen2_5OmniConditionalGenerationMixin:
 
     def _process_audio_input(
         self,
-        audio_input: Qwen2AudioFeatureInputs,
+        audio_input: Qwen2_5OmniAudioFeatureInputs,
         audio_hashes: list[str] = None,
         cached_audio_features: torch.Tensor = None,
     ) -> torch.Tensor:
@@ -658,8 +681,8 @@ class Qwen2_5OmniConditionalGenerationMixin:
             feature_lens=audio_feature_lengths,
             aftercnn_lens=audio_feat_lengths,
         )
-        audio_features = audio_outputs.last_hidden_state
-        return audio_features.split(audio_output_lengths.tolist())
+        return audio_outputs.last_hidden_state.split(
+            audio_output_lengths.tolist())
 
     def _process_image_input(
             self,
@@ -705,7 +728,7 @@ class Qwen2_5OmniConditionalGenerationMixin:
     dummy_inputs=Qwen2_5OmniThinkerDummyInputsBuilder,
 )
 class Qwen2_5OmniThinkerForConditionalGeneration(
-        nn.Module, SupportsMultiModal, SupportsPP,
+        nn.Module, SupportsMultiModal, SupportsPP, SupportsLoRA,
         Qwen2_5OmniConditionalGenerationMixin):
     hf_to_vllm_mapper = WeightsMapper(
         orig_to_new_prefix={
@@ -713,6 +736,22 @@ class Qwen2_5OmniThinkerForConditionalGeneration(
             "thinker.model.": "language_model.model.",
             "thinker.": "",
         })
+    packed_modules_mapping = {
+        "qkv_proj": [
+            "q_proj",
+            "k_proj",
+            "v_proj",
+        ],
+        "attn.qkv": [
+            "attn.q",
+            "attn.k",
+            "attn.v",
+        ],
+        "gate_up_proj": [
+            "gate_proj",
+            "up_proj",
+        ],
+    }
 
     @classmethod
     def get_placeholder_str(cls, modality: str, i: int) -> Optional[str]:
@@ -807,7 +846,7 @@ class Qwen2_5OmniThinkerForConditionalGeneration(
             return []
 
         # The result multimodal_embeddings is tuple of tensors, with each
-        # tensor correspoending to a multimodal data item (image or video).
+        # tensor corresponding to a multimodal data item (image or video).
         multimodal_embeddings: tuple[torch.Tensor, ...] = ()
 
         # NOTE: It is important to iterate over the keys in this dictionary
@@ -834,7 +873,7 @@ class Qwen2_5OmniThinkerForConditionalGeneration(
         if multimodal_embeddings is not None \
             and len(multimodal_embeddings) != 0:
 
-            # TODO (ywang96): support overlapping modalitiy embeddings so that
+            # TODO (ywang96): support overlapping modality embeddings so that
             # `use_audio_in_video` will work on V1.
             inputs_embeds = merge_multimodal_embeddings(
                 input_ids, inputs_embeds, multimodal_embeddings, [
@@ -935,3 +974,12 @@ class Qwen2_5OmniThinkerForConditionalGeneration(
                                              mapper=self.hf_to_vllm_mapper)
 
         return loaded_weights
+
+    def get_mm_mapping(self) -> MultiModelKeys:
+        """
+        Get the module prefix in multimodal models
+        """
+        return MultiModelKeys.from_string_field(
+            language_model="language_model",
+            connector="merger.",
+            tower_model=["visual.", "audio_tower."])
diff --git a/vllm/model_executor/models/qwen2_5_vl.py b/vllm/model_executor/models/qwen2_5_vl.py
index b528083b7c9cc8e9757f50e8642e36fc04b39cfd..8aa7775570297f8ac255b02b811e80e835d030c6 100644
--- a/vllm/model_executor/models/qwen2_5_vl.py
+++ b/vllm/model_executor/models/qwen2_5_vl.py
@@ -27,7 +27,7 @@
 """Inference-only Qwen2.5-VL model compatible with HuggingFace weights."""
 from collections.abc import Iterable, Mapping
 from functools import lru_cache, partial
-from typing import Callable, Literal, Optional, TypedDict, Union
+from typing import Annotated, Callable, Literal, Optional, Union
 
 import torch
 import torch.nn as nn
@@ -48,9 +48,7 @@ from vllm.model_executor.layers.layernorm import RMSNorm
 # yapf: disable
 from vllm.model_executor.layers.linear import (ColumnParallelLinear,
                                                MergedColumnParallelLinear,
-                                               MergedReplicatedLinear,
                                                QKVParallelLinear,
-                                               ReplicatedLinear,
                                                RowParallelLinear)
 # yapf: enable
 from vllm.model_executor.layers.quantization import QuantizationConfig
@@ -65,6 +63,7 @@ from vllm.multimodal.utils import run_dp_sharded_mrope_vision_model
 from vllm.platforms import _Backend
 from vllm.sequence import IntermediateTensors
 from vllm.transformers_utils.config import uses_mrope
+from vllm.utils.tensor_schema import TensorSchema, TensorShape
 
 from .interfaces import (MultiModalEmbeddings, SupportsLoRA,
                          SupportsMultiModal, SupportsPP, SupportsQuant)
@@ -81,84 +80,125 @@ logger = init_logger(__name__)
 # === Vision Inputs === #
 
 
-class Qwen2_5_VLImagePixelInputs(TypedDict):
-    type: Literal["pixel_values"]
-    pixel_values: torch.Tensor
-    """Shape:
-    `(num_patches, num_channels * patch_size * patch_size)`
+class Qwen2_5_VLImagePixelInputs(TensorSchema):
     """
-
-    image_grid_thw: torch.Tensor
-    """Shape: `(num_images, 3)`
-    This should be in `(grid_t, grid_h, grid_w)` format.
+    Dimensions:
+        - np: Number of patches
+        - ni: Number of images
+        - cps: Number of channels * patch_size * patch_size
+    
+    Historical context:
+        - pixel_values shape: (num_patches, num_channels * patch_size * 
+          patch_size)
+        - image_grid_thw shape: (num_images, 3) in (grid_t, grid_h, grid_w)
+          formatnum_channels * patch_size * patch_size
     """
+    type: Literal["pixel_values"]
 
+    pixel_values: Annotated[
+        torch.Tensor,
+        TensorShape("np", "cps"),
+    ]
+
+    image_grid_thw: Annotated[
+        torch.Tensor,
+        TensorShape("ni", 3),
+    ]
 
-class Qwen2_5_VLImageEmbeddingInputs(TypedDict):
-    type: Literal["image_embeds"]
-    image_embeds: torch.Tensor
-    """Supported types:
-    - list[`torch.Tensor`]: A list of tensors holding all images' features.
-        Each tensor holds an image's features.
-    - `torch.Tensor`: A tensor holding all images' features
-        (concatenation of all images' feature tensors).
-
-    Tensor shape: `(num_image_features, hidden_size)`
-    - `num_image_features` varies based on
-        the number and resolution of the images.
-    - `hidden_size` must match the hidden size of language model backbone.
-    """
 
-    image_grid_thw: torch.Tensor
-    """Shape: `(num_images, 3)`
-    This should be in `(grid_t, grid_h, grid_w)` format.
+class Qwen2_5_VLImageEmbeddingInputs(TensorSchema):
+    """
+    Dimensions:
+        - nf: Number of image features
+        - hs: Hidden size
+        - ni: Number of images
+    
+    Historical context:
+        - image_embeds shape: (num_image_features, hidden_size)
+        - num_image_features varies based on the number and resolution of the
+          images.
+        - hidden_size must match the hidden size of language model backbone.
+        - image_grid_thw shape: (num_images, 3) in (grid_t, grid_h, grid_w)
+          format
     """
+    type: Literal["image_embeds"]
+
+    image_embeds: Annotated[
+        torch.Tensor,
+        TensorShape("nf", "hs"),
+    ]
+
+    image_grid_thw: Annotated[
+        torch.Tensor,
+        TensorShape("ni", 3),
+    ]
 
 
 Qwen2_5_VLImageInputs = Union[Qwen2_5_VLImagePixelInputs,
                               Qwen2_5_VLImageEmbeddingInputs]
 
 
-class Qwen2_5_VLVideoPixelInputs(TypedDict):
-    type: Literal["pixel_values_videos"]
-    pixel_values_videos: torch.Tensor
-    """Shape:
-    `(num_patches,
-      num_channels * temporal_patch_size * patch_size * patch_size)`
+class Qwen2_5_VLVideoPixelInputs(TensorSchema):
     """
+    Dimensions:
+        - np: Number of patches
+        - nv: Number of videos
+        - ctps: Number of channels * temporal_patch_size * patch_size * 
+          patch_size
+    
+    Historical context:
+        - pixel_values_videos shape: (num_patches, num_channels * 
+          temporal_patch_size * patch_size * patch_size)
+        - video_grid_thw shape: (num_videos, 3) in (grid_t, grid_h, grid_w)
+          format
+        - second_per_grid_ts: The video time interval (in seconds) for each
+          grid along the temporal dimension in the 3D position IDs. Returned
+          when `videos` is not `None`.
+    """
+    type: Literal["pixel_values_videos"]
 
-    video_grid_thw: torch.Tensor
-    """Shape: `(num_videos, 3)`
+    pixel_values_videos: Annotated[
+        torch.Tensor,
+        TensorShape("np", "ctps"),
+    ]
 
-    This should be in `(grid_t, grid_h, grid_w)` format.
-    """
+    video_grid_thw: Annotated[
+        torch.Tensor,
+        TensorShape("nv", 3),
+    ]
 
-    second_per_grid_ts: torch.Tensor
-    """
-    The video time interval (in seconds) for each grid along the temporal
-    dimension in the 3D position IDs. Returned when `videos` is not `None`.
-    """
+    second_per_grid_ts: Annotated[
+        Optional[torch.Tensor],
+        TensorShape("nv"),
+    ]
 
 
-class Qwen2_5_VLVideoEmbeddingInputs(TypedDict):
-    type: Literal["video_embeds"]
-    video_embeds: torch.Tensor
-    """Supported types:
-    - list[`torch.Tensor`]: A list of tensors holding all videos' features.
-        Each tensor holds an video's features.
-    - `torch.Tensor`: A tensor holding all videos' features
-      (concatenation of all videos' feature tensors).
-
-    Tensor shape: `(num_image_features, hidden_size)`
-    - `num_image_features` varies based on
-        the number and resolution of the videos.
-    - `hidden_size` must match the hidden size of language model backbone.
+class Qwen2_5_VLVideoEmbeddingInputs(TensorSchema):
     """
-
-    video_grid_thw: torch.Tensor
-    """Shape: `(num_videos, 3)`
-    This should be in `(grid_t, grid_h, grid_w)` format.
+    Dimensions:
+        - nf: Number of video features
+        - hs: Hidden size
+        - nv: Number of videos
+    
+    Historical context:
+        - video_embeds shape: (num_video_features, hidden_size)
+        - num_video_features varies based on the number and resolution of the
+          videos.
+        - hidden_size must match the hidden size of language model backbone.
+        - video_grid_thw shape: (num_videos, 3) in (grid_t, grid_h, grid_w)
+          format
     """
+    type: Literal["video_embeds"]
+
+    video_embeds: Annotated[
+        torch.Tensor,
+        TensorShape("nf", "hs"),
+    ]
+
+    video_grid_thw: Annotated[
+        torch.Tensor,
+        TensorShape("nv", 3),
+    ]
 
 
 Qwen2_5_VLVideoInputs = Union[Qwen2_5_VLVideoPixelInputs,
@@ -178,22 +218,20 @@ class Qwen2_5_VisionMLP(nn.Module):
                  prefix: str = "",
                  use_data_parallel: bool = False):
         super().__init__()
-        cls_gate_up_proj = (MergedReplicatedLinear if use_data_parallel else
-                            MergedColumnParallelLinear)
-        self.gate_up_proj = cls_gate_up_proj(
+        self.gate_up_proj = MergedColumnParallelLinear(
             input_size=in_features,
             output_sizes=[hidden_features] * 2,  # [gate_proj, up_proj]
             bias=bias,
             quant_config=quant_config,
-            prefix=f"{prefix}.gate_up_proj")
-
-        cls_down_proj = (ReplicatedLinear
-                         if use_data_parallel else RowParallelLinear)
-        self.down_proj = cls_down_proj(hidden_features,
-                                       in_features,
-                                       bias=bias,
-                                       quant_config=quant_config,
-                                       prefix=f"{prefix}.down_proj")
+            prefix=f"{prefix}.gate_up_proj",
+            disable_tp=use_data_parallel)
+
+        self.down_proj = RowParallelLinear(hidden_features,
+                                           in_features,
+                                           bias=bias,
+                                           quant_config=quant_config,
+                                           prefix=f"{prefix}.down_proj",
+                                           disable_tp=use_data_parallel)
         self.act_fn = act_fn
 
     def forward(self, x: torch.Tensor):
@@ -243,30 +281,21 @@ class Qwen2_5_VisionAttention(nn.Module):
         self.num_attention_heads_per_partition = dist_utils.divide(
             num_heads, self.tp_size)
 
-        if use_data_parallel:
-            self.qkv = ReplicatedLinear(embed_dim,
-                                        self.hidden_size_per_attention_head *
-                                        3 * num_heads,
-                                        bias=True,
-                                        quant_config=quant_config,
-                                        prefix=f"{prefix}.qkv")
-
-        else:
-            self.qkv = QKVParallelLinear(
-                hidden_size=embed_dim,
-                head_size=self.hidden_size_per_attention_head,
-                total_num_heads=num_heads,
-                total_num_kv_heads=num_heads,
-                bias=True,
-                quant_config=quant_config,
-                prefix=f"{prefix}.qkv")
+        self.qkv = QKVParallelLinear(
+            hidden_size=embed_dim,
+            head_size=self.hidden_size_per_attention_head,
+            total_num_heads=num_heads,
+            total_num_kv_heads=num_heads,
+            bias=True,
+            quant_config=quant_config,
+            prefix=f"{prefix}.qkv",
+            disable_tp=use_data_parallel)
 
-        cls_proj = (ReplicatedLinear
-                    if use_data_parallel else RowParallelLinear)
-        self.proj = cls_proj(input_size=projection_size,
-                             output_size=embed_dim,
-                             quant_config=quant_config,
-                             prefix=f"{prefix}.proj")
+        self.proj = RowParallelLinear(input_size=projection_size,
+                                      output_size=embed_dim,
+                                      quant_config=quant_config,
+                                      prefix=f"{prefix}.proj",
+                                      disable_tp=use_data_parallel)
 
         # Detect attention implementation.
         self.attn_backend: _Backend = get_vit_attn_backend(support_fa=True)
@@ -480,32 +509,32 @@ class Qwen2_5_VisionPatchMerger(nn.Module):
             norm_layer = partial(nn.LayerNorm, eps=1e-6)
         self.ln_q = norm_layer(context_dim)
 
-        cls_fc1 = (ReplicatedLinear
-                   if use_data_parallel else ColumnParallelLinear)
-        cls_fc2 = (ReplicatedLinear
-                   if use_data_parallel else RowParallelLinear)
-        self.mlp = nn.ModuleList([
-            cls_fc1(self.hidden_size,
-                    self.hidden_size,
-                    bias=True,
-                    quant_config=quant_config,
-                    prefix=f"{prefix}.mlp.0"),
+        self.mlp = nn.Sequential(
+            ColumnParallelLinear(
+                self.hidden_size,
+                self.hidden_size,
+                bias=True,
+                quant_config=quant_config,
+                prefix=f"{prefix}.mlp.0",
+                return_bias=False,
+                disable_tp=use_data_parallel,
+            ),
             nn.GELU(),
-            cls_fc2(self.hidden_size,
-                    d_model,
-                    bias=True,
-                    quant_config=quant_config,
-                    prefix=f"{prefix}.mlp.2"),
-        ])
+            RowParallelLinear(
+                self.hidden_size,
+                d_model,
+                bias=True,
+                quant_config=quant_config,
+                prefix=f"{prefix}.mlp.2",
+                return_bias=False,
+                disable_tp=use_data_parallel,
+            ),
+        )
 
     def forward(self, x: torch.Tensor) -> torch.Tensor:
         x = self.ln_q(x)
         x = x.view(-1, self.hidden_size)
-
-        mlp_fc1, mlp_act, mlp_fc2 = self.mlp
-        x_parallel, _ = mlp_fc1(x)
-        x_parallel = mlp_act(x_parallel)
-        out, _ = mlp_fc2(x_parallel)
+        out = self.mlp(x)
         return out
 
 
@@ -687,6 +716,15 @@ class Qwen2_5_VisionTransformer(nn.Module):
             seqlens = (cu_seqlens[1:] - cu_seqlens[:-1]).tolist()
         return max_seqlen, seqlens
 
+    @staticmethod
+    def invert_permutation(perm: torch.Tensor) -> torch.Tensor:
+        # building the inverse permutation in O(n) time
+        inv = torch.empty_like(perm)
+        inv[perm] = torch.arange(perm.numel(),
+                                 device=perm.device,
+                                 dtype=perm.dtype)
+        return inv
+
     def forward(
         self,
         x: torch.Tensor,
@@ -730,6 +768,8 @@ class Qwen2_5_VisionTransformer(nn.Module):
 
         rotary_pos_emb = torch.cat(rotary_pos_emb)
         window_index = torch.cat(window_index)
+        # compute reverse indices
+        reverse_indices = self.invert_permutation(window_index)
         cu_window_seqlens = torch.cat(cu_window_seqlens)
         cu_window_seqlens = torch.unique_consecutive(cu_window_seqlens)
         cu_seqlens = torch.cat(cu_seqlens)
@@ -783,7 +823,6 @@ class Qwen2_5_VisionTransformer(nn.Module):
 
         # adapter
         hidden_states = self.merger(hidden_states)
-        reverse_indices = torch.argsort(window_index)
         hidden_states = hidden_states[reverse_indices, :]
         return hidden_states
 
@@ -948,10 +987,6 @@ class Qwen2_5_VLForConditionalGeneration(nn.Module, SupportsMultiModal,
             image_grid_thw = self._validate_and_reshape_mm_tensor(
                 image_grid_thw, "image grid_thw")
 
-            if not isinstance(pixel_values, (torch.Tensor, list)):
-                raise ValueError("Incorrect type of image pixel values. "
-                                 f"Got type: {type(pixel_values)}")
-
             return Qwen2_5_VLImagePixelInputs(type="pixel_values",
                                               pixel_values=pixel_values,
                                               image_grid_thw=image_grid_thw)
@@ -962,9 +997,6 @@ class Qwen2_5_VLForConditionalGeneration(nn.Module, SupportsMultiModal,
             image_grid_thw = self._validate_and_reshape_mm_tensor(
                 image_grid_thw, "image grid_thw")
 
-            if not isinstance(image_embeds, torch.Tensor):
-                raise ValueError("Incorrect type of image embeddings. "
-                                 f"Got type: {type(image_embeds)}")
             return Qwen2_5_VLImageEmbeddingInputs(
                 type="image_embeds",
                 image_embeds=image_embeds,
@@ -985,7 +1017,8 @@ class Qwen2_5_VLForConditionalGeneration(nn.Module, SupportsMultiModal,
                 pixel_values_videos, "video pixel values")
             video_grid_thw = self._validate_and_reshape_mm_tensor(
                 video_grid_thw, "video grid_thw")
-
+            if second_per_grid_ts is not None and second_per_grid_ts.ndim == 2:
+                second_per_grid_ts = second_per_grid_ts.squeeze(-1)
             return Qwen2_5_VLVideoPixelInputs(
                 type="pixel_values_videos",
                 pixel_values_videos=pixel_values_videos,
@@ -999,9 +1032,6 @@ class Qwen2_5_VLForConditionalGeneration(nn.Module, SupportsMultiModal,
             video_grid_thw = self._validate_and_reshape_mm_tensor(
                 video_grid_thw, "video grid_thw")
 
-            if not isinstance(video_embeds, torch.Tensor):
-                raise ValueError("Incorrect type of video embeddings. "
-                                 f"Got type: {type(video_embeds)}")
             return Qwen2_5_VLVideoEmbeddingInputs(
                 type="video_embeds",
                 video_embeds=video_embeds,
@@ -1021,8 +1051,10 @@ class Qwen2_5_VLForConditionalGeneration(nn.Module, SupportsMultiModal,
             pixel_values = image_input["pixel_values"]
 
             if self.use_data_parallel:
-                return run_dp_sharded_mrope_vision_model(
-                    self.visual, pixel_values, grid_thw_list)
+                return run_dp_sharded_mrope_vision_model(self.visual,
+                                                         pixel_values,
+                                                         grid_thw_list,
+                                                         rope_type="rope_3d")
             else:
                 image_embeds = self.visual(pixel_values,
                                            grid_thw=grid_thw_list)
@@ -1048,8 +1080,10 @@ class Qwen2_5_VLForConditionalGeneration(nn.Module, SupportsMultiModal,
         else:
             pixel_values_videos = video_input["pixel_values_videos"]
             if self.use_data_parallel:
-                return run_dp_sharded_mrope_vision_model(
-                    self.visual, pixel_values_videos, grid_thw_list)
+                return run_dp_sharded_mrope_vision_model(self.visual,
+                                                         pixel_values_videos,
+                                                         grid_thw_list,
+                                                         rope_type="rope_3d")
             else:
                 video_embeds = self.visual(pixel_values_videos,
                                            grid_thw=grid_thw_list)
@@ -1158,21 +1192,9 @@ class Qwen2_5_VLForConditionalGeneration(nn.Module, SupportsMultiModal,
             input_ids: Flattened (concatenated) input_ids corresponding to a
                 batch.
             positions: Flattened (concatenated) position ids corresponding to a
-                batch.
-                **NOTE**: If mrope is enabled (default setting for Qwen2.5-VL
-                opensource models), the shape will be `(3, seq_len)`,
+                batch. **NOTE**: If mrope is enabled (default setting for
+                Qwen2.5-VL opensource models), the shape will be `(3, seq_len)`,
                 otherwise it will be `(seq_len,).
-            pixel_values: Pixel values to be fed to a model.
-                `None` if no images are passed.
-            image_grid_thw: Tensor `(n_images, 3)` of image 3D grid in LLM.
-                `None` if no images are passed.
-            pixel_values_videos: Pixel values of videos to be fed to a model.
-                `None` if no videos are passed.
-            video_grid_thw: Tensor `(n_videos, 3)` of video 3D grid in LLM.
-                `None` if no videos are passed.
-            second_per_grid_ts: Tensor `(num_videos)` of video time interval (
-                in seconds) for each grid along the temporal dimension in the
-                3D position IDs. `None` if no videos are passed.
         """
 
         if intermediate_tensors is not None:
diff --git a/vllm/model_executor/models/qwen2_audio.py b/vllm/model_executor/models/qwen2_audio.py
index 86b4a9a018c76d72da0aac2658e81d044f083d0c..54ec7b86274889313da02550056b60911bc7b2ee 100644
--- a/vllm/model_executor/models/qwen2_audio.py
+++ b/vllm/model_executor/models/qwen2_audio.py
@@ -23,7 +23,7 @@
 # limitations under the License.
 """Inference-only Qwen2-Audio model compatible with HuggingFace weights."""
 from collections.abc import Iterable, Mapping, Sequence
-from typing import Any, Literal, Optional, TypedDict, Union
+from typing import Annotated, Any, Literal, Optional, Union
 
 import torch
 import torch.nn as nn
@@ -47,6 +47,7 @@ from vllm.multimodal.processing import (BaseMultiModalProcessor,
                                         PromptUpdate, PromptUpdateDetails)
 from vllm.multimodal.profiling import BaseDummyInputsBuilder
 from vllm.sequence import IntermediateTensors
+from vllm.utils.tensor_schema import TensorSchema, TensorShape
 
 from .interfaces import MultiModalEmbeddings, SupportsMultiModal, SupportsPP
 from .utils import (AutoWeightsLoader, init_vllm_registered_model,
@@ -54,21 +55,38 @@ from .utils import (AutoWeightsLoader, init_vllm_registered_model,
 
 
 # # === Audio Inputs === #
-class Qwen2AudioFeatureInputs(TypedDict):
+class Qwen2AudioFeatureInputs(TensorSchema):
+    """
+    Dimensions:
+        - na: Number of audios
+        - nmb: Number of mel bins
+    """
     type: Literal["audio_features"]
-    input_features: torch.Tensor
-    """Shape: `(num_audios, num_mel_bins, 3000)`"""
+    input_features: Annotated[
+        Union[torch.Tensor, list[torch.Tensor]],
+        TensorShape("na", "nmb", 3000),
+    ]
 
-    feature_attention_mask: torch.Tensor
-    """Shape: `(num_audios, 3000)`"""
+    feature_attention_mask: Annotated[
+        torch.Tensor,
+        TensorShape("na", 3000),
+    ]
 
 
-class Qwen2AudioEmbeddingInputs(TypedDict):
-    type: Literal["audio_embeds"]
-    audio_embeds: list[torch.Tensor]
-    """Shape: `(num_audio_features, hidden_size)`
-    `hidden_size` must match the hidden size of language model backbone.
+class Qwen2AudioEmbeddingInputs(TensorSchema):
     """
+    Dimensions:
+        - bn: Batch size
+        - naf: Number of audio features
+        - hs: Hidden size (must match the hidden size of language model
+          backbone)
+    """
+    type: Literal["audio_embeds"] = "audio_embeds"
+
+    audio_embeds: Annotated[
+        list[torch.Tensor],
+        TensorShape("bn", "naf", "hs"),
+    ]
 
 
 Qwen2AudioInputs = Union[Qwen2AudioFeatureInputs, Qwen2AudioEmbeddingInputs]
diff --git a/vllm/model_executor/models/qwen2_rm.py b/vllm/model_executor/models/qwen2_rm.py
index 421b43563bade76fa5993d7ee7669e463b9ef571..2bd9d2b52628a587245cfe9ea469d36d864d642a 100644
--- a/vllm/model_executor/models/qwen2_rm.py
+++ b/vllm/model_executor/models/qwen2_rm.py
@@ -53,15 +53,18 @@ class Qwen2RewardBaseModel(nn.Module, SupportsLoRA, SupportsPP):
         self.quant_config = quant_config
         self.model = Qwen2Model(vllm_config=vllm_config,
                                 prefix=maybe_prefix(prefix, "model"))
+        self.head_dtype = vllm_config.model_config.head_dtype
 
         self.score = nn.Sequential(
             ColumnParallelLinear(config.hidden_size,
                                  config.hidden_size,
                                  quant_config=quant_config,
+                                 params_dtype=self.head_dtype,
                                  return_bias=False),
             nn.ReLU(),
             RowParallelLinear(config.hidden_size,
                               config.num_labels,
+                              params_dtype=self.head_dtype,
                               quant_config=quant_config,
                               return_bias=False),
         )
@@ -80,6 +83,7 @@ class Qwen2RewardBaseModel(nn.Module, SupportsLoRA, SupportsPP):
     ) -> Union[torch.Tensor, IntermediateTensors]:
         hidden_states = self.model(input_ids, positions, intermediate_tensors,
                                    inputs_embeds)
+        hidden_states = hidden_states.to(self.head_dtype)
         logits = self.score(hidden_states)
         return logits
 
diff --git a/vllm/model_executor/models/qwen2_vl.py b/vllm/model_executor/models/qwen2_vl.py
index ae7a8d8d7a5b9efc695b038aa5db46de81a945d4..90a1ad2a658ab3bdceef8e185ee00abd6769effc 100644
--- a/vllm/model_executor/models/qwen2_vl.py
+++ b/vllm/model_executor/models/qwen2_vl.py
@@ -26,7 +26,7 @@
 """Inference-only Qwen2-VL model compatible with HuggingFace weights."""
 from collections.abc import Iterable, Mapping, Sequence
 from functools import partial
-from typing import Any, Callable, Literal, Optional, TypedDict, Union
+from typing import Annotated, Any, Callable, Literal, Optional, Union
 
 import torch
 import torch.nn as nn
@@ -70,6 +70,7 @@ from vllm.platforms import _Backend, current_platform
 from vllm.sequence import IntermediateTensors
 from vllm.transformers_utils.config import uses_mrope
 from vllm.transformers_utils.tokenizer import AnyTokenizer
+from vllm.utils.tensor_schema import TensorSchema, TensorShape
 
 from .interfaces import (MultiModalEmbeddings, SupportsLoRA,
                          SupportsMultiModal, SupportsPP)
@@ -86,78 +87,119 @@ _MAX_FRAMES_PER_VIDEO = 16
 # === Vision Inputs === #
 
 
-class Qwen2VLImagePixelInputs(TypedDict):
-    type: Literal["pixel_values"]
-    pixel_values: torch.Tensor
-    """Shape:
-    `(num_patches, num_channels * patch_size * patch_size)`
+class Qwen2VLImagePixelInputs(TensorSchema):
     """
-
-    image_grid_thw: torch.Tensor
-    """Shape: `(num_images, 3)`
-    This should be in `(grid_t, grid_h, grid_w)` format.
+    Dimensions:
+        - np: The total number of patches over each image over each prompt in
+              the batch
+        - ni: Number of images
+        - cps: Number of channels * patch_size * patch_size
+    
+    Historical context:
+        - pixel_values shape: (num_patches, num_channels * patch_size * 
+          patch_size)
+        - image_grid_thw shape: (num_images, 3) in (grid_t, grid_h, grid_w)
+          format
     """
+    type: Literal["pixel_values"]
 
+    pixel_values: Annotated[
+        torch.Tensor,
+        TensorShape("np", "cps"),
+    ]
 
-class Qwen2VLImageEmbeddingInputs(TypedDict):
-    type: Literal["image_embeds"]
-    image_embeds: torch.Tensor
-    """Supported types:
-    - list[`torch.Tensor`]: A list of tensors holding all images' features.
-        Each tensor holds an image's features.
-    - `torch.Tensor`: A tensor holding all images' features
-        (concatenation of all images' feature tensors).
+    image_grid_thw: Annotated[
+        torch.Tensor,
+        TensorShape("ni", 3),
+    ]
+
+
+class Qwen2VLImageEmbeddingInputs(TensorSchema):
+    """
+    Dimensions:
+        - nf: Number of image features
+        - hs: Hidden size
+        - ni: Number of images
     
-    Tensor shape: `(num_image_features, hidden_size)`
-    - `num_image_features` varies based on
-        the number and resolution of the images.
-    - `hidden_size` must match the hidden size of language model backbone.
+    Historical context:
+        - image_embeds shape: (num_image_features, hidden_size)
+        - num_image_features varies based on the number and resolution of the
+          images.
+        - hidden_size must match the hidden size of language model backbone.
+        - image_grid_thw shape: (num_images, 3) in (grid_t, grid_h, grid_w)
+          format
     """
+    type: Literal["image_embeds"]
 
-    image_grid_thw: torch.Tensor
-    """Shape: `(num_images, 3)`
-    This should be in `(grid_t, grid_h, grid_w)` format.
-    """
+    image_embeds: Annotated[
+        torch.Tensor,
+        TensorShape("nf", "hs"),
+    ]
+
+    image_grid_thw: Annotated[
+        torch.Tensor,
+        TensorShape("ni", 3),
+    ]
 
 
 Qwen2VLImageInputs = Union[Qwen2VLImagePixelInputs,
                            Qwen2VLImageEmbeddingInputs]
 
 
-class Qwen2VLVideoPixelInputs(TypedDict):
-    type: Literal["pixel_values_videos"]
-    pixel_values_videos: torch.Tensor
-    """Shape:
-    `(num_patches,
-      num_channels * temporal_patch_size * patch_size * patch_size)`
+class Qwen2VLVideoPixelInputs(TensorSchema):
+    """
+    Dimensions:
+        - np: The total number of patches over each video over each prompt in
+              the batch
+        - ctps: Number of channels * temporal_patch_size * patch_size * 
+          patch_size
+        - nv: Number of videos
+    
+    Historical context:
+        - pixel_values_videos shape: (num_patches, num_channels * 
+          temporal_patch_size * patch_size * patch_size)
+        - video_grid_thw shape: (num_videos, 3) in (grid_t, grid_h, grid_w)
+          format
     """
+    type: Literal["pixel_values_videos"]
 
-    video_grid_thw: torch.Tensor
-    """Shape: `(num_videos, 3)`
+    pixel_values_videos: Annotated[
+        torch.Tensor,
+        TensorShape("np", "ctps"),
+    ]
 
-    This should be in `(grid_t, grid_h, grid_w)` format.
-    """
+    video_grid_thw: Annotated[
+        torch.Tensor,
+        TensorShape("nv", 3),
+    ]
 
 
-class Qwen2VLVideoEmbeddingInputs(TypedDict):
-    type: Literal["video_embeds"]
-    video_embeds: torch.Tensor
-    """Supported types:
-    - list[`torch.Tensor`]: A list of tensors holding all videos' features.
-        Each tensor holds an video's features.
-    - `torch.Tensor`: A tensor holding all videos' features
-        (concatenation of all videos' feature tensors).
+class Qwen2VLVideoEmbeddingInputs(TensorSchema):
+    """
+    Dimensions:
+        - nf: Number of video features
+        - hs: Hidden size
+        - nv: Number of videos
     
-    Tensor shape: `(num_image_features, hidden_size)`
-    - `num_image_features` varies based on 
-        the number and resolution of the videos.
-    - `hidden_size` must match the hidden size of language model backbone.
+    Historical context:
+        - video_embeds shape: (num_video_features, hidden_size)
+        - num_video_features varies based on the number and resolution of the
+          videos.
+        - hidden_size must match the hidden size of language model backbone.
+        - video_grid_thw shape: (num_videos, 3) in (grid_t, grid_h, grid_w)
+          format
     """
+    type: Literal["video_embeds"]
 
-    video_grid_thw: torch.Tensor
-    """Shape: `(num_videos, 3)`
-    This should be in `(grid_t, grid_h, grid_w)` format.
-    """
+    video_embeds: Annotated[
+        torch.Tensor,
+        TensorShape("nf", "hs"),
+    ]
+
+    video_grid_thw: Annotated[
+        torch.Tensor,
+        TensorShape("nv", 3),
+    ]
 
 
 Qwen2VLVideoInputs = Union[Qwen2VLVideoPixelInputs,
@@ -915,12 +957,9 @@ class Qwen2VLProcessingInfo(BaseProcessingInfo):
         seq_len: int,
         mm_counts: Mapping[str, int],
     ) -> int:
-        max_images = mm_counts.get("image", 0)
         max_videos = mm_counts.get("video", 0)
 
-        max_image_tokens = self.get_max_image_tokens() * max_images
-        max_total_frames = self._get_max_video_frames(seq_len -
-                                                      max_image_tokens)
+        max_total_frames = self._get_max_video_frames(seq_len)
         max_frames_per_video = min(max_total_frames // max(max_videos, 1),
                                    _MAX_FRAMES_PER_VIDEO)
 
@@ -1129,10 +1168,6 @@ class Qwen2VLForConditionalGeneration(nn.Module, SupportsMultiModal,
             image_grid_thw = self._validate_and_reshape_mm_tensor(
                 image_grid_thw, "image grid_thw")
 
-            if not isinstance(pixel_values, (torch.Tensor, list)):
-                raise ValueError("Incorrect type of image pixel values. "
-                                 f"Got type: {type(pixel_values)}")
-
             return Qwen2VLImagePixelInputs(type="pixel_values",
                                            pixel_values=pixel_values,
                                            image_grid_thw=image_grid_thw)
@@ -1143,9 +1178,6 @@ class Qwen2VLForConditionalGeneration(nn.Module, SupportsMultiModal,
             image_grid_thw = self._validate_and_reshape_mm_tensor(
                 image_grid_thw, "image grid_thw")
 
-            if not isinstance(image_embeds, torch.Tensor):
-                raise ValueError("Incorrect type of image embeddings. "
-                                 f"Got type: {type(image_embeds)}")
             return Qwen2VLImageEmbeddingInputs(type="image_embeds",
                                                image_embeds=image_embeds,
                                                image_grid_thw=image_grid_thw)
@@ -1177,9 +1209,6 @@ class Qwen2VLForConditionalGeneration(nn.Module, SupportsMultiModal,
             video_grid_thw = self._validate_and_reshape_mm_tensor(
                 video_grid_thw, "video grid_thw")
 
-            if not isinstance(video_embeds, torch.Tensor):
-                raise ValueError("Incorrect type of video embeddings. "
-                                 f"Got type: {type(video_embeds)}")
             return Qwen2VLVideoEmbeddingInputs(type="video_embeds",
                                                video_embeds=video_embeds,
                                                video_grid_thw=video_grid_thw)
@@ -1189,6 +1218,7 @@ class Qwen2VLForConditionalGeneration(nn.Module, SupportsMultiModal,
 
         grid_thw = image_input["image_grid_thw"]
         assert grid_thw.ndim == 2
+        grid_thw_list = grid_thw.tolist()
 
         if image_input["type"] == "image_embeds":
             image_embeds = image_input["image_embeds"]
@@ -1198,15 +1228,17 @@ class Qwen2VLForConditionalGeneration(nn.Module, SupportsMultiModal,
 
         # Split concatenated embeddings for each image item.
         merge_size = self.visual.spatial_merge_size
-        sizes = grid_thw.prod(-1) // merge_size // merge_size
+        sizes = (torch.tensor(grid_thw_list, dtype=torch.long).prod(-1) //
+                 (merge_size * merge_size)).tolist()
 
-        return image_embeds.split(sizes.tolist())
+        return image_embeds.split(sizes)
 
     def _process_video_input(
             self, video_input: Qwen2VLVideoInputs) -> tuple[torch.Tensor, ...]:
 
         grid_thw = video_input["video_grid_thw"]
         assert grid_thw.ndim == 2
+        grid_thw_list = grid_thw.tolist()
 
         if video_input["type"] == "video_embeds":
             video_embeds = video_input["video_embeds"]
@@ -1216,9 +1248,10 @@ class Qwen2VLForConditionalGeneration(nn.Module, SupportsMultiModal,
 
         # Split concatenated embeddings for each video item.
         merge_size = self.visual.spatial_merge_size
-        sizes = grid_thw.prod(-1) // merge_size // merge_size
+        sizes = (torch.tensor(grid_thw_list, dtype=torch.long).prod(-1) //
+                 (merge_size * merge_size)).tolist()
 
-        return video_embeds.split(sizes.tolist())
+        return video_embeds.split(sizes)
 
     def _parse_and_validate_multimodal_inputs(self, **kwargs: object) -> dict:
         modalities = {}
diff --git a/vllm/model_executor/models/qwen3_next.py b/vllm/model_executor/models/qwen3_next.py
new file mode 100644
index 0000000000000000000000000000000000000000..55c16c462885da8a67196ef214681ac98c1d4cfb
--- /dev/null
+++ b/vllm/model_executor/models/qwen3_next.py
@@ -0,0 +1,1294 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""Inference-only Qwen3Next model."""
+from collections.abc import Iterable
+from typing import Optional
+
+import torch
+import torch.nn.functional as F
+from einops import rearrange
+from torch import nn
+from transformers.activations import ACT2FN
+
+from vllm import envs
+from vllm.attention import Attention, AttentionBackend, AttentionMetadata
+from vllm.compilation.decorators import support_torch_compile
+from vllm.config import (CacheConfig, ModelConfig, SpeculativeConfig,
+                         VllmConfig, get_current_vllm_config)
+from vllm.distributed import (divide, get_ep_group, get_pp_group,
+                              get_tensor_model_parallel_rank,
+                              get_tensor_model_parallel_world_size)
+from vllm.forward_context import ForwardContext, get_forward_context
+from vllm.logger import init_logger
+from vllm.model_executor.layers.fla.ops import (
+    RMSNormGated, chunk_gated_delta_rule, fused_recurrent_gated_delta_rule)
+from vllm.model_executor.layers.fused_moe import FusedMoE
+# yapf conflicts with isort for this block
+# yapf: disable
+from vllm.model_executor.layers.layernorm import (
+    GemmaRMSNorm as Qwen3NextRMSNorm)
+# yapf: enable
+from vllm.model_executor.layers.linear import (ColumnParallelLinear,
+                                               MergedColumnParallelLinear,
+                                               QKVParallelLinear,
+                                               ReplicatedLinear,
+                                               RowParallelLinear)
+from vllm.model_executor.layers.logits_processor import LogitsProcessor
+from vllm.model_executor.layers.mamba.abstract import MambaBase
+from vllm.model_executor.layers.mamba.mamba_mixer2 import (
+    mamba_v2_sharded_weight_loader)
+from vllm.model_executor.layers.mamba.mamba_utils import (
+    MambaStateDtypeCalculator, MambaStateShapeCalculator)
+from vllm.model_executor.layers.mamba.ops.causal_conv1d import (
+    causal_conv1d_fn, causal_conv1d_update)
+from vllm.model_executor.layers.quantization import QuantizationConfig
+from vllm.model_executor.layers.quantization.gptq import GPTQConfig
+from vllm.model_executor.layers.quantization.gptq_marlin import (
+    GPTQMarlinConfig)
+from vllm.model_executor.layers.rotary_embedding import get_rope
+from vllm.model_executor.layers.vocab_parallel_embedding import (
+    DEFAULT_VOCAB_PADDING_SIZE, ParallelLMHead, VocabParallelEmbedding)
+from vllm.model_executor.model_loader.weight_utils import (
+    default_weight_loader, sharded_weight_loader)
+from vllm.model_executor.models.mamba_cache import MambaCacheParams
+from vllm.model_executor.models.qwen2_moe import Qwen2MoeMLP as Qwen3NextMLP
+from vllm.model_executor.sampling_metadata import SamplingMetadata
+from vllm.model_executor.utils import set_weight_attrs
+from vllm.platforms import current_platform
+from vllm.sequence import IntermediateTensors
+from vllm.transformers_utils.configs import Qwen3NextConfig
+from vllm.triton_utils import tl, triton
+from vllm.utils import direct_register_custom_op
+from vllm.v1.attention.backends.gdn_attn import GDNAttentionMetadata
+
+from .interfaces import (HasInnerState, IsHybrid, MixtureOfExperts,
+                         SupportsLoRA, SupportsPP)
+from .utils import (AutoWeightsLoader, PPMissingLayer, extract_layer_index,
+                    is_pp_missing_parameter,
+                    make_empty_intermediate_tensors_factory, make_layers,
+                    maybe_prefix)
+
+logger = init_logger(__name__)
+
+KVCache = tuple[torch.Tensor, torch.Tensor]
+
+
+class Qwen3NextSparseMoeBlock(nn.Module):
+
+    def __init__(
+        self,
+        config: Qwen3NextConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+        enable_eplb: bool = False,
+    ):
+        super().__init__()
+        self.tp_size = get_tensor_model_parallel_world_size()
+
+        self.ep_group = get_ep_group().device_group
+        self.ep_rank = self.ep_group.rank()
+        self.ep_size = self.ep_group.size()
+        self.n_routed_experts = config.num_experts
+
+        if self.tp_size > config.num_experts:
+            raise ValueError(
+                f"Tensor parallel size {self.tp_size} is greater than "
+                f"the number of experts {config.num_experts}.")
+
+        # Load balancing settings.
+        vllm_config = get_current_vllm_config()
+        eplb_config = vllm_config.parallel_config.eplb_config
+        self.enable_eplb = enable_eplb
+
+        self.n_logical_experts = self.n_routed_experts
+        self.n_redundant_experts = eplb_config.num_redundant_experts
+        self.n_physical_experts = (self.n_logical_experts +
+                                   self.n_redundant_experts)
+        self.n_local_physical_experts = self.n_physical_experts // self.ep_size
+
+        self.physical_expert_start = (self.ep_rank *
+                                      self.n_local_physical_experts)
+        self.physical_expert_end = (self.physical_expert_start +
+                                    self.n_local_physical_experts)
+
+        self.experts = FusedMoE(num_experts=self.n_routed_experts,
+                                top_k=config.num_experts_per_tok,
+                                hidden_size=config.hidden_size,
+                                intermediate_size=config.moe_intermediate_size,
+                                reduce_results=False,
+                                renormalize=config.norm_topk_prob,
+                                quant_config=quant_config,
+                                prefix=f"{prefix}.experts",
+                                enable_eplb=self.enable_eplb,
+                                num_redundant_experts=self.n_redundant_experts)
+
+        self.gate = ReplicatedLinear(
+            config.hidden_size,
+            config.num_experts,
+            bias=False,
+            quant_config=self._maybe_ignore_quant_config(quant_config),
+            prefix=f"{prefix}.gate")
+
+        if config.shared_expert_intermediate_size > 0:
+            self.shared_expert = Qwen3NextMLP(
+                hidden_size=config.hidden_size,
+                intermediate_size=config.shared_expert_intermediate_size,
+                hidden_act=config.hidden_act,
+                quant_config=quant_config,
+                reduce_results=self.experts.must_reduce_shared_expert_outputs(
+                ),
+            )
+        else:
+            self.shared_expert = None
+        self.shared_expert_gate = torch.nn.Linear(config.hidden_size,
+                                                  1,
+                                                  bias=False)
+
+    def _maybe_ignore_quant_config(self, quant_config: QuantizationConfig):
+        # GPTQ configs do not have a list of ignored modules, however AutoGPTQ
+        # seems to avoid gate quantization.
+        # See: https://huggingface.co/Qwen/Qwen3-30B-A3B-GPTQ-Int4
+        if isinstance(quant_config, (GPTQConfig, GPTQMarlinConfig)):
+            return None
+        return quant_config
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        # NOTE: hidden_states can have either 1D or 2D shape.
+        orig_shape = hidden_states.shape
+        hidden_dim = hidden_states.shape[-1]
+        hidden_states = hidden_states.view(-1, hidden_dim)
+
+        shared_output = None
+        if self.shared_expert is not None:
+            shared_output = self.shared_expert(hidden_states)
+            if self.shared_expert_gate is not None:
+                shared_output = F.sigmoid(
+                    self.shared_expert_gate(hidden_states)) * shared_output
+
+        # router_logits: (num_tokens, n_experts)
+        router_logits, _ = self.gate(hidden_states)
+        final_hidden_states = self.experts(hidden_states=hidden_states,
+                                           router_logits=router_logits)
+
+        if shared_output is not None:
+            final_hidden_states = final_hidden_states + shared_output
+        if self.tp_size > 1:
+            final_hidden_states = self.experts.maybe_all_reduce_tensor_model_parallel(  # noqa E501
+                final_hidden_states)
+
+        return final_hidden_states.view(orig_shape)
+
+
+class Qwen3NextGatedDeltaNet(nn.Module, MambaBase):
+
+    @property
+    def mamba_type(self) -> str:
+        return "linear_attention"
+
+    def get_attn_backend(self) -> type["AttentionBackend"]:
+        from vllm.v1.attention.backends.gdn_attn import GDNAttentionBackend
+        return GDNAttentionBackend
+
+    def get_state_dtype(self) -> tuple[torch.dtype, torch.dtype]:
+        return MambaStateDtypeCalculator.gated_delta_net_state_dtype(
+            self.model_config.dtype, self.cache_config.mamba_cache_dtype)
+
+    def get_state_shape(self) -> tuple[tuple[int, ...], tuple[int, ...]]:
+        return MambaStateShapeCalculator.gated_delta_net_state_shape(
+            self.tp_size,
+            self.num_k_heads,
+            self.num_v_heads,
+            self.head_k_dim,
+            self.head_v_dim,
+            self.conv_kernel_size,
+            self.num_spec,
+            use_v1=True)
+
+    def __init__(
+        self,
+        config: Qwen3NextConfig,
+        model_config: Optional[ModelConfig] = None,
+        cache_config: Optional[CacheConfig] = None,
+        quant_config: Optional[QuantizationConfig] = None,
+        speculative_config: Optional[SpeculativeConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.tp_size = get_tensor_model_parallel_world_size()
+        self.tp_rank = get_tensor_model_parallel_rank()
+        self.hidden_size = config.hidden_size
+        self.num_v_heads = config.linear_num_value_heads
+        self.num_k_heads = config.linear_num_key_heads
+        self.head_k_dim = config.linear_key_head_dim
+        self.head_v_dim = config.linear_value_head_dim
+        self.key_dim = self.head_k_dim * self.num_k_heads
+        self.value_dim = self.head_v_dim * self.num_v_heads
+
+        self.conv_kernel_size = config.linear_conv_kernel_dim
+        self.layer_idx = extract_layer_index(prefix)
+        self.activation = config.hidden_act
+        self.act = ACT2FN[config.hidden_act]
+        self.layer_norm_epsilon = config.rms_norm_eps
+        self.prefix = prefix
+
+        self.config = config
+        self.model_config = model_config
+        self.cache_config = cache_config
+        self.quant_config = quant_config
+        self.speculative_config = speculative_config
+        self.num_spec = (self.speculative_config.num_speculative_tokens
+                         if self.speculative_config else 0)
+
+        # QKV
+        self.conv_dim = self.key_dim * 2 + self.value_dim
+        self.conv1d = ColumnParallelLinear(
+            input_size=self.conv_kernel_size,
+            output_size=self.conv_dim,
+            bias=False,
+            prefix=f"{prefix}.conv1d",
+        )
+        self.conv1d.weight.data = self.conv1d.weight.data.unsqueeze(1)
+
+        # projection of the input hidden states
+        self.projection_size_qkvz = self.key_dim * 2 + self.value_dim * 2
+        self.projection_size_ba = self.num_v_heads * 2
+        self.in_proj = MergedColumnParallelLinear(
+            input_size=self.hidden_size,
+            output_sizes=[self.projection_size_qkvz, self.projection_size_ba],
+            bias=False,
+            quant_config=quant_config,
+            prefix=f"{prefix}.in_proj",
+        )
+
+        query_key_settings = (self.key_dim, 0, False)
+        value_settings = (self.value_dim, 0, False)
+
+        delattr(self.conv1d.weight, "weight_loader")
+        set_weight_attrs(
+            self.conv1d.weight, {
+                "weight_loader":
+                mamba_v2_sharded_weight_loader([
+                    query_key_settings,
+                    query_key_settings,
+                    value_settings,
+                ], self.tp_size, self.tp_rank)
+            })
+
+        # selective projection used to make dt, B and C input dependant
+
+        # time step projection (discretization)
+        # instantiate once and copy inv_dt in init_weights of PretrainedModel
+        self.dt_bias = nn.Parameter(
+            torch.ones(self.num_v_heads // self.tp_size), )
+        self.A_log = nn.Parameter(
+            torch.empty(
+                divide(self.num_v_heads, self.tp_size),
+                dtype=torch.float32,
+            ))
+
+        set_weight_attrs(self.A_log,
+                         {"weight_loader": sharded_weight_loader(0)})
+        set_weight_attrs(self.dt_bias,
+                         {"weight_loader": sharded_weight_loader(0)})
+
+        self.norm = RMSNormGated(
+            self.head_v_dim,
+            eps=self.layer_norm_epsilon,
+            group_size=None,
+            norm_before_gate=True,
+            device=torch.cuda.current_device(),
+            dtype=config.torch_dtype,
+        )
+
+        self.out_proj = RowParallelLinear(self.value_dim,
+                                          self.hidden_size,
+                                          bias=False,
+                                          input_is_parallel=True,
+                                          quant_config=quant_config,
+                                          prefix=f"{prefix}.out_proj")
+
+        compilation_config = get_current_vllm_config().compilation_config
+        if prefix in compilation_config.static_forward_context:
+            raise ValueError(f"Duplicate layer name: {prefix}")
+        compilation_config.static_forward_context[prefix] = self
+
+    def fix_query_key_value_ordering(
+        self,
+        mixed_qkvz,
+        mixed_ba,
+    ):
+        """
+        Derives `query`, `key` and `value` tensors from `mixed_qkvzba`.
+        """
+        new_tensor_shape_qkvz = mixed_qkvz.size()[:-1] + (
+            self.num_k_heads // self.tp_size,
+            (self.head_k_dim + self.head_k_dim +
+             (self.head_v_dim + self.head_v_dim) * self.num_v_heads //
+             self.num_k_heads),
+        )
+        new_tensor_shape_ba = mixed_qkvz.size()[:-1] + (
+            self.num_k_heads // self.tp_size,
+            2 * self.num_v_heads // self.num_k_heads,
+        )
+
+        mixed_qkvz = mixed_qkvz.view(*new_tensor_shape_qkvz)
+        mixed_ba = mixed_ba.view(*new_tensor_shape_ba)
+
+        split_arg_list_qkvz = [
+            self.head_k_dim,
+            self.head_k_dim,
+            (self.num_v_heads // self.num_k_heads * self.head_v_dim),
+            (self.num_v_heads // self.num_k_heads * self.head_v_dim),
+        ]
+        split_arg_list_ba = [
+            self.num_v_heads // self.num_k_heads,
+            self.num_v_heads // self.num_k_heads
+        ]
+
+        # [b, sq, ng, (hn + hn + np/ng * hn + np/ng + np/ng)]
+        # --> [b, sq, ng, hn], [b, sq, ng, hn], [b, sq, ng, np/ng * hn],
+        #  [b, sq, ng, np/ng * hn], [b, sq, ng, np/ng], [b, sq, ng, np/ng]
+        (query, key, value, z) = torch.split(mixed_qkvz,
+                                             split_arg_list_qkvz,
+                                             dim=2)
+        (b, a) = torch.split(mixed_ba, split_arg_list_ba, dim=2)
+
+        # [b, sq, ng, np/ng * hn] -> [b, sq, np, hn]
+        value = value.reshape(value.size(0), -1, self.head_v_dim)
+        z = z.reshape(z.size(0), -1, self.head_v_dim)
+        b = b.reshape(b.size(0), self.num_v_heads // self.tp_size)
+        a = a.reshape(a.size(0), self.num_v_heads // self.tp_size)
+
+        return query, key, value, z, b, a
+
+    def rearrange_mixed_qkv(self, mixed_qkv):
+        if mixed_qkv is None:
+            return None, None, None
+        query, key, value = torch.split(
+            mixed_qkv,
+            [
+                self.key_dim // self.tp_size,
+                self.key_dim // self.tp_size,
+                self.value_dim // self.tp_size,
+            ],
+            dim=-1,
+        )
+        query, key = map(
+            lambda x: rearrange(x, 'l (h d) -> 1 l h d', d=self.head_k_dim),
+            (query, key))
+        value = rearrange(value, 'l (h d) -> 1 l h d', d=self.head_v_dim)
+        return query, key, value
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        output: torch.Tensor,
+        cache_params: Optional[MambaCacheParams] = None,
+    ):
+        return torch.ops.vllm.gdn_attention(
+            hidden_states,
+            output,
+            self.prefix,
+        )
+
+    def _forward(
+        self,
+        hidden_states: torch.Tensor,
+        output: torch.Tensor,
+    ):
+        forward_context = get_forward_context()
+        attn_metadata: AttentionMetadata = forward_context.attn_metadata
+
+        if attn_metadata is None:
+            # V1 profile run
+            return
+
+        assert isinstance(attn_metadata, dict)
+        attn_metadata = attn_metadata[self.prefix]
+        assert isinstance(attn_metadata, GDNAttentionMetadata)
+        has_initial_state = attn_metadata.has_initial_state
+        spec_query_start_loc = attn_metadata.spec_query_start_loc
+        non_spec_query_start_loc = attn_metadata.non_spec_query_start_loc
+        spec_sequence_masks = attn_metadata.spec_sequence_masks
+        spec_token_masks = attn_metadata.spec_token_masks
+        spec_state_indices_tensor = attn_metadata.spec_state_indices_tensor  # noqa: E501
+        non_spec_state_indices_tensor = attn_metadata.non_spec_state_indices_tensor  # noqa: E501
+        self_kv_cache = self.kv_cache[forward_context.virtual_engine]
+        conv_state = self_kv_cache[0].transpose(-1, -2)
+        ssm_state = self_kv_cache[1]
+        num_actual_tokens = (attn_metadata.num_prefill_tokens +
+                             attn_metadata.num_decode_tokens +
+                             attn_metadata.num_spec_decode_tokens)
+        num_accepted_tokens = attn_metadata.num_accepted_tokens
+
+        # 1. Set up dimensions for reshapes later
+        projected_states, _ = self.in_proj(hidden_states[:num_actual_tokens])
+        if spec_token_masks is not None:
+            spec_token_masks = spec_token_masks[:num_actual_tokens]
+        projected_states_qkvz, projected_states_ba = torch.split(
+            projected_states,
+            [
+                self.projection_size_qkvz // self.tp_size,
+                self.projection_size_ba // self.tp_size
+            ],
+            dim=-1,
+        )
+        query, key, value, z, b, a = self.fix_query_key_value_ordering(
+            projected_states_qkvz, projected_states_ba)
+        query, key, value = map(lambda x: rearrange(x, 'l p d -> l (p d)'),
+                                (query, key, value))
+        mixed_qkv = torch.cat((query, key, value), dim=-1)
+
+        # 2. Convolution sequence transformation
+        conv_weights = self.conv1d.weight.view(self.conv1d.weight.size(0),
+                                               self.conv1d.weight.size(2))
+
+        if spec_sequence_masks is not None:
+            if (attn_metadata.num_prefills == 0
+                    and attn_metadata.num_decodes == 0):
+                mixed_qkv_spec = mixed_qkv
+                mixed_qkv_non_spec = None
+            else:
+                mixed_qkv_spec = mixed_qkv[spec_token_masks]
+                mixed_qkv_non_spec = mixed_qkv[~spec_token_masks]
+        else:
+            mixed_qkv_spec = None
+            mixed_qkv_non_spec = mixed_qkv
+
+        # 2.1: process the mutli-query part
+        if spec_sequence_masks is not None:
+            mixed_qkv_spec = mixed_qkv_spec.view(
+                attn_metadata.num_spec_decodes, -1, mixed_qkv_spec.size(-1))
+            mixed_qkv_spec = rearrange(mixed_qkv_spec, 'b l d -> b d l')
+            mixed_qkv_spec = causal_conv1d_update(
+                mixed_qkv_spec,
+                conv_state,
+                conv_weights,
+                self.conv1d.bias,
+                self.activation,
+                conv_state_indices=spec_state_indices_tensor[:, 0]
+                [:attn_metadata.num_spec_decodes],
+                num_accepted_tokens=num_accepted_tokens,
+                validate_data=False,
+            )
+            mixed_qkv_spec = rearrange(mixed_qkv_spec, 'b d l -> (b l) d')
+
+        # 2.2: process the remaining part
+        if attn_metadata.num_prefills > 0:
+            # - "cache_indices" updates the conv_state cache in positions
+            #   pointed to by "mamba_cache_params.state_indices_tensor"
+            mixed_qkv_non_spec = causal_conv1d_fn(
+                mixed_qkv_non_spec.transpose(0, 1),
+                conv_weights,
+                self.conv1d.bias,
+                activation=self.activation,
+                conv_states=conv_state,
+                has_initial_state=has_initial_state,
+                cache_indices=non_spec_state_indices_tensor,
+                query_start_loc=non_spec_query_start_loc,
+            ).transpose(0, 1)
+        elif attn_metadata.num_decodes > 0:
+            mixed_qkv_non_spec = causal_conv1d_update(
+                mixed_qkv_non_spec,
+                conv_state,
+                conv_weights,
+                self.conv1d.bias,
+                self.activation,
+                conv_state_indices=non_spec_state_indices_tensor[:attn_metadata
+                                                                 .num_decodes],
+                validate_data=True,
+            )
+        else:
+            mixed_qkv_non_spec = None
+
+        query_spec, key_spec, value_spec = self.rearrange_mixed_qkv(
+            mixed_qkv_spec)
+        query_non_spec, key_non_spec, value_non_spec = self.rearrange_mixed_qkv(
+            mixed_qkv_non_spec)
+
+        beta = b.sigmoid()
+        # g = -self.A_log.float().exp() * F.softplus(a.float() + self.dt_bias)
+        g = fused_gdn_gating(self.A_log, a, self.dt_bias)
+        g, beta = map(lambda x: rearrange(x, 'l d -> 1 l d'), (g, beta))
+
+        if spec_sequence_masks is not None:
+            if (attn_metadata.num_prefills == 0
+                    and attn_metadata.num_decodes == 0):
+                g_spec = g
+                beta_spec = beta
+                g_non_spec = None
+                beta_non_spec = None
+            else:
+                g_spec = g[:, spec_token_masks]
+                beta_spec = beta[:, spec_token_masks]
+                g_non_spec = g[:, ~spec_token_masks]
+                beta_non_spec = beta[:, ~spec_token_masks]
+        else:
+            g_spec = None
+            beta_spec = None
+            g_non_spec = g
+            beta_non_spec = beta
+
+        # 3. Recurrent attention
+
+        # 3.1: process the mutlti-query part
+        if spec_sequence_masks is not None:
+            core_attn_out_spec, last_recurrent_state = (
+                fused_recurrent_gated_delta_rule(
+                    q=query_spec,
+                    k=key_spec,
+                    v=value_spec,
+                    g=g_spec,
+                    beta=beta_spec,
+                    initial_state=ssm_state,
+                    inplace_final_state=True,
+                    cu_seqlens=spec_query_start_loc[:attn_metadata.
+                                                    num_spec_decodes + 1],
+                    ssm_state_indices=spec_state_indices_tensor,
+                    num_accepted_tokens=num_accepted_tokens,
+                    use_qk_l2norm_in_kernel=True,
+                ))
+        else:
+            core_attn_out_spec, last_recurrent_state = None, None
+
+        # 3.2: process the remaining part
+        if attn_metadata.num_prefills > 0:
+            initial_state = ssm_state[
+                non_spec_state_indices_tensor].contiguous()
+            initial_state[~has_initial_state, ...] = 0
+            (
+                core_attn_out_non_spec,
+                last_recurrent_state,
+            ) = chunk_gated_delta_rule(
+                q=query_non_spec,
+                k=key_non_spec,
+                v=value_non_spec,
+                g=g_non_spec,
+                beta=beta_non_spec,
+                initial_state=initial_state,
+                output_final_state=True,
+                cu_seqlens=non_spec_query_start_loc,
+                head_first=False,
+                use_qk_l2norm_in_kernel=True,
+            )
+            # Init cache
+            ssm_state[non_spec_state_indices_tensor] = last_recurrent_state.to(
+                ssm_state.dtype)
+        elif attn_metadata.num_decodes > 0:
+            core_attn_out_non_spec, last_recurrent_state = (
+                fused_recurrent_gated_delta_rule(
+                    q=query_non_spec,
+                    k=key_non_spec,
+                    v=value_non_spec,
+                    g=g_non_spec,
+                    beta=beta_non_spec,
+                    initial_state=ssm_state,
+                    inplace_final_state=True,
+                    cu_seqlens=non_spec_query_start_loc[:attn_metadata.
+                                                        num_decodes + 1],
+                    ssm_state_indices=non_spec_state_indices_tensor,
+                    use_qk_l2norm_in_kernel=True,
+                ))
+        else:
+            core_attn_out_non_spec, last_recurrent_state = None, None
+
+        # Merge core attention output
+        if (spec_sequence_masks is not None
+                and core_attn_out_non_spec is not None):
+            core_attn_out = torch.empty(
+                (1, num_actual_tokens, *core_attn_out_spec.shape[2:]),
+                dtype=core_attn_out_non_spec.dtype,
+                device=core_attn_out_non_spec.device,
+            )
+            core_attn_out[:, spec_token_masks] = core_attn_out_spec
+            core_attn_out[:, ~spec_token_masks] = core_attn_out_non_spec
+        elif spec_sequence_masks is not None:
+            core_attn_out = core_attn_out_spec
+        else:
+            core_attn_out = core_attn_out_non_spec
+
+        z_shape_og = z.shape
+        # reshape input data into 2D tensor
+        core_attn_out = core_attn_out.reshape(-1, core_attn_out.shape[-1])
+        z = z.reshape(-1, z.shape[-1])
+        core_attn_out = self.norm(core_attn_out, z)
+        core_attn_out = core_attn_out.reshape(z_shape_og)
+        core_attn_out = rearrange(core_attn_out, '... h d -> ... (h d)')
+
+        output[:num_actual_tokens], _ = self.out_proj(core_attn_out)
+
+
+class Qwen3NextAttention(nn.Module):
+
+    def __init__(
+        self,
+        config: Qwen3NextConfig,
+        model_config: Optional[ModelConfig] = None,
+        cache_config: Optional[CacheConfig] = None,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.config = config
+        self.hidden_size = config.hidden_size
+        tp_size = get_tensor_model_parallel_world_size()
+        self.total_num_heads = config.num_attention_heads
+        assert self.total_num_heads % tp_size == 0
+        self.num_heads = self.total_num_heads // tp_size
+        self.total_num_kv_heads = config.num_key_value_heads
+        if self.total_num_kv_heads >= tp_size:
+            # Number of KV heads is greater than TP size, so we partition
+            # the KV heads across multiple tensor parallel GPUs.
+            assert self.total_num_kv_heads % tp_size == 0
+        else:
+            # Number of KV heads is less than TP size, so we replicate
+            # the KV heads across multiple tensor parallel GPUs.
+            assert tp_size % self.total_num_kv_heads == 0
+        self.num_kv_heads = max(1, self.total_num_kv_heads // tp_size)
+        self.head_dim = config.head_dim or (self.hidden_size // self.num_heads)
+        self.q_size = self.num_heads * self.head_dim
+        self.kv_size = self.num_kv_heads * self.head_dim
+        self.scaling = self.head_dim**-0.5
+        self.dual_chunk_attention_config = getattr(
+            config, "dual_chunk_attention_config", None)
+        self.attn_output_gate = getattr(config, "attn_output_gate", True)
+
+        self.qkv_proj = QKVParallelLinear(
+            config.hidden_size,
+            self.head_dim,
+            self.total_num_heads * (1 + self.attn_output_gate),
+            self.total_num_kv_heads,
+            bias=getattr(config, "qkv_bias", False),
+            quant_config=quant_config,
+            prefix=f"{prefix}.qkv_proj",
+        )
+
+        self.o_proj = RowParallelLinear(
+            self.total_num_heads * self.head_dim,
+            config.hidden_size,
+            bias=False,
+            quant_config=quant_config,
+            prefix=f"{prefix}.o_proj",
+        )
+
+        self.rotary_emb = get_rope(
+            head_size=self.head_dim,
+            rotary_dim=self.head_dim,
+            max_position=config.max_position_embeddings,
+            base=config.rope_theta,
+            rope_scaling=config.rope_scaling,
+            partial_rotary_factor=config.partial_rotary_factor,
+            dual_chunk_attention_config=self.dual_chunk_attention_config,
+        )
+
+        self.attn = Attention(
+            self.num_heads,
+            self.head_dim,
+            self.scaling,
+            num_kv_heads=self.num_kv_heads,
+            cache_config=cache_config,
+            quant_config=quant_config,
+            prefix=f"{prefix}.attn",
+            **{
+                "layer_idx": extract_layer_index(prefix),
+                "dual_chunk_attention_config":
+                self.dual_chunk_attention_config,
+            } if self.dual_chunk_attention_config else {},
+        )
+
+        self.q_norm = Qwen3NextRMSNorm(self.head_dim, eps=config.rms_norm_eps)
+        self.k_norm = Qwen3NextRMSNorm(self.head_dim, eps=config.rms_norm_eps)
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        output: torch.Tensor,
+        hidden_states: torch.Tensor,
+    ):
+        qkv, _ = self.qkv_proj(hidden_states)
+
+        if self.attn_output_gate:
+            q_gate, k, v = qkv.split(
+                [self.q_size * 2, self.kv_size, self.kv_size], dim=-1)
+            orig_shape = q_gate.shape[:-1]
+            q_gate = q_gate.view(*orig_shape, self.num_heads, -1)
+            q, gate = torch.chunk(q_gate, 2, dim=-1)
+            q = q.reshape(*orig_shape, -1)
+            gate = gate.reshape(*orig_shape, -1)
+        else:
+            q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size],
+                                dim=-1)
+
+        q = self.q_norm(q.view(-1, self.num_heads, self.head_dim)).view(
+            -1, self.num_heads * self.head_dim)
+        k = self.k_norm(k.view(-1, self.num_kv_heads, self.head_dim)).view(
+            -1, self.num_kv_heads * self.head_dim)
+
+        q, k = self.rotary_emb(positions, q, k)
+
+        attn_output = self.attn(q, k, v)
+
+        if self.attn_output_gate:
+            gate = torch.sigmoid(gate)
+            attn_output = attn_output * gate
+
+        output[:], _ = self.o_proj(attn_output)
+
+
+class Qwen3NextDecoderLayer(nn.Module):
+
+    def __init__(
+        self,
+        config: Qwen3NextConfig,
+        layer_type: str,
+        model_config: Optional[ModelConfig] = None,
+        cache_config: Optional[CacheConfig] = None,
+        quant_config: Optional[QuantizationConfig] = None,
+        speculative_config: Optional[SpeculativeConfig] = None,
+        prefix: str = "",
+        enable_eplb: bool = False,
+    ) -> None:
+        super().__init__()
+        self.config = config
+
+        self.layer_type = layer_type
+        self.layer_idx = extract_layer_index(prefix)
+
+        if self.layer_type == "linear_attention":
+            self.linear_attn = Qwen3NextGatedDeltaNet(
+                config,
+                model_config=model_config,
+                cache_config=cache_config,
+                quant_config=quant_config,
+                speculative_config=speculative_config,
+                prefix=f'{prefix}.linear_attn')
+        elif self.layer_type == "full_attention":
+            self.self_attn = Qwen3NextAttention(
+                config,
+                model_config=model_config,
+                cache_config=cache_config,
+                quant_config=quant_config,
+                prefix=f'{prefix}.self_attn',
+            )
+        else:
+            raise ValueError(f"Invalid layer_type {self.layer_type}")
+
+        mlp_only_layers = ([] if not hasattr(config, "mlp_only_layers") else
+                           config.mlp_only_layers)
+        if (self.layer_idx not in mlp_only_layers) and (
+                config.num_experts > 0 and
+            (self.layer_idx + 1) % config.decoder_sparse_step == 0):
+            self.mlp = Qwen3NextSparseMoeBlock(
+                config=config,
+                quant_config=quant_config,
+                prefix=f"{prefix}.mlp",
+                enable_eplb=enable_eplb,
+            )
+        else:
+            self.mlp = Qwen3NextMLP(
+                hidden_size=config.hidden_size,
+                intermediate_size=config.intermediate_size,
+                hidden_act=config.hidden_act,
+                quant_config=quant_config,
+            )
+
+        self.input_layernorm = Qwen3NextRMSNorm(config.hidden_size,
+                                                eps=config.rms_norm_eps)
+        self.post_attention_layernorm = Qwen3NextRMSNorm(
+            config.hidden_size, eps=config.rms_norm_eps)
+
+        self.layer_scale = getattr(config, "layer_scale", False)
+        if self.layer_scale:
+            self.attn_layer_scale = torch.nn.Parameter(
+                torch.zeros(
+                    1,
+                    1,
+                    self.config.hidden_size,
+                    dtype=config.torch_dtype,
+                ), )
+            self.ffn_layer_scale = torch.nn.Parameter(
+                torch.zeros(
+                    1,
+                    1,
+                    self.config.hidden_size,
+                    dtype=config.torch_dtype,
+                ), )
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        residual: Optional[torch.Tensor],
+        positions: torch.Tensor = None,
+        **kwargs: object,
+    ):
+        if residual is None:
+            residual = hidden_states
+            hidden_states = self.input_layernorm(hidden_states)
+        else:
+            hidden_states, residual = self.input_layernorm(
+                hidden_states, residual)
+
+        self_attention_output = torch.empty_like(hidden_states)
+        if self.layer_type == "linear_attention":
+            self.linear_attn(
+                hidden_states=hidden_states,
+                output=self_attention_output,
+            )
+        elif self.layer_type == "full_attention":
+            self.self_attn(
+                hidden_states=hidden_states,
+                output=self_attention_output,
+                positions=positions,
+            )
+        else:
+            raise ValueError("Invalid layer_type")
+        hidden_states = self_attention_output
+
+        if self.layer_scale:
+            if len(hidden_states.shape) == 2:
+                hidden_states = hidden_states * (
+                    self.attn_layer_scale.to(hidden_states.dtype)[0] + 1)
+            else:
+                hidden_states = hidden_states * (
+                    self.attn_layer_scale.to(hidden_states.dtype) + 1)
+
+        # Fully Connected
+        hidden_states, residual = self.post_attention_layernorm(
+            hidden_states, residual)
+        hidden_states = self.mlp(hidden_states)
+
+        if self.layer_scale:
+            if len(hidden_states.shape) == 2:
+                hidden_states = hidden_states * (
+                    self.ffn_layer_scale.to(hidden_states.dtype)[0] + 1)
+            else:
+                assert len(hidden_states.shape) == len(
+                    self.ffn_layer_scale.shape
+                ), f'shape must be the same {len(hidden_states.shape)}, {len(self.ffn_layer_scale.shape)}'  # noqa: E501
+                hidden_states = hidden_states * (
+                    self.ffn_layer_scale.to(hidden_states.dtype) + 1)
+
+        return hidden_states, residual
+
+
+@support_torch_compile
+class Qwen3NextModel(nn.Module):
+
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+        super().__init__()
+
+        config: Qwen3NextConfig = vllm_config.model_config.hf_config
+        model_config = vllm_config.model_config
+        cache_config = vllm_config.cache_config
+        quant_config = vllm_config.quant_config
+        parallel_config = vllm_config.parallel_config
+        lora_config = vllm_config.lora_config
+        speculative_config = vllm_config.speculative_config
+        enable_eplb = parallel_config.enable_eplb
+        eplb_config = parallel_config.eplb_config
+        self.num_redundant_experts = eplb_config.num_redundant_experts
+
+        self.config = config
+        lora_vocab = ((lora_config.lora_extra_vocab_size *
+                       (lora_config.max_loras or 1)) if lora_config else 0)
+        self.vocab_size = config.vocab_size + lora_vocab
+
+        self.embed_tokens = VocabParallelEmbedding(
+            self.vocab_size,
+            config.hidden_size,
+            org_num_embeddings=config.vocab_size,
+        )
+
+        def get_layer(prefix: str):
+            return Qwen3NextDecoderLayer(
+                config,
+                layer_type=config.layer_types[extract_layer_index(prefix)],
+                model_config=model_config,
+                cache_config=cache_config,
+                quant_config=quant_config,
+                speculative_config=speculative_config,
+                prefix=prefix,
+                enable_eplb=enable_eplb,
+            )
+
+        self.start_layer, self.end_layer, self.layers = make_layers(
+            config.num_hidden_layers, get_layer, prefix=f"{prefix}.layers")
+        self.make_empty_intermediate_tensors = (
+            make_empty_intermediate_tensors_factory(
+                ["hidden_states", "residual"], config.hidden_size))
+
+        self.norm = Qwen3NextRMSNorm(config.hidden_size,
+                                     eps=config.rms_norm_eps)
+
+    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.embed_tokens(input_ids)
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        intermediate_tensors: Optional[IntermediateTensors] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        if get_pp_group().is_first_rank:
+            if inputs_embeds is not None:
+                hidden_states = inputs_embeds
+            else:
+                hidden_states = self.get_input_embeddings(input_ids)
+            residual = None
+        else:
+            assert intermediate_tensors is not None
+            hidden_states = intermediate_tensors["hidden_states"]
+            residual = intermediate_tensors["residual"]
+
+        for layer in self.layers:
+            hidden_states, residual = layer(
+                positions=positions,
+                hidden_states=hidden_states,
+                residual=residual,
+            )
+
+        if not get_pp_group().is_last_rank:
+            return IntermediateTensors({
+                "hidden_states": hidden_states,
+                "residual": residual
+            })
+        hidden_states, _ = self.norm(hidden_states, residual)
+        return hidden_states
+
+    def get_expert_mapping(self) -> list[tuple[str, str, int, str]]:
+        # Params for weights, fp8 weight scales, fp8 activation scales
+        # (param_name, weight_name, expert_id, shard_id)
+        return FusedMoE.make_expert_params_mapping(
+            ckpt_gate_proj_name="gate_proj",
+            ckpt_down_proj_name="down_proj",
+            ckpt_up_proj_name="up_proj",
+            num_experts=self.config.num_experts,
+            num_redundant_experts=self.num_redundant_experts)
+
+    def load_weights(self, weights: Iterable[tuple[str,
+                                                   torch.Tensor]]) -> set[str]:
+        stacked_params_mapping = [
+            # (param_name, shard_name, shard_id)
+            ("qkv_proj", "q_proj", "q"),
+            ("qkv_proj", "k_proj", "k"),
+            ("qkv_proj", "v_proj", "v"),
+            ("gate_up_proj", "gate_proj", 0),
+            ("gate_up_proj", "up_proj", 1),
+            ("in_proj", "in_proj_qkvz", 0),
+            ("in_proj", "in_proj_ba", 1),
+        ]
+
+        params_dict = dict(self.named_parameters())
+        loaded_params: set[str] = set()
+        expert_params_mapping = self.get_expert_mapping()
+        for name, loaded_weight in weights:
+            if "rotary_emb.inv_freq" in name:
+                continue
+
+            if name.startswith("mtp."):
+                continue
+
+            for param_name, weight_name, shard_id in stacked_params_mapping:
+                if weight_name not in name:
+                    continue
+
+                if "mlp.experts" in name:
+                    continue
+
+                name = name.replace(weight_name, param_name)
+                # Skip loading extra bias for GPTQ models.
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+                # Skip layers on other devices.
+                if is_pp_missing_parameter(name, self):
+                    continue
+                # name = apply_attn_prefix(name, params_dict)
+                if name not in params_dict:
+                    continue
+                param = params_dict[name]
+                weight_loader = param.weight_loader
+                weight_loader(param, loaded_weight, shard_id)
+                break
+            else:
+                for mapping in expert_params_mapping:
+                    param_name, weight_name, expert_id, shard_id = mapping
+                    if weight_name not in name:
+                        continue
+                    name = name.replace(weight_name, param_name)
+                    # Skip layers on other devices.
+                    if is_pp_missing_parameter(name, self):
+                        continue
+                    # Skip loading extra bias for GPTQ models.
+                    if ((name.endswith(".bias") or name.endswith("_bias"))
+                            and name not in params_dict):
+                        continue
+                    param = params_dict[name]
+                    weight_loader = param.weight_loader
+                    weight_loader(param,
+                                  loaded_weight,
+                                  name,
+                                  shard_id=shard_id,
+                                  expert_id=expert_id)
+                    break
+                else:
+                    # Skip loading extra bias for GPTQ models.
+                    if name.endswith(".bias") and name not in params_dict:
+                        continue
+                    if is_pp_missing_parameter(name, self):
+                        continue
+                    param = params_dict[name]
+                    weight_loader = getattr(param, "weight_loader",
+                                            default_weight_loader)
+                    weight_loader(param, loaded_weight)
+            loaded_params.add(name)
+        return loaded_params
+
+
+class Qwen3NextForCausalLM(nn.Module, HasInnerState, SupportsLoRA, SupportsPP,
+                           MixtureOfExperts, IsHybrid):
+    packed_modules_mapping = {
+        "qkv_proj": [
+            "q_proj",
+            "k_proj",
+            "v_proj",
+        ],
+        "gate_up_proj": ["gate_proj", "up_proj"],
+        "in_proj": ["in_proj_qkvz", "in_proj_ba"],
+    }
+
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+        config = vllm_config.model_config.hf_config
+        self.vllm_config = vllm_config
+        self.model_config = vllm_config.model_config
+        cache_config = vllm_config.cache_config
+        lora_config = vllm_config.lora_config
+        scheduler_config = vllm_config.scheduler_config
+        assert not cache_config.enable_prefix_caching, \
+            "Qwen3Next currently does not support prefix caching"
+        assert envs.VLLM_USE_V1, "Qwen3Next requires VLLM_USE_V1"
+        self.quant_config = vllm_config.quant_config
+
+        super().__init__()
+        self.config = config
+        self.scheduler_config = scheduler_config
+        self.model = Qwen3NextModel(vllm_config=vllm_config,
+                                    prefix=maybe_prefix(prefix, "model"))
+        self.unpadded_vocab_size = config.vocab_size
+        if lora_config:
+            self.unpadded_vocab_size += lora_config.lora_extra_vocab_size
+        self.lm_head = ParallelLMHead(
+            self.unpadded_vocab_size,
+            config.hidden_size,
+            org_num_embeddings=config.vocab_size,
+            padding_size=DEFAULT_VOCAB_PADDING_SIZE
+            # We need bigger padding if using lora for kernel
+            # compatibility
+            if not lora_config else lora_config.lora_vocab_padding_size,
+        )
+        self.logits_processor = LogitsProcessor(self.unpadded_vocab_size,
+                                                config.vocab_size)
+        self.make_empty_intermediate_tensors = (
+            self.model.make_empty_intermediate_tensors)
+
+        # Set MoE hyperparameters
+        self.expert_weights = []
+
+        self.moe_layers: list[FusedMoE] = []
+        example_layer = None
+        for layer in self.model.layers:
+            if isinstance(layer, PPMissingLayer):
+                continue
+
+            assert isinstance(layer, Qwen3NextDecoderLayer)
+            if isinstance(layer.mlp, Qwen3NextSparseMoeBlock):
+                example_layer = layer.mlp
+                self.moe_layers.append(layer.mlp.experts)
+
+        if example_layer is None:
+            raise RuntimeError("No Qwen3Next layer found in the model.layers.")
+
+        self.num_moe_layers = len(self.moe_layers)
+        self.num_expert_groups = 1
+        self.num_shared_experts = 0
+        self.num_logical_experts = example_layer.n_logical_experts
+        self.num_physical_experts = example_layer.n_physical_experts
+        self.num_local_physical_experts = example_layer.n_local_physical_experts
+        self.num_routed_experts = example_layer.n_routed_experts
+        self.num_redundant_experts = example_layer.n_redundant_experts
+
+    def set_eplb_state(
+        self,
+        expert_load_view: torch.Tensor,
+        logical_to_physical_map: torch.Tensor,
+        logical_replica_count: torch.Tensor,
+    ) -> None:
+        for layer_idx, layer in enumerate(self.moe_layers):
+            # Register the expert weights.
+            self.expert_weights.append(layer.get_expert_weights())
+            layer.set_eplb_state(
+                moe_layer_idx=layer_idx,
+                expert_load_view=expert_load_view,
+                logical_to_physical_map=logical_to_physical_map,
+                logical_replica_count=logical_replica_count,
+            )
+
+    def update_physical_experts_metadata(
+        self,
+        num_physical_experts: int,
+        num_local_physical_experts: int,
+    ) -> None:
+        assert self.num_local_physical_experts == num_local_physical_experts
+        self.num_physical_experts = num_physical_experts
+        self.num_local_physical_experts = num_local_physical_experts
+        self.num_redundant_experts = (num_physical_experts -
+                                      self.num_logical_experts)
+        for layer in self.model.layers:
+            if isinstance(layer.mlp, Qwen3NextSparseMoeBlock):
+                moe = layer.mlp
+                moe.n_local_physical_experts = num_local_physical_experts
+                moe.n_physical_experts = num_physical_experts
+                moe.n_redundant_experts = self.num_redundant_experts
+                moe.experts.update_expert_map()
+
+    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.model.get_input_embeddings(input_ids)
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        intermediate_tensors: Optional[IntermediateTensors] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        **kwargs: object,
+    ):
+        hidden_states = self.model(input_ids, positions, intermediate_tensors,
+                                   inputs_embeds)
+
+        return hidden_states
+
+    @classmethod
+    def get_mamba_state_dtype_from_config(
+        cls,
+        vllm_config: "VllmConfig",
+    ) -> tuple[torch.dtype, torch.dtype]:
+        return MambaStateDtypeCalculator.gated_delta_net_state_dtype(
+            vllm_config.model_config.dtype,
+            vllm_config.cache_config.mamba_cache_dtype)
+
+    @classmethod
+    def get_mamba_state_shape_from_config(
+            cls, vllm_config: "VllmConfig"
+    ) -> tuple[tuple[int, int], tuple[int, int]]:
+        parallel_config = vllm_config.parallel_config
+        hf_config = vllm_config.model_config.hf_config
+        tp_size = parallel_config.tensor_parallel_size
+        num_spec = (vllm_config.speculative_config.num_speculative_tokens
+                    if vllm_config.speculative_config else 0)
+        return MambaStateShapeCalculator.gated_delta_net_state_shape(
+            tp_size,
+            hf_config.linear_num_key_heads,
+            hf_config.linear_num_value_heads,
+            hf_config.linear_key_head_dim,
+            hf_config.linear_value_head_dim,
+            hf_config.linear_conv_kernel_dim,
+            num_spec,
+            use_v1=True)
+
+    def compute_logits(
+        self,
+        hidden_states: torch.Tensor,
+        sampling_metadata: SamplingMetadata,
+    ) -> Optional[torch.Tensor]:
+        return self.logits_processor(self.lm_head, hidden_states,
+                                     sampling_metadata)
+
+    def load_weights(self, weights: Iterable[tuple[str,
+                                                   torch.Tensor]]) -> set[str]:
+        loader = AutoWeightsLoader(
+            self,
+            skip_prefixes=["mtp."],
+        )
+        return loader.load_weights(weights)
+
+    def get_expert_mapping(self) -> list[tuple[str, str, int, str]]:
+        return self.model.get_expert_mapping()
+
+
+def gdn_attention(
+    hidden_states: torch.Tensor,
+    output: torch.Tensor,
+    layer_name: str,
+) -> None:
+    forward_context: ForwardContext = get_forward_context()
+    self = forward_context.no_compile_layers[layer_name]
+    self._forward(hidden_states=hidden_states, output=output)
+
+
+def gdn_attention_fake(
+    hidden_states: torch.Tensor,
+    output: torch.Tensor,
+    layer_name: str,
+) -> None:
+    return
+
+
+direct_register_custom_op(
+    op_name="gdn_attention",
+    op_func=gdn_attention,
+    mutates_args=["output"],
+    fake_impl=gdn_attention_fake,
+    dispatch_key=current_platform.dispatch_key,
+)
+
+
+# g = -self.A_log.float().exp() * F.softplus(a.float() + self.dt_bias)
+@triton.jit
+def fused_gdn_gating_kernel(
+    g,
+    A_log,
+    a,
+    dt_bias,
+    seq_len,
+    NUM_HEADS: tl.constexpr,
+    beta: tl.constexpr,
+    threshold: tl.constexpr,
+    BLK_HEADS: tl.constexpr,
+):
+    i_b, i_s, i_d = tl.program_id(0), tl.program_id(1), tl.program_id(2)
+    head_off = i_d * BLK_HEADS + tl.arange(0, BLK_HEADS)
+    off = i_b * seq_len * NUM_HEADS + i_s * NUM_HEADS + head_off
+    mask = head_off < NUM_HEADS
+    blk_A_log = tl.load(A_log + head_off, mask=mask)
+    blk_a = tl.load(a + off, mask=mask)
+    blk_bias = tl.load(dt_bias + head_off, mask=mask)
+    # If the model is loaded in fp16, without the .float() here, A might be -inf
+    x = blk_a.to(tl.float32) + blk_bias.to(tl.float32)
+    softplus_x = tl.where(beta * x <= threshold,
+                          (1 / beta) * tl.log(1 + tl.exp(beta * x)), x)
+    blk_g = -tl.exp(blk_A_log.to(tl.float32)) * softplus_x
+    tl.store(g + off, blk_g.to(g.dtype.element_ty), mask=mask)
+
+
+def fused_gdn_gating(
+    A_log: torch.Tensor,
+    a: torch.Tensor,
+    dt_bias: torch.Tensor,
+    beta: float = 1.0,
+    threshold: float = 20.0,
+) -> torch.Tensor:
+    batch, num_heads = a.shape
+    seq_len = 1
+    grid = (batch, seq_len, triton.cdiv(num_heads, 8))
+    g = torch.empty_like(a, dtype=torch.float32)
+    fused_gdn_gating_kernel[grid](g,
+                                  A_log,
+                                  a,
+                                  dt_bias,
+                                  seq_len,
+                                  num_heads,
+                                  beta,
+                                  threshold,
+                                  8,
+                                  num_warps=1)
+    return g
diff --git a/vllm/model_executor/models/qwen3_next_mtp.py b/vllm/model_executor/models/qwen3_next_mtp.py
new file mode 100644
index 0000000000000000000000000000000000000000..e7aff377e9aeced9aec34da6c7b1baeef409a519
--- /dev/null
+++ b/vllm/model_executor/models/qwen3_next_mtp.py
@@ -0,0 +1,285 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""Inference-only Qwen3Next MTP model."""
+from collections.abc import Iterable
+from typing import Optional
+
+import torch
+from torch import nn
+
+from vllm.compilation.decorators import support_torch_compile
+from vllm.config import VllmConfig
+from vllm.distributed.parallel_state import get_pp_group
+from vllm.logger import init_logger
+from vllm.model_executor.layers.fused_moe import FusedMoE
+from vllm.model_executor.layers.linear import ColumnParallelLinear
+from vllm.model_executor.layers.logits_processor import LogitsProcessor
+from vllm.model_executor.layers.vocab_parallel_embedding import (
+    DEFAULT_VOCAB_PADDING_SIZE, ParallelLMHead, VocabParallelEmbedding)
+from vllm.model_executor.model_loader.weight_utils import default_weight_loader
+from vllm.model_executor.models.qwen3_next import (Qwen3NextDecoderLayer,
+                                                   Qwen3NextRMSNorm)
+from vllm.model_executor.sampling_metadata import SamplingMetadata
+from vllm.sequence import IntermediateTensors
+from vllm.transformers_utils.configs import Qwen3NextConfig
+
+from .interfaces import SupportsPP
+from .utils import (AutoWeightsLoader, is_pp_missing_parameter,
+                    make_empty_intermediate_tensors_factory, maybe_prefix)
+
+logger = init_logger(__name__)
+
+KVCache = tuple[torch.Tensor, torch.Tensor]
+
+
+@support_torch_compile
+class Qwen3NextMultiTokenPredictor(nn.Module):
+
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+        super().__init__()
+
+        model_config = vllm_config.model_config
+        cache_config = vllm_config.cache_config
+        quant_config = vllm_config.quant_config
+        lora_config = vllm_config.lora_config
+        config: Qwen3NextConfig = model_config.hf_config
+
+        self.config = config
+        lora_vocab = ((lora_config.lora_extra_vocab_size *
+                       (lora_config.max_loras or 1)) if lora_config else 0)
+        self.vocab_size = config.vocab_size + lora_vocab
+        self.org_vocab_size = config.vocab_size
+
+        self.mtp_start_layer_idx = config.num_hidden_layers
+        self.num_mtp_layers = getattr(config, "num_nextn_predict_layers", 1)
+
+        self.embed_tokens = VocabParallelEmbedding(
+            self.vocab_size,
+            config.hidden_size,
+            org_num_embeddings=config.vocab_size,
+        )
+
+        self.fc = ColumnParallelLinear(self.config.hidden_size * 2,
+                                       self.config.hidden_size,
+                                       gather_output=True,
+                                       bias=False,
+                                       return_bias=False)
+
+        self.layers = torch.nn.ModuleList(
+            Qwen3NextDecoderLayer(
+                config,
+                layer_type="full_attention",
+                model_config=model_config,
+                cache_config=cache_config,
+                quant_config=quant_config,
+                prefix=f'{prefix}.layers.{self.mtp_start_layer_idx + idx}',
+            ) for idx in range(self.num_mtp_layers))
+
+        self.make_empty_intermediate_tensors = (
+            make_empty_intermediate_tensors_factory(
+                ["hidden_states", "residual"], config.hidden_size))
+
+        self.norm = Qwen3NextRMSNorm(config.hidden_size,
+                                     eps=config.rms_norm_eps)
+        self.pre_fc_norm_hidden = Qwen3NextRMSNorm(config.hidden_size,
+                                                   eps=config.rms_norm_eps)
+        self.pre_fc_norm_embedding = Qwen3NextRMSNorm(config.hidden_size,
+                                                      eps=config.rms_norm_eps)
+
+    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.embed_tokens(input_ids)
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+        intermediate_tensors: Optional[IntermediateTensors] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        spec_step_idx: int = 0,
+    ) -> torch.Tensor:
+        if get_pp_group().is_first_rank:
+            if inputs_embeds is None:
+                inputs_embeds = self.get_input_embeddings(input_ids)
+            assert hidden_states.shape[-1] == inputs_embeds.shape[-1]
+            inputs_embeds = self.pre_fc_norm_embedding(inputs_embeds)
+            hidden_states = self.pre_fc_norm_hidden(hidden_states)
+            hidden_states = torch.cat([inputs_embeds, hidden_states], dim=-1)
+            hidden_states = self.fc(hidden_states)
+            residual = None
+        else:
+            assert intermediate_tensors is not None
+            hidden_states = intermediate_tensors["hidden_states"]
+            residual = intermediate_tensors["residual"]
+
+        current_step_idx = (spec_step_idx % self.num_mtp_layers)
+        hidden_states, residual = self.layers[current_step_idx](
+            positions=positions,
+            hidden_states=hidden_states,
+            residual=residual,
+        )
+
+        if not get_pp_group().is_last_rank:
+            return IntermediateTensors({
+                "hidden_states": hidden_states,
+                "residual": residual
+            })
+
+        hidden_states, _ = self.norm(hidden_states, residual)
+        return hidden_states
+
+    def load_weights(self, weights: Iterable[tuple[str,
+                                                   torch.Tensor]]) -> set[str]:
+        stacked_params_mapping = [
+            # (param_name, shard_name, shard_id)
+            ("qkv_proj", "q_proj", "q"),
+            ("qkv_proj", "k_proj", "k"),
+            ("qkv_proj", "v_proj", "v"),
+            ("gate_up_proj", "gate_proj", 0),
+            ("gate_up_proj", "up_proj", 1),
+        ]
+
+        # Params for weights, fp8 weight scales, fp8 activation scales
+        # (param_name, weight_name, expert_id, shard_id)
+        expert_params_mapping = FusedMoE.make_expert_params_mapping(
+            ckpt_gate_proj_name="gate_proj",
+            ckpt_down_proj_name="down_proj",
+            ckpt_up_proj_name="up_proj",
+            num_experts=self.config.num_experts)
+
+        params_dict = dict(self.named_parameters())
+        loaded_params: set[str] = set()
+        for name, loaded_weight in weights:
+            if "rotary_emb.inv_freq" in name:
+                continue
+
+            for param_name, weight_name, shard_id in stacked_params_mapping:
+                if weight_name not in name:
+                    continue
+
+                if "mlp.experts" in name:
+                    continue
+
+                name = name.replace(weight_name, param_name)
+                # Skip loading extra bias for GPTQ models.
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+                # Skip layers on other devices.
+                if is_pp_missing_parameter(name, self):
+                    continue
+                if name not in params_dict:
+                    continue
+                param = params_dict[name]
+                weight_loader = param.weight_loader
+                weight_loader(param, loaded_weight, shard_id)
+                break
+            else:
+                for mapping in expert_params_mapping:
+                    param_name, weight_name, expert_id, shard_id = mapping
+                    if weight_name not in name:
+                        continue
+                    name = name.replace(weight_name, param_name)
+                    # Skip layers on other devices.
+                    if is_pp_missing_parameter(name, self):
+                        continue
+                    # Skip loading extra bias for GPTQ models.
+                    if ((name.endswith(".bias") or name.endswith("_bias"))
+                            and name not in params_dict):
+                        continue
+                    param = params_dict[name]
+                    weight_loader = param.weight_loader
+                    weight_loader(param,
+                                  loaded_weight,
+                                  name,
+                                  shard_id=shard_id,
+                                  expert_id=expert_id)
+                    break
+                else:
+                    # Skip loading extra bias for GPTQ models.
+                    if name.endswith(".bias") and name not in params_dict:
+                        continue
+                    if is_pp_missing_parameter(name, self):
+                        continue
+
+                    param = params_dict[name]
+                    weight_loader = getattr(param, "weight_loader",
+                                            default_weight_loader)
+                    weight_loader(param, loaded_weight)
+            loaded_params.add(name)
+        return loaded_params
+
+
+@support_torch_compile
+class Qwen3NextMTP(nn.Module, SupportsPP):
+    packed_modules_mapping = {
+        "qkv_proj": [
+            "q_proj",
+            "k_proj",
+            "v_proj",
+        ],
+        "gate_up_proj": ["up_proj", "down_proj"]
+    }
+
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+        config = vllm_config.model_config.hf_config
+        self.vllm_config = vllm_config
+        cache_config = vllm_config.cache_config
+        assert not cache_config.enable_prefix_caching, \
+            "Qwen3NextMTP currently does not support prefix caching"
+
+        self.quant_config = vllm_config.quant_config
+
+        super().__init__()
+        self.config = config
+        self.model = Qwen3NextMultiTokenPredictor(vllm_config=vllm_config,
+                                                  prefix=maybe_prefix(
+                                                      prefix, "model"))
+        self.unpadded_vocab_size = config.vocab_size
+        self.lm_head = ParallelLMHead(self.unpadded_vocab_size,
+                                      config.hidden_size,
+                                      org_num_embeddings=config.vocab_size,
+                                      padding_size=DEFAULT_VOCAB_PADDING_SIZE)
+        self.logits_processor = LogitsProcessor(self.unpadded_vocab_size,
+                                                config.vocab_size)
+        self.make_empty_intermediate_tensors = (
+            self.model.make_empty_intermediate_tensors)
+
+    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.model.get_input_embeddings(input_ids)
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+        intermediate_tensors: Optional[IntermediateTensors] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        **kwargs: object,
+    ):
+        hidden_states = self.model(input_ids, positions, hidden_states,
+                                   intermediate_tensors, inputs_embeds)
+        return hidden_states
+
+    def compute_logits(
+        self,
+        hidden_states: torch.Tensor,
+        sampling_metadata: SamplingMetadata,
+        spec_step_idx: int = 0,
+    ) -> Optional[torch.Tensor]:
+        return self.logits_processor(self.lm_head, hidden_states,
+                                     sampling_metadata)
+
+    def load_weights(self, weights: Iterable[tuple[str,
+                                                   torch.Tensor]]) -> set[str]:
+        shared_weight_names = ["embed_tokens", "lm_head"]
+
+        def remap_weight_names(weights):
+            for name, weight in weights:
+                if name.startswith("mtp."):
+                    name = name.replace("mtp.", "model.")
+                elif not any(key in name for key in shared_weight_names):
+                    continue
+                yield name, weight
+
+        loader = AutoWeightsLoader(self)
+        return loader.load_weights(remap_weight_names(weights))
diff --git a/vllm/model_executor/models/registry.py b/vllm/model_executor/models/registry.py
index 98115f8623563e46b5dd973ba7a3876b3a39a256..7d7654e846e1c2ff5b834633799ac294269d3b6f 100644
--- a/vllm/model_executor/models/registry.py
+++ b/vllm/model_executor/models/registry.py
@@ -74,6 +74,7 @@ _TEXT_GENERATION_MODELS = {
     "Gemma2ForCausalLM": ("gemma2", "Gemma2ForCausalLM"),
     "Gemma3ForCausalLM": ("gemma3", "Gemma3ForCausalLM"),
     "Gemma3nForCausalLM": ("gemma3n", "Gemma3nForCausalLM"),
+    "Qwen3NextForCausalLM": ("qwen3_next", "Qwen3NextForCausalLM"),
     "GlmForCausalLM": ("glm", "GlmForCausalLM"),
     "Glm4ForCausalLM": ("glm4", "Glm4ForCausalLM"),
     "Glm4MoeForCausalLM": ("glm4_moe", "Glm4MoeForCausalLM"),
@@ -110,7 +111,7 @@ _TEXT_GENERATION_MODELS = {
     "MiniCPM3ForCausalLM": ("minicpm3", "MiniCPM3ForCausalLM"),
     "MistralForCausalLM": ("llama", "LlamaForCausalLM"),
     "MixtralForCausalLM": ("mixtral", "MixtralForCausalLM"),
-    "QuantMixtralForCausalLM": ("mixtral_quant", "MixtralForCausalLM"),
+    "MotifForCausalLM": ("motif", "MotifForCausalLM"),
     # transformers's mpt class has lower case
     "MptForCausalLM": ("mpt", "MPTForCausalLM"),
     "MPTForCausalLM": ("mpt", "MPTForCausalLM"),
@@ -155,6 +156,7 @@ _EMBEDDING_MODELS = {
     "BertModel": ("bert", "BertEmbeddingModel"),
     "DeciLMForCausalLM": ("nemotron_nas", "DeciLMForCausalLM"),
     "Gemma2Model": ("gemma2", "Gemma2ForCausalLM"),
+    "Gemma3TextModel": ("gemma3", "Gemma3Model"),
     "GlmForCausalLM": ("glm", "GlmForCausalLM"),
     "GPT2ForSequenceClassification": ("gpt2", "GPT2ForSequenceClassification"),
     "GritLM": ("gritlm", "GritLM"),
@@ -184,10 +186,11 @@ _EMBEDDING_MODELS = {
     "LlavaNextForConditionalGeneration": ("llava_next", "LlavaNextForConditionalGeneration"),  # noqa: E501
     "Phi3VForCausalLM": ("phi3v", "Phi3VForCausalLM"),
     "Qwen2VLForConditionalGeneration": ("qwen2_vl", "Qwen2VLForConditionalGeneration"),  # noqa: E501
-    # Technically PrithviGeoSpatialMAE is a model that works on images, both in
+    # Technically Terratorch models work on images, both in
     # input and output. I am adding it here because it piggy-backs on embedding
     # models for the time being.
-    "PrithviGeoSpatialMAE": ("prithvi_geospatial_mae", "PrithviGeoSpatialMAE"),
+    "PrithviGeoSpatialMAE": ("terratorch", "Terratorch"),
+    "Terratorch": ("terratorch", "Terratorch"),
 }
 
 _CROSS_ENCODER_MODELS = {
@@ -222,11 +225,13 @@ _MULTIMODAL_MODELS = {
     "GraniteSpeechForConditionalGeneration": ("granite_speech", "GraniteSpeechForConditionalGeneration"),  # noqa: E501
     "H2OVLChatModel": ("h2ovl", "H2OVLChatModel"),
     "InternVLChatModel": ("internvl", "InternVLChatModel"),
+    "NemotronH_Nano_VL": ("nano_nemotron_vl", "NemotronH_Nano_VL"),
     "InternS1ForConditionalGeneration": ("interns1", "InternS1ForConditionalGeneration"),  # noqa: E501
     "InternVLForConditionalGeneration": ("interns1", "InternS1ForConditionalGeneration"),  # noqa: E501
     "Idefics3ForConditionalGeneration":("idefics3","Idefics3ForConditionalGeneration"),
     "SmolVLMForConditionalGeneration": ("smolvlm","SmolVLMForConditionalGeneration"),  # noqa: E501
     "KeyeForConditionalGeneration": ("keye", "KeyeForConditionalGeneration"),
+    "KeyeVL1_5ForConditionalGeneration": ("keye_vl1_5", "KeyeVL1_5ForConditionalGeneration"), # noqa: E501
     "RForConditionalGeneration": ("rvl", "RForConditionalGeneration"),
     "KimiVLForConditionalGeneration": ("kimi_vl", "KimiVLForConditionalGeneration"),  # noqa: E501
     "Llama_Nemotron_Nano_VL": ("nemotron_vl", "LlamaNemotronVLChatModel"),
@@ -235,6 +240,7 @@ _MULTIMODAL_MODELS = {
     "LlavaNextVideoForConditionalGeneration": ("llava_next_video", "LlavaNextVideoForConditionalGeneration"),  # noqa: E501
     "LlavaOnevisionForConditionalGeneration": ("llava_onevision", "LlavaOnevisionForConditionalGeneration"),  # noqa: E501
     "MantisForConditionalGeneration": ("llava", "MantisForConditionalGeneration"),  # noqa: E501
+    "MiDashengLMModel": ("midashenglm", "MiDashengLMModel"),
     "MiniMaxVL01ForConditionalGeneration": ("minimax_vl_01", "MiniMaxVL01ForConditionalGeneration"),  # noqa: E501
     "MiniCPMO": ("minicpmo", "MiniCPMO"),
     "MiniCPMV": ("minicpmv", "MiniCPMV"),
@@ -274,13 +280,13 @@ _SPECULATIVE_DECODING_MODELS = {
     "EagleLlama4ForCausalLM": ("llama4_eagle", "EagleLlama4ForCausalLM"),
     "EagleMiniCPMForCausalLM": ("minicpm_eagle", "EagleMiniCPMForCausalLM"),
     "Eagle3LlamaForCausalLM": ("llama_eagle3", "Eagle3LlamaForCausalLM"),
-    # TODO: Re-enable this once tests/models/test_initialization.py is fixed, see PR #22333 #22611  # noqa: E501
-    # "LlamaForCausalLMEagle3": ("llama_eagle3", "Eagle3LlamaForCausalLM"),
+    "LlamaForCausalLMEagle3": ("llama_eagle3", "Eagle3LlamaForCausalLM"),
     "EagleDeepSeekMTPModel": ("deepseek_eagle", "EagleDeepseekV3ForCausalLM"),
     "DeepSeekMTPModel": ("deepseek_mtp", "DeepSeekMTP"),
     "ErnieMTPModel": ("ernie_mtp", "ErnieMTP"),
     "Glm4MoeMTPModel": ("glm4_moe_mtp", "Glm4MoeMTP"),
     "MedusaModel": ("medusa", "Medusa"),
+    "Qwen3NextMTP": ("qwen3_next_mtp", "Qwen3NextMTP"),
     # Temporarily disabled.
     # # TODO(woosuk): Re-enable this once the MLP Speculator is supported in V1.
     # "MLPSpeculatorPreTrainedModel": ("mlp_speculator", "MLPSpeculator"),
@@ -637,6 +643,9 @@ class _ModelRegistry:
                 model_info = self._try_inspect_model_cls(arch)
                 if model_info is not None:
                     return (model_info, arch)
+        elif model_config.model_impl == ModelImpl.TERRATORCH:
+            model_info = self._try_inspect_model_cls("Terratorch")
+            return (model_info, "Terratorch")
 
         # Fallback to transformers impl (after resolving convert_type)
         if (all(arch not in self.models for arch in architectures)
@@ -685,6 +694,11 @@ class _ModelRegistry:
                 model_cls = self._try_load_model_cls(arch)
                 if model_cls is not None:
                     return (model_cls, arch)
+        elif model_config.model_impl == ModelImpl.TERRATORCH:
+            arch = "Terratorch"
+            model_cls = self._try_load_model_cls(arch)
+            if model_cls is not None:
+                return (model_cls, arch)
 
         # Fallback to transformers impl (after resolving convert_type)
         if (all(arch not in self.models for arch in architectures)
diff --git a/vllm/model_executor/models/roberta.py b/vllm/model_executor/models/roberta.py
index 2bfa51162910b56e515c952222052162f60b539a..ba405be41687662da5c56e225e37d7267d1155a5 100644
--- a/vllm/model_executor/models/roberta.py
+++ b/vllm/model_executor/models/roberta.py
@@ -8,7 +8,7 @@ import torch
 from torch import nn
 from transformers import RobertaConfig
 
-from vllm.config import VllmConfig
+from vllm.config import ModelConfig, VllmConfig
 from vllm.model_executor.layers.pooler import (ClassifierPooler, CLSPool,
                                                DispatchPooler, Pooler)
 from vllm.model_executor.layers.vocab_parallel_embedding import (
@@ -73,10 +73,16 @@ class RobertaEmbedding(nn.Module):
 class RobertaClassificationHead(nn.Module):
     """Head for sentence-level classification tasks."""
 
-    def __init__(self, config: RobertaConfig):
+    def __init__(self, model_config: "ModelConfig"):
         super().__init__()
-        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
-        self.out_proj = nn.Linear(config.hidden_size, config.num_labels)
+        config = model_config.hf_config
+        head_dtype = model_config.head_dtype
+        self.dense = nn.Linear(config.hidden_size,
+                               config.hidden_size,
+                               dtype=head_dtype)
+        self.out_proj = nn.Linear(config.hidden_size,
+                                  config.num_labels,
+                                  dtype=head_dtype)
 
     def forward(self, x: torch.Tensor) -> torch.Tensor:
         # CLSPool has already been applied in `pooling`
@@ -184,7 +190,7 @@ class RobertaForSequenceClassification(nn.Module, SupportsCrossEncoding):
         self.roberta = BertModel(vllm_config=vllm_config,
                                  prefix=maybe_prefix(prefix, "bert"),
                                  embedding_class=RobertaEmbedding)
-        self.classifier = RobertaClassificationHead(config)
+        self.classifier = RobertaClassificationHead(vllm_config.model_config)
 
         pooler_config = vllm_config.model_config.pooler_config
         assert pooler_config is not None
diff --git a/vllm/model_executor/models/step3_vl.py b/vllm/model_executor/models/step3_vl.py
index f379d2c15fb6c4bcbb9478543e5c1ae148d60e0c..2ba5f94ea3b88dbb6627b20210d121126d5b273f 100644
--- a/vllm/model_executor/models/step3_vl.py
+++ b/vllm/model_executor/models/step3_vl.py
@@ -16,12 +16,12 @@ from torchvision import transforms
 from torchvision.transforms.functional import InterpolationMode
 from transformers import BatchFeature, PretrainedConfig, TensorType
 
+from vllm.attention.layer import MultiHeadAttention
 from vllm.config import VllmConfig
 from vllm.distributed import get_tensor_model_parallel_world_size
 from vllm.model_executor.layers.activation import get_act_fn
 from vllm.model_executor.layers.linear import (ColumnParallelLinear,
                                                QKVParallelLinear,
-                                               ReplicatedLinear,
                                                RowParallelLinear)
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
@@ -667,39 +667,25 @@ class Step3VisionAttention(nn.Module):
 
         self.q_size = self.num_heads * self.head_dim
 
-        if use_data_parallel:
-            self.qkv_proj = ReplicatedLinear(
-                self.embed_dim,
-                3 * self.q_size,
-                bias=True,
-                quant_config=quant_config,
-                prefix=prefix,
-            )
-            self.out_proj = ReplicatedLinear(
-                self.total_num_heads * self.head_dim,
-                self.embed_dim,
-                bias=True,
-                quant_config=quant_config,
-                prefix=prefix,
-            )
-        else:
-            self.qkv_proj = QKVParallelLinear(
-                self.embed_dim,
-                self.head_dim,
-                self.total_num_heads,
-                bias=True,
-                quant_config=quant_config,
-                prefix=prefix,
-            )
-            self.out_proj = RowParallelLinear(self.embed_dim,
-                                              self.embed_dim,
-                                              bias=True,
-                                              quant_config=quant_config,
-                                              prefix=prefix)
+        self.qkv_proj = QKVParallelLinear(
+            self.embed_dim,
+            self.head_dim,
+            self.total_num_heads,
+            bias=True,
+            quant_config=quant_config,
+            prefix=f"{prefix}.qkv_proj",
+            disable_tp=use_data_parallel,
+        )
+        self.out_proj = RowParallelLinear(self.embed_dim,
+                                          self.embed_dim,
+                                          bias=True,
+                                          quant_config=quant_config,
+                                          prefix=f"{prefix}.out_proj",
+                                          disable_tp=use_data_parallel)
 
-    def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int):
-        return tensor.view(bsz, seq_len, self.num_heads,
-                           self.head_dim).transpose(1, 2).contiguous()
+        # Use unified MultiHeadAttention with automatic backend selection
+        self.attn = MultiHeadAttention(self.num_heads, self.head_dim,
+                                       self.scale)
 
     def forward(
         self,
@@ -711,19 +697,9 @@ class Step3VisionAttention(nn.Module):
         # get query proj
         qkv, _ = self.qkv_proj(hidden_states)
         q, k, v = qkv.chunk(chunks=3, dim=-1)
-        q = q.view(bsz, tgt_len, self.num_heads, self.head_dim)
-        k = k.view(bsz, tgt_len, self.num_heads, self.head_dim)
-        v = v.view(bsz, tgt_len, self.num_heads, self.head_dim)
-        q = q.transpose(1, 2)
-        k = k.transpose(1, 2)
-        v = v.transpose(1, 2)
-        attn_output = F.scaled_dot_product_attention(q,
-                                                     k,
-                                                     v,
-                                                     scale=self.scale,
-                                                     is_causal=False)
-        attn_output = attn_output.transpose(1, 2).reshape(
-            bsz, tgt_len, self.num_heads * self.head_dim)
+
+        # Use unified MultiHeadAttention with automatic backend selection
+        attn_output = self.attn(q, k, v)
 
         attn_output, _ = self.out_proj(attn_output)
 
@@ -740,20 +716,18 @@ class Step3VisionMLP(nn.Module):
         super().__init__()
         self.config = config
         self.activation_fn = get_act_fn(config.hidden_act)
-        cls_fc1 = (ReplicatedLinear
-                   if use_data_parallel else ColumnParallelLinear)
-        self.fc1 = cls_fc1(config.hidden_size,
-                           config.intermediate_size,
-                           bias=True,
-                           quant_config=quant_config,
-                           prefix=prefix)
-        cls_fc2 = (ReplicatedLinear
-                   if use_data_parallel else RowParallelLinear)
-        self.fc2 = cls_fc2(config.intermediate_size,
-                           config.hidden_size,
-                           bias=True,
-                           quant_config=quant_config,
-                           prefix=prefix)
+        self.fc1 = ColumnParallelLinear(config.hidden_size,
+                                        config.intermediate_size,
+                                        bias=True,
+                                        quant_config=quant_config,
+                                        prefix=f"{prefix}.fc1",
+                                        disable_tp=use_data_parallel)
+        self.fc2 = RowParallelLinear(config.intermediate_size,
+                                     config.hidden_size,
+                                     bias=True,
+                                     quant_config=quant_config,
+                                     prefix=f"{prefix}.fc2",
+                                     disable_tp=use_data_parallel)
 
     def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
         hidden_states, _ = self.fc1(hidden_states)
diff --git a/vllm/model_executor/models/terratorch.py b/vllm/model_executor/models/terratorch.py
new file mode 100644
index 0000000000000000000000000000000000000000..b9dfa8e9b6f51ff52300ff75c5e8949a6abc43c3
--- /dev/null
+++ b/vllm/model_executor/models/terratorch.py
@@ -0,0 +1,294 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+# Copyright 2025 The vLLM team.
+# Copyright 2025 IBM.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Wrapper around `Terratorch` models"""
+
+from collections import OrderedDict
+from collections.abc import Iterable, Mapping, Sequence
+from typing import Any, Callable, Optional, Union
+
+import torch
+import torch.nn as nn
+from terratorch.vllm import (DummyDataGenerator, InferenceRunner,
+                             InputDefinition, InputTypeEnum)
+from transformers import BatchFeature
+
+from vllm.config import VllmConfig
+from vllm.model_executor.layers.pooler import DispatchPooler, Pooler
+from vllm.model_executor.model_loader.weight_utils import default_weight_loader
+from vllm.model_executor.models.utils import AutoWeightsLoader
+from vllm.multimodal import MULTIMODAL_REGISTRY
+from vllm.multimodal.cache import MultiModalProcessorOnlyCache
+from vllm.multimodal.inputs import (ImageItem, ModalityData,
+                                    MultiModalDataDict, MultiModalFieldConfig,
+                                    MultiModalInputs, MultiModalKwargsItems,
+                                    MultiModalUUIDDict, PlaceholderRange)
+from vllm.multimodal.parse import (DictEmbeddingItems, ModalityDataItems,
+                                   MultiModalDataItems, MultiModalDataParser)
+from vllm.multimodal.processing import (BaseMultiModalProcessor,
+                                        BaseProcessingInfo, PromptUpdate)
+from vllm.multimodal.profiling import BaseDummyInputsBuilder
+from vllm.sequence import IntermediateTensors
+
+from .interfaces import (IsAttentionFree, MultiModalEmbeddings,
+                         SupportsMultiModal)
+from .interfaces_base import default_pooling_type
+
+
+def _terratorch_field_names(pretrained_cfg: dict):
+    input_definition = InputDefinition(**pretrained_cfg["input"])
+    return set(input_definition.data.keys())
+
+
+def _terratorch_field_factory(
+    pretrained_cfg: dict
+) -> Callable[
+    [Mapping[str, torch.Tensor]],
+        Mapping[str, MultiModalFieldConfig],
+]:
+
+    def _terratorch_field_config(hf_inputs: Mapping[str, torch.Tensor]):
+        input_definition = InputDefinition(**pretrained_cfg["input"])
+        fields = {}
+        for input_name, input in input_definition.data.items():
+            if input.type == InputTypeEnum.tensor:
+                fields[input_name] = "image"
+
+        mm_fields_config = {}
+        for field_name, field_modality in fields.items():
+            mm_fields_config[field_name] = MultiModalFieldConfig.shared(
+                batch_size=1, modality=field_modality)
+        return mm_fields_config
+
+    return _terratorch_field_config
+
+
+class TerratorchProcessingInfo(BaseProcessingInfo):
+
+    def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
+        return {"image": None}
+
+
+class TerratorchInputBuilder(BaseDummyInputsBuilder[TerratorchProcessingInfo]):
+
+    def __init__(self, info: TerratorchProcessingInfo):
+        super().__init__(info)
+        self.dummy_data_generator = DummyDataGenerator(
+            self.info.get_hf_config().to_dict()["pretrained_cfg"])
+
+    def get_dummy_text(self, mm_counts: Mapping[str, int]) -> str:
+        return ""
+
+    def get_dummy_mm_data(
+        self,
+        seq_len: int,
+        mm_counts: Mapping[str, int],
+    ) -> MultiModalDataDict:
+        # Dummy data is generated based on the 'input' section
+        # defined in the HF configuration file
+        return self.dummy_data_generator.get_dummy_mm_data()
+
+
+class TerratorchMultiModalDataParser(MultiModalDataParser):
+
+    def __init__(self, pretrained_cfg: dict, *args, **kwargs):
+        self._pretrained_cfg = pretrained_cfg
+        super().__init__(*args, **kwargs)
+
+    def _parse_image_data(
+        self,
+        data: Union[dict[str, torch.Tensor], ModalityData[ImageItem]],
+    ) -> Optional[ModalityDataItems[Any, Any]]:
+        if isinstance(data, dict):
+
+            terratorch_fields = _terratorch_field_names(self._pretrained_cfg)
+
+            return DictEmbeddingItems(
+                data,
+                modality="image",
+                required_fields=terratorch_fields,
+                fields_factory=_terratorch_field_factory(self._pretrained_cfg),
+            )
+
+        return super()._parse_image_data(data)
+
+
+class TerratorchMultiModalProcessor(BaseMultiModalProcessor):
+
+    def __init__(
+            self,
+            info: TerratorchProcessingInfo,
+            dummy_inputs: "BaseDummyInputsBuilder[TerratorchProcessingInfo]",
+            *,
+            cache: Optional[MultiModalProcessorOnlyCache] = None) -> None:
+
+        self.pretrained_cfg = info.get_hf_config().to_dict()["pretrained_cfg"]
+        super().__init__(info=info, dummy_inputs=dummy_inputs, cache=cache)
+
+    def _get_data_parser(self) -> MultiModalDataParser:
+        return TerratorchMultiModalDataParser(
+            pretrained_cfg=self.pretrained_cfg)
+
+    def _get_mm_fields_config(
+        self,
+        hf_inputs: BatchFeature,
+        hf_processor_mm_kwargs: Mapping[str, object],
+    ) -> Mapping[str, MultiModalFieldConfig]:
+        return _terratorch_field_factory(self.pretrained_cfg)(hf_inputs)
+
+    def _get_prompt_updates(
+        self,
+        mm_items: MultiModalDataItems,
+        hf_processor_mm_kwargs: Mapping[str, object],
+        out_mm_kwargs: MultiModalKwargsItems,
+    ) -> Sequence[PromptUpdate]:
+        return []
+
+    def apply(
+        self,
+        prompt: Union[str, list[int]],
+        mm_data: MultiModalDataDict,
+        hf_processor_mm_kwargs: Mapping[str, object],
+        tokenization_kwargs: Optional[Mapping[str, object]] = None,
+        mm_uuids: Optional[MultiModalUUIDDict] = None,
+    ) -> MultiModalInputs:
+        if "image" in mm_data:
+            image_data = mm_data["image"]
+        else:
+            image_data = mm_data
+            mm_data = {"image": mm_data}
+
+        mm_items = self._to_mm_items(mm_data)
+        tokenization_kwargs = tokenization_kwargs or {}
+        mm_hashes = self._hash_mm_items(mm_items,
+                                        hf_processor_mm_kwargs,
+                                        tokenization_kwargs,
+                                        mm_uuids=mm_uuids)
+        mm_placeholders = {"image": [PlaceholderRange(offset=0, length=0)]}
+
+        mm_processed_data = BatchFeature(image_data)
+
+        mm_kwargs = MultiModalKwargsItems.from_hf_inputs(
+            mm_processed_data,
+            self._get_mm_fields_config(mm_processed_data,
+                                       hf_processor_mm_kwargs),
+        )
+
+        return MultiModalInputs(
+            type="multimodal",
+            prompt=prompt,
+            prompt_token_ids=[1],
+            mm_kwargs=mm_kwargs,
+            mm_hashes=mm_hashes,
+            mm_placeholders=mm_placeholders,
+        )
+
+
+@default_pooling_type("All")
+@MULTIMODAL_REGISTRY.register_processor(
+    TerratorchMultiModalProcessor,
+    info=TerratorchProcessingInfo,
+    dummy_inputs=TerratorchInputBuilder,
+)
+class Terratorch(nn.Module, IsAttentionFree, SupportsMultiModal):
+    supports_multimodal_raw_input_only = True
+    is_pooling_model = True
+
+    @classmethod
+    def get_placeholder_str(cls, modality: str, i: int) -> Optional[str]:
+        if modality.startswith("image"):
+            return None
+
+        raise ValueError("Only image modality is supported")
+
+    def __init__(self, vllm_config: VllmConfig, prefix: str = ""):
+        super().__init__()
+
+        config = vllm_config.model_config.hf_config.to_dict()["pretrained_cfg"]
+
+        self.inference_runner = InferenceRunner(config)
+        self.model = self.inference_runner.model
+
+        pooler_config = vllm_config.model_config.pooler_config
+        assert pooler_config is not None
+
+        self.pooler = DispatchPooler(
+            {"encode": Pooler.for_encode(pooler_config)}, )
+
+    def get_input_embeddings(
+        self,
+        input_ids: torch.Tensor,
+        multimodal_embeddings: Optional[MultiModalEmbeddings] = None,
+    ) -> torch.Tensor:
+        # We do not really use any input tokens and therefore no embeddings
+        # to be calculated. However, due to the mandatory token ids in
+        # the input prompt we pass one token and the size of the dummy
+        # embedding tensors must reflect that.
+        return torch.empty((input_ids.shape[0], 0))
+
+    def forward(
+        self,
+        input_ids: Optional[torch.Tensor],
+        positions: torch.Tensor,
+        intermediate_tensors: Optional[IntermediateTensors] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        **kwargs: object,
+    ):
+        model_output = self.inference_runner.forward(**kwargs)
+
+        return model_output.output
+
+    def load_weights(self, weights: Iterable[tuple[str,
+                                                   torch.Tensor]]) -> set[str]:
+        params_list = []
+        model_buffers = dict(self.named_buffers())
+        loaded_buffers = []
+        for key, value in weights:
+            if isinstance(value, (dict, OrderedDict)):
+                if key == "state_dict":
+                    weights_to_parse = value
+                    for name, weight in weights_to_parse.items():
+                        name = f"inference_runner.{name}"
+
+                        if "pos_embed" in name:
+                            continue
+
+                        if "_timm_module." in name:
+                            name = name.replace("_timm_module.", "")
+
+                        # this model requires a couple of buffers to be loaded
+                        # that are not loadable with the AutoWeightsLoader
+                        if name in model_buffers:
+                            if "_timm_module." in name:
+                                name = name.replace("_timm_module.", "")
+                            buffer = model_buffers[name]
+                            weight_loader = getattr(buffer, "weight_loader",
+                                                    default_weight_loader)
+                            weight_loader(buffer, weight)
+                            loaded_buffers.append(name)
+                        else:
+                            params_list.append((name, weight))
+                    break
+
+            elif isinstance(value, torch.Tensor):
+                params_list.append((f"inference_runner.model.{key}", value))
+
+        # Load the remaining model parameters
+        loader = AutoWeightsLoader(self)
+        autoloaded_weights = loader.load_weights(params_list)
+
+        return autoloaded_weights.union(set(loaded_buffers))
diff --git a/vllm/model_executor/models/transformers.py b/vllm/model_executor/models/transformers.py
index 5ad0482330ecdddc78ee35919f855e3080f297b1..a386f47e1929f9c676469c8fd7cf18b48a72803b 100644
--- a/vllm/model_executor/models/transformers.py
+++ b/vllm/model_executor/models/transformers.py
@@ -44,7 +44,8 @@ from vllm.model_executor.layers.vocab_parallel_embedding import (
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalKwargsItems
 from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
-                                    MultiModalInputs, PlaceholderRange)
+                                    MultiModalInputs, MultiModalUUIDDict,
+                                    PlaceholderRange)
 from vllm.multimodal.parse import ImageProcessorItems, MultiModalDataItems
 from vllm.multimodal.processing import (BaseMultiModalProcessor,
                                         BaseProcessingInfo)
@@ -347,7 +348,7 @@ class MultiModalProcessor(BaseMultiModalProcessor[MultiModalProcessingInfo]):
         mm_data: MultiModalDataDict,
         hf_processor_mm_kwargs: Mapping[str, object],
         tokenization_kwargs: Optional[Mapping[str, object]] = None,
-        mm_hash_overrides: Optional[dict[str, list[str]]] = None,
+        mm_uuids: Optional[MultiModalUUIDDict] = None,
     ) -> MultiModalInputs:
         """
         Process multi-modal inputs to be used in vLLM.
@@ -415,9 +416,8 @@ class MultiModalProcessor(BaseMultiModalProcessor[MultiModalProcessingInfo]):
                                        num_image_patches),
         )
         # Use overrides if provided; fallback to data-dependent hashing.
-        mm_hashes = (mm_hash_overrides if mm_hash_overrides is not None else
-                     self._hash_mm_items(mm_items, hf_processor_mm_kwargs,
-                                         tokenization_kwargs))
+        mm_hashes = (mm_uuids if mm_uuids is not None else self._hash_mm_items(
+            mm_items, hf_processor_mm_kwargs, tokenization_kwargs))
 
         return MultiModalInputs(
             type="multimodal",
diff --git a/vllm/model_executor/models/ultravox.py b/vllm/model_executor/models/ultravox.py
index f91c4ddb6e8342fac2ba80c7a1cb938e9e8a1a01..9e28b0c443df473c6a60c2dbe9b40256e9eed003 100644
--- a/vllm/model_executor/models/ultravox.py
+++ b/vllm/model_executor/models/ultravox.py
@@ -4,7 +4,7 @@
 # Adapted from https://github.com/fixie-ai/ultravox/blob/ecd58c4041030bae2ad15aa6bcf04ab43199ea02/ultravox/model/ultravox_model.py
 """PyTorch Ultravox model."""
 from collections.abc import Iterable, Mapping, Sequence
-from typing import Any, Literal, Optional, TypedDict, Union
+from typing import Annotated, Any, Literal, Optional, Union
 
 import torch
 from torch import nn
@@ -31,6 +31,7 @@ from vllm.multimodal.processing import (BaseMultiModalProcessor,
 from vllm.multimodal.profiling import BaseDummyInputsBuilder
 from vllm.sequence import IntermediateTensors
 from vllm.transformers_utils.configs.ultravox import UltravoxConfig
+from vllm.utils.tensor_schema import TensorSchema, TensorShape
 
 from .interfaces import (MultiModalEmbeddings, SupportsLoRA,
                          SupportsMultiModal, SupportsPP)
@@ -43,26 +44,37 @@ _AUDIO_PLACEHOLDER_OVERRIDE = "<|audio|>"
 _MAX_ENCODER_BATCH_SIZE = 16
 
 
-class UltravoxAudioFeatureInputs(TypedDict):
-    type: Literal["audio_features"]
-    data: Union[torch.Tensor, list[torch.Tensor], list[list[torch.Tensor]]]
-    """Shape: `(batch_size, num_chunks, 80, M)`"""
-    lens: Union[torch.Tensor, list[torch.Tensor]]
+class UltravoxAudioFeatureInputs(TensorSchema):
     """
-    Length of the audio frames. Used for attention mask in WhisperEncoder.
-    Shape: `(batch_size, num_chunks)`
+    Dimensions:
+    - b: batch size
+    - n: number of chunks
+    - t: Time frames (M)
+    - nmb: Number of mel bins
     """
-    token_len: Union[torch.Tensor, list[torch.Tensor]]
+    type: Literal["audio_features"]
+    data: Annotated[Union[torch.Tensor, list[torch.Tensor],
+                          list[list[torch.Tensor]]],
+                    TensorShape("b", "n", "nmb", "t", dynamic_dims={"n"})]
+    lens: Annotated[Union[torch.Tensor, list[torch.Tensor]],
+                    TensorShape("b", "n", dynamic_dims={"n"})]
+    """Length of the audio frames. Used for attention mask in WhisperEncoder."""
+    token_len: Annotated[Union[torch.Tensor, list[torch.Tensor]],
+                         TensorShape("b", "n", dynamic_dims={"n"})]
+    """Length of the audio tokens. Used for flattening the audio features."""
+
+
+class UltravoxAudioEmbeddingInputs(TensorSchema):
     """
-    Length of the audio tokens. Used for flattening the audio features.
-    Shape: `(batch_size, num_chunks)`
+    Dimensions:
+    - b: batch size
+    - na: number of audios
+    - afs: audio feature size
+    - hs: hidden size
     """
-
-
-class UltravoxAudioEmbeddingInputs(TypedDict):
     type: Literal["audio_embeds"]
-    data: NestedTensors
-    """Shape: `(batch_size, num_audios, audio_feature_size, hidden_size)`"""
+    data: Annotated[Union[torch.Tensor, list[torch.Tensor]],
+                    TensorShape("b", "na", "afs", "hs")]
 
 
 UltravoxAudioInputs = Union[UltravoxAudioFeatureInputs,
@@ -264,7 +276,7 @@ class UltravoxProjector(nn.Module):
         else:
             self.act = get_act_fn(config.projector_act)
 
-        dim_out = config.text_hidden_size
+        dim_out = config.text_config.hidden_size
         self.linear_2 = nn.Linear(dim_mid, dim_out, bias=False)
 
         # Ultravox v0.4.1 and below use layer_norm after the second linear layer
@@ -406,7 +418,7 @@ class UltravoxModel(nn.Module, SupportsMultiModal, SupportsPP, SupportsLoRA):
 
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__()
-        config = vllm_config.model_config.hf_config
+        config: UltravoxConfig = vllm_config.model_config.hf_config
         multimodal_config = vllm_config.model_config.multimodal_config
         self.config = config
         self.multi_modal_config = multimodal_config
@@ -426,7 +438,7 @@ class UltravoxModel(nn.Module, SupportsMultiModal, SupportsPP, SupportsLoRA):
         self.multi_modal_projector = UltravoxProjector(config)
         self.language_model = init_vllm_registered_model(
             vllm_config=vllm_config,
-            hf_config=config.text_config,
+            hf_config=config.wrapped_model_config,
             prefix=maybe_prefix(prefix, "language_model"),
         )
         if config.text_model_id is not None:
@@ -484,26 +496,12 @@ class UltravoxModel(nn.Module, SupportsMultiModal, SupportsPP, SupportsLoRA):
             return None
 
         if audio_features is not None:
-            if not isinstance(audio_features, (torch.Tensor, list)):
-                raise ValueError("Incorrect type of audio features. "
-                                 f"Got type: {type(audio_features)}")
-            if not isinstance(audio_lens, (torch.Tensor, list)):
-                raise ValueError("Incorrect type of audio_lens. "
-                                 f"Got type: {type(audio_features)}")
-            if not isinstance(audio_token_len, (torch.Tensor, list)):
-                raise ValueError("Incorrect type of audio_token_len. "
-                                 f"Got type: {type(audio_features)}")
-
             return UltravoxAudioFeatureInputs(type="audio_features",
                                               data=audio_features,
                                               lens=audio_lens,
                                               token_len=audio_token_len)
 
         if audio_embeds is not None:
-            if not isinstance(audio_embeds, (torch.Tensor, list)):
-                raise ValueError("Incorrect type of audio embeds. "
-                                 f"Got type: {type(audio_embeds)}")
-
             return UltravoxAudioEmbeddingInputs(type="audio_embeds",
                                                 data=audio_embeds)
 
diff --git a/vllm/model_executor/models/utils.py b/vllm/model_executor/models/utils.py
index 28cfefac30ddb9a010e2814bd26bbecdff08961b..e716ec582baaba4bf060242102638d47a79c3aa8 100644
--- a/vllm/model_executor/models/utils.py
+++ b/vllm/model_executor/models/utils.py
@@ -761,3 +761,10 @@ def fast_topk(values: torch.Tensor, topk: int,
     else:
         # Use topk for efficiency with larger k values
         return torch.topk(values, topk, dim=dim)
+
+
+def get_model_hidden_size(hf_config: PretrainedConfig) -> int:
+    if hasattr(hf_config, "hidden_size"):
+        return hf_config.hidden_size
+    text_config = hf_config.get_text_config()
+    return text_config.hidden_size
diff --git a/vllm/model_executor/models/vision.py b/vllm/model_executor/models/vision.py
index de30509b1ccb4593c7111a133cd87e9473940839..c16aa5ac608f9eab18a9ab39f019044daeaca338 100644
--- a/vllm/model_executor/models/vision.py
+++ b/vllm/model_executor/models/vision.py
@@ -122,4 +122,4 @@ def resolve_visual_encoder_outputs(
     uses_last_layer = feature_sample_layers[-1] in (len(hs_pool) - 1, -1)
     if post_layer_norm is not None and uses_last_layer:
         hs_pool[-1] = post_layer_norm(encoder_outputs)
-    return torch.cat(hs_pool, dim=-1)
+    return torch.cat(hs_pool, dim=-1)
\ No newline at end of file
diff --git a/vllm/model_executor/models/voxtral.py b/vllm/model_executor/models/voxtral.py
index 6bc748407a7d1f8123cf786164998942e4b85e19..1ea317c2f95f9fa41dee01be4e4cf4dbb72daf0e 100644
--- a/vllm/model_executor/models/voxtral.py
+++ b/vllm/model_executor/models/voxtral.py
@@ -5,7 +5,7 @@ import math
 from collections.abc import Iterable, Mapping, Sequence
 from functools import cached_property
 from math import ceil
-from typing import Optional, Union, cast
+from typing import Literal, Optional, Union, cast
 
 import numpy as np
 import regex as re
@@ -23,15 +23,18 @@ from transformers.tokenization_utils_base import TextInput
 from vllm.config import ModelConfig, SpeechToTextConfig, VllmConfig
 from vllm.inputs.data import PromptType
 from vllm.logger import init_logger
+from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
 from vllm.model_executor.models import SupportsPP
+from vllm.model_executor.models.module_mapping import MultiModelKeys
 # yapf: disable
 from vllm.model_executor.models.whisper import WhisperEncoder
 # yapf: enable
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.multimodal import MULTIMODAL_REGISTRY
 from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
-                                    MultiModalKwargsItems, NestedTensors)
+                                    MultiModalKwargsItems, MultiModalUUIDDict,
+                                    NestedTensors)
 from vllm.multimodal.parse import (AudioProcessorItems, MultiModalDataItems,
                                    MultiModalDataParser)
 from vllm.multimodal.processing import (BaseMultiModalProcessor,
@@ -43,8 +46,8 @@ from vllm.sequence import IntermediateTensors
 from vllm.transformers_utils.tokenizer import (MistralTokenizer,
                                                cached_tokenizer_from_config)
 
-from .interfaces import (MultiModalEmbeddings, SupportsMultiModal,
-                         SupportsTranscription)
+from .interfaces import (MultiModalEmbeddings, SupportsLoRA,
+                         SupportsMultiModal, SupportsTranscription)
 from .utils import (flatten_bn, init_vllm_registered_model, maybe_prefix,
                     merge_multimodal_embeddings)
 
@@ -290,14 +293,14 @@ class VoxtralMultiModalProcessor(BaseMultiModalProcessor[VoxtralProcessingInfo]
         mm_data_items: MultiModalDataItems,
         hf_processor_mm_kwargs: Mapping[str, object],
         tokenization_kwargs: Mapping[str, object],
-        mm_hash_overrides: Optional[dict[str, list[str]]] = None,
+        mm_uuids: Optional[MultiModalUUIDDict] = None,
     ) -> tuple[list[int], MultiModalProcessingInfo, bool]:
         prompt_ids, mm_info, _ = super()._cached_apply_hf_processor(
             prompt=prompt,
             mm_data_items=mm_data_items,
             hf_processor_mm_kwargs=hf_processor_mm_kwargs,
             tokenization_kwargs=tokenization_kwargs,
-            mm_hash_overrides=mm_hash_overrides,
+            mm_uuids=mm_uuids,
         )
 
         # NOTE: The tokens are already inserted by the chat template
@@ -312,13 +315,25 @@ class VoxtralMultiModalProcessor(BaseMultiModalProcessor[VoxtralProcessingInfo]
                                         info=VoxtralProcessingInfo,
                                         dummy_inputs=VoxtralDummyInputsBuilder)
 class VoxtralForConditionalGeneration(nn.Module, SupportsMultiModal,
-                                      SupportsPP, SupportsTranscription):
+                                      SupportsPP, SupportsLoRA,
+                                      SupportsTranscription):
     supported_languages = ISO639_1_SUPPORTED_LANGS
 
+    packed_modules_mapping = {
+        "qkv_proj": ["q_proj", "k_proj", "v_proj"],
+        "gate_up_proj": ["gate_proj", "up_proj"]
+    }
+
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__()
         self.tokenizer = cached_tokenizer_from_config(vllm_config.model_config)
 
+        # update quant config to so that ignored module and target module names
+        # match the vLLM model names
+        if hasattr(vllm_config, "quant_config"):
+            vllm_config.quant_config = self.maybe_update_quant_config(
+                vllm_config.quant_config)
+
         config = vllm_config.model_config.hf_config
         self.config = config
         self.downsample_factor = self.config.audio_config.downsample_factor
@@ -340,6 +355,14 @@ class VoxtralForConditionalGeneration(nn.Module, SupportsMultiModal,
     def get_language_model(self) -> torch.nn.Module:
         return self.language_model
 
+    def get_mm_mapping(self) -> MultiModelKeys:
+        """Get module prefix for multimodal models to filter LoRA modules."""
+        return MultiModelKeys.from_string_field(
+            language_model="language_model",
+            connector="audio_language_adapter",
+            tower_model=["whisper_encoder"],
+        )
+
     def forward(
         self,
         input_ids: torch.Tensor,
@@ -455,8 +478,10 @@ class VoxtralForConditionalGeneration(nn.Module, SupportsMultiModal,
     def get_generation_prompt(cls, audio: np.ndarray,
                               model_config: ModelConfig,
                               stt_config: SpeechToTextConfig,
-                              language: Optional[str], task_type: str,
-                              request_prompt: str) -> PromptType:
+                              language: Optional[str],
+                              task_type: Literal["transcribe", "translate"],
+                              request_prompt: str,
+                              to_language: Optional[str]) -> PromptType:
         tokenizer = cached_tokenizer_from_config(model_config)
         audio = Audio(audio, int(stt_config.sample_rate),
                       format="wav")  # lossless
@@ -540,6 +565,72 @@ class VoxtralForConditionalGeneration(nn.Module, SupportsMultiModal,
 
         return loaded_weights
 
+    def maybe_update_quant_config(
+            self, quant_config: QuantizationConfig) -> QuantizationConfig:
+        """
+        Update quant config to so that ignored module and target module names
+        match the vLLM model names.
+        Right now this is specific for compressed-tensors format and
+        load_format mistral.
+        """
+        remapping_rules = [
+            (r"output", r"language_model.lm_head"),
+            (r"layers\.(\d+)\.attention\.wo",
+             r"language_model.model.layers.\1.self_attn.out_proj"),
+            (r"layers\.(\d+)\.attention\.w(.*)",
+             r"language_model.model.layers.\1.self_attn.\2_proj"),
+            (r"layers\.(\d+)\.feed_forward\.w1",
+             r"language_model.model.layers.\1.mlp.gate_proj"),
+            (r"layers\.(\d+)\.feed_forward\.w2",
+             r"language_model.model.layers.\1.mlp.down_proj"),
+            (r"layers\.(\d+)\.feed_forward\.w3",
+             r"language_model.model.layers.\1.mlp.up_proj"),
+            (r"mm_whisper_embeddings\.whisper_encoder\.transformer\.layers\.(\d+)\.attention.wo",
+             r"whisper_encoder.whisper_encoder.layers.\1.layers.self_attn.out_proj"
+             ),
+            (r"mm_whisper_embeddings\.whisper_encoder\.transformer\.layers\.(\d+)\.attention.w(.*)",
+             r"whisper_encoder.whisper_encoder.layers.\1.layers.self_attn.\2_proj"
+             ),
+            (r"mm_whisper_embeddings\.whisper_encoder\.transformer\.layers\.(\d+)\.feed_forward.w(\d+)",
+             r"whisper_encoder.whisper_encoder.layers.\1.layers.mlp.fc\2"),
+            (r"mm_whisper_embeddings\.whisper_encoder\.conv_layers\.0",
+             r"whisper_encoder.whisper_encoder.conv1"),
+            (r"mm_whisper_embeddings\.whisper_encoder\.conv_layers\.1",
+             r"whisper_encoder.whisper_encoder.conv2"),
+            (r"mm_whisper_embeddings\.audio_language_projection\.0",
+             r"audio_language_adapter.w_in"),
+            (r"mm_whisper_embeddings\.audio_language_projection\.2",
+             r"audio_language_adapter.w_out"),
+        ]
+
+        # Update ignore list
+        if hasattr(quant_config, "ignore"):
+            mistral_ignore = []
+            for name in quant_config.ignore:
+                mistral_name = name
+                for pattern, repl in remapping_rules:
+                    if re.fullmatch(pattern, name):
+                        mistral_name = re.sub(pattern, repl, name)
+                mistral_ignore.append(mistral_name)
+            quant_config.ignore = mistral_ignore
+
+        # Update target list
+        if hasattr(quant_config, "config_groups"):
+            config_groups = quant_config.config_groups
+            for group_name in config_groups:
+                if "targets" in config_groups[group_name]:
+                    targets = []
+                    for name in config_groups[group_name]["targets"]:
+                        mistral_name = name
+                        for pattern, repl in remapping_rules:
+                            if re.fullmatch(pattern, name):
+                                mistral_name = re.sub(pattern, repl, name)
+                        targets.append(mistral_name)
+                config_groups[group_name]["targets"] = targets
+            quant_config.config_groups = config_groups
+
+        return quant_config
+
 
 class AudioLanguageAdapter(nn.Module):
 
@@ -582,7 +673,6 @@ class VoxtralEncoderModel(nn.Module):
         self.whisper_encoder = WhisperEncoder(vllm_config=vllm_config,
                                               prefix=maybe_prefix(
                                                   prefix, "whisper_encoder"),
-                                              is_standalone_encoder=True,
                                               init_in_fp32=True)
         mel_filters = mel_filter_bank(
             num_frequency_bins=1 + self.config.window_size // 2,
diff --git a/vllm/model_executor/models/whisper.py b/vllm/model_executor/models/whisper.py
index 16bbe2f2010a14698edaaeb41acbf4a79a18f727..41ae7b129782d9ddae0756ec0ef6706ea070a099 100644
--- a/vllm/model_executor/models/whisper.py
+++ b/vllm/model_executor/models/whisper.py
@@ -4,7 +4,7 @@
 import math
 from collections.abc import Iterable, Mapping, Sequence
 from contextlib import nullcontext
-from typing import Optional, TypedDict, Union, cast
+from typing import Annotated, Literal, Optional, Union, cast
 
 import numpy as np
 import torch
@@ -15,6 +15,7 @@ from transformers.models.whisper.modeling_whisper import sinusoids
 
 from vllm.attention import Attention, AttentionType
 from vllm.attention.layer import MultiHeadAttention
+from vllm.attention.layers.cross_attention import CrossAttention
 from vllm.config import (CacheConfig, ModelConfig, SpeechToTextConfig,
                          VllmConfig)
 from vllm.distributed import get_tensor_model_parallel_world_size
@@ -40,9 +41,10 @@ from vllm.multimodal.processing import (BaseProcessingInfo,
                                         PromptReplacement, PromptUpdate)
 from vllm.multimodal.profiling import BaseDummyInputsBuilder
 from vllm.transformers_utils.processor import cached_get_processor
+from vllm.utils.tensor_schema import TensorSchema, TensorShape
 
 from .interfaces import (MultiModalEmbeddings, SupportsMultiModal,
-                         SupportsTranscription, SupportsV0Only)
+                         SupportsTranscription)
 from .utils import (AutoWeightsLoader, WeightsMapper, cast_overflow_tensors,
                     make_layers)
 
@@ -111,9 +113,44 @@ ISO639_1_SUPPORTED_LANGS = {
 }
 
 
-class WhisperAudioInputs(TypedDict):
-    input_features: NestedTensors
-    """Shape: `(batch_size, 128, M)`"""
+class WhisperAudioInputs(TensorSchema):
+    """
+    Dimensions:
+        - b: Batch size
+        - nmb: Number of mel bins
+        - t: Time frames (M)
+    """
+
+    input_features: Annotated[Optional[NestedTensors],
+                              TensorShape("b", "nmb", "t")]
+
+
+class WhisperEncoderAttention(MultiHeadAttention):
+    """Multi-headed attention for Whisper encoder with 2D tensor support."""
+
+    def forward(
+        self,
+        query: torch.Tensor,
+        key: torch.Tensor,
+        value: torch.Tensor,
+    ) -> torch.Tensor:
+        """
+        Input shape: batch_size x seq_len x hidden_size
+                     or seq_len x hidden_size
+        """
+        is_2d = query.dim() == 2
+        if is_2d:
+            query = query.unsqueeze(0)
+            key = key.unsqueeze(0)
+            value = value.unsqueeze(0)
+
+        # Call the parent forward method
+        out = super().forward(query, key, value)
+
+        if is_2d:
+            out = out.squeeze(0)
+
+        return out
 
 
 class WhisperPositionalEmbedding(nn.Embedding):
@@ -136,7 +173,6 @@ class WhisperAttention(nn.Module):
         cache_config: Optional[CacheConfig] = None,
         quant_config: Optional[QuantizationConfig] = None,
         prefix: str = "",
-        standalone_encoder: bool = False,
     ):
         super().__init__()
         self.embed_dim = embed_dim
@@ -172,14 +208,25 @@ class WhisperAttention(nn.Module):
             quant_config=quant_config,
             prefix=f"{prefix}.out_proj",
         )
-        if standalone_encoder:
-            self.attn = MultiHeadAttention(
+        if attn_type == AttentionType.ENCODER:
+            self.attn = WhisperEncoderAttention(
                 self.num_heads,
                 self.head_dim,
                 self.scaling,
                 num_kv_heads=self.num_kv_heads,
             )
-        else:
+        elif self.attn_type == AttentionType.ENCODER_DECODER:
+            self.attn = CrossAttention(
+                self.num_heads,
+                self.head_dim,
+                self.scaling,
+                num_kv_heads=self.num_kv_heads,
+                cache_config=cache_config,
+                quant_config=quant_config,
+                prefix=f"{prefix}.attn",
+                attn_type=self.attn_type,
+            )
+        else:  # AttentionType.DECODER (regular decoder self-attention)
             self.attn = Attention(
                 self.num_heads,
                 self.head_dim,
@@ -324,11 +371,7 @@ class WhisperMLP(nn.Module):
 
 class WhisperEncoderLayer(nn.Module):
 
-    def __init__(self,
-                 *,
-                 vllm_config: VllmConfig,
-                 prefix: str = "",
-                 is_standalone_encoder: bool = False):
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__()
         config = vllm_config.model_config.hf_config
         cache_config = vllm_config.cache_config
@@ -342,7 +385,6 @@ class WhisperEncoderLayer(nn.Module):
             cache_config=cache_config,
             quant_config=quant_config,
             prefix=f"{prefix}.self_attn",
-            standalone_encoder=is_standalone_encoder,
         )
         self.self_attn_layer_norm = nn.LayerNorm(self.embed_dim)
         self.mlp = WhisperMLP(
@@ -438,12 +480,10 @@ class WhisperEncoder(nn.Module):
                  *,
                  vllm_config: VllmConfig,
                  prefix: str = "",
-                 is_standalone_encoder: bool = False,
                  init_in_fp32: bool = False):
         super().__init__()
         config = vllm_config.model_config.hf_config
         embed_dim = config.d_model
-        self.is_standalone_encoder = is_standalone_encoder
         self.num_mel_bins = config.num_mel_bins
         self.max_source_positions = config.max_source_positions
         self.embed_scale = (math.sqrt(embed_dim)
@@ -461,9 +501,7 @@ class WhisperEncoder(nn.Module):
         self.start_layer, self.end_layer, self.layers = make_layers(
             config.encoder_layers,
             lambda prefix: WhisperEncoderLayer(vllm_config=vllm_config,
-                                               prefix=f"{prefix}.layers",
-                                               is_standalone_encoder=
-                                               is_standalone_encoder),
+                                               prefix=f"{prefix}.layers"),
             prefix=f"{prefix}.layers",
         )
         self.layer_norm = nn.LayerNorm(config.d_model)
@@ -744,7 +782,7 @@ class WhisperMultiModalProcessor(
                                         info=WhisperProcessingInfo,
                                         dummy_inputs=WhisperDummyInputsBuilder)
 class WhisperForConditionalGeneration(nn.Module, SupportsTranscription,
-                                      SupportsMultiModal, SupportsV0Only):
+                                      SupportsMultiModal):
     packed_modules_mapping = {
         "self_attn.qkv_proj": [
             "self_attn.q_proj",
@@ -783,8 +821,9 @@ class WhisperForConditionalGeneration(nn.Module, SupportsTranscription,
             model_config: ModelConfig,  # not needed here
             stt_config: SpeechToTextConfig,
             language: Optional[str],
-            task_type: str,
-            request_prompt: str) -> PromptType:
+            task_type: Literal["transcribe", "translate"],
+            request_prompt: str,
+            to_language: Optional[str]) -> PromptType:
         if language is None:
             raise ValueError(
                 "Language must be specified when creating the Whisper prompt")
@@ -871,19 +910,17 @@ class WhisperForConditionalGeneration(nn.Module, SupportsTranscription,
 
     def get_multimodal_embeddings(self,
                                   **kwargs: object) -> MultiModalEmbeddings:
-        # TODO: This method does not obey the interface for SupportsMultiModal.
-        # Refactor this once encoder/decoder support is implemented in V1.
+        # Required as part of SupportsMultiModal interface.
         audio_input = self._parse_and_validate_audio_input(**kwargs)
-        return self.model.get_encoder_outputs(audio_input["input_features"])
+        return [self.model.get_encoder_outputs(audio_input["input_features"])]
 
     def get_input_embeddings(
         self,
         input_ids: torch.Tensor,
         multimodal_embeddings: Optional[NestedTensors] = None,
     ) -> torch.Tensor:
-        # TODO: This method just returns the decoder sequence embeddings since
-        # Whisper does not have encoder text tokens. Refactor this once
-        # encoder/decoder support is implemented in V1.
+        # This method just returns the decoder sequence embeddings since
+        # Whisper does not have encoder text tokens.
         return self.model.decoder.get_input_embeddings(input_ids)
 
     def _parse_and_validate_audio_input(
diff --git a/vllm/model_executor/models/zamba2.py b/vllm/model_executor/models/zamba2.py
index ed65944c109bd463b08888a8c83193ac94ababb5..34b9c1ad07d76f15332e9f913ba0c15d2a6d7b7e 100644
--- a/vllm/model_executor/models/zamba2.py
+++ b/vllm/model_executor/models/zamba2.py
@@ -9,7 +9,7 @@ model alternates between state space model layers and attention-based layers.
 """
 from collections.abc import Iterable
 from itertools import cycle
-from typing import Optional, Union
+from typing import Any, Optional, Union
 
 import torch
 from torch import nn
@@ -528,8 +528,6 @@ class Zamba2MambaDecoderLayer(nn.Module):
             hidden_states: Input tensor [batch_size, seq_len, hidden_size]
             mamba_cache_params: Parameters for Mamba's state caches 
                 (one for conv, one for ssm)
-            sequence_idx: Index tensor for identifying sequences in batch
-                Required for proper chunked processing in prefill
             transformer_hidden_states: Optional output from transformer path
                 Added to input if provided (used in hybrid architecture)
             positions: Optional position IDs (unused in Mamba)
@@ -591,8 +589,6 @@ class Zamba2HybridLayer(nn.Module):
         
         Args:
             shared_transformer: Transformer decoder layer for attention pathway
-            linear: Linear projection for transformer output before Mamba
-            mamba: Mamba decoder layer for state space pathway
         """
         super().__init__()
         self.block_idx = block_idx
@@ -630,8 +626,6 @@ class Zamba2HybridLayer(nn.Module):
             positions: Position IDs for positional embeddings
             mamba_cache_params: Parameters for Mamba's state caches 
                 (one for conv, one for ssm)
-            sequence_idx: Indices for identifying sequences in batch,
-                required for proper chunked processing in prefill
             
         Returns:
             Output tensor combining transformer and Mamba representations
@@ -915,8 +909,8 @@ class Zamba2ForCausalLM(nn.Module, HasInnerState, IsHybrid):
             prefix: Optional prefix for parameter names
         
         Raises:
-            AssertionError: If prefix caching is enabled (not supported by 
-            Mamba)
+            AssertionError: If prefix caching is enabled
+            (not supported by Mamba)
         """
         config = vllm_config.model_config.hf_config
         cache_config = vllm_config.cache_config
@@ -971,7 +965,7 @@ class Zamba2ForCausalLM(nn.Module, HasInnerState, IsHybrid):
                 input_ids: torch.Tensor,
                 positions: torch.Tensor,
                 inputs_embeds: Optional[torch.Tensor] = None,
-                **kwargs) -> torch.Tensor:
+                **kwargs: Any) -> torch.Tensor:
         """Forward pass through the model.
         
         Args:
@@ -1012,9 +1006,9 @@ class Zamba2ForCausalLM(nn.Module, HasInnerState, IsHybrid):
 
         return hidden_states
 
-    def copy_inputs_before_cuda_graphs(self, input_buffers: dict[str,
-                                                                 torch.Tensor],
-                                       **kwargs) -> dict[str, torch.Tensor]:
+    def copy_inputs_before_cuda_graphs(
+            self, input_buffers: dict[str, torch.Tensor],
+            **kwargs: Any) -> dict[str, torch.Tensor]:
         """Copy inputs before CUDA graph capture.
         
         Args:
diff --git a/vllm/model_executor/parameter.py b/vllm/model_executor/parameter.py
index 9465308e94e659a589c1d3dbe70cecd289e4d197..221712ba9a3387f4d30854d94433a5ee64882b61 100644
--- a/vllm/model_executor/parameter.py
+++ b/vllm/model_executor/parameter.py
@@ -57,6 +57,8 @@ class BasevLLMParameter(Parameter):
             weight_loader = _make_synced_weight_loader(weight_loader)
 
         self._weight_loader = weight_loader
+        self.tp_rank = get_tensor_model_parallel_rank()
+        self.tp_size = get_tensor_model_parallel_world_size()
 
     @property
     def weight_loader(self):
@@ -116,10 +118,10 @@ class _ColumnvLLMParameter(BasevLLMParameter):
         return self._output_dim
 
     def load_column_parallel_weight(self, loaded_weight: torch.Tensor):
-        tp_rank = get_tensor_model_parallel_rank()
         shard_size = self.data.shape[self.output_dim]
         loaded_weight = loaded_weight.narrow(self.output_dim,
-                                             tp_rank * shard_size, shard_size)
+                                             self.tp_rank * shard_size,
+                                             shard_size)
         assert self.data.shape == loaded_weight.shape
         self.data.copy_(loaded_weight)
 
@@ -127,6 +129,7 @@ class _ColumnvLLMParameter(BasevLLMParameter):
 
         shard_offset = kwargs.get("shard_offset")
         shard_size = kwargs.get("shard_size")
+
         # TODO: move these to PackedColumnParameter and PackedvLLMParameter
         if isinstance(
                 self,
@@ -137,11 +140,11 @@ class _ColumnvLLMParameter(BasevLLMParameter):
 
         param_data = self.data
 
-        tp_rank = get_tensor_model_parallel_rank()
         param_data = param_data.narrow(self.output_dim, shard_offset,
                                        shard_size)
         loaded_weight = loaded_weight.narrow(self.output_dim,
-                                             tp_rank * shard_size, shard_size)
+                                             self.tp_rank * shard_size,
+                                             shard_size)
         assert param_data.shape == loaded_weight.shape
         param_data.copy_(loaded_weight)
 
@@ -161,8 +164,8 @@ class _ColumnvLLMParameter(BasevLLMParameter):
                 shard_offset=shard_offset, shard_size=shard_size)
 
         param_data = self.data
-        tp_rank = get_tensor_model_parallel_rank()
-        shard_id = tp_rank if shard_id == "q" else tp_rank // num_heads
+        shard_id = (self.tp_rank if shard_id == "q" else self.tp_rank //
+                    num_heads)
         param_data = param_data.narrow(self.output_dim, shard_offset,
                                        shard_size)
         loaded_weight = loaded_weight.narrow(self.output_dim,
@@ -189,10 +192,10 @@ class RowvLLMParameter(BasevLLMParameter):
         return self._input_dim
 
     def load_row_parallel_weight(self, loaded_weight: torch.Tensor):
-        tp_rank = get_tensor_model_parallel_rank()
         shard_size = self.data.shape[self.input_dim]
         loaded_weight = loaded_weight.narrow(self.input_dim,
-                                             tp_rank * shard_size, shard_size)
+                                             self.tp_rank * shard_size,
+                                             shard_size)
 
         if len(loaded_weight.shape) == 0:
             loaded_weight = loaded_weight.reshape(1)
@@ -414,9 +417,6 @@ class SharedWeightParameter(BasevLLMParameter):
             "weight_loader": self._fake_weight_loader
         }
 
-        self.tp_rank = get_tensor_model_parallel_rank()
-        self.tp_size = get_tensor_model_parallel_world_size()
-
         if self.tp_size > 1:
             raise NotImplementedError(f"{self.__class__.__name__} does not "
                                       "currently support tensor parallelism")
diff --git a/vllm/model_executor/sampling_metadata.py b/vllm/model_executor/sampling_metadata.py
index 56f0f0984bfa0ec9a3054c9a9d7b351355cfe2e7..2315f9dad5a5af2cb5419f90d5e0ec94c45aa79d 100644
--- a/vllm/model_executor/sampling_metadata.py
+++ b/vllm/model_executor/sampling_metadata.py
@@ -97,7 +97,7 @@ class SamplingMetadataCache:
 class SamplingMetadata:
     """Metadata for input sequences. Used in sampler.
 
-    The usage is as follow;
+    The usage is as follows;
     ```
     hidden_states = execute_model(...)
     logits = hidden_states[sampling_metadata.selected_token_indices]
diff --git a/vllm/model_executor/utils.py b/vllm/model_executor/utils.py
index 41ed0b09c5a2ab85c86ee02c29b90a69a4e3edf8..65436786f82ac3e95c6d28050a94916599c22078 100644
--- a/vllm/model_executor/utils.py
+++ b/vllm/model_executor/utils.py
@@ -52,10 +52,11 @@ def set_weight_attrs(
 def _make_synced_weight_loader(original_weight_loader):
 
     def _synced_weight_loader(param, *args, **kwargs):
-        original_weight_loader(param, *args, **kwargs)
+        out = original_weight_loader(param, *args, **kwargs)
         # torch._sync doesn't support, is not needed for CPU tensors.
         if param.device != torch.device("cpu"):
             torch._sync(param)
+        return out
 
     return _synced_weight_loader
 
diff --git a/vllm/model_executor/warmup/deep_gemm_warmup.py b/vllm/model_executor/warmup/deep_gemm_warmup.py
index 74599fa44c88c0b06569641db9a6508e3efaa115..a25ef86a989db8cb52abe39dc6938a8039ceb301 100644
--- a/vllm/model_executor/warmup/deep_gemm_warmup.py
+++ b/vllm/model_executor/warmup/deep_gemm_warmup.py
@@ -10,6 +10,7 @@ import torch
 from tqdm import tqdm
 
 import vllm.envs as envs
+from vllm.distributed.parallel_state import get_dp_group
 from vllm.model_executor.layers.fused_moe.deep_gemm_moe import DeepGemmExperts
 from vllm.model_executor.layers.fused_moe.deep_gemm_utils import (
     compute_aligned_M, deep_gemm_block_shape)
@@ -131,11 +132,9 @@ def _deepgemm_fp8_gemm_nt_warmup(w: torch.Tensor, ws: torch.Tensor,
 GROUPED_FP8_GEMM_NT_CONTIGUOUS_WARMUP_CACHE: set[torch.Size] = set()
 
 
-def _deepgemm_grouped_fp8_gemm_nt_contiguous_warmup(w1: torch.Tensor,
-                                                    w2: torch.Tensor,
-                                                    w1_scale: torch.Tensor,
-                                                    w2_scale: torch.Tensor,
-                                                    num_topk: int):
+def _deepgemm_grouped_fp8_gemm_nt_contiguous_warmup(
+        w1: torch.Tensor, w2: torch.Tensor, w1_scale: torch.Tensor,
+        w2_scale: torch.Tensor, num_topk: int, max_tokens: int):
     if (w1.size() in GROUPED_FP8_GEMM_NT_CONTIGUOUS_WARMUP_CACHE
             and w2.size() in GROUPED_FP8_GEMM_NT_CONTIGUOUS_WARMUP_CACHE):
         return
@@ -147,9 +146,13 @@ def _deepgemm_grouped_fp8_gemm_nt_contiguous_warmup(w1: torch.Tensor,
     num_experts = w1.size(0)
     device = w1.device
 
+    # Assumes all ranks have the same max_num_batched_tokens
+    max_tokens_across_dp = get_dp_group().world_size * max_tokens
+    max_tokens = min(max_tokens_across_dp, envs.VLLM_FUSED_MOE_CHUNK_SIZE)
+
     # This is the maximum GroupedGemm M size that we expect to run
     # the grouped_gemm with.
-    MAX_M = compute_aligned_M(envs.VLLM_FUSED_MOE_CHUNK_SIZE,
+    MAX_M = compute_aligned_M(max_tokens,
                               num_topk,
                               num_experts,
                               block_m,
@@ -201,7 +204,8 @@ def deepgemm_fp8_gemm_nt_warmup(model: torch.nn.Module, max_tokens: int):
         _deepgemm_fp8_gemm_nt_warmup(w=w, ws=ws, max_tokens=max_tokens)
 
 
-def deepgemm_grouped_fp8_gemm_nt_contiguous_warmup(model: torch.nn.Module):
+def deepgemm_grouped_fp8_gemm_nt_contiguous_warmup(model: torch.nn.Module,
+                                                   max_tokens: int):
     dg_modules = [
         m for m in model.modules()
         if _fused_moe_grouped_gemm_may_use_deep_gemm(m)
@@ -211,9 +215,9 @@ def deepgemm_grouped_fp8_gemm_nt_contiguous_warmup(model: torch.nn.Module):
         w13, w13_scale, w2, w2_scale, num_topk = (
             _extract_data_from_fused_moe_module(dgm))
         _deepgemm_grouped_fp8_gemm_nt_contiguous_warmup(
-            w13, w2, w13_scale, w2_scale, num_topk)
+            w13, w2, w13_scale, w2_scale, num_topk, max_tokens)
 
 
 def deep_gemm_warmup(model: torch.nn.Module, max_tokens: int):
     deepgemm_fp8_gemm_nt_warmup(model, max_tokens)
-    deepgemm_grouped_fp8_gemm_nt_contiguous_warmup(model)
+    deepgemm_grouped_fp8_gemm_nt_contiguous_warmup(model, max_tokens)
diff --git a/vllm/model_executor/warmup/kernel_warmup.py b/vllm/model_executor/warmup/kernel_warmup.py
index 761172e4d361664733a66eb059335435886becd0..89ce20308f447f6d0f8899b11473763869aac6cd 100644
--- a/vllm/model_executor/warmup/kernel_warmup.py
+++ b/vllm/model_executor/warmup/kernel_warmup.py
@@ -10,6 +10,7 @@ from typing import TYPE_CHECKING
 import torch
 
 import vllm.envs as envs
+from vllm.logger import init_logger
 from vllm.model_executor.warmup.deep_gemm_warmup import deep_gemm_warmup
 from vllm.platforms import current_platform
 from vllm.utils.deep_gemm import is_deep_gemm_supported
@@ -19,6 +20,8 @@ if TYPE_CHECKING:
     from vllm.v1.worker.gpu_model_runner import GPUModelRunner
     from vllm.v1.worker.gpu_worker import Worker
 
+logger = init_logger(__name__)
+
 
 def kernel_warmup(worker: "Worker"):
     # Deep GEMM warmup
@@ -30,10 +33,33 @@ def kernel_warmup(worker: "Worker"):
         max_tokens = worker.scheduler_config.max_num_batched_tokens
         deep_gemm_warmup(model, max_tokens)
 
-    # FlashInfer autotune for Blackwell (SM 10.0) GPUs
-    if has_flashinfer() and current_platform.is_device_capability(100):
+    # FlashInfer autotune for Hopper (SM 9.0) and Blackwell (SM 10.0) GPUs
+    if has_flashinfer() and current_platform.has_device_capability(90):
         flashinfer_autotune(worker.model_runner)
 
+    # FlashInfer attention warmup
+    # Only warmup if the model has FlashInfer attention groups
+    # and is not a pooling model
+    def _is_flashinfer_backend(backend):
+        try:
+            return backend.get_name() == "FLASHINFER_VLLM_V1"
+        except NotImplementedError:
+            return False
+
+    if not worker.model_runner.is_pooling_model and all(
+            _is_flashinfer_backend(group.backend)
+            for groups in worker.model_runner.attn_groups for group in groups):
+        logger.info("Warming up FlashInfer attention.")
+        # Warmup with mixed batch containing both prefill and decode tokens
+        # This is to warm up both prefill and decode attention kernels
+        worker.model_runner._dummy_run(
+            num_tokens=16,
+            skip_eplb=True,
+            is_profile=True,
+            force_attention=True,
+            create_mixed_batch=True,
+        )
+
 
 def flashinfer_autotune(runner: "GPUModelRunner") -> None:
     """
diff --git a/vllm/multimodal/processing.py b/vllm/multimodal/processing.py
index 0531b7bd9f0a721994628f46b2894ede3eb24c67..e5db356b635f33f1677635351f0e047546c05443 100644
--- a/vllm/multimodal/processing.py
+++ b/vllm/multimodal/processing.py
@@ -1022,13 +1022,12 @@ class BaseMultiModalProcessor(ABC, Generic[_I]):
         mm_data: MultiModalDataDict,
         hf_processor_mm_kwargs: Mapping[str, object],
         *,
-        mm_hash_overrides: Optional[Union[dict[str, list[str]],
-                                          MultiModalUUIDDict]] = None,
+        mm_uuids: Optional[MultiModalUUIDDict] = None,
     ) -> MultiModalInputs:
         return self.apply(prompt,
                           mm_data,
                           hf_processor_mm_kwargs,
-                          mm_hash_overrides=mm_hash_overrides)
+                          mm_uuids=mm_uuids)
 
     def _get_data_parser(self) -> MultiModalDataParser:
         """
@@ -1364,8 +1363,7 @@ class BaseMultiModalProcessor(ABC, Generic[_I]):
         hf_processor_mm_kwargs: Mapping[str, object],
         tokenization_kwargs: Mapping[str, object],
         *,
-        mm_hash_overrides: Optional[Union[dict[str, list[str]],
-                                          MultiModalUUIDDict]] = None,
+        mm_uuids: Optional[MultiModalUUIDDict] = None,
     ) -> MultiModalHashes:
         """Create MM hashes to be returned (only used in V1).
 
@@ -1376,30 +1374,30 @@ class BaseMultiModalProcessor(ABC, Generic[_I]):
         model_id = self.info.model_id
 
         hashes: MultiModalHashes = {}
-        mm_hash_overrides = mm_hash_overrides or {}
+        mm_uuids = mm_uuids or {}
 
         for modality, items in mm_items.items():
-            if modality in mm_hash_overrides:
-                mm_hashes = mm_hash_overrides[modality]
-                if isinstance(mm_hashes, str):
-                    mm_hashes = [mm_hashes]
+            if modality in mm_uuids:
+                mm_uuids_per_modality = mm_uuids[modality]
+                if isinstance(mm_uuids_per_modality, str):
+                    mm_uuids_per_modality = [mm_uuids_per_modality]
 
                 # For None entries, compute a hash; otherwise, use provided ID.
                 computed: list[str] = []
                 for i, item in enumerate(items):
-                    mm_hash = mm_hashes[i]
+                    item_uuid = mm_uuids_per_modality[i]
 
-                    # NOTE: Even if a mm_hash is provided, we still compute a
+                    # NOTE: Even if a item_uuid is provided, we still compute a
                     # hash if `hf_processor_mm_kwargs` or `tokenization_kwargs`
                     # are provided. This is because the processed multimodal
                     # inputs can be different depending on the processor kwargs.
-                    if mm_hash is None or \
+                    if item_uuid is None or \
                         hf_processor_mm_kwargs or \
                         tokenization_kwargs:
 
                         # NOTE: use provided hash string to hash with kwargs
                         # if available for better performance.
-                        item = mm_hash if mm_hash is not None else item
+                        item = item_uuid if item_uuid is not None else item
                         computed.append(
                             MultiModalHasher.hash_kwargs(
                                 model_id=model_id,
@@ -1407,7 +1405,7 @@ class BaseMultiModalProcessor(ABC, Generic[_I]):
                                 **hf_processor_mm_kwargs,
                                 **tokenization_kwargs))
                     else:
-                        computed.append(mm_hash)
+                        computed.append(item_uuid)
                 hashes[modality] = computed
             else:
                 hashes[modality] = [
@@ -1514,8 +1512,7 @@ class BaseMultiModalProcessor(ABC, Generic[_I]):
         hf_processor_mm_kwargs: Mapping[str, object],
         tokenization_kwargs: Mapping[str, object],
         *,
-        mm_hash_overrides: Optional[Union[dict[str, list[str]],
-                                          MultiModalUUIDDict]] = None,
+        mm_uuids: Optional[MultiModalUUIDDict] = None,
     ) -> tuple[list[int], MultiModalProcessingInfo, bool]:
         (
             prompt_ids,
@@ -1539,7 +1536,7 @@ class BaseMultiModalProcessor(ABC, Generic[_I]):
         mm_hashes = self._hash_mm_items(mm_data_items,
                                         hf_processor_mm_kwargs,
                                         tokenization_kwargs,
-                                        mm_hash_overrides=mm_hash_overrides)
+                                        mm_uuids=mm_uuids)
 
         mm_prompt_updates = self._get_mm_prompt_updates(
             mm_data_items,
@@ -1562,8 +1559,7 @@ class BaseMultiModalProcessor(ABC, Generic[_I]):
         hf_processor_mm_kwargs: Mapping[str, object],
         tokenization_kwargs: Mapping[str, object],
         *,
-        mm_hash_overrides: Optional[Union[dict[str, list[str]],
-                                          MultiModalUUIDDict]] = None,
+        mm_uuids: Optional[MultiModalUUIDDict] = None,
     ) -> tuple[list[int], MultiModalProcessingInfo, bool]:
         """
         Apply the HF processor on the full prompt text,
@@ -1578,13 +1574,13 @@ class BaseMultiModalProcessor(ABC, Generic[_I]):
                 mm_data_items=mm_data_items,
                 hf_processor_mm_kwargs=hf_processor_mm_kwargs,
                 tokenization_kwargs=tokenization_kwargs,
-                mm_hash_overrides=mm_hash_overrides,
+                mm_uuids=mm_uuids,
             )
 
         mm_hashes = self._hash_mm_items(mm_data_items,
                                         hf_processor_mm_kwargs,
                                         tokenization_kwargs,
-                                        mm_hash_overrides=mm_hash_overrides)
+                                        mm_uuids=mm_uuids)
 
         mm_missing_data_items = self._get_cache_missing_items(
             cache=cache,
@@ -1785,8 +1781,7 @@ class BaseMultiModalProcessor(ABC, Generic[_I]):
         hf_processor_mm_kwargs: Mapping[str, object],
         tokenization_kwargs: Optional[Mapping[str, object]] = None,
         *,
-        mm_hash_overrides: Optional[Union[dict[str, list[str]],
-                                          MultiModalUUIDDict]] = None,
+        mm_uuids: Optional[MultiModalUUIDDict] = None,
     ) -> MultiModalInputs:
         """
         Process multi-modal inputs to be used in vLLM.
@@ -1815,7 +1810,7 @@ class BaseMultiModalProcessor(ABC, Generic[_I]):
             mm_items,
             hf_processor_mm_kwargs,
             tokenization_kwargs=tokenization_kwargs,
-            mm_hash_overrides=mm_hash_overrides,
+            mm_uuids=mm_uuids,
         )
 
         # NOTE: tokenization_kwargs are not required to init processor
@@ -1901,8 +1896,7 @@ class EncDecMultiModalProcessor(BaseMultiModalProcessor[_I]):
         hf_processor_mm_kwargs: Mapping[str, object],
         tokenization_kwargs: Optional[Mapping[str, object]] = None,
         *,
-        mm_hash_overrides: Optional[Union[dict[str, list[str]],
-                                          MultiModalUUIDDict]] = None,
+        mm_uuids: Optional[MultiModalUUIDDict] = None,
     ) -> MultiModalEncDecInputs:
         """
         Process multi-modal inputs to be used in vLLM.
@@ -1917,7 +1911,7 @@ class EncDecMultiModalProcessor(BaseMultiModalProcessor[_I]):
             mm_data,
             hf_processor_mm_kwargs,
             tokenization_kwargs,
-            mm_hash_overrides=mm_hash_overrides,
+            mm_uuids=mm_uuids,
         )
 
         return self._get_enc_dec_inputs(
diff --git a/vllm/multimodal/utils.py b/vllm/multimodal/utils.py
index 834b2189e4bed8f44d679996664ce0f4fa28ec4b..e09c97de576efde992d34d6faa44e46e01d6c1f7 100644
--- a/vllm/multimodal/utils.py
+++ b/vllm/multimodal/utils.py
@@ -9,7 +9,7 @@ from collections.abc import Iterable
 from concurrent.futures import ThreadPoolExecutor
 from itertools import groupby
 from pathlib import Path
-from typing import TYPE_CHECKING, Any, Optional, TypeVar, Union
+from typing import TYPE_CHECKING, Any, Literal, Optional, TypeVar, Union
 from urllib.parse import ParseResult, urlparse
 from urllib.request import url2pathname
 
@@ -213,7 +213,7 @@ class MediaConnector:
         image_mode: str = "RGB",
     ) -> Image.Image:
         """
-        Load a PIL image from a HTTP or base64 data URL.
+        Load a PIL image from an HTTP or base64 data URL.
 
         By default, the image is converted into RGB format.
         """
@@ -237,7 +237,7 @@ class MediaConnector:
         image_mode: str = "RGB",
     ) -> Image.Image:
         """
-        Asynchronously load a PIL image from a HTTP or base64 data URL.
+        Asynchronously load a PIL image from an HTTP or base64 data URL.
 
         By default, the image is converted into RGB format.
         """
@@ -261,7 +261,7 @@ class MediaConnector:
         image_mode: str = "RGB",
     ) -> tuple[npt.NDArray, dict[str, Any]]:
         """
-        Load video from a HTTP or base64 data URL.
+        Load video from an HTTP or base64 data URL.
         """
         image_io = ImageMediaIO(image_mode=image_mode,
                                 **self.media_io_kwargs.get("image", {}))
@@ -281,7 +281,7 @@ class MediaConnector:
         image_mode: str = "RGB",
     ) -> tuple[npt.NDArray, dict[str, Any]]:
         """
-        Asynchronously load video from a HTTP or base64 data URL.
+        Asynchronously load video from an HTTP or base64 data URL.
 
         By default, the image is converted into RGB format.
         """
@@ -370,7 +370,7 @@ def group_mm_inputs_by_modality(
 
     def modality_group_func(
             mm_input: MultiModalKwargsItems) -> Union[str, int]:
-        # If the input has multiple modalities, return a id as the unique key
+        # If the input has multiple modalities, return an id as the unique key
         # for the mm_input input.
         if len(mm_input) > 1:
             return id(mm_input)
@@ -378,10 +378,7 @@ def group_mm_inputs_by_modality(
         elif len(mm_input) == 1:
             return next(iter(mm_input.keys()))
 
-        # FIXME(Isotr0py): Modality of mm_input from legacy pipeline is empty,
-        # this is used to make InternVL with legacy pipeline still work with v1.
-        else:
-            return ""
+        raise AssertionError("This line should be unreachable.")
 
     return [
         list(group) for _, group in groupby(mm_inputs, key=modality_group_func)
@@ -444,7 +441,6 @@ def run_dp_sharded_vision_model(image_input: torch.Tensor,
     Args:
         image_input (torch.Tensor): Image input tensor.
         vision_model (torch.nn.Module): Vision model.
-
     Returns:
         torch.Tensor: Output image embeddings
     """
@@ -542,6 +538,8 @@ def run_dp_sharded_mrope_vision_model(
     vision_model: torch.nn.Module,
     pixel_values: torch.Tensor,
     grid_thw_list: list[list[int]],
+    *,
+    rope_type: Literal["rope_3d", "rope_2d"],
 ) -> tuple[torch.Tensor, ...]:
     """Run a vision model with data parallelism (DP) sharding. 
     The function will shard the input image tensor on the 
@@ -552,6 +550,10 @@ def run_dp_sharded_mrope_vision_model(
         vision_model (torch.nn.Module): Vision model.
         pixel_values (torch.Tensor): Image/Video input tensor.
         grid_thw_list: List of grid dimensions for each image
+        rope_type: Type of rope used in the vision model.
+                   Different rope types have different dimension to do ViT.
+                   "rope_3d" for 3D rope (e.g., Qwen2.5-VL)
+                   "rope_2d" for 2D rope (e.g., Kimi-VL)
     Returns:
         torch.Tensor: Output image embeddings
 
@@ -605,8 +607,12 @@ def run_dp_sharded_mrope_vision_model(
                                          device=pixel_values.device,
                                          dtype=pixel_values.dtype)
     # embed_dim_reduction_factor = 2 * 2
-    embed_dim_reduction_factor = (vision_model.spatial_merge_size *
-                                  vision_model.spatial_merge_size)
+    if rope_type == "rope_2d":
+        embed_dim_reduction_factor = (vision_model.merge_kernel_size[0] *
+                                      vision_model.merge_kernel_size[1])
+    else:
+        embed_dim_reduction_factor = (vision_model.spatial_merge_size *
+                                      vision_model.spatial_merge_size)
 
     # Find the max length across all ranks
     # The output embedding of every DP rank has to be
@@ -617,23 +623,42 @@ def run_dp_sharded_mrope_vision_model(
     local_grid_thw_list = [grid_thw_list[i] for i in image_idxs_local]
 
     # Run the vision model on the local pixel_values_local
-    if pixel_values_local.shape[0] > 0:
-        image_embeds_local = vision_model(pixel_values_local,
-                                          local_grid_thw_list)
+    if rope_type == "rope_2d":
+        if pixel_values_local.shape[0] > 0:
+            image_embeds_local = vision_model(
+                pixel_values_local, torch.tensor(local_grid_thw_list))
+            if isinstance(image_embeds_local, list):
+                image_embeds_local = torch.cat(image_embeds_local, dim=0)
+        else:
+            out_dim = getattr(vision_model.config, "hidden_size", None)
+            image_embeds_local = torch.empty(
+                (0, embed_dim_reduction_factor, out_dim),
+                device=pixel_values.device,
+                dtype=pixel_values.dtype)
     else:
-        # Handle empty case
-        image_embeds_local = torch.empty((0, vision_model.out_hidden_size),
-                                         device=pixel_values.device,
-                                         dtype=pixel_values.dtype)
+        if pixel_values_local.shape[0] > 0:
+            image_embeds_local = vision_model(pixel_values_local,
+                                              local_grid_thw_list)
+        else:
+            # Handle empty case
+            image_embeds_local = torch.empty((0, vision_model.out_hidden_size),
+                                             device=pixel_values.device,
+                                             dtype=pixel_values.dtype)
 
     # Pad the output based on max_len_per_rank
     # for tensor_model_parallel_all_gather to work
     current_len = image_embeds_local.shape[0]
     if current_len < max_len_per_rank:
         padding_size = max_len_per_rank - current_len
-        padding = torch.empty((padding_size, image_embeds_local.shape[1]),
-                              dtype=image_embeds_local.dtype,
-                              device=image_embeds_local.device)
+        if rope_type == "rope_2d":
+            padding = torch.empty((padding_size, image_embeds_local.shape[1],
+                                   image_embeds_local.shape[2]),
+                                  dtype=image_embeds_local.dtype,
+                                  device=image_embeds_local.device)
+        else:
+            padding = torch.empty((padding_size, image_embeds_local.shape[1]),
+                                  dtype=image_embeds_local.dtype,
+                                  device=image_embeds_local.device)
         image_embeds_local_padded = torch.cat([image_embeds_local, padding],
                                               dim=0)
     else:
@@ -674,7 +699,6 @@ def run_dp_sharded_mrope_vision_model(
                     embed_start:embed_start + img_patches]
                 embed_start += img_patches
             current_idx += count
-
     out_embeddings = tuple(embed for embed in original_order_embeddings
                            if embed is not None)
     assert len(out_embeddings) == len(
diff --git a/vllm/multimodal/video.py b/vllm/multimodal/video.py
index ef1380bdb614c6abacde27e3c7978218abdc66ce..df6e19da82ca2b8bec932ef6f0466cd1b2f7eb0a 100644
--- a/vllm/multimodal/video.py
+++ b/vllm/multimodal/video.py
@@ -1,12 +1,12 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
 import base64
+import math
 from abc import abstractmethod
 from functools import partial
 from io import BytesIO
 from pathlib import Path
-from typing import Any
+from typing import Any, Union
 
 import numpy as np
 import numpy.typing as npt
@@ -104,10 +104,12 @@ class OpenCVVideoBackend(VideoLoader):
         return api_pref
 
     @classmethod
-    def load_bytes(cls,
-                   data: bytes,
-                   num_frames: int = -1,
-                   **kwargs) -> tuple[npt.NDArray, dict[str, Any]]:
+    def load_bytes(
+        cls,
+        data: bytes,
+        num_frames: int = -1,
+        **kwargs,
+    ) -> tuple[npt.NDArray, dict[str, Any]]:
         import cv2
 
         backend = cls().get_cv2_video_api()
@@ -119,6 +121,15 @@ class OpenCVVideoBackend(VideoLoader):
         original_fps = cap.get(cv2.CAP_PROP_FPS)
         duration = total_frames_num / original_fps if original_fps > 0 else 0
 
+        # Use transformers transformers.video_utils.VideoMetadata format
+        metadata = {
+            "total_num_frames": total_frames_num,
+            "fps": original_fps,
+            "duration": duration,
+            "video_backend": "opencv"
+        }
+
+        # resample video to target num_frames
         full_read = num_frames == -1 or total_frames_num < num_frames
         if full_read:
             num_frames = total_frames_num
@@ -148,14 +159,88 @@ class OpenCVVideoBackend(VideoLoader):
         assert i == num_frames, (f"Expected reading {num_frames} frames, "
                                  f"but only loaded {i} frames from video.")
 
+        return frames, metadata
+
+
+@VIDEO_LOADER_REGISTRY.register("opencv_dynamic")
+class OpenCVDynamicVideoBackend(OpenCVVideoBackend):
+
+    @classmethod
+    def load_bytes(
+        cls,
+        data: bytes,
+        num_frames: int = -1,
+        requested_fps: int = 2,
+        max_duration: int = 300,
+        **kwargs,
+    ) -> tuple[npt.NDArray, dict[str, Any]]:
+        import cv2
+
+        backend = cls().get_cv2_video_api()
+        cap = cv2.VideoCapture(BytesIO(data), backend, [])
+        if not cap.isOpened():
+            raise ValueError("Could not open video stream")
+
+        total_frames_num = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
+        original_fps = cap.get(cv2.CAP_PROP_FPS)
+        duration = total_frames_num / original_fps if original_fps > 0 else 0
+
         # Use transformers transformers.video_utils.VideoMetadata format
         metadata = {
             "total_num_frames": total_frames_num,
             "fps": original_fps,
             "duration": duration,
-            "video_backend": "opencv"
+            "video_backend": "opencv_dynamic"
         }
 
+        # resample video to target num_frames
+        max_frame_idx = total_frames_num - 1
+        duration = duration or round(max_frame_idx / original_fps) + 1
+
+        # Refer to:
+        # https://github.com/huggingface/transformers/blob/v4.55.4/src/transformers/models/glm4v/video_processing_glm4v.py#L103-L140
+        frame_indices: Union[range, list[int]]
+        if duration <= max_duration:
+            n = int(math.floor(duration * requested_fps))
+            frame_indices = sorted({
+                min(max_frame_idx,
+                    int(math.ceil(i * original_fps / requested_fps)))
+                for i in range(n)
+            })
+        else:
+            num_samples = int(max_duration * requested_fps)
+            if num_samples >= total_frames_num:
+                frame_indices = range(total_frames_num)
+            else:
+                target_seconds = np.linspace(0,
+                                             duration,
+                                             num_samples,
+                                             endpoint=True)
+                frame_indices = sorted({
+                    min(max_frame_idx, int(math.ceil(t * original_fps)))
+                    for t in target_seconds
+                })
+
+        width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
+        height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
+        frames = np.empty((len(frame_indices), height, width, 3),
+                          dtype=np.uint8)
+
+        i = 0
+        for idx in range(total_frames_num):
+            ok = cap.grab()
+            if not ok:
+                break
+            if idx in frame_indices:
+                ret, frame = cap.retrieve()
+                if ret:
+                    frames[i] = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
+                    i += 1
+
+        assert i == len(frame_indices), (
+            f"Expected reading {len(frame_indices)} frames, "
+            f"but only loaded {i} frames from video.")
+
         return frames, metadata
 
 
diff --git a/vllm/outputs.py b/vllm/outputs.py
index acdb2f89ce7354bc369a5248da6c8e685b809a14..64bcfd472f2ada5fa7b4cb6770f9d73523dcce48 100644
--- a/vllm/outputs.py
+++ b/vllm/outputs.py
@@ -11,11 +11,12 @@ import torch
 from typing_extensions import TypeVar
 
 from vllm.logger import init_logger
+from vllm.logprobs import PromptLogprobs, SampleLogprobs
 from vllm.lora.request import LoRARequest
 from vllm.multimodal.inputs import MultiModalPlaceholderDict
 from vllm.sampling_params import RequestOutputKind
-from vllm.sequence import (PromptLogprobs, RequestMetrics, SampleLogprobs,
-                           SequenceGroup, SequenceGroupBase, SequenceStatus)
+from vllm.sequence import (RequestMetrics, SequenceGroup, SequenceGroupBase,
+                           SequenceStatus)
 
 logger = init_logger(__name__)
 
diff --git a/vllm/platforms/__init__.py b/vllm/platforms/__init__.py
index 56edb8629e45b35526ee0100a6ffd12fc2255070..9b64817da648c67fc2e55d8a1fa553b3f60d9b4d 100644
--- a/vllm/platforms/__init__.py
+++ b/vllm/platforms/__init__.py
@@ -169,37 +169,12 @@ def cpu_platform_plugin() -> Optional[str]:
     return "vllm.platforms.cpu.CpuPlatform" if is_cpu else None
 
 
-def neuron_platform_plugin() -> Optional[str]:
-    tnx_installed = False
-    nxd_installed = False
-    logger.debug("Checking if Neuron platform is available.")
-    try:
-        import transformers_neuronx  # noqa: F401
-        tnx_installed = True
-        logger.debug("Confirmed Neuron platform is available because"
-                     " transformers_neuronx is found.")
-    except ImportError:
-        pass
-
-    try:
-        import neuronx_distributed_inference  # noqa: F401
-        nxd_installed = True
-        logger.debug("Confirmed Neuron platform is available because"
-                     " neuronx_distributed_inference is found.")
-    except ImportError:
-        pass
-
-    is_neuron = tnx_installed or nxd_installed
-    return "vllm.platforms.neuron.NeuronPlatform" if is_neuron else None
-
-
 builtin_platform_plugins = {
     'tpu': tpu_platform_plugin,
     'cuda': cuda_platform_plugin,
     'rocm': rocm_platform_plugin,
     'xpu': xpu_platform_plugin,
     'cpu': cpu_platform_plugin,
-    'neuron': neuron_platform_plugin,
 }
 
 
diff --git a/vllm/platforms/cpu.py b/vllm/platforms/cpu.py
index 2c36df7bff3414c1965539df5849b9c3f8d5a859..12406033819241f6611271fd780a7b6f2d438e87 100644
--- a/vllm/platforms/cpu.py
+++ b/vllm/platforms/cpu.py
@@ -75,12 +75,12 @@ class CpuPlatform(Platform):
     def supported_dtypes(self) -> list[torch.dtype]:
         if self.get_cpu_architecture() == CpuArchEnum.POWERPC:
             return [torch.bfloat16, torch.float32]
-        elif sys.platform.startswith(
-                "darwin") and self.get_cpu_architecture() == CpuArchEnum.ARM:
-            # TODO: change this condition to check if the platform support bf16
-            # instead of checking the OS. For instance M2 shall supports bf16
-            # already. But we need to modify `cpu_extension.cmake` to activate
-            # the feature in the build.
+        elif (self.get_cpu_architecture() == CpuArchEnum.ARM
+              and sys.platform.startswith("darwin")):
+            if (subprocess.check_output(
+                ["sysctl -n hw.optional.arm.FEAT_BF16"],
+                    shell=True).strip() == b"1"):
+                return [torch.bfloat16, torch.float16, torch.float32]
             return [torch.float16, torch.float32]
         # x86/aarch64 CPU has supported both bf16 and fp16 natively.
         return [torch.bfloat16, torch.float16, torch.float32]
@@ -361,3 +361,7 @@ class CpuPlatform(Platform):
     @classmethod
     def opaque_attention_op(cls) -> bool:
         return True
+
+    @classmethod
+    def support_hybrid_kv_cache(cls) -> bool:
+        return True
diff --git a/vllm/platforms/cuda.py b/vllm/platforms/cuda.py
index 5cbb7346436ef931694680f09dc264e67f3483b1..e40b6eb2b5a4ded133fb3831b2290b134c942aec 100644
--- a/vllm/platforms/cuda.py
+++ b/vllm/platforms/cuda.py
@@ -146,6 +146,7 @@ class CudaPlatformBase(Platform):
             # required block_size.
             use_flashmla = False
             use_cutlass_mla = False
+            use_flashinfer_mla = False
 
             if envs.VLLM_ATTENTION_BACKEND is None:
                 # Default case
@@ -164,6 +165,8 @@ class CudaPlatformBase(Platform):
                 use_flashmla = (envs.VLLM_ATTENTION_BACKEND == "FLASHMLA")
                 use_cutlass_mla = (
                     envs.VLLM_ATTENTION_BACKEND == "CUTLASS_MLA")
+                use_flashinfer_mla = (
+                    envs.VLLM_ATTENTION_BACKEND == "FLASHINFER_MLA")
 
             from vllm.attention.ops.flashmla import is_flashmla_supported
             if use_flashmla and is_flashmla_supported()[0] \
@@ -176,6 +179,11 @@ class CudaPlatformBase(Platform):
                 cache_config.block_size = 128
                 logger.info("Forcing kv cache block size to 128 for "
                             "CUTLASS_MLA backend.")
+            if use_flashinfer_mla and cache_config.block_size not in [32, 64]:
+                cache_config.block_size = 64
+                logger.info(
+                    "Forcing kv cache block size to 64 for FlashInferMLA "
+                    "backend.")
 
         # lazy import to avoid circular import
         from vllm.config import CUDAGraphMode
@@ -183,16 +191,14 @@ class CudaPlatformBase(Platform):
         compilation_config = vllm_config.compilation_config
         if (envs.VLLM_ALL2ALL_BACKEND == "deepep_high_throughput"
                 and parallel_config.data_parallel_size > 1
-                and compilation_config.cudagraph_mode != CUDAGraphMode.NONE):
+                and compilation_config.cudagraph_mode
+                not in [CUDAGraphMode.NONE, CUDAGraphMode.PIECEWISE]):
             logger.info(
-                "Data Parallel: disabling cudagraphs since DP "
-                "with DeepEP high-throughput kernels are not CUDA Graph "
-                "compatible. The DeepEP low-latency kernels are CUDA Graph "
-                "compatible. Set the all_to_all backend to deepep_low_latency "
-                "to use those kernels instead.")
-            compilation_config.cudagraph_mode = CUDAGraphMode.NONE
-            if model_config is not None:
-                model_config.enforce_eager = True
+                "Data Parallel with DeepEP high-throughput: using PIECEWISE "
+                "CUDA graphs and excluding MoE ops from capture. Set "
+                "VLLM_ALL2ALL_BACKEND=deepep_low_latency if you need MoE "
+                "graphs captured as well.")
+            compilation_config.cudagraph_mode = CUDAGraphMode.PIECEWISE
 
     @classmethod
     def get_current_memory_usage(cls,
@@ -223,9 +229,33 @@ class CudaPlatformBase(Platform):
         if use_mla:
             # TODO(lucas): refactor to be more concise
             #  we should probably consider factoring out V1 here
-            if selected_backend == _Backend.CUTLASS_MLA or (
-                    cls.is_device_capability(100) and selected_backend is None
-                    and block_size == 128):
+
+            from vllm.attention.ops.flashmla import is_flashmla_supported
+            from vllm.attention.utils.fa_utils import flash_attn_supports_mla
+
+            use_cutlassmla = selected_backend == _Backend.CUTLASS_MLA or (
+                selected_backend is None and cls.is_device_capability(100)
+                and block_size == 128)
+            use_flashinfermla = selected_backend == _Backend.FLASHINFER_MLA or (
+                selected_backend is None and cls.is_device_capability(100)
+                and block_size in [32, 64])
+            use_flashmla = selected_backend in [
+                _Backend.FLASHMLA, _Backend.FLASHMLA_VLLM_V1
+            ] or (selected_backend is None and is_flashmla_supported()[0])
+            use_flashattn = selected_backend == _Backend.FLASH_ATTN_MLA or (
+                selected_backend is None and flash_attn_supports_mla())
+            use_triton = selected_backend == _Backend.TRITON_MLA or (
+                selected_backend is None)
+
+            def _get_version(name, import_suffix) -> str:
+                if use_v1:
+                    logger.info_once(f"Using {name} backend on V1 engine.")
+                    return f"vllm.v1.attention.backends.mla.{import_suffix}"
+                else:
+                    logger.info_once(f"Using {name} backend.")
+                    return f"vllm.attention.backends.{import_suffix}"
+
+            if use_cutlassmla:
                 if use_v1:
                     logger.info_once("Using Cutlass MLA backend on V1 engine.")
                     return ("vllm.v1.attention.backends.mla."
@@ -233,36 +263,40 @@ class CudaPlatformBase(Platform):
                 else:
                     logger.warning(
                         "Cutlass MLA backend is only supported on V1 engine")
-            if selected_backend == _Backend.TRITON_MLA or block_size != 64:
+            if use_flashinfermla:
                 if use_v1:
-                    logger.info_once("Using Triton MLA backend on V1 engine.")
+                    from vllm.v1.attention.backends.utils import (
+                        set_kv_cache_layout)
+                    set_kv_cache_layout("HND")
+                    logger.info_once(
+                        "Using FlashInfer MLA backend on V1 engine.")
                     return ("vllm.v1.attention.backends.mla."
-                            "triton_mla.TritonMLABackend")
+                            "flashinfer_mla.FlashInferMLABackend")
                 else:
-                    logger.info("Using Triton MLA backend.")
-                    return "vllm.attention.backends.triton_mla.TritonMLABackend"
-            else:
-                from vllm.attention.backends.flashmla import (
-                    is_flashmla_supported)
-                if not is_flashmla_supported()[0]:
                     logger.warning(
-                        "FlashMLA backend is not supported due to %s",
-                        is_flashmla_supported()[1])
-                elif block_size != 64:
+                        "FlashInfer MLA backend is only supported on V1 engine"
+                    )
+            if use_flashmla:
+                if block_size != 64:
                     logger.warning(
                         "FlashMLA backend is not supported for block size %d"
                         " (currently only supports block size 64).",
                         block_size)
                 else:
-                    if use_v1:
-                        logger.info_once(
-                            "Using FlashMLA backend on V1 engine.")
-                        return ("vllm.v1.attention.backends.mla."
-                                "flashmla.FlashMLABackend")
-                    else:
-                        logger.info("Using FlashMLA backend.")
-                        return ("vllm.attention.backends."
-                                "flashmla.FlashMLABackend")
+                    return _get_version("FlashMLA", "flashmla.FlashMLABackend")
+            if use_flashattn:
+                if use_v1:
+                    logger.info_once(
+                        "Using FlashAttention MLA backend on V1 engine.")
+                    return ("vllm.v1.attention.backends.mla."
+                            "flashattn_mla.FlashAttnMLABackend")
+                else:
+                    logger.warning(
+                        "FlashAttention MLA backend is only supported on V1 "
+                        "engine.")
+            if use_triton:
+                return _get_version("Triton MLA",
+                                    "triton_mla.TritonMLABackend")
         if use_v1:
             FLASHINFER_V1 = "vllm.v1.attention.backends.flashinfer.FlashInferBackend"  # noqa: E501
             FLEX_ATTENTION_V1 = "vllm.v1.attention.backends.flex_attention.FlexAttentionBackend"  # noqa: E501
@@ -500,8 +534,8 @@ class CudaPlatformBase(Platform):
                 else:
                     attention_backend = "FLASHMLA"
 
-            # Only FlashMLA supports fp8
-            if attention_backend == "FLASHMLA":
+            # Only FlashMLA and CUTLASS_MLA support fp8
+            if attention_backend in ["FLASHMLA", "CUTLASS_MLA"]:
                 supported = True
             else:
                 supported = (not fp8_attention)
@@ -520,6 +554,10 @@ class CudaPlatformBase(Platform):
                     supported = flash_attn_supports_fp8()
                 else:
                     supported = True
+            elif attention_backend == "FLASHINFER":
+                supported = True
+            elif attention_backend == "TRITON_ATTN_VLLM_V1":
+                supported = cls.supports_fp8()
         return supported
 
     @classmethod
@@ -542,6 +580,10 @@ class CudaPlatformBase(Platform):
                     "You can use float16 instead by explicitly setting the "
                     "`dtype` flag in CLI, for example: --dtype=half.")
 
+    @classmethod
+    def support_hybrid_kv_cache(cls) -> bool:
+        return True
+
 
 # NVML utils
 # Note that NVML is not affected by `CUDA_VISIBLE_DEVICES`,
diff --git a/vllm/platforms/interface.py b/vllm/platforms/interface.py
index ad12f7f788cf8ebaee09f54d6ed6aef88a62ff76..59aa4681856987f9535371e1d83ad8f4a258046a 100644
--- a/vllm/platforms/interface.py
+++ b/vllm/platforms/interface.py
@@ -48,13 +48,16 @@ class _Backend(enum.Enum):
     ROCM_AITER_MLA_VLLM_V1 = enum.auto()
     ROCM_AITER_FA = enum.auto()  # used for ViT attn backend
     TORCH_SDPA = enum.auto()
+    TORCH_SDPA_VLLM_V1 = enum.auto()
     FLASHINFER = enum.auto()
     FLASHINFER_VLLM_V1 = enum.auto()
+    FLASHINFER_MLA = enum.auto()
     TRITON_MLA = enum.auto()  # Supported by V1
     TRITON_MLA_VLLM_V1 = enum.auto()
-    FLASHMLA_VLLM_V1 = enum.auto()
-    FLASHMLA = enum.auto()  # Supported by V1
     CUTLASS_MLA = enum.auto()
+    FLASHMLA = enum.auto()  # Supported by V1
+    FLASHMLA_VLLM_V1 = enum.auto()
+    FLASH_ATTN_MLA = enum.auto()  # Supported by V1
     PALLAS = enum.auto()
     PALLAS_VLLM_V1 = enum.auto()
     IPEX = enum.auto()
@@ -72,7 +75,6 @@ class PlatformEnum(enum.Enum):
     TPU = enum.auto()
     XPU = enum.auto()
     CPU = enum.auto()
-    NEURON = enum.auto()
     OOT = enum.auto()
     UNSPECIFIED = enum.auto()
 
@@ -163,9 +165,6 @@ class Platform:
     def is_cpu(self) -> bool:
         return self._enum == PlatformEnum.CPU
 
-    def is_neuron(self) -> bool:
-        return self._enum == PlatformEnum.NEURON
-
     def is_out_of_tree(self) -> bool:
         return self._enum == PlatformEnum.OOT
 
@@ -587,6 +586,13 @@ class Platform:
         """
         raise NotImplementedError
 
+    @classmethod
+    def support_hybrid_kv_cache(cls) -> bool:
+        """
+        Returns if the hybrid kv cache is supported by the current platform.
+        """
+        return False
+
 
 class UnspecifiedPlatform(Platform):
     _enum = PlatformEnum.UNSPECIFIED
diff --git a/vllm/platforms/neuron.py b/vllm/platforms/neuron.py
deleted file mode 100644
index cb8ac8db669fe2bfffea5a7d80c0e62f6f423496..0000000000000000000000000000000000000000
--- a/vllm/platforms/neuron.py
+++ /dev/null
@@ -1,151 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-import enum
-import os
-from functools import lru_cache
-from typing import TYPE_CHECKING, Optional
-
-from vllm import envs
-from vllm.logger import init_logger
-from vllm.utils import DEFAULT_MAX_NUM_BATCHED_TOKENS
-
-from .interface import Platform, PlatformEnum
-
-if TYPE_CHECKING:
-    from vllm.config import VllmConfig
-else:
-    VllmConfig = None
-
-logger = init_logger(__name__)
-
-
-class NeuronFramework(enum.Enum):
-    TRANSFORMERS_NEURONX = "transformers-neuronx"
-    NEURONX_DISTRIBUTED_INFERENCE = "neuronx-distributed-inference"
-
-
-class NeuronPlatform(Platform):
-    _enum = PlatformEnum.NEURON
-    device_name: str = "neuron"
-    device_type: str = "neuron"
-    ray_device_key: str = "neuron_cores"
-    supported_quantization: list[str] = ["neuron_quant", "fbgemm_fp8"]
-    dist_backend: str = "gloo"
-    device_control_env_var: str = "NEURON_RT_VISIBLE_CORES"
-
-    @classmethod
-    def get_device_name(cls, device_id: int = 0) -> str:
-        return "neuron"
-
-    @classmethod
-    def is_async_output_supported(cls, enforce_eager: Optional[bool]) -> bool:
-        return False
-
-    @classmethod
-    def check_and_update_config(cls, vllm_config: VllmConfig) -> None:
-        parallel_config = vllm_config.parallel_config
-        if parallel_config.worker_cls == "auto":
-            parallel_config.worker_cls = \
-                "vllm.worker.neuron_worker.NeuronWorker"
-
-        if parallel_config.world_size > 1:
-            parallel_config.distributed_executor_backend = "uni"
-
-        if vllm_config.cache_config and vllm_config.model_config:
-            # neuron needs block_size = max_model_len
-            vllm_config.cache_config.block_size = \
-                vllm_config.model_config.max_model_len  # type: ignore
-
-        if vllm_config.model_config and vllm_config.model_config.use_mla:
-            logger.info(
-                "MLA is enabled on a non-GPU platform; forcing chunked "
-                "prefill and prefix caching to be disabled.")
-            vllm_config.scheduler_config.enable_chunked_prefill = False
-            vllm_config.scheduler_config.chunked_prefill_enabled = False
-            vllm_config.scheduler_config.max_num_batched_tokens = max(
-                vllm_config.scheduler_config.max_model_len,
-                DEFAULT_MAX_NUM_BATCHED_TOKENS)
-
-    @classmethod
-    def is_pin_memory_available(cls) -> bool:
-        logger.warning("Pin memory is not supported on Neuron.")
-        return False
-
-    @classmethod
-    def get_device_communicator_cls(cls) -> str:
-        if envs.VLLM_USE_V1:
-            return "vllm.distributed.device_communicators.neuron_communicator.NeuronCommunicator"  # noqa
-        else:
-            return Platform.get_device_communicator_cls()
-
-    @classmethod
-    def use_all_gather(cls) -> bool:
-        return True
-
-    @classmethod
-    @lru_cache
-    def is_neuronx_distributed_inference(cls) -> bool:
-        try:
-            import neuronx_distributed_inference
-        except ImportError:
-            neuronx_distributed_inference = None
-        return neuronx_distributed_inference is not None
-
-    @classmethod
-    @lru_cache
-    def is_transformers_neuronx(cls) -> bool:
-        try:
-            import transformers_neuronx
-        except ImportError:
-            transformers_neuronx = None
-        return transformers_neuronx is not None
-
-    def get_neuron_framework_to_use(self):
-        """Return the specified framework if corresponding installations are
-        available.
-
-        If no framework is specified, use neuronx-distributed-inference by
-        default.
-        If that's unavailable, check and switch to transformers-neuronx.
-        """
-        if not self.is_neuron():
-            raise AssertionError(
-                f"Neuron Framework unavailable for platform: {self}")
-
-        tnx_installed = self.is_transformers_neuronx()
-        nxd_installed = self.is_neuronx_distributed_inference()
-
-        specified_framework = os.environ.get("VLLM_NEURON_FRAMEWORK")
-        tnx_framework = NeuronFramework.TRANSFORMERS_NEURONX.value
-        nxd_framework = NeuronFramework.NEURONX_DISTRIBUTED_INFERENCE.value
-        if specified_framework == tnx_framework and tnx_installed:
-            return self.TRANSFORMERS_NEURONX
-
-        if ((specified_framework == nxd_framework and nxd_installed)
-                or (specified_framework is None and nxd_installed)):
-            return NeuronFramework.NEURONX_DISTRIBUTED_INFERENCE
-
-        if specified_framework is None and tnx_installed:
-            return NeuronFramework.TRANSFORMERS_NEURONX
-
-        return None
-
-    def use_neuronx_distributed(self):
-        """
-        Return True if the framework determined in get_neuron_framework_to_use()
-        is NeuronFramework.NEURONX_DISTRIBUTED_INFERENCE, False otherwise. This
-        is used to select the Neuron model framework and framework-specific
-        configuration to apply during model compilation.
-        """
-        nxd_framework = NeuronFramework.NEURONX_DISTRIBUTED_INFERENCE
-        return self.get_neuron_framework_to_use() == nxd_framework
-
-    def use_transformers_neuronx(self):
-        """
-        Return True if the framework determined in get_neuron_framework_to_use()
-        is NeuronFramework.TRANSFORMERS_NEURONX, False otherwise. This is used
-        to select the Neuron model framework and framework-specific
-        configuration to apply during model compilation.
-        """
-        return self.get_neuron_framework_to_use(
-        ) == NeuronFramework.TRANSFORMERS_NEURONX
diff --git a/vllm/platforms/rocm.py b/vllm/platforms/rocm.py
index 8d045f1bab5abf4ff1a9a6f60efb5b0dcfe832cf..0194895d4ca4db075a83b57e698864323cb01ef1 100644
--- a/vllm/platforms/rocm.py
+++ b/vllm/platforms/rocm.py
@@ -171,7 +171,7 @@ class RocmPlatform(Platform):
 
     supported_quantization: list[str] = [
         "awq", "gptq", "fp8", "compressed-tensors", "fbgemm_fp8", "gguf",
-        "quark", "ptpc_fp8", "mxfp4", "petit_nvfp4"
+        "quark", "ptpc_fp8", "mxfp4", "petit_nvfp4", "torchao"
     ]
 
     @classmethod
@@ -323,23 +323,35 @@ class RocmPlatform(Platform):
 
     @classmethod
     def check_and_update_config(cls, vllm_config: "VllmConfig") -> None:
+        from vllm.config.compilation import CUDAGraphMode
+
         cache_config = vllm_config.cache_config
+        compilation_config = vllm_config.compilation_config
+        parallel_config = vllm_config.parallel_config
+        is_eager_execution = compilation_config == CUDAGraphMode.NONE
+
+        use_v1 = envs.VLLM_USE_V1
+        use_aiter_rms_norm = envs.VLLM_ROCM_USE_AITER and \
+             envs.VLLM_ROCM_USE_AITER_RMSNORM
+
         if cache_config and cache_config.block_size is None:
             cache_config.block_size = 16
 
-        parallel_config = vllm_config.parallel_config
         if parallel_config.worker_cls == "auto":
             if vllm_config.speculative_config:
-                if not envs.VLLM_USE_V1:
+                if not use_v1:
                     raise NotImplementedError(
                         "Speculative decoding is not supported on vLLM V0.")
                 parallel_config.worker_cls = "vllm.v1.worker.gpu_worker.Worker"
             else:
-                if envs.VLLM_USE_V1:
+                if use_v1:
                     parallel_config.worker_cls = \
                         "vllm.v1.worker.gpu_worker.Worker"
                 else:
                     parallel_config.worker_cls = "vllm.worker.worker.Worker"
+        #  Aiter rms norm perform best when CUDA Graph capture is enabled.
+        if use_v1 and use_aiter_rms_norm and not is_eager_execution:
+            compilation_config.custom_ops.append("+rms_norm")
 
     @classmethod
     def verify_model_arch(cls, model_arch: str) -> None:
@@ -488,3 +500,7 @@ class RocmPlatform(Platform):
                     f"Your {gpu_name} GPU {compute_str}. "
                     "You can use float16 instead by explicitly setting the "
                     "`dtype` flag in CLI, for example: --dtype=half.")
+
+    @classmethod
+    def support_hybrid_kv_cache(cls) -> bool:
+        return True
diff --git a/vllm/platforms/tpu.py b/vllm/platforms/tpu.py
index ba7b3ed15c504d3476154a98a38be8d5916ac949..15e2352735d70210eecfb85477cf59349714a11f 100644
--- a/vllm/platforms/tpu.py
+++ b/vllm/platforms/tpu.py
@@ -209,6 +209,32 @@ class TpuPlatform(Platform):
                                     model_config: "ModelConfig") -> bool:
         return True
 
+    @classmethod
+    @torch.compile(backend="openxla")
+    def insert_blocks_to_device(
+        cls,
+        src_cache: torch.Tensor,
+        dst_cache: torch.Tensor,
+        src_block_indices: torch.Tensor,
+        dst_block_indices: torch.Tensor,
+    ) -> None:
+        torch.ops.xla.dynamo_set_buffer_donor_(dst_cache, True)
+        dst_cache[dst_block_indices] = src_cache[src_block_indices].to(
+            dst_cache.device)
+
+    @classmethod
+    @torch.compile(backend="openxla")
+    def swap_out_blocks_to_host(
+        cls,
+        src_cache: torch.Tensor,
+        dst_cache: torch.Tensor,
+        src_block_indices: torch.Tensor,
+        dst_block_indices: torch.Tensor,
+    ) -> None:
+        """ tpu blocks to cpu blocks"""
+        torch.ops.xla.dynamo_set_buffer_donor_(src_cache, True)
+        dst_cache[dst_block_indices] = src_cache[src_block_indices].cpu()
+
 
 try:
     from tpu_commons.platforms import TpuPlatform as TpuCommonsPlatform
diff --git a/vllm/platforms/xpu.py b/vllm/platforms/xpu.py
index ac85a5469f205613359a41907f2a248598ec2741..359857a881fa03c26c4d109df20ee718363c9f18 100644
--- a/vllm/platforms/xpu.py
+++ b/vllm/platforms/xpu.py
@@ -37,15 +37,37 @@ class XPUPlatform(Platform):
                              dtype: torch.dtype, kv_cache_dtype: Optional[str],
                              block_size: int, use_v1: bool, use_mla: bool,
                              has_sink: bool) -> str:
-        if selected_backend is not None and selected_backend != _Backend.IPEX:
-            logger.info("Cannot use %s backend on XPU.", selected_backend)
         use_v1 = envs.VLLM_USE_V1
-        if use_v1:
-            logger.info("Using Flash Attention backend on V1 engine.")
-            return "vllm.v1.attention.backends.flash_attn.FlashAttentionBackend"
-        else:
-            logger.info("Using IPEX attention backend.")
-            return "vllm.attention.backends.ipex_attn.IpexAttnBackend"
+        if not use_v1:
+            raise ValueError("XPU backend only supports V1.")
+        TRITON_ATTN_VLLM_V1 = "vllm.v1.attention.backends.triton_attn.TritonAttentionBackend"  # noqa: E501
+        FLASH_ATTN_V1 = "vllm.v1.attention.backends.flash_attn.FlashAttentionBackend"  # noqa: E501
+        if selected_backend == _Backend.TRITON_ATTN_VLLM_V1:
+            logger.info_once("Using Triton backend on V1 engine.")
+            return TRITON_ATTN_VLLM_V1
+        elif selected_backend == _Backend.FLASH_ATTN:
+            logger.info_once("Using Flash Attention backend on V1 engine.")
+            return FLASH_ATTN_V1
+        elif selected_backend:
+            raise ValueError(
+                f"Invalid attention backend for {cls.device_name}, "
+                f"with use_v1: {use_v1} use_mla: {use_mla}")
+
+        logger.info("Using Flash Attention backend on V1 engine.")
+        return "vllm.v1.attention.backends.flash_attn.FlashAttentionBackend"
+
+    @classmethod
+    def is_kv_cache_dtype_supported(cls, kv_cache_dtype: str,
+                                    model_config: "ModelConfig") -> bool:
+        """
+        Check if the kv_cache_dtype is supported.
+        XPU only support fp8 kv cache with triton backend.
+        """
+        if envs.is_set("VLLM_ATTENTION_BACKEND") and \
+            envs.VLLM_ATTENTION_BACKEND == "TRITON_ATTN_VLLM_V1":
+            return kv_cache_dtype in ["fp8_e4m3", "fp8_e5m2", "fp8"]
+
+        return False
 
     @classmethod
     def set_device(cls, device: torch.device) -> None:
@@ -96,7 +118,7 @@ class XPUPlatform(Platform):
                 cache_config.block_size = 16
 
         # lazy import to avoid circular import
-        from vllm.config import CUDAGraphMode
+        from vllm.config import CompilationLevel, CUDAGraphMode
         compilation_config = vllm_config.compilation_config
         if compilation_config.cudagraph_mode is None or \
                 compilation_config.cudagraph_mode.max_cudagraph_mode() \
@@ -105,6 +127,9 @@ class XPUPlatform(Platform):
                         "cudagraphs. Fallback to cudagraph_mode=NONE")
             compilation_config.cudagraph_mode = CUDAGraphMode.NONE
 
+        if vllm_config.lora_config is not None:
+            compilation_config.level = CompilationLevel.NO_COMPILATION
+
         # check and update parallel config
         parallel_config = vllm_config.parallel_config
         if envs.VLLM_USE_V1:
@@ -146,6 +171,13 @@ class XPUPlatform(Platform):
                 vllm_config.scheduler_config.max_model_len,
                 DEFAULT_MAX_NUM_BATCHED_TOKENS)
 
+        if (envs.VLLM_KV_CACHE_LAYOUT is None
+                or envs.VLLM_KV_CACHE_LAYOUT != "NHD"):
+            os.environ["VLLM_KV_CACHE_LAYOUT"] = "NHD"
+            logger.info(
+                "Setting VLLM_KV_CACHE_LAYOUT to 'NHD' for XPU; "
+                "only NHD layout is supported by XPU attention kernels.")
+
     @classmethod
     def is_pin_memory_available(cls):
         return True
@@ -157,6 +189,10 @@ class XPUPlatform(Platform):
         torch.xpu.reset_peak_memory_stats(device)
         return torch.xpu.max_memory_allocated(device)
 
+    @classmethod
+    def fp8_dtype(cls) -> torch.dtype:
+        return torch.float8_e5m2
+
     @classmethod
     def is_data_center_gpu(cls) -> bool:
         device_name = cls.get_device_name().lower()
@@ -188,3 +224,27 @@ class XPUPlatform(Platform):
     @classmethod
     def opaque_attention_op(cls) -> bool:
         return True
+
+    @classmethod
+    def insert_blocks_to_device(
+        cls,
+        src_cache: torch.Tensor,
+        dst_cache: torch.Tensor,
+        src_block_indices: torch.Tensor,
+        dst_block_indices: torch.Tensor,
+    ) -> None:
+        """Copy blocks from src_cache to dst_cache on XPU."""
+        _src_cache = src_cache[:, src_block_indices]
+        dst_cache[:, dst_block_indices] = _src_cache.to(dst_cache.device)
+
+    @classmethod
+    def swap_out_blocks_to_host(
+        cls,
+        src_cache: torch.Tensor,
+        dst_cache: torch.Tensor,
+        src_block_indices: torch.Tensor,
+        dst_block_indices: torch.Tensor,
+    ) -> None:
+        """Copy blocks from XPU to host (CPU)."""
+        _src_cache = src_cache[:, src_block_indices]
+        dst_cache[:, dst_block_indices] = _src_cache.cpu()
diff --git a/vllm/plugins/io_processors/interface.py b/vllm/plugins/io_processors/interface.py
index 5c73188d5df51cf81c6599608817d96a5e00830b..62b224cac5e53073e818b35bf4f670c3af148bc8 100644
--- a/vllm/plugins/io_processors/interface.py
+++ b/vllm/plugins/io_processors/interface.py
@@ -49,7 +49,12 @@ class IOProcessor(ABC, Generic[IOProcessorInput, IOProcessorOutput]):
         request_id: Optional[str] = None,
         **kwargs,
     ) -> IOProcessorOutput:
-        collected_output = [item async for i, item in model_output]
+        # We cannot guarantee outputs are returned in the same order they were
+        # fed to vLLM.
+        # Let's sort them by id before post_processing
+        sorted_output = sorted([(i, item) async for i, item in model_output],
+                               key=lambda output: output[0])
+        collected_output = [output[1] for output in sorted_output]
         return self.post_process(collected_output, request_id, **kwargs)
 
     @abstractmethod
@@ -59,4 +64,4 @@ class IOProcessor(ABC, Generic[IOProcessorInput, IOProcessorOutput]):
     @abstractmethod
     def output_to_response(
             self, plugin_output: IOProcessorOutput) -> IOProcessorResponse:
-        raise NotImplementedError
\ No newline at end of file
+        raise NotImplementedError
diff --git a/vllm/reasoning/gptoss_reasoning_parser.py b/vllm/reasoning/gptoss_reasoning_parser.py
index 05a72ac23bf2e923ef55ed0f80d7ad1e8c1cc7d5..3bd4d872ce22fc8a5b08d5f733063e953814793f 100644
--- a/vllm/reasoning/gptoss_reasoning_parser.py
+++ b/vllm/reasoning/gptoss_reasoning_parser.py
@@ -6,6 +6,7 @@ from typing import Optional, Union
 
 from transformers import PreTrainedTokenizerBase
 
+from vllm.entrypoints.harmony_utils import parse_chat_output
 from vllm.entrypoints.openai.protocol import (ChatCompletionRequest,
                                               DeltaMessage)
 from vllm.logger import init_logger
@@ -14,7 +15,7 @@ from vllm.reasoning import ReasoningParser, ReasoningParserManager
 logger = init_logger(__name__)
 
 
-@ReasoningParserManager.register_module("GptOss")
+@ReasoningParserManager.register_module("openai_gptoss")
 class GptOssReasoningParser(ReasoningParser):
     """
     Reasoning parser for GptOss model.
@@ -39,9 +40,10 @@ class GptOssReasoningParser(ReasoningParser):
         return False
 
     def extract_content_ids(self, input_ids: list[int]) -> list[int]:
-        raise RuntimeError(
-            "GptOss model uses harmony to extract reasoning content. This "
-            "function should not be called.")
+        _, content, _ = parse_chat_output(input_ids)
+        if content is None:
+            return []
+        return self.model_tokenizer.encode(content)
 
     def extract_reasoning_content_streaming(
         self,
@@ -52,13 +54,34 @@ class GptOssReasoningParser(ReasoningParser):
         current_token_ids: Sequence[int],
         delta_token_ids: Sequence[int],
     ) -> Union[DeltaMessage, None]:
-        raise RuntimeError(
-            "GptOss model uses harmony to extract reasoning content. This "
-            "function should not be called.")
+        prev_reasoning, prev_content, _ = parse_chat_output(
+            list(previous_token_ids))
+        cur_reasoning, cur_content, _ = parse_chat_output(
+            list(current_token_ids))
+        reasoning_delta = None
+        content_delta = None
+        if cur_reasoning is not None:
+            prev_r = prev_reasoning or ""
+            if cur_reasoning.startswith(prev_r):
+                reasoning_delta = cur_reasoning[len(prev_r):] or None
+            else:
+                reasoning_delta = cur_reasoning
+        if cur_content is not None:
+            prev_c = prev_content or ""
+            if cur_content.startswith(prev_c):
+                content_delta = cur_content[len(prev_c):] or None
+            else:
+                content_delta = cur_content
+        if reasoning_delta is None and content_delta is None:
+            return None
+        return DeltaMessage(reasoning_content=reasoning_delta,
+                            content=content_delta)
 
     def extract_reasoning_content(
-            self, model_output: str, request: ChatCompletionRequest
+        self,
+        model_output: str,
+        request: ChatCompletionRequest,
     ) -> tuple[Optional[str], Optional[str]]:
-        raise RuntimeError(
-            "GptOss model uses harmony to extract reasoning content. This "
-            "function should not be called.")
+        raise NotImplementedError(
+            "gpt-oss has a special branch for parsing reasoning in non-streaming mode. This method shouldn't be used."  # noqa: E501
+        )
diff --git a/vllm/sampling_params.py b/vllm/sampling_params.py
index c7b4ba34c602e964d5c8139e8a3ee1079eaab428..fe93e906064e43a6a04f3191f98c135950ea2455 100644
--- a/vllm/sampling_params.py
+++ b/vllm/sampling_params.py
@@ -165,7 +165,8 @@ class SamplingParams(
     the sampled token, so there may be up to `logprobs+1` elements in the
     response. When set to -1, return all `vocab_size` log probabilities."""
     prompt_logprobs: Optional[int] = None
-    """Number of log probabilities to return per prompt token."""
+    """Number of log probabilities to return per prompt token.
+    When set to -1, return all `vocab_size` log probabilities."""
     # NOTE: This parameter is only exposed at the engine level for now.
     # It is not exposed in the OpenAI API server, as the OpenAI API does
     # not support returning only a list of token IDs.
@@ -409,9 +410,11 @@ class SamplingParams(
                 and self.logprobs < 0):
             raise ValueError(
                 f"logprobs must be non-negative or -1, got {self.logprobs}.")
-        if self.prompt_logprobs is not None and self.prompt_logprobs < 0:
-            raise ValueError(f"prompt_logprobs must be non-negative, got "
-                             f"{self.prompt_logprobs}.")
+        if (self.prompt_logprobs is not None and self.prompt_logprobs != -1
+                and self.prompt_logprobs < 0):
+            raise ValueError(
+                f"prompt_logprobs must be non-negative or -1, got "
+                f"{self.prompt_logprobs}.")
         if (self.truncate_prompt_tokens is not None
                 and (self.truncate_prompt_tokens == 0
                      or self.truncate_prompt_tokens < -1)):
diff --git a/vllm/scalar_type.py b/vllm/scalar_type.py
index 6f11ab8e0300a4761216e4b07855d22a87050311..055f28914ad597c2a8561be63107783d9fdff145 100644
--- a/vllm/scalar_type.py
+++ b/vllm/scalar_type.py
@@ -269,7 +269,7 @@ class ScalarType:
 
     @classmethod
     def uint(cls, size_bits: int, bias: Optional[int]) -> 'ScalarType':
-        """Create a unsigned integer scalar type."""
+        """Create an unsigned integer scalar type."""
         ret = cls(0, size_bits, False, bias if bias else 0)
         ret.id  # noqa B018: make sure the id is cached
         return ret
diff --git a/vllm/sequence.py b/vllm/sequence.py
index 7b48b7be9f511346fa989d34bf76443fc35ae6f5..24114c0bb792ee2ab46418ee1d83f31c9c6d1715 100644
--- a/vllm/sequence.py
+++ b/vllm/sequence.py
@@ -16,6 +16,7 @@ import msgspec
 import torch
 
 from vllm.inputs import SingletonInputs
+from vllm.logprobs import Logprob, PromptLogprobs, SampleLogprobs
 from vllm.multimodal import MultiModalKwargs, MultiModalPlaceholderDict
 from vllm.pooling_params import PoolingParams
 from vllm.sampling_params import RequestOutputKind, SamplingParams
@@ -38,30 +39,6 @@ def array_full(token_id: int, count: int):
     return array(VLLM_TOKEN_ID_ARRAY_TYPE, [token_id]) * count
 
 
-# We use dataclass for now because it is used for
-# openai server output, and msgspec is not serializable.
-# TODO(sang): Fix it.
-@dataclass
-class Logprob:
-    """Infos for supporting OpenAI compatible logprobs and token ranks.
-
-    Attributes:
-        logprob: The logprob of chosen token
-        rank: The vocab rank of chosen token (>=1)
-        decoded_token: The decoded chosen token index
-    """
-    logprob: float
-    rank: Optional[int] = None
-    decoded_token: Optional[str] = None
-
-
-# {token_id -> logprob} per each sequence group. None if the corresponding
-# sequence group doesn't require prompt logprob.
-PromptLogprobs = list[Optional[dict[int, Logprob]]]
-# {token_id -> logprob} for each sequence group.
-SampleLogprobs = list[dict[int, Logprob]]
-
-
 class SequenceStatus(enum.IntEnum):
     """Status of a sequence."""
     WAITING = 0
@@ -1216,7 +1193,7 @@ class HiddenStates(msgspec.Struct, array_like=True,
     seq_ids are the sequence ids of each entry of the batch
     dimension of the hidden_states tensor"""
     # Scorer hidden states. For prefill step, it is used for hidden states of
-    # all tokens, whereas for decode step, it use used for last accepted tokens.
+    # all tokens, whereas for decode step, it is used for last accepted tokens.
     hidden_states: torch.Tensor
     # The sequence group metadata list. Only needed for decode step.
     seq_group_metadata_list: Optional[list[SequenceGroupMetadata]] = None
diff --git a/vllm/third_party/pynvml.py b/vllm/third_party/pynvml.py
index d215e5d8bf657a35c1ee9059b51338e9b1f07220..6aabbc217dd03c630f3f4ef498a5adc832ea11c1 100644
--- a/vllm/third_party/pynvml.py
+++ b/vllm/third_party/pynvml.py
@@ -1022,7 +1022,7 @@ def _extractNVMLErrorsAsClasses():
     Each NVML Error gets a new NVMLError subclass. This way try,except blocks can filter appropriate
     exceptions more easily.
 
-    NVMLError is a parent class. Each NVML_ERROR_* gets it's own subclass.
+    NVMLError is a parent class. Each NVML_ERROR_* gets its own subclass.
     e.g. NVML_ERROR_ALREADY_INITIALIZED will be turned into NVMLError_AlreadyInitialized
     '''
     this_module = sys.modules[__name__]
@@ -3533,7 +3533,7 @@ def nvmlDeviceGetMPSComputeRunningProcesses_v3(handle):
         return []
     elif (ret == NVML_ERROR_INSUFFICIENT_SIZE):
         # typical case
-        # oversize the array incase more processes are created
+        # oversize the array in case more processes are created
         c_count.value = c_count.value * 2 + 5
         proc_array = c_nvmlProcessInfo_v3_t * c_count.value
         c_procs = proc_array()
diff --git a/vllm/tracing.py b/vllm/tracing.py
index 6a287d82be5ffbabcd732a25d340e9a306e75fce..7537e9901a044d469eb3bdb979890e2020e0a7ae 100644
--- a/vllm/tracing.py
+++ b/vllm/tracing.py
@@ -119,6 +119,11 @@ class SpanAttributes:
     # forward, block/sync across workers, cpu-gpu sync time and sampling time.
     GEN_AI_LATENCY_TIME_IN_MODEL_EXECUTE = (
         "gen_ai.latency.time_in_model_execute")
+    GEN_AI_LATENCY_TIME_IN_MODEL_PREFILL = \
+        "gen_ai.latency.time_in_model_prefill"
+    GEN_AI_LATENCY_TIME_IN_MODEL_DECODE = "gen_ai.latency.time_in_model_decode"
+    GEN_AI_LATENCY_TIME_IN_MODEL_INFERENCE = \
+        "gen_ai.latency.time_in_model_inference"
 
 
 def contains_trace_headers(headers: Mapping[str, str]) -> bool:
diff --git a/vllm/transformers_utils/config.py b/vllm/transformers_utils/config.py
index bec792465bfbbfffe2915f54f8bf3daebe50829c..2852d16ec53f4bef9d037b0ff1a242388eb9e66d 100644
--- a/vllm/transformers_utils/config.py
+++ b/vllm/transformers_utils/config.py
@@ -1,13 +1,12 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-import enum
 import json
 import os
 import time
 from functools import cache, partial
 from pathlib import Path
-from typing import Any, Callable, Optional, TypeVar, Union
+from typing import Any, Callable, Literal, Optional, TypeVar, Union
 
 import huggingface_hub
 from huggingface_hub import get_safetensors_metadata, hf_hub_download
@@ -27,6 +26,7 @@ from transformers.utils import CONFIG_NAME as HF_CONFIG_NAME
 
 from vllm import envs
 from vllm.logger import init_logger
+from vllm.transformers_utils.config_parser_base import ConfigParserBase
 from vllm.transformers_utils.utils import check_gguf_file
 
 if envs.VLLM_USE_MODELSCOPE:
@@ -71,6 +71,7 @@ _CONFIG_REGISTRY: dict[str, type[PretrainedConfig]] = LazyConfigDict(
     jais="JAISConfig",
     mlp_speculator="MLPSpeculatorConfig",
     medusa="MedusaConfig",
+    midashenglm="MiDashengLMConfig",
     eagle="EAGLEConfig",
     speculators="SpeculatorsConfig",
     nemotron="NemotronConfig",
@@ -78,7 +79,7 @@ _CONFIG_REGISTRY: dict[str, type[PretrainedConfig]] = LazyConfigDict(
     ultravox="UltravoxConfig",
     step3_vl="Step3VLConfig",
     step3_text="Step3TextConfig",
-)
+    qwen3_next="Qwen3NextConfig")
 
 _CONFIG_ATTRS_MAPPING: dict[str, str] = {
     "llm_config": "text_config",
@@ -99,10 +100,163 @@ _AUTO_CONFIG_KWARGS_OVERRIDES: dict[str, dict[str, Any]] = {
 }
 
 
-class ConfigFormat(str, enum.Enum):
-    AUTO = "auto"
-    HF = "hf"
-    MISTRAL = "mistral"
+class HFConfigParser(ConfigParserBase):
+
+    def parse(self,
+              model: Union[str, Path],
+              trust_remote_code: bool,
+              revision: Optional[str] = None,
+              code_revision: Optional[str] = None,
+              **kwargs) -> tuple[dict, PretrainedConfig]:
+        kwargs["local_files_only"] = huggingface_hub.constants.HF_HUB_OFFLINE
+        config_dict, _ = PretrainedConfig.get_config_dict(
+            model,
+            revision=revision,
+            code_revision=code_revision,
+            token=_get_hf_token(),
+            **kwargs,
+        )
+        # Use custom model class if it's in our registry
+        model_type = config_dict.get("model_type")
+        if model_type is None:
+            model_type = "speculators" if config_dict.get(
+                "speculators_config") is not None else model_type
+
+        if model_type in _CONFIG_REGISTRY:
+            config_class = _CONFIG_REGISTRY[model_type]
+            config = config_class.from_pretrained(
+                model,
+                revision=revision,
+                code_revision=code_revision,
+                token=_get_hf_token(),
+                **kwargs,
+            )
+        else:
+            try:
+                kwargs = _maybe_update_auto_config_kwargs(
+                    kwargs, model_type=model_type)
+                config = AutoConfig.from_pretrained(
+                    model,
+                    trust_remote_code=trust_remote_code,
+                    revision=revision,
+                    code_revision=code_revision,
+                    token=_get_hf_token(),
+                    **kwargs,
+                )
+            except ValueError as e:
+                if (not trust_remote_code
+                        and "requires you to execute the configuration file"
+                        in str(e)):
+                    err_msg = (
+                        "Failed to load the model config. If the model "
+                        "is a custom model not yet available in the "
+                        "HuggingFace transformers library, consider setting "
+                        "`trust_remote_code=True` in LLM or using the "
+                        "`--trust-remote-code` flag in the CLI.")
+                    raise RuntimeError(err_msg) from e
+                else:
+                    raise e
+        config = _maybe_remap_hf_config_attrs(config)
+        return config_dict, config
+
+
+class MistralConfigParser(ConfigParserBase):
+
+    def parse(self,
+              model: Union[str, Path],
+              trust_remote_code: bool,
+              revision: Optional[str] = None,
+              code_revision: Optional[str] = None,
+              **kwargs) -> tuple[dict, PretrainedConfig]:
+        # This function loads a params.json config which
+        # should be used when loading models in mistral format
+        config_dict = _download_mistral_config_file(model, revision)
+        if (max_position_embeddings :=
+                config_dict.get("max_position_embeddings")) is None:
+            max_position_embeddings = _maybe_retrieve_max_pos_from_hf(
+                model, revision, **kwargs)
+            config_dict["max_position_embeddings"] = max_position_embeddings
+
+        from vllm.transformers_utils.configs.mistral import adapt_config_dict
+
+        config = adapt_config_dict(config_dict)
+
+        # Mistral configs may define sliding_window as list[int]. Convert it
+        # to int and add the layer_types list[str] to make it HF compatible
+        if ((sliding_window := getattr(config, "sliding_window", None))
+                and isinstance(sliding_window, list)):
+            pattern_repeats = config.num_hidden_layers // len(sliding_window)
+            layer_types = sliding_window * pattern_repeats
+            config.layer_types = [
+                "full_attention" if layer_type is None else "sliding_attention"
+                for layer_type in layer_types
+            ]
+            config.sliding_window = next(filter(None, sliding_window), None)
+
+        return config_dict, config
+
+
+_CONFIG_FORMAT_TO_CONFIG_PARSER: dict[str, type[ConfigParserBase]] = {
+    "hf": HFConfigParser,
+    "mistral": MistralConfigParser,
+}
+
+ConfigFormat = Literal[
+    "auto",
+    "hf",
+    "mistral",
+]
+
+
+def get_config_parser(config_format: str) -> ConfigParserBase:
+    """Get the config parser for a given config format."""
+    if config_format not in _CONFIG_FORMAT_TO_CONFIG_PARSER:
+        raise ValueError(f"Unknown config format `{config_format}`.")
+    return _CONFIG_FORMAT_TO_CONFIG_PARSER[config_format]()
+
+
+def register_config_parser(config_format: str):
+
+    """Register a customized vllm config parser.
+    When a config format is not supported by vllm, you can register a customized
+   config parser to support it.
+    Args:
+        config_format (str): The config parser format name.
+    Examples:
+
+        >>> from vllm.transformers_utils.config import (get_config_parser,
+                                                        register_config_parser)
+        >>> from vllm.transformers_utils.config_parser_base import ConfigParserBase
+        >>>
+        >>> @register_config_parser("custom_config_parser")
+        ... class CustomConfigParser(ConfigParserBase):
+        ...     def parse(self,
+        ...            model: Union[str, Path],
+        ...            trust_remote_code: bool,
+        ...            revision: Optional[str] = None,
+        ...            code_revision: Optional[str] = None,
+        ...           **kwargs) -> tuple[dict, PretrainedConfig]:
+        ...        raise NotImplementedError
+        >>>
+        >>> type(get_config_parser("custom_config_parser"))
+        <class 'CustomConfigParser'>
+    """  # noqa: E501
+
+    def _wrapper(config_parser_cls):
+        if config_format in _CONFIG_FORMAT_TO_CONFIG_PARSER:
+            logger.warning(
+                "Config format `%s` is already registered, and will be "
+                "overwritten by the new parser class `%s`.", config_format,
+                config_parser_cls)
+        if not issubclass(config_parser_cls, ConfigParserBase):
+            raise ValueError("The config parser must be a subclass of "
+                             "`ConfigParserBase`.")
+        _CONFIG_FORMAT_TO_CONFIG_PARSER[config_format] = config_parser_cls
+        logger.info("Registered config parser `%s` with config format `%s`",
+                    config_parser_cls, config_format)
+        return config_parser_cls
+
+    return _wrapper
 
 
 _R = TypeVar("_R")
@@ -349,7 +503,7 @@ def get_config(
     trust_remote_code: bool,
     revision: Optional[str] = None,
     code_revision: Optional[str] = None,
-    config_format: ConfigFormat = ConfigFormat.AUTO,
+    config_format: Union[str, ConfigFormat] = "auto",
     hf_overrides_kw: Optional[dict[str, Any]] = None,
     hf_overrides_fn: Optional[Callable[[PretrainedConfig],
                                        PretrainedConfig]] = None,
@@ -362,20 +516,22 @@ def get_config(
         kwargs["gguf_file"] = Path(model).name
         model = Path(model).parent
 
-    if config_format == ConfigFormat.AUTO:
+    if config_format == "auto":
         try:
             if is_gguf or file_or_path_exists(
                     model, HF_CONFIG_NAME, revision=revision):
-                config_format = ConfigFormat.HF
+                config_format = "hf"
             elif file_or_path_exists(model,
                                      MISTRAL_CONFIG_NAME,
                                      revision=revision):
-                config_format = ConfigFormat.MISTRAL
+                config_format = "mistral"
             else:
                 raise ValueError(
                     "Could not detect config format for no config file found. "
-                    "Ensure your model has either config.json (HF format) "
-                    "or params.json (Mistral format).")
+                    "With config_format 'auto', ensure your model has either"
+                    "config.json (HF format) or params.json (Mistral format)."
+                    "Otherwise please specify your_custom_config_format"
+                    "in engine args for customized config parser")
 
         except Exception as e:
             error_message = (
@@ -394,92 +550,14 @@ def get_config(
 
             raise ValueError(error_message) from e
 
-    if config_format == ConfigFormat.HF:
-        kwargs["local_files_only"] = huggingface_hub.constants.HF_HUB_OFFLINE
-        config_dict, _ = PretrainedConfig.get_config_dict(
-            model,
-            revision=revision,
-            code_revision=code_revision,
-            token=_get_hf_token(),
-            **kwargs,
-        )
-        # Use custom model class if it's in our registry
-        model_type = config_dict.get("model_type")
-        if model_type is None:
-            model_type = "speculators" if config_dict.get(
-                "speculators_config") is not None else model_type
-
-        if model_type in _CONFIG_REGISTRY:
-            config_class = _CONFIG_REGISTRY[model_type]
-            config = config_class.from_pretrained(
-                model,
-                revision=revision,
-                code_revision=code_revision,
-                token=_get_hf_token(),
-                **kwargs,
-            )
-        else:
-            try:
-                kwargs = _maybe_update_auto_config_kwargs(
-                    kwargs, model_type=model_type)
-                config = AutoConfig.from_pretrained(
-                    model,
-                    trust_remote_code=trust_remote_code,
-                    revision=revision,
-                    code_revision=code_revision,
-                    token=_get_hf_token(),
-                    **kwargs,
-                )
-            except ValueError as e:
-                if (not trust_remote_code
-                        and "requires you to execute the configuration file"
-                        in str(e)):
-                    err_msg = (
-                        "Failed to load the model config. If the model "
-                        "is a custom model not yet available in the "
-                        "HuggingFace transformers library, consider setting "
-                        "`trust_remote_code=True` in LLM or using the "
-                        "`--trust-remote-code` flag in the CLI.")
-                    raise RuntimeError(err_msg) from e
-                else:
-                    raise e
-        config = _maybe_remap_hf_config_attrs(config)
-
-    elif config_format == ConfigFormat.MISTRAL:
-        # This function loads a params.json config which
-        # should be used when loading models in mistral format
-        config_dict = _download_mistral_config_file(model, revision)
-        if (max_position_embeddings :=
-                config_dict.get("max_position_embeddings")) is None:
-            max_position_embeddings = _maybe_retrieve_max_pos_from_hf(
-                model, revision, **kwargs)
-            config_dict["max_position_embeddings"] = max_position_embeddings
-
-        from vllm.transformers_utils.configs.mistral import adapt_config_dict
-
-        config = adapt_config_dict(config_dict)
-
-        # Mistral configs may define sliding_window as list[int]. Convert it
-        # to int and add the layer_types list[str] to make it HF compatible
-        if ((sliding_window := getattr(config, "sliding_window", None))
-                and isinstance(sliding_window, list)):
-            pattern_repeats = config.num_hidden_layers // len(sliding_window)
-            layer_types = sliding_window * pattern_repeats
-            config.layer_types = [
-                "full_attention" if layer_type is None else "sliding_attention"
-                for layer_type in layer_types
-            ]
-            config.sliding_window = next(filter(None, sliding_window), None)
-    else:
-        supported_formats = [
-            fmt.value for fmt in ConfigFormat if fmt != ConfigFormat.AUTO
-        ]
-        raise ValueError(
-            f"Unsupported config format: {config_format}. "
-            f"Supported formats are: {', '.join(supported_formats)}. "
-            f"Ensure your model uses one of these configuration formats "
-            f"or specify the correct format explicitly.")
-
+    config_parser = get_config_parser(config_format)
+    config_dict, config = config_parser.parse(
+        model,
+        trust_remote_code=trust_remote_code,
+        revision=revision,
+        code_revision=code_revision,
+        **kwargs,
+    )
     # Special architecture mapping check for GGUF models
     if is_gguf:
         if config.model_type not in MODEL_FOR_CAUSAL_LM_MAPPING_NAMES:
@@ -913,7 +991,7 @@ def _maybe_retrieve_max_pos_from_hf(model, revision, **kwargs) -> int:
         hf_config = get_config(model=model,
                                trust_remote_code=trust_remote_code_val,
                                revision=revision,
-                               config_format=ConfigFormat.HF)
+                               config_format="hf")
         if hf_value := hf_config.get_text_config().max_position_embeddings:
             max_position_embeddings = hf_value
     except Exception as e:
diff --git a/vllm/transformers_utils/config_parser_base.py b/vllm/transformers_utils/config_parser_base.py
new file mode 100644
index 0000000000000000000000000000000000000000..c27177f74d4ba8032d6c0ad41a1afa39377d281f
--- /dev/null
+++ b/vllm/transformers_utils/config_parser_base.py
@@ -0,0 +1,20 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from abc import ABC, abstractmethod
+from pathlib import Path
+from typing import Optional, Union
+
+from transformers import PretrainedConfig
+
+
+class ConfigParserBase(ABC):
+
+    @abstractmethod
+    def parse(self,
+              model: Union[str, Path],
+              trust_remote_code: bool,
+              revision: Optional[str] = None,
+              code_revision: Optional[str] = None,
+              **kwargs) -> tuple[dict, PretrainedConfig]:
+        raise NotImplementedError
diff --git a/vllm/transformers_utils/configs/__init__.py b/vllm/transformers_utils/configs/__init__.py
index 8339c55bcf8080e4a7e83ad2160349c6b3aeac4d..cdae59ccc24e0128ce6130cb1099739a47f16f04 100644
--- a/vllm/transformers_utils/configs/__init__.py
+++ b/vllm/transformers_utils/configs/__init__.py
@@ -17,12 +17,14 @@ from vllm.transformers_utils.configs.falcon import RWConfig
 from vllm.transformers_utils.configs.jais import JAISConfig
 from vllm.transformers_utils.configs.kimi_vl import KimiVLConfig
 from vllm.transformers_utils.configs.medusa import MedusaConfig
+from vllm.transformers_utils.configs.midashenglm import MiDashengLMConfig
 from vllm.transformers_utils.configs.mlp_speculator import MLPSpeculatorConfig
 from vllm.transformers_utils.configs.moonvit import MoonViTConfig
 from vllm.transformers_utils.configs.nemotron import NemotronConfig
 from vllm.transformers_utils.configs.nemotron_h import NemotronHConfig
 from vllm.transformers_utils.configs.nemotron_vl import Nemotron_Nano_VL_Config
 from vllm.transformers_utils.configs.ovis import OvisConfig
+from vllm.transformers_utils.configs.qwen3_next import Qwen3NextConfig
 from vllm.transformers_utils.configs.speculators.base import SpeculatorsConfig
 from vllm.transformers_utils.configs.step3_vl import (Step3TextConfig,
                                                       Step3VisionEncoderConfig,
@@ -36,6 +38,7 @@ __all__ = [
     "RWConfig",
     "JAISConfig",
     "MedusaConfig",
+    "MiDashengLMConfig",
     "MLPSpeculatorConfig",
     "MoonViTConfig",
     "KimiVLConfig",
@@ -48,4 +51,5 @@ __all__ = [
     "Step3VLConfig",
     "Step3VisionEncoderConfig",
     "Step3TextConfig",
+    "Qwen3NextConfig",
 ]
diff --git a/vllm/transformers_utils/configs/eagle.py b/vllm/transformers_utils/configs/eagle.py
index 6aabf9e5262e68249a8249a6d92c833b7dd97fba..444ed70de3d0ce60c4c01b518af17041b535f778 100644
--- a/vllm/transformers_utils/configs/eagle.py
+++ b/vllm/transformers_utils/configs/eagle.py
@@ -46,6 +46,7 @@ class EAGLEConfig(PretrainedConfig):
         # Eagle model name should follow naming convention of
         # LlamaForCausalLM -> EagleLlamaForCausalLM
         # LlamaForCausalLM -> Eagle3LlamaForCausalLM
+        # LlamaForCausalLMEagle3 -> LlamaForCausalLMEagle3
         if method == "eagle":
             assert self.model is not None, \
                 "model should not be None when method is eagle"
@@ -53,6 +54,7 @@ class EAGLEConfig(PretrainedConfig):
                 f"Eagle{arch}" if not arch.startswith("Eagle") \
                     else arch for arch in self.model.architectures
             ]
+
         elif method == "eagle3":
             assert self.model is not None, \
                 "model should not be None when method is eagle3"
diff --git a/vllm/transformers_utils/configs/midashenglm.py b/vllm/transformers_utils/configs/midashenglm.py
new file mode 100644
index 0000000000000000000000000000000000000000..1c23202e23c8e534c78319ce12c9a45da731be2a
--- /dev/null
+++ b/vllm/transformers_utils/configs/midashenglm.py
@@ -0,0 +1,101 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+# Copyright 2025 Horizon team, Xiaomi MiLM Plus.
+# Copyright 2024 The Qwen team.
+# Copyright 2023 The vLLM team.
+# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.
+#
+# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
+# and OPT implementations in this library. It has been modified from its
+# original forms to accommodate minor architectural differences compared
+# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import Optional, Union
+
+from transformers import PretrainedConfig
+from transformers.models.qwen2_5_omni.configuration_qwen2_5_omni import (
+    Qwen2_5OmniTextConfig)
+
+
+class DashengConfig(PretrainedConfig):
+    model_type = "midashenglm_dasheng_encoder"
+
+    def __init__(
+        self,
+        embed_dim: int = 768,
+        outputdim: int = 527,
+        patch_size: Union[int, tuple[int, int]] = 16,
+        patch_stride: Union[int, tuple[int, int]] = 16,
+        input_channels: int = 1,
+        target_length: int = 1012,
+        depth: int = 12,
+        num_heads: int = 12,
+        mlp_ratio: float = 4.0,
+        qkv_bias: bool = True,
+        init_values: Optional[float] = None,
+        drop_rate: float = 0.0,
+        attn_drop_rate: float = 0.0,
+        f_min: float = 0.0,
+        f_max: float = 8000.0,
+        center: bool = True,
+        win_length: int = 512,
+        hop_length: int = 160,
+        sample_rate: int = 16000,
+        n_fft: int = 512,
+        n_mels: int = 64,
+        **kwargs,
+    ):
+        self.embed_dim = embed_dim
+        self.outputdim = outputdim
+        self.patch_size = patch_size
+        self.patch_stride = patch_stride
+        self.input_channels = input_channels
+        self.target_length = target_length
+        self.depth = depth
+        self.num_heads = num_heads
+        self.mlp_ratio = mlp_ratio
+        self.qkv_bias = qkv_bias
+        self.init_values = init_values
+        self.drop_rate = drop_rate
+        self.attn_drop_rate = attn_drop_rate
+        self.f_min = f_min
+        self.f_max = f_max
+        self.center = center
+        self.win_length = win_length
+        self.hop_length = hop_length
+        self.sample_rate = sample_rate
+        self.n_fft = n_fft
+        self.n_mels = n_mels
+        super().__init__(**kwargs)
+
+
+class MiDashengLMConfig(PretrainedConfig):
+    model_type = "midashenglm"
+
+    def __init__(
+        self,
+        audio_encoder_config: Optional[dict] = None,
+        subsample_factor: int = 5,
+        text_config: Optional[dict] = None,
+        audio_token_id: Optional[int] = None,
+        **kwargs,
+    ):
+        self.audio_encoder_config = DashengConfig(
+            **(audio_encoder_config or {}))
+        self.subsample_factor = subsample_factor
+        self.text_config = (Qwen2_5OmniTextConfig(
+            **text_config) if text_config else Qwen2_5OmniTextConfig())
+        self.text_config.rope_scaling = None  # uses_mrope is false
+        self.audio_token_id = audio_token_id
+        super().__init__(**kwargs)
diff --git a/vllm/transformers_utils/configs/mistral.py b/vllm/transformers_utils/configs/mistral.py
index 8a9c660b882fd4f877ede8157920ee6fc3d42ae6..5d9206e188322f0626020b5953cbfde24b9e4040 100644
--- a/vllm/transformers_utils/configs/mistral.py
+++ b/vllm/transformers_utils/configs/mistral.py
@@ -157,6 +157,7 @@ def _remap_mistral_audio_args(config: dict) -> dict:
             encoder_attention_heads=encoder_args["n_heads"],
             vocab_size=encoder_args["vocab_size"],
             max_source_positions=encoder_args["max_source_positions"],
+            is_encoder_decoder=False,  # Override WhisperConfig default
         )
     }
     if quant_config:
diff --git a/vllm/transformers_utils/configs/nemotron.py b/vllm/transformers_utils/configs/nemotron.py
index 9a7243b1262c0e2ccfae4a43906d21dc5ccb4991..090fefa14203e8625e42fdc26fcdc589f910f348 100644
--- a/vllm/transformers_utils/configs/nemotron.py
+++ b/vllm/transformers_utils/configs/nemotron.py
@@ -26,7 +26,7 @@ logger = logging.get_logger(__name__)
 class NemotronConfig(PretrainedConfig):
     r"""
     This is the configuration class to store the configuration of a
-    [`NemotronModel`]. It is used to instantiate an Nemotron model
+    [`NemotronModel`]. It is used to instantiate a Nemotron model
     according to the specified arguments, defining the model architecture.
     Instantiating a configuration with the defaults will yield a similar
     configuration to that of the Nemotron-8B.
diff --git a/vllm/transformers_utils/configs/nemotron_h.py b/vllm/transformers_utils/configs/nemotron_h.py
index 027f2911543f5a14fdb6643d4b69a8ba8c409c5a..581bed5716c1c3370f638092a2ff5ab395813ca4 100644
--- a/vllm/transformers_utils/configs/nemotron_h.py
+++ b/vllm/transformers_utils/configs/nemotron_h.py
@@ -38,7 +38,7 @@ class NemotronHConfig(PretrainedConfig):
             passed when calling [`NemotronHModel`]
         tie_word_embeddings (`bool`, *optional*, defaults to `False`):
             Whether the model's input and output word embeddings should be
-            tied. Note that this is only relevant if the model has a output
+            tied. Note that this is only relevant if the model has an output
             word embedding layer.
         hidden_size (`int`, *optional*, defaults to 4096):
             Dimension of the hidden representations.
diff --git a/vllm/transformers_utils/configs/qwen3_next.py b/vllm/transformers_utils/configs/qwen3_next.py
new file mode 100644
index 0000000000000000000000000000000000000000..c7af26acd1b9f82e36ae5033c149e6e368ea41c1
--- /dev/null
+++ b/vllm/transformers_utils/configs/qwen3_next.py
@@ -0,0 +1,275 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+# Copyright 2025 The Qwen team, Alibaba Group and the HuggingFace Inc. team.
+# All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Qwen3-Next model configuration"""
+
+from transformers.configuration_utils import (PretrainedConfig,
+                                              layer_type_validation)
+from transformers.modeling_rope_utils import rope_config_validation
+from transformers.utils import logging
+
+logger = logging.get_logger(__name__)
+
+
+class Qwen3NextConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`Qwen3NextModel`]. It is used to instantiate a
+    Qwen3-Next model according to the specified arguments, defining the model architecture.
+    Instantiating a configuration with the defaults will yield a similar configuration to that of
+    Qwen3-Next-80B-A3B-Instruct [Qwen/Qwen3-Next-80B-A3B-Instruct](https://huggingface.co/Qwen/Qwen3-Next-80B-A3B-Instruct).
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+
+    Args:
+        vocab_size (`int`, *optional*, defaults to 151936):
+            Vocabulary size of the model. Defines the number of different tokens that can be represented by the
+            `inputs_ids`.
+        hidden_size (`int`, *optional*, defaults to 2048):
+            Dimension of the hidden representations.
+        intermediate_size (`int`, *optional*, defaults to 5632):
+            Dimension of the MLP representations.
+        num_hidden_layers (`int`, *optional*, defaults to 48):
+            Number of hidden layers in the Transformer encoder.
+        num_attention_heads (`int`, *optional*, defaults to 16):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        num_key_value_heads (`int`, *optional*, defaults to 2):
+            This is the number of key_value heads that should be used to implement Grouped Query Attention. If
+            `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
+            `num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When
+            converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
+            by meanpooling all the original heads within that group. For more details checkout [this
+            paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to `32`.
+        hidden_act (`str`, *optional*, defaults to `"silu"`):
+            The non-linear activation function in the decoder.
+        max_position_embeddings (`int`, *optional*, defaults to 32768):
+            The maximum sequence length that this model might ever be used with.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        rms_norm_eps (`float`, *optional*, defaults to 1e-06):
+            The epsilon used by the rms normalization layers.
+        use_cache (`bool`, *optional*, defaults to `True`):
+            Whether or not the model should return the last key/values attentions (not used by all models). Only
+            relevant if `config.is_decoder=True`.
+        tie_word_embeddings (`bool`, *optional*, defaults to `False`):
+            Whether the model's input and output word embeddings should be tied.
+        rope_theta (`float`, *optional*, defaults to 10000.0):
+            The base period of the RoPE embeddings.
+        rope_scaling (`Dict`, *optional*):
+            Dictionary containing the scaling configuration for the RoPE embeddings. NOTE: if you apply new rope type
+            and you expect the model to work on longer `max_position_embeddings`, we recommend you to update this value
+            accordingly.
+            Expected contents:
+                `rope_type` (`str`):
+                    The sub-variant of RoPE to use. Can be one of ['default', 'linear', 'dynamic', 'yarn', 'longrope',
+                    'llama3'], with 'default' being the original RoPE implementation.
+                `factor` (`float`, *optional*):
+                    Used with all rope types except 'default'. The scaling factor to apply to the RoPE embeddings. In
+                    most scaling types, a `factor` of x will enable the model to handle sequences of length x *
+                    original maximum pre-trained length.
+                `original_max_position_embeddings` (`int`, *optional*):
+                    Used with 'dynamic', 'longrope' and 'llama3'. The original max position embeddings used during
+                    pretraining.
+                `attention_factor` (`float`, *optional*):
+                    Used with 'yarn' and 'longrope'. The scaling factor to be applied on the attention
+                    computation. If unspecified, it defaults to value recommended by the implementation, using the
+                    `factor` field to infer the suggested value.
+                `beta_fast` (`float`, *optional*):
+                    Only used with 'yarn'. Parameter to set the boundary for extrapolation (only) in the linear
+                    ramp function. If unspecified, it defaults to 32.
+                `beta_slow` (`float`, *optional*):
+                    Only used with 'yarn'. Parameter to set the boundary for interpolation (only) in the linear
+                    ramp function. If unspecified, it defaults to 1.
+                `short_factor` (`List[float]`, *optional*):
+                    Only used with 'longrope'. The scaling factor to be applied to short contexts (<
+                    `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden
+                    size divided by the number of attention heads divided by 2
+                `long_factor` (`List[float]`, *optional*):
+                    Only used with 'longrope'. The scaling factor to be applied to long contexts (<
+                    `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden
+                    size divided by the number of attention heads divided by 2
+                `low_freq_factor` (`float`, *optional*):
+                    Only used with 'llama3'. Scaling factor applied to low frequency components of the RoPE
+                `high_freq_factor` (`float`, *optional*):
+                    Only used with 'llama3'. Scaling factor applied to high frequency components of the RoPE
+        partial_rotary_factor (`float`, *optional*, defaults to 0.25):
+            Percentage of the query and keys which will have rotary embedding.
+        attention_bias (`bool`, *optional*, defaults to `False`):
+            Whether to use a bias in the query, key, value and output projection layers during self-attention.
+        attention_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for the attention probabilities.
+        head_dim (`int`, *optional*, defaults to 256):
+            Projection weights dimension in multi-head attention.
+        linear_conv_kernel_dim (`int`, *optional*, defaults to 4):
+            Kernel size of the convolution used in linear attention layers.
+        linear_key_head_dim (`int`, *optional*, defaults to 128):
+            Dimension of each key head in linear attention.
+        linear_value_head_dim (`int`, *optional*, defaults to 128):
+            Dimension of each value head in linear attention.
+        linear_num_key_heads (`int`, *optional*, defaults to 16):
+            Number of key heads used in linear attention layers.
+        linear_num_value_heads (`int`, *optional*, defaults to 32):
+            Number of value heads used in linear attention layers.
+        decoder_sparse_step (`int`, *optional*, defaults to 1):
+            The frequency of the MoE layer.
+        moe_intermediate_size (`int`, *optional*, defaults to 512):
+            Intermediate size of the routed expert.
+        shared_expert_intermediate_size (`int`, *optional*, defaults to 512):
+            Intermediate size of the shared expert.
+        num_experts_per_tok (`int`, *optional*, defaults to 10):
+            Number of selected experts.
+        num_experts (`int`, *optional*, defaults to 512):
+            Number of routed experts.
+        norm_topk_prob (`bool`, *optional*, defaults to `True`):
+            Whether to normalize the topk probabilities.
+        output_router_logits (`bool`, *optional*, defaults to `False`):
+            Whether or not the router logits should be returned by the model. Enabling this will also
+            allow the model to output the auxiliary loss, including load balancing loss and router z-loss.
+        router_aux_loss_coef (`float`, *optional*, defaults to 0.001):
+            The aux loss factor for the total loss.
+        mlp_only_layers (`list[int]`, *optional*, defaults to `[]`):
+            Indicate which layers use Qwen3NextMLP rather than Qwen3NextSparseMoeBlock
+            The list contains layer index, from 0 to num_layers-1 if we have num_layers layers
+            If `mlp_only_layers` is empty, `decoder_sparse_step` is used to determine the sparsity.
+        layer_types (`list[str]`, *optional*):
+            Types of each layer (attention or linear).
+
+    ```python
+    >>> from transformers import Qwen3NextModel, Qwen3NextConfig
+
+    >>> # Initializing a Qwen3Next style configuration
+    >>> configuration =  Qwen3NextConfig()
+
+    >>> # Initializing a model from the Qwen3-Next-80B-A3B style configuration
+    >>> model = Qwen3NextModel(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```
+    """  # noqa: E501
+
+    model_type = "qwen3_next"
+    keys_to_ignore_at_inference = ["past_key_values"]
+
+    base_model_tp_plan = {
+        "layers.*.self_attn.q_proj": "colwise",
+        "layers.*.self_attn.k_proj": "colwise",
+        "layers.*.self_attn.v_proj": "colwise",
+        "layers.*.self_attn.o_proj": "rowwise",
+        "layers.*.mlp.experts.*.gate_proj": "colwise",
+        "layers.*.mlp.experts.*.up_proj": "colwise",
+        "layers.*.mlp.experts.*.down_proj": "rowwise",
+        "layers.*.mlp.shared_experts.gate_proj": "colwise",
+        "layers.*.mlp.shared_experts.up_proj": "colwise",
+        "layers.*.mlp.shared_experts.down_proj": "rowwise",
+        "layers.*.mlp.gate_proj": "colwise",
+        "layers.*.mlp.up_proj": "colwise",
+        "layers.*.mlp.down_proj": "rowwise",
+    }
+    base_model_pp_plan = {
+        "embed_tokens": (["input_ids"], ["inputs_embeds"]),
+        "layers": (["hidden_states", "attention_mask"], ["hidden_states"]),
+        "norm": (["hidden_states"], ["hidden_states"]),
+    }
+
+    def __init__(
+        self,
+        vocab_size=151936,
+        hidden_size=2048,
+        intermediate_size=5632,
+        num_hidden_layers=48,
+        num_attention_heads=16,
+        num_key_value_heads=2,
+        hidden_act="silu",
+        max_position_embeddings=32768,
+        initializer_range=0.02,
+        rms_norm_eps=1e-6,
+        use_cache=True,
+        tie_word_embeddings=False,
+        rope_theta=10000.0,
+        rope_scaling=None,
+        partial_rotary_factor=0.25,
+        attention_bias=False,
+        attention_dropout=0.0,
+        head_dim=256,
+        linear_conv_kernel_dim=4,
+        linear_key_head_dim=128,
+        linear_value_head_dim=128,
+        linear_num_key_heads=16,
+        linear_num_value_heads=32,
+        decoder_sparse_step=1,
+        moe_intermediate_size=512,
+        shared_expert_intermediate_size=512,
+        num_experts_per_tok=10,
+        num_experts=512,
+        norm_topk_prob=True,
+        output_router_logits=False,
+        router_aux_loss_coef=0.001,
+        mlp_only_layers=None,
+        layer_types=None,
+        **kwargs,
+    ):
+        if mlp_only_layers is None:
+            mlp_only_layers = []
+        super().__init__(tie_word_embeddings=tie_word_embeddings, **kwargs)
+        self.vocab_size = vocab_size
+        self.max_position_embeddings = max_position_embeddings
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.num_key_value_heads = num_key_value_heads
+        self.hidden_act = hidden_act
+        self.initializer_range = initializer_range
+        self.rms_norm_eps = rms_norm_eps
+        self.use_cache = use_cache
+        self.rope_theta = rope_theta
+        self.rope_scaling = rope_scaling
+        self.partial_rotary_factor = partial_rotary_factor
+        self.attention_bias = attention_bias
+        self.attention_dropout = attention_dropout
+        self.head_dim = head_dim
+        rope_config_validation(self)
+
+        self.layer_types = layer_types
+        if self.layer_types is None:
+            self.layer_types = [
+                "linear_attention" if bool((i + 1) % 4) else "full_attention"
+                for i in range(self.num_hidden_layers)
+            ]
+        layer_type_validation(self.layer_types)
+
+        # linear attention part
+        self.linear_conv_kernel_dim = linear_conv_kernel_dim
+        self.linear_key_head_dim = linear_key_head_dim
+        self.linear_value_head_dim = linear_value_head_dim
+        self.linear_num_key_heads = linear_num_key_heads
+        self.linear_num_value_heads = linear_num_value_heads
+
+        # MoE arguments
+        self.decoder_sparse_step = decoder_sparse_step
+        self.moe_intermediate_size = moe_intermediate_size
+        self.shared_expert_intermediate_size = shared_expert_intermediate_size
+        self.num_experts_per_tok = num_experts_per_tok
+        self.num_experts = num_experts
+        self.norm_topk_prob = norm_topk_prob
+        self.output_router_logits = output_router_logits
+        self.router_aux_loss_coef = router_aux_loss_coef
+        self.mlp_only_layers = mlp_only_layers
+
+
+__all__ = ["Qwen3NextConfig"]
diff --git a/vllm/transformers_utils/configs/ultravox.py b/vllm/transformers_utils/configs/ultravox.py
index 87064cc12dedad14c074e687304c898e87d1e2b2..e67479516560fc5344e8f05a7f975fff235ae29a 100644
--- a/vllm/transformers_utils/configs/ultravox.py
+++ b/vllm/transformers_utils/configs/ultravox.py
@@ -20,10 +20,13 @@ class UltravoxConfig(transformers.PretrainedConfig):
 
     Args:
         audio_config (`Union[AutoConfig, dict]`,  *optional*):
-            Custom audio config or dict
+            Custom audio config or dict.
         text_config (`Union[AutoConfig, dict]`, *optional*):
-            The config object of the text backbone. Can be any of `LlamaConfig`
-            or `MistralConfig`.
+            The config object of the text backbone.
+        audio_model_id (`str`, *optional*):
+            The model ID of the audio backbone.
+        text_model_id (`str`, *optional*):
+            The model ID of the text backbone.
         ignore_index (`int`, *optional*, defaults to -100):
             The ignore index for the loss function.
         audio_token_index (`int`, *optional*, defaults to 32000):
@@ -43,7 +46,7 @@ class UltravoxConfig(transformers.PretrainedConfig):
             projector or at the end. Versions v0.4.1 and below
             use `False`, but v0.5 and above use `True`.
     """
-
+    wrapped_model_config: transformers.PretrainedConfig
     model_type = "ultravox"
     audio_token = "<|audio|>"
     is_composition = False
@@ -60,15 +63,10 @@ class UltravoxConfig(transformers.PretrainedConfig):
         stack_factor: int = 8,
         norm_init: float = 0.4,
         projector_act: str = "swiglu",
-        text_model_lora_config: Optional[dict[str, Any]] = None,
-        audio_model_lora_config: Optional[dict[str, Any]] = None,
         projector_ln_mid: bool = False,
         **kwargs,
     ):
         self.ignore_index = ignore_index
-
-        self.audio_model_id = audio_model_id
-        self.text_model_id = text_model_id
         self.audio_token_index = audio_token_index
 
         self.hidden_size = hidden_size
@@ -77,36 +75,46 @@ class UltravoxConfig(transformers.PretrainedConfig):
         self.projector_act = projector_act
         self.projector_ln_mid = projector_ln_mid
 
-        if text_model_id is not None:
-            # Avoid circular import
-            from vllm.transformers_utils.config import get_config
-
-            text_config_obj = get_config(text_model_id,
-                                         trust_remote_code=False)
-        else:
+        # N.B. May set the wrapped_model_config below.
+        self.text_model_id = text_model_id
+        if text_model_id is None:
             text_config = text_config or {}
-            text_config_obj = transformers.CONFIG_MAPPING[text_config.get(
-                "model_type", "llama")](**text_config)
+            self.wrapped_model_config = transformers.CONFIG_MAPPING[
+                text_config.get("model_type", "llama")](**text_config)
+
+        # N.B. May set the audio_config below.
+        self.audio_model_id = audio_model_id
+        if audio_model_id is None:
+            self.audio_model_id = None
+            audio_config = audio_config or {}
+            self.audio_config = transformers.CONFIG_MAPPING[audio_config.get(
+                "model_type", "whisper")](**audio_config)
 
-        inner_text_config = text_config_obj.get_text_config()
+        super().__init__(**kwargs)
 
-        if audio_model_id is not None:
-            # Avoid circular import
+    def __setattr__(self, key, value):
+        # Since --hf-overrides are applied _after_ the UltravoxConfig is
+        # instantiated, load the configs implicitly when assigning text_model_id
+        # or audio_model_id. This allows:
+        #
+        #   --hf-overrides.text_model_id=<quantized variant>
+        #
+        # to behave as intended.
+        if key == "text_model_id" and value is not None:
             from vllm.transformers_utils.config import get_config
 
-            audio_config = get_config(audio_model_id, trust_remote_code=False)
-        else:
-            audio_config = audio_config or {}
-            audio_config = transformers.CONFIG_MAPPING[audio_config.get(
-                "model_type", "whisper")](**audio_config)
+            self.wrapped_model_config = get_config(value,
+                                                   trust_remote_code=False)
+        elif key == "audio_model_id" and value is not None:
+            from vllm.transformers_utils.config import get_config
 
-        self.text_config = text_config_obj
-        self.audio_config = audio_config
-        self.text_model_lora_config = text_model_lora_config or {}
-        self.audio_model_lora_config = audio_model_lora_config or {}
+            self.audio_config = get_config(value, trust_remote_code=False)
 
-        self.vocab_size = inner_text_config.vocab_size
-        self.initializer_range = inner_text_config.initializer_range
-        self.text_hidden_size = inner_text_config.hidden_size
+        return super().__setattr__(key, value)
 
-        super().__init__(**kwargs)
+    @property
+    def text_config(self) -> transformers.PretrainedConfig:
+        # When Ultravox wraps a multi-modal model (e.g. Gemma), we instantiate
+        # the full model, but the text config is the text config of the inner
+        # model.
+        return self.wrapped_model_config.get_text_config()
diff --git a/vllm/transformers_utils/detokenizer.py b/vllm/transformers_utils/detokenizer.py
index 380c62a141f0f23fefb4b1074382896860867ba5..56b01ecf78c46dcd8c1b4c3af7f0a8aa75c77235 100644
--- a/vllm/transformers_utils/detokenizer.py
+++ b/vllm/transformers_utils/detokenizer.py
@@ -3,8 +3,9 @@
 
 from typing import Optional
 
-from vllm.sequence import (VLLM_INVALID_TOKEN_ID, Logprob, SamplingParams,
-                           Sequence, SequenceGroup)
+from vllm.logprobs import Logprob
+from vllm.sequence import (VLLM_INVALID_TOKEN_ID, SamplingParams, Sequence,
+                           SequenceGroup)
 
 from .detokenizer_utils import (convert_prompt_ids_to_tokens,
                                 detokenize_incrementally)
diff --git a/vllm/transformers_utils/processors/ovis.py b/vllm/transformers_utils/processors/ovis.py
index 557d251c45f3b27850c1cc8ea0301fa511d27117..0077a7a8ce6562b2c9dc4de951df9aa16ee34170 100644
--- a/vllm/transformers_utils/processors/ovis.py
+++ b/vllm/transformers_utils/processors/ovis.py
@@ -55,7 +55,7 @@ class OvisProcessorKwargs(ProcessingKwargs, total=False):   # type: ignore[call-
 
 class OvisProcessor(ProcessorMixin):
     r"""
-    Constructs a Ovis processor which wraps a Ovis image processor and a Qwen2 tokenizer into a single processor.
+    Constructs an Ovis processor which wraps an Ovis image processor and a Qwen2 tokenizer into a single processor.
     [`OvisProcessor`] offers all the functionalities of [`Qwen2VLImageProcessor`] and [`Qwen2TokenizerFast`]. See the
     [`~OvisProcessor.__call__`] and [`~OvisProcessor.decode`] for more information.
     Args:
diff --git a/vllm/transformers_utils/processors/ovis2_5.py b/vllm/transformers_utils/processors/ovis2_5.py
index d3273257ff8c20d082860d64d5983bfaafc03f9d..282e9cb2116e0a67306032b03dca1e6c11d3a37b 100644
--- a/vllm/transformers_utils/processors/ovis2_5.py
+++ b/vllm/transformers_utils/processors/ovis2_5.py
@@ -41,7 +41,7 @@ class Ovis2_5ProcessorKwargs(ProcessingKwargs,
 
 class Ovis2_5Processor(ProcessorMixin):
     r"""
-    Constructs a Ovis processor which wraps a Ovis image processor
+    Constructs an Ovis processor which wraps an Ovis image processor
     and a Qwen2 tokenizer into a single processor.
     [`OvisProcessor`] offers all the functionalities of 
     [`Qwen2VLImageProcessor`] and [`Qwen2TokenizerFast`]. 
diff --git a/vllm/transformers_utils/runai_utils.py b/vllm/transformers_utils/runai_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..357c180ed35f2dcc01641c668136ef6b5df700e2
--- /dev/null
+++ b/vllm/transformers_utils/runai_utils.py
@@ -0,0 +1,99 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import os
+import shutil
+import signal
+import tempfile
+from typing import Optional
+
+from vllm.logger import init_logger
+from vllm.utils import PlaceholderModule
+
+logger = init_logger(__name__)
+
+SUPPORTED_SCHEMES = ['s3://', 'gs://']
+
+try:
+    from runai_model_streamer import list_safetensors as runai_list_safetensors
+    from runai_model_streamer import pull_files as runai_pull_files
+except (ImportError, OSError):
+    # see https://github.com/run-ai/runai-model-streamer/issues/26
+    # OSError will be raised on arm64 platform
+    runai_model_streamer = PlaceholderModule(
+        "runai_model_streamer")  # type: ignore[assignment]
+    runai_pull_files = runai_model_streamer.placeholder_attr("pull_files")
+    runai_list_safetensors = runai_model_streamer.placeholder_attr(
+        "list_safetensors")
+
+
+def list_safetensors(path: str = "") -> list[str]:
+    """
+    List full file names from object path and filter by allow pattern.
+
+    Args:
+        path: The object storage path to list from.
+        allow_pattern: A list of patterns of which files to pull.
+
+    Returns:
+        list[str]: List of full object storage paths allowed by the pattern
+    """
+    return runai_list_safetensors(path)
+
+
+def is_runai_obj_uri(model_or_path: str) -> bool:
+    return model_or_path.lower().startswith(tuple(SUPPORTED_SCHEMES))
+
+
+class ObjectStorageModel:
+    """
+    A class representing an ObjectStorage model mirrored into a
+    temporary directory.
+
+    Attributes:
+        dir: The temporary created directory.
+
+    Methods:
+        pull_files(): Pull model from object storage to the temporary
+        directory.
+    """
+
+    def __init__(self) -> None:
+        for sig in (signal.SIGINT, signal.SIGTERM):
+            existing_handler = signal.getsignal(sig)
+            signal.signal(sig, self._close_by_signal(existing_handler))
+
+        self.dir = tempfile.mkdtemp()
+
+    def __del__(self):
+        self._close()
+
+    def _close(self) -> None:
+        if os.path.exists(self.dir):
+            shutil.rmtree(self.dir)
+
+    def _close_by_signal(self, existing_handler=None):
+
+        def new_handler(signum, frame):
+            self._close()
+            if existing_handler:
+                existing_handler(signum, frame)
+
+        return new_handler
+
+    def pull_files(self,
+                   model_path: str = "",
+                   allow_pattern: Optional[list[str]] = None,
+                   ignore_pattern: Optional[list[str]] = None) -> None:
+        """
+        Pull files from object storage into the temporary directory.
+
+        Args:
+            model_path: The object storage path of the model.
+            allow_pattern: A list of patterns of which files to pull.
+            ignore_pattern: A list of patterns of which files not to pull.
+
+        """
+        if not model_path.endswith("/"):
+            model_path = model_path + "/"
+        runai_pull_files(model_path, self.dir, allow_pattern, ignore_pattern)
diff --git a/vllm/transformers_utils/s3_utils.py b/vllm/transformers_utils/s3_utils.py
index f95aae7815e0bd9418c2ec3be097720fd70c0ec5..62c87c167e682715d6b685789f20cab054870e5a 100644
--- a/vllm/transformers_utils/s3_utils.py
+++ b/vllm/transformers_utils/s3_utils.py
@@ -2,11 +2,6 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import fnmatch
-import os
-import shutil
-import signal
-import tempfile
-from pathlib import Path
 from typing import Optional
 
 from vllm.utils import PlaceholderModule
@@ -93,70 +88,3 @@ def list_files(
         paths = _filter_ignore(paths, ignore_pattern)
 
     return bucket_name, prefix, paths
-
-
-class S3Model:
-    """
-    A class representing a S3 model mirrored into a temporary directory.
-
-    Attributes:
-        s3: S3 client.
-        dir: The temporary created directory.
-
-    Methods:
-        pull_files(): Pull model from S3 to the temporary directory.
-    """
-
-    def __init__(self) -> None:
-        self.s3 = boto3.client('s3')
-        for sig in (signal.SIGINT, signal.SIGTERM):
-            existing_handler = signal.getsignal(sig)
-            signal.signal(sig, self._close_by_signal(existing_handler))
-
-        self.dir = tempfile.mkdtemp()
-
-    def __del__(self):
-        self._close()
-
-    def _close(self) -> None:
-        if os.path.exists(self.dir):
-            shutil.rmtree(self.dir)
-
-    def _close_by_signal(self, existing_handler=None):
-
-        def new_handler(signum, frame):
-            self._close()
-            if existing_handler:
-                existing_handler(signum, frame)
-
-        return new_handler
-
-    def pull_files(self,
-                   s3_model_path: str = "",
-                   allow_pattern: Optional[list[str]] = None,
-                   ignore_pattern: Optional[list[str]] = None) -> None:
-        """
-        Pull files from S3 storage into the temporary directory.
-
-        Args:
-            s3_model_path: The S3 path of the model.
-            allow_pattern: A list of patterns of which files to pull.
-            ignore_pattern: A list of patterns of which files not to pull.
-
-        """
-        if not s3_model_path.endswith("/"):
-            s3_model_path = s3_model_path + "/"
-
-        bucket_name, base_dir, files = list_files(self.s3, s3_model_path,
-                                                  allow_pattern,
-                                                  ignore_pattern)
-        if len(files) == 0:
-            return
-
-        for file in files:
-            destination_file = os.path.join(
-                self.dir,
-                file.removeprefix(base_dir).lstrip("/"))
-            local_dir = Path(destination_file).parent
-            os.makedirs(local_dir, exist_ok=True)
-            self.s3.download_file(bucket_name, file, destination_file)
diff --git a/vllm/transformers_utils/tokenizer_group.py b/vllm/transformers_utils/tokenizer_group.py
index ae8220f9b9dc5249f2970687cd67dc45479e39a8..6b519cccd3cc668a9df813a5ac387145e78d702d 100644
--- a/vllm/transformers_utils/tokenizer_group.py
+++ b/vllm/transformers_utils/tokenizer_group.py
@@ -5,7 +5,8 @@ from typing import Optional
 
 from typing_extensions import assert_never
 
-from vllm.config import LoRAConfig, ModelConfig, SchedulerConfig
+from vllm.config import ModelConfig, SchedulerConfig
+from vllm.config.lora import LoRAConfig
 from vllm.lora.request import LoRARequest
 from vllm.transformers_utils.tokenizer import (AnyTokenizer, encode_tokens,
                                                get_lora_tokenizer,
diff --git a/vllm/triton_utils/__init__.py b/vllm/triton_utils/__init__.py
index 0fcf5d15afd1dbee06a7b54e7454b7fcdfbef6e0..828536e6408b15fc1e1df363eaf5d12b0b83e013 100644
--- a/vllm/triton_utils/__init__.py
+++ b/vllm/triton_utils/__init__.py
@@ -7,8 +7,10 @@ from vllm.triton_utils.importing import (HAS_TRITON, TritonLanguagePlaceholder,
 if HAS_TRITON:
     import triton
     import triton.language as tl
+    import triton.language.extra.libdevice as tldevice
 else:
     triton = TritonPlaceholder()
     tl = TritonLanguagePlaceholder()
+    tldevice = TritonLanguagePlaceholder()
 
-__all__ = ["HAS_TRITON", "triton", "tl"]
+__all__ = ["HAS_TRITON", "triton", "tl", "tldevice"]
diff --git a/vllm/utils/__init__.py b/vllm/utils/__init__.py
index e0ed39ef5a4c833b293cb8ea32bbf1b124d95509..9451b1777546b6ab87af14c324ef97f681d7b13d 100644
--- a/vllm/utils/__init__.py
+++ b/vllm/utils/__init__.py
@@ -78,6 +78,7 @@ if TYPE_CHECKING:
     from argparse import Namespace
 
     from vllm.config import ModelConfig, VllmConfig
+    from vllm.sequence import IntermediateTensors
 
 logger = init_logger(__name__)
 
@@ -1471,9 +1472,9 @@ def current_stream() -> torch.cuda.Stream:
         # On ROCm using the default 0 stream in combination with RCCL
         # is hurting performance. Therefore creating a dedicated stream
         # per process
-        
         # if current_platform.is_rocm():
-        #     _current_stream_tls.value = torch.cuda.Stream()
+        #     # torch.cuda.set_stream here is the alias of _pathed_set_stream
+        #     torch.cuda.set_stream(torch.cuda.Stream())
         if current_platform.is_cpu():
             _current_stream_tls.value = _StreamPlaceholder()
         else:
@@ -2279,7 +2280,8 @@ def weak_ref_tensor(tensor: Any) -> Any:
 
 
 def weak_ref_tensors(
-    tensors: Union[torch.Tensor, list[torch.Tensor], tuple[torch.Tensor]]
+    tensors: Union[torch.Tensor, list[torch.Tensor], tuple[torch.Tensor],
+                   IntermediateTensors]
 ) -> Union[torch.Tensor, list[Any], tuple[Any], Any]:
     """
     Convenience function to create weak references to tensors,
@@ -2291,6 +2293,15 @@ def weak_ref_tensors(
         return [weak_ref_tensor(t) for t in tensors]
     if isinstance(tensors, tuple):
         return tuple(weak_ref_tensor(t) for t in tensors)
+
+    # For IntermediateTensors used in pipeline parallelism
+    from vllm.sequence import IntermediateTensors
+    if isinstance(tensors, IntermediateTensors):
+        ret = IntermediateTensors({
+            key: weak_ref_tensor(val)
+            for key, val in tensors.tensors.items()
+        })
+        return ret
     raise ValueError("Invalid type for tensors")
 
 
@@ -2780,7 +2791,10 @@ def memory_profiling(
     result.torch_peak_increase = diff_profile.torch_peak
     result.non_torch_increase = diff_from_create.non_torch_memory
     result.profile_time = diff_profile.timestamp
-    result.non_kv_cache_memory = result.non_torch_increase + result.torch_peak_increase + result.weights_memory  # noqa
+
+    non_torch_memory = result.non_torch_increase
+    peak_activation_memory = result.torch_peak_increase
+    result.non_kv_cache_memory = non_torch_memory + peak_activation_memory + result.weights_memory  # noqa
 
 
 # Adapted from: https://github.com/sgl-project/sglang/blob/v0.4.1/python/sglang/srt/utils.py#L630 # noqa: E501
@@ -3251,7 +3265,7 @@ def check_use_alibi(model_config: ModelConfig) -> bool:
                       and getattr(cfg.attn_config, "alibi", False)))))
 
 
-def sha256(input) -> int:
+def sha256(input) -> bytes:
     """Hash any picklable Python object using SHA-256.
 
     The input is serialized using pickle before hashing, which allows
@@ -3262,16 +3276,15 @@ def sha256(input) -> int:
         input: Any picklable Python object.
 
     Returns:
-        An integer representing the SHA-256 hash of the serialized input.
+        Bytes representing the SHA-256 hash of the serialized input.
     """
     input_bytes = pickle.dumps(input, protocol=pickle.HIGHEST_PROTOCOL)
-    return int.from_bytes(hashlib.sha256(input_bytes).digest(),
-                          byteorder="big")
+    return hashlib.sha256(input_bytes).digest()
 
 
-def sha256_cbor_64bit(input) -> int:
+def sha256_cbor(input) -> bytes:
     """
-    Hash objects using CBOR serialization and SHA-256, then truncate to 64bits.
+    Hash objects using CBOR serialization and SHA-256.
 
     This option is useful for non-Python-dependent serialization and hashing.
 
@@ -3282,17 +3295,13 @@ def sha256_cbor_64bit(input) -> int:
             Custom classes must implement CBOR serialization methods.
 
     Returns:
-        An integer in the range [0, 2^64-1] representing the lower 64 bits
-        of the SHA-256 hash of the CBOR serialized input.
+        Bytes representing the SHA-256 hash of the CBOR serialized input.
     """
     input_bytes = cbor2.dumps(input, canonical=True)
-    full_hash = int.from_bytes(hashlib.sha256(input_bytes).digest(),
-                               byteorder="big")
+    return hashlib.sha256(input_bytes).digest()
 
-    return full_hash & ((1 << 64) - 1)
 
-
-def get_hash_fn_by_name(hash_fn_name: str) -> Callable[[Any], int]:
+def get_hash_fn_by_name(hash_fn_name: str) -> Callable[[Any], bytes]:
     """Get a hash function by name, or raise an error if
     the function is not found.
     Args:
@@ -3302,10 +3311,8 @@ def get_hash_fn_by_name(hash_fn_name: str) -> Callable[[Any], int]:
     """
     if hash_fn_name == "sha256":
         return sha256
-    if hash_fn_name == "sha256_cbor_64bit":
-        return sha256_cbor_64bit
-    if hash_fn_name == "builtin":
-        return hash
+    if hash_fn_name == "sha256_cbor":
+        return sha256_cbor
 
     raise ValueError(f"Unsupported hash function: {hash_fn_name}")
 
@@ -3368,7 +3375,7 @@ def has_triton_kernels() -> bool:
 
 def set_process_title(name: str,
                       suffix: str = "",
-                      append: bool = False) -> None:
+                      prefix: str = envs.VLLM_PROCESS_NAME_PREFIX) -> None:
     """
     Set the current process title to a specific name with an
     optional suffix.
@@ -3376,15 +3383,11 @@ def set_process_title(name: str,
     Args:
         name: The title to assign to the current process.
         suffix: An optional suffix to append to the base name.
-        append: Whether to append to the existing process title.
+        prefix: A prefix to prepend to the front separated by `::`.
     """
     if suffix:
         name = f"{name}_{suffix}"
-    if append:
-        name = f"{setproctitle.getproctitle()}_{name}"
-    else:
-        name = f"{envs.VLLM_PROCESS_NAME_PREFIX}::{name}"
-    setproctitle.setproctitle(name)
+    setproctitle.setproctitle(f"{prefix}::{name}")
 
 
 def _add_prefix(file: TextIO, worker_name: str, pid: int) -> None:
diff --git a/vllm/utils/flashinfer.py b/vllm/utils/flashinfer.py
index fab134733d4fd49f553973e5e16211c3c1b10e2e..83ec65c9b45940bfd323593a76f80ffad6fee7f4 100644
--- a/vllm/utils/flashinfer.py
+++ b/vllm/utils/flashinfer.py
@@ -200,11 +200,6 @@ def use_trtllm_attention(
         logger.info_once("Using TRTLLM attention (query is quantized).")
         return True
 
-    # TRTLLM prefill attention does not support FP8 kv cache with
-    # non-quantized query
-    if is_prefill and kv_cache_dtype.startswith("fp8"):
-        return False
-
     # If sinks are being used, we must use TRTLLM attention as it's
     # the only backend that supports them
     if has_sinks:
@@ -353,6 +348,12 @@ def flashinfer_scaled_fp8_mm(
     return output
 
 
+@functools.cache
+def flashinfer_disable_q_quantization() -> bool:
+    """Cache result which only depends on the environment"""
+    return envs.VLLM_FLASHINFER_DISABLE_Q_QUANTIZATION
+
+
 __all__ = [
     "has_flashinfer",
     "flashinfer_trtllm_fp8_block_scale_moe",
diff --git a/vllm/v1/attention/backends/cpu_attn.py b/vllm/v1/attention/backends/cpu_attn.py
index ced8234a7b4336d7053e7c0a67d58645dbfdd346..ab87f3bb4e3cbd2770fdef662c244eaf1b185e69 100644
--- a/vllm/v1/attention/backends/cpu_attn.py
+++ b/vllm/v1/attention/backends/cpu_attn.py
@@ -317,8 +317,8 @@ class TorchSDPAMetadataBuilderV1(AttentionMetadataBuilder[TorchSDPAMetadata]):
 
     def __init__(self, kv_cache_spec: AttentionSpec, layer_names: list[str],
                  vllm_config: VllmConfig, device: torch.device) -> None:
-        self.kv_cache_spec = kv_cache_spec
-        self.vllm_config = vllm_config
+        super().__init__(kv_cache_spec, layer_names, vllm_config, device)
+
         self.scheduler_config = vllm_config.scheduler_config
 
         # For reorder
diff --git a/vllm/v1/attention/backends/flash_attn.py b/vllm/v1/attention/backends/flash_attn.py
index 6ef89946bffaa366fd57bdc43bfd6103cdd2e7a9..3a965517a0dd6ca026f1b7df625b8062a1fd1677 100644
--- a/vllm/v1/attention/backends/flash_attn.py
+++ b/vllm/v1/attention/backends/flash_attn.py
@@ -167,7 +167,7 @@ class FlashAttentionMetadataBuilder(
     # work for mixed prefill-decode and uniform-decode. But for non-spec decodes
     # the graphs would not work for mixed prefill-decode; sorta the inverse
     # of UNIFORM_SINGLE_TOKEN_DECODE.
-    # Theres probably a better way to describe this using `AttentionCGSupport`
+    # There's probably a better way to describe this using `AttentionCGSupport`
     # but for now just set it to `UNIFORM_BATCH` to get use to drop down
     # to FULL_AND_PIECEWISE.
     # TODO(luka, lucas): audit FA2 as part of:
@@ -177,12 +177,11 @@ class FlashAttentionMetadataBuilder(
 
     def __init__(self, kv_cache_spec: AttentionSpec, layer_names: list[str],
                  vllm_config: VllmConfig, device: torch.device):
-        self.vllm_config = vllm_config
+        super().__init__(kv_cache_spec, layer_names, vllm_config, device)
         self.model_config = vllm_config.model_config
         self.parallel_config = vllm_config.parallel_config
         self.cache_config = vllm_config.cache_config
         self.compilation_config = vllm_config.compilation_config
-        self.device = device
 
         self.num_heads_q = self.model_config.get_num_attention_heads(
             self.parallel_config)
diff --git a/vllm/v1/attention/backends/flashinfer.py b/vllm/v1/attention/backends/flashinfer.py
index 5fc3a1517b6905f4bea1faa3391d1274bf4a6505..98a4cf38bc195d6688b8d06be1bec7ad2ec5a629 100755
--- a/vllm/v1/attention/backends/flashinfer.py
+++ b/vllm/v1/attention/backends/flashinfer.py
@@ -25,7 +25,8 @@ from vllm.model_executor.layers.quantization.utils.quant_utils import (
 from vllm.platforms import current_platform
 from vllm.triton_utils import tl, triton
 from vllm.utils import cdiv, is_pin_memory_available
-from vllm.utils.flashinfer import (supports_trtllm_attention,
+from vllm.utils.flashinfer import (flashinfer_disable_q_quantization,
+                                   supports_trtllm_attention,
                                    use_trtllm_attention)
 from vllm.v1.attention.backends.flash_attn import use_cascade_attention
 # yapf conflicts with isort for this block
@@ -48,8 +49,89 @@ FP4_DTYPE = torch.uint8
 logger = init_logger(__name__)
 
 
-class FlashInferBackend(AttentionBackend):
+@triton.jit
+def _trtllm_prefill_attn_kvfp8_dequant(
+    kv_cache_ptr,
+    block_tables_prefill_ptr,
+    block_table_stride,
+    mock_kv_cache_ptr,
+    k_scale_ptr,
+    v_scale_ptr,
+    K_CACHE_STRIDE: tl.constexpr,
+    KV_CACHE_STRIDE: tl.constexpr,
+):
+    batch_idx = tl.program_id(0).to(tl.int64)
+    mock_block_table_idx = tl.program_id(1).to(tl.int64)
+    orig_page_num = tl.load(block_tables_prefill_ptr +
+                            batch_idx * block_table_stride +
+                            mock_block_table_idx).to(tl.int64)
+    if orig_page_num <= 0:
+        return
+    dequant_dtype = mock_kv_cache_ptr.dtype.element_ty
+
+    # Dequantize K
+    k_scale_val = tl.load(k_scale_ptr)
+    offset = orig_page_num * KV_CACHE_STRIDE + tl.arange(0, K_CACHE_STRIDE)
+    fp8_vals = tl.load(kv_cache_ptr + offset)
+    dequantized_vals = fp8_vals.to(tl.float32) * k_scale_val
+    mock_cache_offset = (batch_idx * block_table_stride + mock_block_table_idx
+                         + 1) * KV_CACHE_STRIDE + tl.arange(0, K_CACHE_STRIDE)
+    dequantized_vals = dequantized_vals.to(dequant_dtype)
+    tl.store(mock_kv_cache_ptr + mock_cache_offset, dequantized_vals)
+
+    # Dequantize V
+    v_scale_val = tl.load(v_scale_ptr)
+    offset = (orig_page_num * KV_CACHE_STRIDE + K_CACHE_STRIDE +
+              tl.arange(0, K_CACHE_STRIDE))
+    fp8_vals = tl.load(kv_cache_ptr + offset)
+    dequantized_vals = fp8_vals.to(tl.float32) * v_scale_val
+    mock_cache_offset = (
+        (batch_idx * block_table_stride + mock_block_table_idx + 1) *
+        KV_CACHE_STRIDE + K_CACHE_STRIDE + tl.arange(0, K_CACHE_STRIDE))
+    dequantized_vals = dequantized_vals.to(dequant_dtype)
+    tl.store(mock_kv_cache_ptr + mock_cache_offset, dequantized_vals)
+
+
+def trtllm_prefill_attn_kvfp8_dequant(
+    kv_cache: torch.Tensor,
+    block_tables_prefill: torch.Tensor,
+    k_scale: torch.Tensor,
+    v_scale: torch.Tensor,
+    dequant_dtype: torch.dtype,
+) -> tuple[torch.Tensor, torch.Tensor]:
+    batch_size, num_of_page_per_token = block_tables_prefill.shape
+    s = kv_cache.shape
+    assert s[1] == 2
+    assert dequant_dtype in (torch.bfloat16, torch.float16)
+    k_cache_stride = s[2] * s[3] * s[4]
+    kv_cache_stride = k_cache_stride * s[1]
+    new_s = (batch_size * num_of_page_per_token + 1, s[1], s[2], s[3], s[4])
+    # mock kv cache contains just the pages needed by this prefill
+    mock_kv_cache = torch.empty(new_s,
+                                dtype=dequant_dtype,
+                                device=kv_cache.device)
+    # we simply sequentially index the pages needed by this prefill
+    mock_block_table = torch.arange(
+        start=1,
+        end=batch_size * num_of_page_per_token + 1,
+        dtype=torch.int32,
+        device=block_tables_prefill.device,
+    ).reshape(batch_size, num_of_page_per_token)
+    grid = (batch_size, num_of_page_per_token)
+    _trtllm_prefill_attn_kvfp8_dequant[grid](
+        kv_cache,
+        block_tables_prefill,
+        num_of_page_per_token,
+        mock_kv_cache,
+        k_scale,
+        v_scale,
+        k_cache_stride,
+        kv_cache_stride,
+    )
+    return mock_kv_cache, mock_block_table
+
 
+class FlashInferBackend(AttentionBackend):
     accept_output_buffer: bool = True
 
     @classmethod
@@ -122,7 +204,6 @@ class FlashInferBackend(AttentionBackend):
 
 @dataclass
 class FlashInferMetadata:
-
     num_actual_tokens: int  # Number of tokens excluding padding.
 
     # The data type of the query
@@ -163,11 +244,9 @@ class FlashInferMetadataBuilder(AttentionMetadataBuilder[FlashInferMetadata]):
 
     def __init__(self, kv_cache_spec: AttentionSpec, layer_names: list[str],
                  vllm_config: VllmConfig, device: torch.device):
-        self.device = device
-        self.vllm_config = vllm_config
+        super().__init__(kv_cache_spec, layer_names, vllm_config, device)
         self.cache_config = vllm_config.cache_config
         self.model_config = vllm_config.model_config
-        self.kv_cache_spec = kv_cache_spec
         self._workspace_buffer = None
         self._prefill_wrapper = None  # Wrapper for prefill/append
         self._decode_wrapper = None  # Wrapper for decode (general shape)
@@ -177,8 +256,8 @@ class FlashInferMetadataBuilder(AttentionMetadataBuilder[FlashInferMetadata]):
                                      self.kv_cache_spec.block_size)
         max_num_reqs = vllm_config.scheduler_config.max_num_seqs
         max_num_pages = max_num_reqs * max_num_pages_per_req
-        self.enable_cuda_graph = self.compilation_config.cudagraph_mode.\
-            decode_mode() == CUDAGraphMode.FULL
+        self.enable_cuda_graph = (self.compilation_config.cudagraph_mode.\
+            decode_mode() == CUDAGraphMode.FULL)
         if self.enable_cuda_graph:
             # For full cudagraph capture, one `decode_wrapper` for each batch
             # size is needed for FlashInfer.
@@ -194,20 +273,21 @@ class FlashInferMetadataBuilder(AttentionMetadataBuilder[FlashInferMetadata]):
         FlashInferBackend.validate_head_size(self.head_dim)
         self.page_size = self.kv_cache_spec.block_size
 
-        self.enable_fusion = (
-            self.compilation_config.pass_config.enable_attn_fusion)
-        self.q_data_type = self.model_config.dtype
         self.cache_dtype = self.cache_config.cache_dtype
         if self.cache_dtype.startswith("fp8"):
             self.kv_cache_dtype = (
                 FlashInferBackend.get_fp8_dtype_for_flashinfer(
                     self.cache_dtype))
-            # Insert FP8 quant for query if FP8 kv cache and attn fusion enabled
-            if self.enable_fusion:
-                self.q_data_type = self.kv_cache_dtype
         else:
+            assert self.kv_cache_spec.dtype == self.model_config.dtype
             self.kv_cache_dtype = self.kv_cache_spec.dtype
 
+        if supports_trtllm_attention()[0] and \
+            not flashinfer_disable_q_quantization():
+            self.q_data_type = self.kv_cache_dtype
+        else:
+            self.q_data_type = self.model_config.dtype
+
         self._cascade_wrapper = None  # Wrapper for cascade attention
 
         # Global hyperparameters shared by all attention layers
@@ -218,7 +298,11 @@ class FlashInferMetadataBuilder(AttentionMetadataBuilder[FlashInferMetadata]):
         self.window_left = self.global_hyperparameters.window_left
         self.logits_soft_cap = self.global_hyperparameters.logits_soft_cap
         self.has_sinks = self.global_hyperparameters.has_sinks
-
+        if self.has_sinks and not supports_trtllm_attention()[0]:
+            raise NotImplementedError(
+                "FlashInfer backend currently does not support attention "
+                "sinks, please use trtllm on blackwell or flash attention on "
+                "earlier GPUs.")
         # Preparing persistent buffers (device-side)
         self.paged_kv_indptr = torch.zeros(max_num_reqs + 1,
                                            dtype=torch.int32,
@@ -291,7 +375,7 @@ class FlashInferMetadataBuilder(AttentionMetadataBuilder[FlashInferMetadata]):
                 paged_kv_indices_buffer=paged_kv_indices,
                 paged_kv_last_page_len_buffer=paged_kv_last_page_len,
                 # Tensor cores are enabled by default because the perf would be
-                # atleast as good as cuda cores for all attention ops in latest
+                # at least as good as cuda cores for all attention ops in latest
                 # gpus.
                 use_tensor_cores=True,
             )
@@ -317,7 +401,8 @@ class FlashInferMetadataBuilder(AttentionMetadataBuilder[FlashInferMetadata]):
         num_reqs = common_attn_metadata.num_reqs
         num_actual_tokens = common_attn_metadata.num_actual_tokens
         num_decodes, num_prefills, num_decode_tokens, num_prefill_tokens =\
-            split_decodes_and_prefills(common_attn_metadata)
+            split_decodes_and_prefills(common_attn_metadata,
+                                       decode_threshold=self.reorder_batch_threshold)
 
         page_size = self.page_size
         max_q_len = common_attn_metadata.max_query_len
@@ -409,7 +494,11 @@ class FlashInferMetadataBuilder(AttentionMetadataBuilder[FlashInferMetadata]):
                                                  self.q_data_type,
                                                  is_prefill=False,
                                                  has_sinks=self.has_sinks)
-
+        if self.has_sinks and not (prefill_use_trtllm and decode_use_trtllm):
+            raise NotImplementedError(
+                "FlashInfer backend currently does not support attention "
+                "sinks, please use trtllm on blackwell or flash attention on "
+                "earlier GPUs.")
         attn_metadata = FlashInferMetadata(
             num_actual_tokens=num_actual_tokens,
             q_data_type=self.q_data_type,
@@ -542,22 +631,6 @@ class FlashInferMetadataBuilder(AttentionMetadataBuilder[FlashInferMetadata]):
                     )
         return attn_metadata
 
-    def build_for_cudagraph_capture(
-            self, common_attn_metadata: CommonAttentionMetadata):
-        """
-        This method builds the metadata for full cudagraph capture.
-        Currently, only decode is supported for full cudagraphs with FlashInfer.
-        """
-        m = common_attn_metadata
-
-        assert m.num_reqs == m.num_actual_tokens, \
-            "FlashInfer only supports decode-only full CUDAGraph capture. " \
-            "Make sure all cudagraph capture sizes <= max_num_seq."
-
-        m.max_query_len = 1  # decode-only
-
-        return self.build(0, m)
-
     def use_cascade_attention(self, *args, **kwargs) -> bool:
         if self.kv_cache_spec.dtype != self.vllm_config.model_config.dtype:
             # TODO: The cascade wrapper currently does not support setting
@@ -667,8 +740,6 @@ class FlashInferImpl(AttentionImpl):
 
         # The attn+quant fusion happens when output_scale is provided.
         if output_scale is None:
-            assert attn_metadata.q_data_type != FP8_DTYPE, \
-                "Query can only be FP8 if output fusion happened."
             assert output_block_scale is None, "output_block_scale "\
                 "is not supported when fusion has not happened"
         else:
@@ -686,7 +757,7 @@ class FlashInferImpl(AttentionImpl):
             else:
                 raise ValueError(f"Unsupported output dtype: {output.dtype}")
 
-            # TRTLLM attn kernel requires o scale to pass as a host scalar,
+            # TRTLLM attn kernel requires to scale to pass as a host scalar,
             # store the o scale as a host scalar in warmup run with cuda graph
             # not enabled
             if layer._o_scale_float is None:
@@ -696,7 +767,8 @@ class FlashInferImpl(AttentionImpl):
                 elif output.dtype == FP4_DTYPE:
                     self.o_sf_scale = layer._o_scale_float
 
-            # Insert FP8 quant for query
+        # Insert FP8 quant for query
+        if attn_metadata.q_data_type == FP8_DTYPE:
             num_tokens, num_heads, head_size = query.shape
             query, _ = ops.scaled_fp8_quant(
                 query.reshape(
@@ -805,11 +877,29 @@ class FlashInferImpl(AttentionImpl):
                     assert self.o_sf_scale is None
                     out = output[num_decode_tokens:]
 
+                if attn_metadata.q_data_type != FP8_DTYPE \
+                    and self.kv_cache_dtype.startswith("fp8"):
+                    # TRTLLM prefill attention does not support BF16 Q
+                    # and fp8 kv cache. So to enable prefill attention
+                    # with fp8 kv cache, we can construct a mock block
+                    # and mock kv cache with BF16 KV involved in the prefill
+                    mock_kv_cache, mock_block_table = (
+                        trtllm_prefill_attn_kvfp8_dequant(
+                            kv_cache_permute,
+                            block_tables_prefill,
+                            layer._k_scale,
+                            layer._v_scale,
+                            attn_metadata.q_data_type,
+                        ))
+                else:
+                    mock_kv_cache = kv_cache_permute
+                    mock_block_table = block_tables_prefill
+
                 trtllm_batch_context_with_kv_cache(
                     query=prefill_query,
-                    kv_cache=kv_cache_permute,
+                    kv_cache=mock_kv_cache,
                     workspace_buffer=workspace_buffer,
-                    block_tables=block_tables_prefill,
+                    block_tables=mock_block_table,
                     seq_lens=seq_lens_prefill,
                     max_q_len=attn_metadata.max_q_len,
                     max_kv_len=attn_metadata.max_seq_len,
@@ -847,7 +937,7 @@ class FlashInferImpl(AttentionImpl):
                 decode_query = decode_query.contiguous()
                 workspace_buffer = decode_wrapper._float_workspace_buffer
                 block_tables_decode = attn_metadata.\
-                        block_table_tensor[:num_decode_tokens]
+                    block_table_tensor[:num_decode_tokens]
                 seq_lens_decode = attn_metadata.seq_lens[:num_decode_tokens]
 
                 # This path needs to be enabled with VLLM_KV_CACHE_LAYOUT = HND
diff --git a/vllm/v1/attention/backends/flex_attention.py b/vllm/v1/attention/backends/flex_attention.py
index d5b1c15e68d0e96d65c0bf603a1067cf57c1d12f..cb983494216a7d3575f4a90acb73e512954e707d 100644
--- a/vllm/v1/attention/backends/flex_attention.py
+++ b/vllm/v1/attention/backends/flex_attention.py
@@ -516,10 +516,11 @@ class FlexAttentionMetadataBuilder(
 
     def __init__(self, kv_cache_spec: AttentionSpec, layer_names: list[str],
                  vllm_config: VllmConfig, device: torch.device):
+        super().__init__(kv_cache_spec, layer_names, vllm_config, device)
+
         self.model_config = vllm_config.model_config
         self.parallel_config = vllm_config.parallel_config
         self.cache_config = vllm_config.cache_config
-        self.device = device
 
         self.num_heads_q = self.model_config.get_num_attention_heads(
             self.parallel_config)
diff --git a/vllm/v1/attention/backends/gdn_attn.py b/vllm/v1/attention/backends/gdn_attn.py
new file mode 100644
index 0000000000000000000000000000000000000000..12233af057b04360474b571ceedb6d2515ab6c98
--- /dev/null
+++ b/vllm/v1/attention/backends/gdn_attn.py
@@ -0,0 +1,319 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""Backend for GatedDeltaNet attention."""
+from dataclasses import dataclass
+from typing import ClassVar, Optional
+
+import torch
+
+from vllm.attention.backends.abstract import AttentionBackend
+from vllm.attention.backends.utils import PAD_SLOT_ID
+from vllm.config import VllmConfig
+from vllm.v1.attention.backends.utils import (AttentionCGSupport,
+                                              AttentionMetadataBuilder,
+                                              CommonAttentionMetadata,
+                                              split_decodes_and_prefills)
+from vllm.v1.kv_cache_interface import AttentionSpec, MambaSpec
+
+
+class GDNAttentionBackend(AttentionBackend):
+
+    @staticmethod
+    def get_builder_cls() -> type["GDNAttentionMetadataBuilder"]:
+        return GDNAttentionMetadataBuilder
+
+
+@dataclass
+class GDNAttentionMetadata:
+    num_prefills: int
+    num_prefill_tokens: int
+    num_decodes: int
+    num_decode_tokens: int
+    num_spec_decodes: int
+    num_spec_decode_tokens: int
+
+    has_initial_state: Optional[torch.Tensor] = None
+
+    spec_query_start_loc: Optional[
+        torch.Tensor] = None  # shape: [num_spec_decodes + 1,]
+    non_spec_query_start_loc: Optional[
+        torch.Tensor] = None  # shape: [batch - num_spec_decodes + 1,]
+
+    spec_state_indices_tensor: Optional[
+        torch.Tensor] = None  # shape: [batch, num_spec]
+    non_spec_state_indices_tensor: Optional[
+        torch.Tensor] = None  # shape: [batch - num_spec_decodes,]
+    spec_sequence_masks: Optional[torch.Tensor] = None  # shape: [batch,]
+    spec_token_masks: Optional[
+        torch.
+        Tensor] = None  # shape: [num_prefill_tokens + num_decode_tokens,]
+    num_accepted_tokens: Optional[torch.Tensor] = None  # shape: [batch,]
+
+
+class GDNAttentionMetadataBuilder(
+        AttentionMetadataBuilder[GDNAttentionMetadata]):
+
+    cudagraph_support = AttentionCGSupport.UNIFORM_BATCH
+
+    reorder_batch_threshold: ClassVar[int] = 1
+
+    def __init__(self, kv_cache_spec: AttentionSpec, layer_names: list[str],
+                 vllm_config: VllmConfig, device: torch.device):
+        assert isinstance(kv_cache_spec, MambaSpec)
+        self.vllm_config = vllm_config
+        self.compilation_config = vllm_config.compilation_config
+        self.speculative_config = vllm_config.speculative_config
+        self.kv_cache_spec = kv_cache_spec
+        if self.speculative_config:
+            self.num_spec = self.speculative_config.num_speculative_tokens  # noqa: E501
+        else:
+            self.num_spec = 0
+        self.use_spec_decode = self.num_spec > 0
+        self.reorder_batch_threshold = self.num_spec + 1  # type: ignore[misc]
+
+        self.use_full_cuda_graph = \
+            self.compilation_config.cudagraph_mode.has_full_cudagraphs()
+        self.decode_cudagraph_max_bs = min(
+            self.vllm_config.scheduler_config.max_num_seqs,
+            self.compilation_config.max_capture_size)
+
+        self.spec_state_indices_tensor = torch.empty(
+            (self.decode_cudagraph_max_bs, self.num_spec + 1),
+            dtype=torch.int32,
+            device=device,
+        )
+        self.non_spec_state_indices_tensor = torch.empty(
+            (self.decode_cudagraph_max_bs, ),
+            dtype=torch.int32,
+            device=device,
+        )
+        self.spec_sequence_masks = torch.empty(
+            (self.decode_cudagraph_max_bs, ),
+            dtype=torch.bool,
+            device=device,
+        )
+        self.spec_token_masks = torch.empty(
+            (self.decode_cudagraph_max_bs * (self.num_spec + 1), ),
+            dtype=torch.bool,
+            device=device,
+        )
+        self.spec_query_start_loc = torch.empty(
+            (self.decode_cudagraph_max_bs + 1, ),
+            dtype=torch.int32,
+            device=device,
+        )
+        self.non_spec_query_start_loc = torch.empty(
+            (self.decode_cudagraph_max_bs + 1, ),
+            dtype=torch.int32,
+            device=device,
+        )
+        self.num_accepted_tokens = torch.empty(
+            (self.decode_cudagraph_max_bs, ),
+            dtype=torch.int32,
+            device=device,
+        )
+
+    def build(  # type: ignore[override]
+        self,
+        common_prefix_len: int,
+        common_attn_metadata: CommonAttentionMetadata,
+        num_accepted_tokens: Optional[torch.Tensor] = None,
+        num_draft_tokens: Optional[torch.Tensor] = None,
+        fast_build: bool = False,
+    ) -> GDNAttentionMetadata:
+        m = common_attn_metadata
+
+        query_start_loc = m.query_start_loc
+        context_lens = m.num_computed_tokens_cpu
+        context_lens_tensor = context_lens.to(query_start_loc.device)
+        seq_lens_tensor = m.seq_lens
+
+        if (not self.use_spec_decode or num_draft_tokens is None
+                or num_draft_tokens.sum().item() == 0):
+            spec_sequence_masks = None
+        else:
+            spec_sequence_masks = (num_draft_tokens > 0) & (
+                context_lens_tensor +
+                (num_draft_tokens + 1) == seq_lens_tensor)
+            if spec_sequence_masks.sum().item() == 0:
+                spec_sequence_masks = None
+
+        if spec_sequence_masks is None:
+            num_decodes, num_prefills, num_decode_tokens, num_prefill_tokens = (
+                split_decodes_and_prefills(m, decode_threshold=1))
+            num_spec_decodes = 0
+            num_spec_decode_tokens = 0
+            spec_token_masks = None
+            spec_state_indices_tensor = None
+            non_spec_state_indices_tensor = m.block_table_tensor[:, 0]
+            spec_query_start_loc = None
+            non_spec_query_start_loc = query_start_loc
+            num_accepted_tokens = None
+        else:
+            num_spec_decodes = spec_sequence_masks.sum().item()
+            query_lens = query_start_loc[1:] - query_start_loc[:-1]
+
+            non_spec_query_lens = query_lens[~spec_sequence_masks]
+            num_decodes = (non_spec_query_lens == 1).sum().item()
+            num_prefills = non_spec_query_lens.size(0) - num_decodes
+            num_decode_tokens = num_decodes
+            num_prefill_tokens = non_spec_query_lens.sum().item(
+            ) - num_decode_tokens
+
+            if num_prefills == 0 and num_decodes == 0:
+                spec_token_masks = torch.ones(
+                    (min(num_spec_decodes *
+                         (self.num_spec + 1), query_start_loc[-1].item())),
+                    dtype=torch.bool,
+                    device=query_start_loc.device)
+                spec_state_indices_tensor = m.block_table_tensor[:, :self.
+                                                                 num_spec + 1]
+                non_spec_state_indices_tensor = None
+                spec_query_start_loc = query_start_loc
+                non_spec_query_start_loc = None
+            else:
+                spec_token_masks = torch.repeat_interleave(
+                    spec_sequence_masks, query_lens)
+                spec_state_indices_tensor = m.block_table_tensor[
+                    spec_sequence_masks, :self.num_spec + 1]
+                non_spec_state_indices_tensor = \
+                    m.block_table_tensor[~spec_sequence_masks, 0]
+
+                spec_query_start_loc = torch.zeros(
+                    num_spec_decodes + 1,
+                    dtype=torch.int32,
+                    device=query_start_loc.device)
+                torch.cumsum(query_lens[spec_sequence_masks],
+                             dim=0,
+                             out=spec_query_start_loc[1:])
+                non_spec_query_start_loc = torch.zeros(
+                    query_lens.size(0) - num_spec_decodes + 1,
+                    dtype=torch.int32,
+                    device=query_start_loc.device)
+                torch.cumsum(query_lens[~spec_sequence_masks],
+                             dim=0,
+                             out=non_spec_query_start_loc[1:])
+
+            num_spec_decode_tokens = min(
+                num_spec_decodes * (self.num_spec + 1),
+                spec_token_masks.size(0))
+            assert num_accepted_tokens is not None
+            num_accepted_tokens = num_accepted_tokens[spec_sequence_masks]
+
+        if num_prefills > 0:
+            has_initial_state = context_lens_tensor > 0
+            if spec_sequence_masks is not None:
+                has_initial_state = has_initial_state[~spec_sequence_masks]
+        else:
+            has_initial_state = None
+
+        # prepare tensors for cudagraph
+        if (self.use_full_cuda_graph and num_prefills == 0 and num_decodes == 0
+                and num_spec_decodes <= self.decode_cudagraph_max_bs):
+            num_total_tokens = self.vllm_config.pad_for_cudagraph(
+                m.num_actual_tokens)
+            batch_size = num_total_tokens // (self.num_spec + 1)
+
+            self.spec_state_indices_tensor[:num_spec_decodes].copy_(
+                spec_state_indices_tensor, non_blocking=True)
+            spec_state_indices_tensor = self.spec_state_indices_tensor[:
+                                                                       batch_size]
+            spec_state_indices_tensor[num_spec_decodes:].fill_(PAD_SLOT_ID)
+
+            self.spec_sequence_masks[:num_spec_decodes].copy_(
+                spec_sequence_masks, non_blocking=True)
+            spec_sequence_masks = self.spec_sequence_masks[:batch_size]
+            spec_sequence_masks[num_spec_decodes:].fill_(False)
+
+            assert spec_token_masks is not None
+            self.spec_token_masks[:spec_token_masks.size(0)].copy_(
+                spec_token_masks, non_blocking=True)
+            spec_token_masks = self.spec_token_masks[:m.num_actual_tokens]
+            spec_token_masks[spec_token_masks.size(0):].fill_(False)
+
+            self.spec_query_start_loc[:num_spec_decodes + 1].copy_(
+                spec_query_start_loc, non_blocking=True)
+            spec_num_query_tokens = spec_query_start_loc[
+                -1]  # type: ignore[index]
+            spec_query_start_loc = self.spec_query_start_loc[:batch_size + 1]
+            spec_query_start_loc[num_spec_decodes +
+                                 1:].fill_(spec_num_query_tokens)
+
+            self.num_accepted_tokens[:num_spec_decodes].copy_(
+                num_accepted_tokens, non_blocking=True)
+            num_accepted_tokens = self.num_accepted_tokens[:batch_size]
+            num_accepted_tokens[num_spec_decodes:].fill_(1)
+
+        if (self.use_full_cuda_graph and num_prefills == 0
+                and num_spec_decodes == 0
+                and num_decodes <= self.decode_cudagraph_max_bs):
+            num_total_tokens = self.vllm_config.pad_for_cudagraph(
+                m.num_actual_tokens)
+            batch_size = num_total_tokens
+
+            self.non_spec_state_indices_tensor[:num_decodes].copy_(
+                non_spec_state_indices_tensor, non_blocking=True)
+            non_spec_state_indices_tensor = \
+                self.non_spec_state_indices_tensor[:batch_size]
+            non_spec_state_indices_tensor[num_decodes:].fill_(PAD_SLOT_ID)
+
+            self.non_spec_query_start_loc[:num_decodes + 1].copy_(
+                non_spec_query_start_loc, non_blocking=True)
+            non_spec_num_query_tokens = non_spec_query_start_loc[
+                -1]  # type: ignore[index]
+            non_spec_query_start_loc = \
+                self.non_spec_query_start_loc[:batch_size + 1]
+            non_spec_query_start_loc[num_decodes +
+                                     1:].fill_(non_spec_num_query_tokens)
+
+        attn_metadata = GDNAttentionMetadata(
+            num_prefills=num_prefills,
+            num_prefill_tokens=num_prefill_tokens,
+            num_decodes=num_decodes,
+            num_decode_tokens=num_decode_tokens,
+            num_spec_decodes=num_spec_decodes,
+            num_spec_decode_tokens=num_spec_decode_tokens,
+            has_initial_state=has_initial_state,
+            spec_query_start_loc=spec_query_start_loc,
+            non_spec_query_start_loc=non_spec_query_start_loc,
+            spec_state_indices_tensor=spec_state_indices_tensor,
+            non_spec_state_indices_tensor=non_spec_state_indices_tensor,
+            spec_sequence_masks=spec_sequence_masks,
+            spec_token_masks=spec_token_masks,
+            num_accepted_tokens=num_accepted_tokens,
+        )
+        return attn_metadata
+
+    def build_for_cudagraph_capture(
+            self, common_attn_metadata: CommonAttentionMetadata):
+        """
+        This method builds the metadata for full cudagraph capture.
+        Currently, only decode is supported for full cudagraphs with Mamba.
+        """
+        m = common_attn_metadata
+
+        assert (m.num_reqs * (self.num_spec + 1) <= m.num_actual_tokens
+                and ((m.num_reqs + 1) * (self.num_spec + 1)
+                     >= m.num_actual_tokens)), \
+            "GDN only supports decode-only full CUDAGraph capture. " \
+            "Make sure all cudagraph capture sizes <= max_num_seq."
+
+        num_accepted_tokens = torch.full((m.num_reqs, ),
+                                         m.max_query_len,
+                                         dtype=torch.int32,
+                                         device=m.query_start_loc.device)
+        num_drafted_tokens = torch.full((m.num_reqs, ),
+                                        self.num_spec,
+                                        dtype=torch.int32,
+                                        device=m.query_start_loc.device)
+
+        # Fixes query-start loc for spec-sequence-indices.
+        m.query_start_loc = torch.arange(0,
+                                         m.num_actual_tokens + 1,
+                                         step=m.max_query_len,
+                                         device=m.query_start_loc.device,
+                                         dtype=torch.int32)
+        m.num_computed_tokens_cpu = (m.seq_lens_cpu - torch.full(
+            (m.num_reqs, ), m.max_query_len, dtype=torch.int32, device='cpu'))
+
+        return self.build(0, m, num_accepted_tokens, num_drafted_tokens)
diff --git a/vllm/v1/attention/backends/linear_attn.py b/vllm/v1/attention/backends/linear_attn.py
index f08b6d7f177c7b2c69c21b8b90df833762fe7c70..3ff201d83a79baf199c99e1a8c1cd2aa96be6eac 100644
--- a/vllm/v1/attention/backends/linear_attn.py
+++ b/vllm/v1/attention/backends/linear_attn.py
@@ -39,8 +39,8 @@ class LinearAttentionMetadataBuilder(
 
     def __init__(self, kv_cache_spec: AttentionSpec, layer_names: list[str],
                  vllm_config: VllmConfig, device: torch.device):
+        super().__init__(kv_cache_spec, layer_names, vllm_config, device)
         assert isinstance(kv_cache_spec, MambaSpec)
-        self.kv_cache_spec = kv_cache_spec
 
     def build(self,
               common_prefix_len: int,
@@ -52,8 +52,9 @@ class LinearAttentionMetadataBuilder(
         state_indices_tensor = common_attn_metadata.block_table_tensor[:, 0]
 
         num_decodes, num_prefills, num_decode_tokens, num_prefill_tokens = (
-            split_decodes_and_prefills(common_attn_metadata,
-                                       decode_threshold=1))
+            split_decodes_and_prefills(
+                common_attn_metadata,
+                decode_threshold=self.reorder_batch_threshold))
 
         attn_metadata = LinearAttentionMetadata(
             num_prefills=num_prefills,
diff --git a/vllm/v1/attention/backends/mamba1_attn.py b/vllm/v1/attention/backends/mamba1_attn.py
index 97a1aa86dda0ded755ae11ce2fbd06b0e3dad03a..7cbfa2c2c9a5460e1512681ba6863061addaae6f 100644
--- a/vllm/v1/attention/backends/mamba1_attn.py
+++ b/vllm/v1/attention/backends/mamba1_attn.py
@@ -50,8 +50,9 @@ class Mamba1AttentionMetadataBuilder(
             query_start_loc.device)
 
         num_decodes, num_prefills, num_decode_tokens, num_prefill_tokens = (
-            split_decodes_and_prefills(common_attn_metadata,
-                                       decode_threshold=1))
+            split_decodes_and_prefills(
+                common_attn_metadata,
+                decode_threshold=self.reorder_batch_threshold))
 
         has_initial_states = None
         padded_decodes = num_decodes
diff --git a/vllm/v1/attention/backends/mamba2_attn.py b/vllm/v1/attention/backends/mamba2_attn.py
index ed30884fdbc94f16f74ee2535458927e82ef1048..359bad1ea9dee3039378a0f953536e5ac9df1f1b 100644
--- a/vllm/v1/attention/backends/mamba2_attn.py
+++ b/vllm/v1/attention/backends/mamba2_attn.py
@@ -16,9 +16,58 @@ from vllm.v1.attention.backends.utils import (CommonAttentionMetadata,
 from vllm.v1.kv_cache_interface import AttentionSpec
 
 
-def _query_start_loc_to_chunk_indices_offsets(query_start_loc: torch.Tensor,
-                                              chunk_size: int,
-                                              total_seqlens: int):
+def _query_start_loc_to_chunk_indices_offsets(
+        query_start_loc: torch.Tensor, chunk_size: int,
+        total_seqlens: int) -> tuple[torch.Tensor, torch.Tensor]:
+    """
+    Args:
+        query_start_loc (torch.Tensor): 1D tensor of cumulative sequence 
+            lengths, shape (num_seqs + 1,).
+            The first element should be 0. Each entry represents the starting
+            index of a sequence in the flattened token array.
+        chunk_size (int): The size of each physical mamba chunk
+            (number of tokens per chunk).
+        total_seqlens (int): The total number of tokens in the batch.
+
+    Returns:
+        Tuple[torch.Tensor, torch.Tensor]: A tuple containing:
+            - chunk_indices (torch.Tensor): 1D tensor of indices 
+                indicating the physical chunk for each logical chunk.
+            - chunk_offsets (torch.Tensor): 1D tensor of offsets
+                indicating the starting index of each logical chunk within
+                its physical chunk.
+
+    This function computes the chunk indices and offsets for the given
+    query_start_loc and chunk_size. Both are tensors of integers with length N,
+    where N is the number of logical (pseudo) chunks.
+    A logical chunk is a sequence of tokens that are all part of the same
+    sequence and are all in the same physical mamba chunk.
+    In other words, a logical chunk changes every time we cross a sequence
+    boundary or a physical mamba chunk boundary.
+    Logical chunks are needed to handle batched requests with initial states
+    (see _state_passing_fwd and _chunk_scan_fwd).
+    The chunk_indices tensor contains the index of the physical chunk for each
+    logical chunk.
+    The chunk_offsets tensor contains the offset (AKA starting index) of the
+    logical chunk in the physical chunk.
+
+    Example:
+    query_start_loc = [0, 5, 10]
+    chunk_size = 8
+    total_seqlens = 10
+    -> chunk_indices = [0, 0, 1]
+    -> chunk_offsets = [0, 5, 0]
+
+    In this example, we have 2 sequences, each with 5 tokens. The physical
+    chunk size is 8 tokens.
+    We have three logical chunks:
+    - the first logical chunk starts at token 0 in the first physical chunk
+        and contains all 5 tokens from the first sequence
+    - the second logical chunk starts at token 5 in the first physical chunk
+        and contains first 3 tokens from the second sequence
+    - the third logical chunk starts at token 0 in the second physical chunk
+        and contains the remaining 2 tokens from the second sequence
+    """
 
     cu_seqlens = query_start_loc[1:]  # remove prepended 0
 
@@ -115,8 +164,9 @@ class Mamba2AttentionMetadataBuilder(
         state_indices_tensor = common_attn_metadata.block_table_tensor[:, 0]
 
         num_decodes, num_prefills, num_decode_tokens, num_prefill_tokens = (
-            split_decodes_and_prefills(common_attn_metadata,
-                                       decode_threshold=1))
+            split_decodes_and_prefills(
+                common_attn_metadata,
+                decode_threshold=self.reorder_batch_threshold))
 
         # Compute seq_idx, chunk_indices and chunk_offsets for prefill only
         if num_prefills > 0:
diff --git a/vllm/v1/attention/backends/mamba_attn.py b/vllm/v1/attention/backends/mamba_attn.py
index 07ef7cb69a160cd5b49c172416569bb1da4e9062..9970331a6042c389d09f97f636c3d560053adf40 100644
--- a/vllm/v1/attention/backends/mamba_attn.py
+++ b/vllm/v1/attention/backends/mamba_attn.py
@@ -22,12 +22,9 @@ class BaseMambaAttentionMetadataBuilder(AttentionMetadataBuilder[M], abc.ABC):
 
     def __init__(self, kv_cache_spec: AttentionSpec, layer_names: list[str],
                  vllm_config: VllmConfig, device: torch.device):
-        assert isinstance(kv_cache_spec, MambaSpec)
-        self.kv_cache_spec = kv_cache_spec
-        self.device = device
-        self.vllm_config = vllm_config
-        self.layer_names = layer_names
+        super().__init__(kv_cache_spec, layer_names, vllm_config, device)
 
+        assert isinstance(kv_cache_spec, MambaSpec)
         self.compilation_config = vllm_config.compilation_config
         self.decode_cudagraph_max_bs = min(
             self.vllm_config.scheduler_config.max_num_seqs,
@@ -52,4 +49,4 @@ class BaseMambaAttentionMetadataBuilder(AttentionMetadataBuilder[M], abc.ABC):
 
         m.max_query_len = 1  # decode-only
 
-        return self.build(0, m)
\ No newline at end of file
+        return self.build(0, m)
diff --git a/vllm/v1/attention/backends/mla/common.py b/vllm/v1/attention/backends/mla/common.py
index 9f93b50b075b41aa5ca78e7418dc06b099d9c46b..036a281f1d26e299a159f95af4200c593d74fe03 100755
--- a/vllm/v1/attention/backends/mla/common.py
+++ b/vllm/v1/attention/backends/mla/common.py
@@ -201,10 +201,11 @@ from vllm.attention.backends.abstract import (AttentionBackend, AttentionLayer,
                                               AttentionMetadata,
                                               MLAAttentionImpl)
 from vllm.attention.backends.utils import get_mla_dims
+from vllm.attention.ops.common import cp_lse_ag_out_rs
 from vllm.attention.ops.merge_attn_states import merge_attn_states
 from vllm.attention.utils.fa_utils import get_flash_attn_version
 from vllm.config import VllmConfig
-from vllm.distributed.parallel_state import is_global_first_rank
+from vllm.distributed.parallel_state import get_dcp_group, is_global_first_rank
 from vllm.logger import init_logger
 from vllm.model_executor.layers.linear import (ColumnParallelLinear,
                                                LinearBase,
@@ -323,6 +324,13 @@ class MLACommonPrefillMetadata:
         seq_lens: torch.Tensor
         workspace: torch.Tensor
 
+        # for mla DCP
+        cp_chunk_seq_lens: Optional[list[list[int]]] = None
+        origin_context_lens: Optional[list[int]] = None
+        cp_cu_seq_lens: Optional[torch.Tensor] = None
+        chunk_size: Optional[int] = None
+        cu_seq_lens_lst: Optional[list[list[int]]] = None
+
     block_table: torch.Tensor
     query_start_loc: torch.Tensor
     max_query_len: int
@@ -373,6 +381,7 @@ class MLACommonMetadata(Generic[D]):
 
     num_reqs: int
     max_query_len: int
+    max_seq_len: int
 
     num_actual_tokens: int  # Number of tokens excluding padding.
     query_start_loc: torch.Tensor
@@ -401,7 +410,7 @@ M = TypeVar("M", bound=MLACommonMetadata)
 
 
 def use_flashinfer_prefill() -> bool:
-    # For blackwell default to flashinfer prefill if its available since
+    # For blackwell default to flashinfer prefill if it's available since
     # it is faster than FA2.
     return (flashinfer_available and not envs.VLLM_USE_CUDNN_PREFILL
             and current_platform.is_device_capability(100))
@@ -435,17 +444,26 @@ class MLACommonMetadataBuilder(AttentionMetadataBuilder[M]):
         self.metadata_cls = metadata_cls \
             if metadata_cls is not None else MLACommonMetadata
         self.kv_cache_spec = kv_cache_spec
-        self.device = device
         scheduler_config = vllm_config.scheduler_config
         self.model_config = vllm_config.model_config
-        cache_config = vllm_config.cache_config
         parallel_config = vllm_config.parallel_config
+        cache_config = vllm_config.cache_config
+        self.compilation_config = vllm_config.compilation_config
+        self.device = device
+
         self.num_heads = self.model_config.get_num_attention_heads(
             parallel_config)
         self.mla_dims = get_mla_dims(self.model_config)
         self.aot_schedule = current_platform.is_cuda()
-
-        # Dont try to access the runner on AMD
+        try:
+            self.dcp_world_size = get_dcp_group().world_size
+            self.dcp_rank = get_dcp_group().rank_in_group
+        except AssertionError:
+            # DCP might not be initialized in testing
+            self.dcp_world_size = 1
+            self.dcp_rank = 0
+
+        # Don't try to access the runner on AMD
         if self.aot_schedule:
             self.page_size = self.kv_cache_spec.block_size
 
@@ -465,12 +483,27 @@ class MLACommonMetadataBuilder(AttentionMetadataBuilder[M]):
             128 * 1024)
         assert self.chunked_prefill_workspace_size >= \
             scheduler_config.max_num_seqs * cache_config.block_size
-        self.chunked_prefill_workspace = torch.empty(
-            (self.chunked_prefill_workspace_size,
-             self.model_config.get_head_size()),
-            dtype=self.model_config.dtype,
-            device=device,
-        )
+        if self.dcp_world_size > 1:
+            # Note(hc): The local kvcache is incomplete when DCP is triggered,
+            # an additional kvcache allgather across the DCP group is therefore
+            # required, so the workspace has to be enlarged by 1/DCP relative
+            # to the original TP allocation.
+            assert self.chunked_prefill_workspace_size % \
+                self.dcp_world_size == 0
+            self.chunked_prefill_workspace = torch.empty(
+                (self.chunked_prefill_workspace_size +
+                 self.chunked_prefill_workspace_size // self.dcp_world_size,
+                 self.model_config.get_head_size()),
+                dtype=self.model_config.dtype,
+                device=device,
+            )
+        else:
+            self.chunked_prefill_workspace = torch.empty(
+                (self.chunked_prefill_workspace_size,
+                 self.model_config.get_head_size()),
+                dtype=self.model_config.dtype,
+                device=device,
+            )
 
         self._use_cudnn_prefill = use_cudnn_prefill()
         self._use_fi_prefill = use_flashinfer_prefill()
@@ -579,10 +612,14 @@ class MLACommonMetadataBuilder(AttentionMetadataBuilder[M]):
         prefill.prefill_chunks = self._fi_prefill_chunks
 
     def _build_decode(self, block_table_tensor: torch.Tensor,
-                      seq_lens: torch.Tensor):
+                      seq_lens_cpu: torch.Tensor,
+                      seq_lens_device: torch.Tensor,
+                      query_start_loc_cpu: torch.Tensor,
+                      query_start_loc_device: torch.Tensor,
+                      num_decode_tokens: int) -> MLACommonDecodeMetadata:
         return MLACommonDecodeMetadata(
             block_table=block_table_tensor,
-            seq_lens=seq_lens,
+            seq_lens=seq_lens_device,
         )
 
     def build_for_cudagraph_capture(
@@ -592,11 +629,12 @@ class MLACommonMetadataBuilder(AttentionMetadataBuilder[M]):
         Currently, only decode is supported for full cudagraphs with MLA.
         """
         m = common_attn_metadata
-        assert m.num_reqs == m.num_actual_tokens, \
+        assert m.num_reqs <= (m.num_actual_tokens *
+                              self.reorder_batch_threshold), \
             "MLA only supports decode-only full CUDAGraph capture. " \
             "Make sure all cudagraph capture sizes <= max_num_seq."
 
-        assert m.max_query_len == 1  # decode-only
+        assert m.max_query_len <= self.reorder_batch_threshold  # decode only
 
         return self.build(0, m)
 
@@ -607,6 +645,7 @@ class MLACommonMetadataBuilder(AttentionMetadataBuilder[M]):
         num_reqs = common_attn_metadata.num_reqs
         num_tokens = common_attn_metadata.num_actual_tokens
         max_query_len = common_attn_metadata.max_query_len
+        max_seq_len = common_attn_metadata.max_seq_len
 
         # Note(simon): be careful about the CPU <> GPU memory movement in this
         # function. We should avoid GPU -> CPU sync as much as possible because
@@ -618,6 +657,7 @@ class MLACommonMetadataBuilder(AttentionMetadataBuilder[M]):
         query_start_loc = common_attn_metadata.query_start_loc
         query_start_loc_cpu = common_attn_metadata.query_start_loc_cpu
         seq_lens = common_attn_metadata.seq_lens
+        seq_lens_cpu = common_attn_metadata.seq_lens_cpu
 
         query_seq_lens_cpu = query_start_loc_cpu[1:] - query_start_loc_cpu[:-1]
 
@@ -625,7 +665,14 @@ class MLACommonMetadataBuilder(AttentionMetadataBuilder[M]):
                                    query_seq_lens_cpu)
 
         num_decodes, num_prefills, num_decode_tokens, num_prefill_tokens = \
-            split_decodes_and_prefills(common_attn_metadata)
+            split_decodes_and_prefills(common_attn_metadata,
+                                       decode_threshold=self.reorder_batch_threshold)
+
+        # Note(hc): update seq_lens of decode reqs under DCP.
+        if self.dcp_world_size > 1:
+            seq_lens[:num_decodes] = seq_lens[:num_decodes] \
+                // self.dcp_world_size + (self.dcp_rank <= \
+                (seq_lens[:num_decodes] - 1) % self.dcp_world_size)
 
         assert num_decodes + num_prefills == num_reqs
         assert num_decode_tokens + num_prefill_tokens == num_tokens
@@ -635,6 +682,10 @@ class MLACommonMetadataBuilder(AttentionMetadataBuilder[M]):
             reqs_start = num_decodes  # prefill_start
 
             context_lens_cpu = num_computed_tokens_cpu[reqs_start:num_reqs]
+            # Note(hc): The context lengths in the perspective of dcp rank0.
+            cp_context_lens_cpu = torch.ceil(context_lens_cpu.float() /
+                                             self.dcp_world_size).int()
+            origin_context_lens = context_lens_cpu.tolist()
             max_context_len_cpu = context_lens_cpu.max().item()
             num_prefills_with_context_cpu = (context_lens_cpu > 0).sum().item()
             prefill_query_start_loc = query_start_loc[
@@ -687,20 +738,66 @@ class MLACommonMetadataBuilder(AttentionMetadataBuilder[M]):
                              out=cu_seq_lens_cpu[:, 1:],
                              dtype=torch.int32)
 
+                if self.dcp_world_size > 1:
+                    # Note(hc): The above max_context_chunk already enforces
+                    # block_size alignment, DCP just need the block_size can
+                    # be divisible by dcp_world_size, because DCP use
+                    # cp_gather_cache which not require `cp_chunk_starts`
+                    # aligned to page_size.
+                    assert max_context_chunk % self.dcp_world_size == 0
+                    cp_max_context_chunk = max_context_chunk // \
+                        self.dcp_world_size
+                    cp_chunk_starts = \
+                        torch.arange(num_chunks, dtype=torch.int32) \
+                        .unsqueeze(1).expand(-1, num_prefills) \
+                        * cp_max_context_chunk
+                    cp_chunk_ends = torch.min(
+                        cp_context_lens_cpu.unsqueeze(0),
+                        cp_chunk_starts + cp_max_context_chunk)
+                    cp_chunk_seq_lens = (cp_chunk_ends -
+                                         cp_chunk_starts).clamp(min=0)
+
+                    cp_cu_seq_lens_cpu = torch.zeros(num_chunks,
+                                                     num_prefills + 1,
+                                                     dtype=torch.int32,
+                                                     pin_memory=True)
+                    torch.cumsum(cp_chunk_seq_lens,
+                                 dim=1,
+                                 out=cp_cu_seq_lens_cpu[:, 1:],
+                                 dtype=torch.int32)
+
                 chunked_context_metadata_cls = \
                     CudnnPrefillMetadata.ChunkedContextMetadata \
                     if self._use_cudnn_prefill else \
                         MLACommonPrefillMetadata.ChunkedContextMetadata
-
-                chunked_context_metadata = \
-                    chunked_context_metadata_cls(
-                    cu_seq_lens=cu_seq_lens_cpu.to(device, non_blocking=True),
-                    starts=chunk_starts.to(device, non_blocking=True),
-                    seq_tot=chunk_seq_lens.sum(dim=1).tolist(),
-                    max_seq_lens=chunk_seq_lens.max(dim=1).values.tolist(),
-                    seq_lens=chunk_seq_lens,
-                    workspace=self.chunked_prefill_workspace,
-                )
+                if self.dcp_world_size > 1:
+                    chunked_context_metadata = \
+                        chunked_context_metadata_cls(
+                        cu_seq_lens=cu_seq_lens_cpu \
+                            .to(device, non_blocking=True),
+                        starts=cp_chunk_starts.to(device, non_blocking=True),
+                        seq_tot=cp_chunk_seq_lens.sum(dim=1).tolist(),
+                        max_seq_lens=chunk_seq_lens.max(dim=1).values.tolist(),
+                        seq_lens=chunk_seq_lens,
+                        workspace=self.chunked_prefill_workspace,
+                        cp_chunk_seq_lens=cp_chunk_seq_lens.tolist(),
+                        origin_context_lens=origin_context_lens,
+                        cp_cu_seq_lens=cp_cu_seq_lens_cpu \
+                            .to(device, non_blocking=True),
+                        chunk_size=max_context_chunk,
+                        cu_seq_lens_lst=cu_seq_lens_cpu.tolist(),
+                    )
+                else:
+                    chunked_context_metadata = \
+                        chunked_context_metadata_cls(
+                        cu_seq_lens=cu_seq_lens_cpu \
+                            .to(device, non_blocking=True),
+                        starts=chunk_starts.to(device, non_blocking=True),
+                        seq_tot=chunk_seq_lens.sum(dim=1).tolist(),
+                        max_seq_lens=chunk_seq_lens.max(dim=1).values.tolist(),
+                        seq_lens=chunk_seq_lens,
+                        workspace=self.chunked_prefill_workspace,
+                    )
 
                 if self._use_cudnn_prefill:
                     chunked_context_metadata.seq_lens = chunk_seq_lens
@@ -725,12 +822,17 @@ class MLACommonMetadataBuilder(AttentionMetadataBuilder[M]):
         if num_decodes > 0:
             decode_metadata = self._build_decode(
                 block_table_tensor=block_table_tensor[:num_decodes, ...],
-                seq_lens=seq_lens[:num_decodes],
+                seq_lens_cpu=seq_lens_cpu[:num_decodes],
+                seq_lens_device=seq_lens[:num_decodes],
+                query_start_loc_cpu=query_start_loc_cpu[:num_decodes + 1],
+                query_start_loc_device=query_start_loc[:num_decodes + 1],
+                num_decode_tokens=num_decode_tokens,
             )
 
         attn_metadata = self.metadata_cls(
             num_reqs=common_attn_metadata.num_reqs,
             max_query_len=common_attn_metadata.max_query_len,
+            max_seq_len=max_seq_len,
             num_actual_tokens=num_tokens,
             query_start_loc=query_start_loc,
             slot_mapping=slot_mapping,
@@ -750,6 +852,71 @@ class MLACommonMetadataBuilder(AttentionMetadataBuilder[M]):
         return attn_metadata
 
 
+def reorg_kvcache(
+    allgatered_kv_c_normed: torch.Tensor,
+    allgatered_k_pe: torch.Tensor,
+    cp_chunk_seq_lens_lst: list[int],
+    origin_context_lens: list[int],
+    cp_world_size: int,
+    sum_seq_len: int,
+    max_seq_len: int,
+    chunk_size: int,
+    chunk_idx: int,
+    toks: int,
+) -> tuple[torch.Tensor, torch.Tensor]:
+    """
+    reorg kvcache after cp local gather to tp layout for attn kernel.
+
+    Args:
+        cp_chunk_seq_lens_lst: chunk context lengths under CP.
+        origin_context_lens: origin full context lengths under CP.
+        cp_world_size: CP size.
+        sum_seq_len: the sum of cp_chunk_seq_lens_lst.
+        max_seq_len: the max value of cp_chunk_seq_lens_lst.
+        chunk_size: equals to max_context_chunk from
+            chunked_context_metadata building.
+        chunk_idx: chunk idx of chunked_prefill.
+        toks: the number of tokens for local gather cache.
+    """
+    kv_c_segments = []
+    k_pe_segments = []
+    src_token_idx = 0
+    max_seq_len_check = 0
+    for cp_chunk_seq_len, origin_context_len in zip(cp_chunk_seq_lens_lst,
+                                                    origin_context_lens):
+        chunk_context_len = chunk_size
+        if cp_chunk_seq_len != 0:
+            chunk_context_len = min(
+                chunk_context_len, origin_context_len - chunk_size * chunk_idx)
+        cp_target_rank = (chunk_context_len - 1) % cp_world_size
+        cur_seq_len = 0
+        for rank in range(cp_world_size):
+            if rank > cp_target_rank and cp_chunk_seq_len:
+                real_cp_chunk_seq_len = cp_chunk_seq_len - 1
+            else:
+                real_cp_chunk_seq_len = cp_chunk_seq_len
+            if real_cp_chunk_seq_len:
+                kv_c_segment = allgatered_kv_c_normed[rank * toks +
+                                                      src_token_idx:rank *
+                                                      toks + src_token_idx +
+                                                      real_cp_chunk_seq_len]
+                k_pe_segment = allgatered_k_pe[rank * toks +
+                                               src_token_idx:rank * toks +
+                                               src_token_idx +
+                                               real_cp_chunk_seq_len]
+                kv_c_segments.append(kv_c_segment)
+                k_pe_segments.append(k_pe_segment)
+                cur_seq_len += real_cp_chunk_seq_len
+        max_seq_len_check = max(max_seq_len_check, cur_seq_len)
+        src_token_idx += cp_chunk_seq_len
+    reorganized_kv_c_normed = torch.cat(kv_c_segments, dim=0)
+    reorganized_k_pe = torch.cat(k_pe_segments, dim=0)
+    assert reorganized_kv_c_normed.shape[0] == sum_seq_len
+    assert reorganized_k_pe.shape[0] == sum_seq_len
+    assert max_seq_len_check == max_seq_len
+    return reorganized_kv_c_normed, reorganized_k_pe
+
+
 class MLACommonImpl(MLAAttentionImpl[M], Generic[M]):
     """
     NOTE: Please read the comment at the top of the file before trying to
@@ -829,6 +996,8 @@ class MLACommonImpl(MLAAttentionImpl[M], Generic[M]):
                 self.vllm_flash_attn_version == 3
                 and current_platform.get_device_capability()[0] == 9)
 
+        self.dcp_world_size: Optional[int] = None
+
     def _flash_attn_varlen_diff_headdims(self,
                                          q,
                                          k,
@@ -1011,7 +1180,7 @@ class MLACommonImpl(MLAAttentionImpl[M], Generic[M]):
             return layer.weight
 
         # we currently do not have quantized bmm's which are needed for
-        # `W_UV` and `W_UK_T`, we we just store fp16/bf16 copies and perform
+        # `W_UV` and `W_UK_T`, we just store fp16/bf16 copies and perform
         # the bmm's in 16-bit, the extra memory overhead of this is fairly low
         kv_b_proj_weight = get_and_maybe_dequant_weights(self.kv_b_proj).T
         assert kv_b_proj_weight.shape == (
@@ -1145,6 +1314,108 @@ class MLACommonImpl(MLAAttentionImpl[M], Generic[M]):
 
         return output, output_lse
 
+    def _context_parallel_compute_prefill_context(
+        self,
+        q: torch.Tensor,
+        kv_c_and_k_pe_cache: torch.Tensor,
+        attn_metadata: MLACommonMetadata,
+        k_scale: torch.Tensor,
+        dcp_world_size: int,
+    ):
+        assert k_scale is None, "DCP not support scaled kvcache now."
+        assert attn_metadata.prefill is not None
+        prefill_metadata = attn_metadata.prefill
+        assert prefill_metadata.chunked_context is not None
+        assert prefill_metadata.chunked_context.cp_chunk_seq_lens is not None
+        assert prefill_metadata.chunked_context.origin_context_lens is not None
+        assert prefill_metadata.chunked_context.cp_cu_seq_lens is not None
+        assert prefill_metadata.chunked_context.chunk_size is not None
+        assert prefill_metadata.chunked_context.cu_seq_lens_lst is not None
+
+        output = None
+        iters = len(prefill_metadata.chunked_context.seq_tot)
+        workspace = prefill_metadata.chunked_context.workspace
+
+        for i in range(iters):
+            toks = prefill_metadata.chunked_context.seq_tot[i]
+            ops.cp_gather_cache(
+                src_cache=kv_c_and_k_pe_cache,
+                dst=workspace,
+                block_table=prefill_metadata.block_table,
+                cu_seq_lens=prefill_metadata.chunked_context.cp_cu_seq_lens[i],
+                batch_size=attn_metadata.num_prefills,
+                seq_starts=prefill_metadata.chunked_context.starts[i],
+            )
+            # workspace
+            # |------- N tokens --------|--------- N*dcp_size tokens ----------|
+            # |<- use for loca_gather ->|<--------- use for allgather -------->|
+            allgather_offset = workspace.shape[0] // (dcp_world_size + 1)
+            assert allgather_offset * (dcp_world_size +
+                                       1) == workspace.shape[0]
+            assert toks <= allgather_offset
+            local_gathered_kvcache = workspace[:toks]
+            cur_allgather_workspace = workspace[
+                allgather_offset:allgather_offset * (1 + dcp_world_size)]
+            assert toks * dcp_world_size <= cur_allgather_workspace.shape[0]
+            cur_allgather_kvcache = cur_allgather_workspace[:toks *
+                                                            dcp_world_size]
+            cur_allgather_kvcache.copy_(get_dcp_group().all_gather(
+                local_gathered_kvcache, dim=0))
+            assert cur_allgather_kvcache.shape[
+                -1] == self.kv_lora_rank + self.qk_rope_head_dim
+            allgatered_kv_c_normed, allgatered_k_pe = \
+                cur_allgather_kvcache.unsqueeze(
+                1).split([self.kv_lora_rank, self.qk_rope_head_dim], dim=-1)
+
+            kv_c_normed, k_pe = reorg_kvcache(
+                allgatered_kv_c_normed,
+                allgatered_k_pe,
+                cp_chunk_seq_lens_lst=prefill_metadata.chunked_context.
+                cp_chunk_seq_lens[i],
+                origin_context_lens=prefill_metadata.chunked_context.
+                origin_context_lens,
+                cp_world_size=dcp_world_size,
+                sum_seq_len=prefill_metadata.chunked_context.cu_seq_lens_lst[i]
+                [-1],
+                max_seq_len=prefill_metadata.chunked_context.max_seq_lens[i],
+                chunk_size=prefill_metadata.chunked_context.chunk_size,
+                chunk_idx=i,
+                toks=toks)
+
+            kv_nope = self.kv_b_proj(kv_c_normed)[0].view( \
+                -1, self.num_heads, self.qk_nope_head_dim + self.v_head_dim)
+            k_nope, v = kv_nope\
+                .split([self.qk_nope_head_dim, self.v_head_dim], dim=-1)
+            k = torch.cat((k_nope, k_pe.expand((*k_nope.shape[:-1], -1))),
+                          dim=-1)
+
+            attn_output, attn_softmax_lse = self._run_prefill_context_chunk(
+                prefill=prefill_metadata,
+                chunk_idx=i,
+                q=q,
+                k=k,
+                v=v,
+            )
+
+            if output is None:
+                output = attn_output
+                output_lse = attn_softmax_lse
+            else:
+                output_tmp = torch.empty_like(output)
+                output_lse_tmp = torch.empty_like(output_lse)
+                merge_attn_states(
+                    output=output_tmp,
+                    output_lse=output_lse_tmp,
+                    prefix_output=output,
+                    prefix_lse=output_lse,
+                    suffix_output=attn_output,
+                    suffix_lse=attn_softmax_lse,
+                )
+                output = output_tmp
+                output_lse = output_lse_tmp
+
+        return output, output_lse
+
     def _forward_prefill(
         self,
         q: torch.Tensor,
@@ -1155,6 +1426,7 @@ class MLACommonImpl(MLAAttentionImpl[M], Generic[M]):
         k_scale: torch.Tensor,
     ) -> torch.Tensor:
         assert attn_metadata.prefill is not None
+        assert self.dcp_world_size is not None
 
         has_context = attn_metadata.prefill.chunked_context is not None
         kv_nope = self.kv_b_proj(kv_c_normed)[0].view(\
@@ -1174,8 +1446,15 @@ class MLACommonImpl(MLAAttentionImpl[M], Generic[M]):
 
         if has_context:
             suffix_output, suffix_lse = output
-            context_output, context_lse = self._compute_prefill_context( \
-                q, kv_c_and_k_pe_cache, attn_metadata, k_scale)
+            if self.dcp_world_size > 1:
+                context_output, context_lse = \
+                    self._context_parallel_compute_prefill_context(
+                    q, kv_c_and_k_pe_cache, attn_metadata,
+                    k_scale=None, dcp_world_size=self.dcp_world_size)
+            else:
+                context_output, context_lse = \
+                    self._compute_prefill_context(
+                    q, kv_c_and_k_pe_cache, attn_metadata, k_scale)
 
             output = torch.empty_like(suffix_output)
             merge_attn_states(
@@ -1195,12 +1474,11 @@ class MLACommonImpl(MLAAttentionImpl[M], Generic[M]):
     @abstractmethod
     def _forward_decode(
         self,
-        ql_nope: torch.Tensor,
-        q_pe: torch.Tensor,
+        q: Union[torch.Tensor, tuple[torch.Tensor, torch.Tensor]],
         kv_c_and_k_pe_cache: torch.Tensor,
         attn_metadata: M,
         layer: AttentionLayer,
-    ) -> torch.Tensor:
+    ) -> tuple[torch.Tensor, Optional[torch.Tensor]]:
         raise NotImplementedError
 
     def forward(
@@ -1228,6 +1506,9 @@ class MLACommonImpl(MLAAttentionImpl[M], Generic[M]):
             # same expert outputs.
             return output.fill_(0)
 
+        if self.dcp_world_size is None:
+            self.dcp_world_size = get_dcp_group().world_size
+
         fp8_attention = self.kv_cache_dtype.startswith("fp8")
 
         num_actual_toks = attn_metadata.num_actual_tokens
@@ -1306,7 +1587,22 @@ class MLACommonImpl(MLAAttentionImpl[M], Generic[M]):
                     layer._q_scale)
                 decode_q_pe = decode_q_pe.reshape(q_pe_shape)
 
-            output[:num_decode_tokens] = self._forward_decode(
-                decode_ql_nope, decode_q_pe, kv_cache, attn_metadata, layer)
+            decode_q = (decode_ql_nope, decode_q_pe)
+            if self.dcp_world_size > 1:
+                assert not fp8_attention, "DCP not support fp8 kvcache now."
+                # concatenate decode_ql_nope and decode_q_pe -> (B, N, L + P)
+                decode_q = torch.cat(decode_q, dim=-1)
+                # decode_q do allgather in head dim.
+                decode_q = get_dcp_group().all_gather(decode_q, dim=1)
+
+            # call decode attn
+            attn_out, lse = self._forward_decode(decode_q, kv_cache,
+                                                 attn_metadata, layer)
+
+            # recorect dcp attn_out with lse.
+            if self.dcp_world_size > 1:
+                attn_out = cp_lse_ag_out_rs(attn_out, lse, get_dcp_group())
 
+            # v_up projection
+            output[:num_decode_tokens] = self._v_up_proj(attn_out)
         return output_padded
diff --git a/vllm/v1/attention/backends/mla/cutlass_mla.py b/vllm/v1/attention/backends/mla/cutlass_mla.py
index 8a17d3a49278329ce26342de8e9b3b57dfc63a67..6017445402eca5e30486daac222d1c923d3da8f5 100644
--- a/vllm/v1/attention/backends/mla/cutlass_mla.py
+++ b/vllm/v1/attention/backends/mla/cutlass_mla.py
@@ -76,6 +76,7 @@ g_sm100_workspace = SM100Workspace(128 * 1024 * 1024)  # 128MB
 
 
 class CutlassMLAImpl(MLACommonImpl[MLACommonMetadata]):
+    can_return_lse_for_decode: bool = True
 
     def __init__(
             self,
@@ -108,10 +109,6 @@ class CutlassMLAImpl(MLACommonImpl[MLACommonMetadata]):
                                       "are not implemented for "
                                       "CutlassMLAImpl")
 
-        if is_quantized_kv_cache(self.kv_cache_dtype):
-            raise NotImplementedError(
-                "CutlassMLA V1 with FP8 KV cache not yet supported")
-
         self._use_old_cutlass_mla = False
         force_old_cutlass = os.environ.get("FORCE_OLD_CUTLASS_MLA", None)
         if force_old_cutlass:
@@ -142,7 +139,7 @@ class CutlassMLAImpl(MLACommonImpl[MLACommonMetadata]):
         workspace: torch.Tensor,
         sm_scale: float,
         num_kv_splits: int,
-    ) -> torch.Tensor:
+    ) -> tuple[torch.Tensor, torch.Tensor]:
         assert (q_nope.ndim == 3
                 ), f"q_nope must be a 3D tensor, but got {q_nope.ndim}"
         assert (
@@ -182,11 +179,10 @@ class CutlassMLAImpl(MLACommonImpl[MLACommonMetadata]):
                 > 0), f"block num must be greater than 0, got {block_num}"
         assert block_num % (128 / PAGE_SIZE) == 0
 
-        # TODO(kaixih@nvidia): support fp8
         assert q_nope.dtype in (
-            torch.float16,
-            torch.bfloat16,
-        ), f"q_nope.dtype needs to be fp16 or bf16 but got {q_nope.dtype}."
+            torch.float16, torch.bfloat16, torch.float8_e4m3fn), (
+                f"q_nope.dtype needs to be fp16 or bf16 or e4m3 but got "
+                f"{q_nope.dtype}.")
         assert q_nope.dtype == q_pe.dtype == kv_c_and_k_pe_cache.dtype
         assert (
             seq_lens.dtype == torch.int32
@@ -195,10 +191,16 @@ class CutlassMLAImpl(MLACommonImpl[MLACommonMetadata]):
             page_table.dtype == torch.int32
         ), f"page_table.dtype needs to be int32 but got {page_table.dtype}."
 
-        out = q_nope.new_empty((B_q, MAX_HEADS, D_latent))
+        dtype = (torch.bfloat16 if is_quantized_kv_cache(self.kv_cache_dtype)
+                 else q_nope.dtype)
+        out = q_nope.new_empty((B_q, MAX_HEADS, D_latent), dtype=dtype)
+        lse = (torch.empty(
+            (B_q, MAX_HEADS), dtype=torch.float32, device=q_nope.device)
+               if self.need_to_return_lse_for_decode else torch.Tensor())
 
         ops.sm100_cutlass_mla_decode(
             out,
+            lse,
             q_nope,
             q_pe,
             kv_c_and_k_pe_cache,
@@ -208,7 +210,9 @@ class CutlassMLAImpl(MLACommonImpl[MLACommonMetadata]):
             sm_scale,
             num_kv_splits,
         )
-        return out[:, :H].contiguous()
+        returned_lse = lse[:, :H].contiguous(
+        ) if self.need_to_return_lse_for_decode else lse
+        return out[:, :H].contiguous(), returned_lse
 
     def _sm100_forward_decode(
         self,
@@ -216,13 +220,10 @@ class CutlassMLAImpl(MLACommonImpl[MLACommonMetadata]):
         q_pe: torch.Tensor,
         kv_c_and_k_pe_cache: torch.Tensor,
         attn_metadata: MLACommonMetadata,
-    ) -> torch.Tensor:
+    ) -> tuple[torch.Tensor, Optional[torch.Tensor]]:
         assert kv_c_and_k_pe_cache.numel() > 0
         assert attn_metadata.decode is not None
 
-        if self.kv_cache_dtype.startswith("fp8"):
-            raise NotImplementedError("FP8 Cutlass MLA not yet supported")
-
         # Adjust workspace size (if necessary)
         self._workspace.ensure_size(attn_metadata, self._num_kv_splits)
 
@@ -232,13 +233,18 @@ class CutlassMLAImpl(MLACommonImpl[MLACommonMetadata]):
         q_nope = q_nope.clone()
         q_pe = q_pe.clone()
 
-        o = self._sm100_cutlass_mla_decode(q_nope, q_pe, kv_c_and_k_pe_cache,
-                                           attn_metadata.decode.seq_lens,
-                                           attn_metadata.decode.block_table,
-                                           self._workspace.get_buf(),
-                                           self.scale, self._num_kv_splits)
+        o, lse = self._sm100_cutlass_mla_decode(
+            q_nope,
+            q_pe,
+            kv_c_and_k_pe_cache,
+            attn_metadata.decode.seq_lens,
+            attn_metadata.decode.block_table,
+            self._workspace.get_buf(),
+            self.scale,
+            self._num_kv_splits,
+        )
 
-        return self._v_up_proj(o)
+        return o, (lse if self.need_to_return_lse_for_decode else None)
 
     # TODO: Currently we leave it here only for backup in case something is
     #       wrong with the new SM100 CUTLASS MLA kernel
@@ -252,8 +258,9 @@ class CutlassMLAImpl(MLACommonImpl[MLACommonMetadata]):
         assert kv_c_and_k_pe_cache.numel() > 0
         assert attn_metadata.decode is not None
 
-        if self.kv_cache_dtype.startswith("fp8"):
-            raise NotImplementedError("FP8 Cutlass MLA not yet supported")
+        if is_quantized_kv_cache(self.kv_cache_dtype):
+            raise NotImplementedError(
+                "FP8 Cutlass MLA not supported with FORCE_OLD_CUTLASS_MLA")
 
         B = q_nope.shape[0]
 
@@ -270,21 +277,25 @@ class CutlassMLAImpl(MLACommonImpl[MLACommonMetadata]):
                                attn_metadata.decode.seq_lens,
                                attn_metadata.decode.block_table, self.scale)
 
-        return self._v_up_proj(o)
+        return o
 
     def _forward_decode(
         self,
-        q_nope: torch.Tensor,
-        q_pe: torch.Tensor,
+        q: torch.Tensor,
         kv_c_and_k_pe_cache: torch.Tensor,
         attn_metadata: MLACommonMetadata,
         layer: AttentionLayer,
-    ) -> torch.Tensor:
+    ) -> tuple[torch.Tensor, Optional[torch.Tensor]]:
+        if type(q) is tuple:
+            q_nope, q_pe = q
+        else:
+            q_nope, q_pe = torch.split(
+                q, [self.kv_lora_rank, self.qk_rope_head_dim], dim=-1)
         if self._use_old_cutlass_mla:
             # TODO: Remove the old cutlass MLA kernel after more extensive
             #       testing
             return self._old_forward_decode(q_nope, q_pe, kv_c_and_k_pe_cache,
-                                            attn_metadata)
+                                            attn_metadata), None
 
         return self._sm100_forward_decode(q_nope, q_pe, kv_c_and_k_pe_cache,
                                           attn_metadata)
diff --git a/vllm/v1/attention/backends/mla/flashattn_mla.py b/vllm/v1/attention/backends/mla/flashattn_mla.py
new file mode 100644
index 0000000000000000000000000000000000000000..472095e13615b0346b424a1635e65e659afa5176
--- /dev/null
+++ b/vllm/v1/attention/backends/mla/flashattn_mla.py
@@ -0,0 +1,273 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from dataclasses import dataclass
+from typing import ClassVar, Optional, Union
+
+import torch
+
+from vllm.attention.backends.abstract import (AttentionLayer, AttentionType,
+                                              is_quantized_kv_cache)
+from vllm.attention.utils.fa_utils import (flash_attn_supports_mla,
+                                           get_flash_attn_version)
+from vllm.config import VllmConfig
+from vllm.distributed.parallel_state import get_dcp_group
+from vllm.logger import init_logger
+from vllm.v1.attention.backends.mla.common import (MLACommonBackend,
+                                                   MLACommonDecodeMetadata,
+                                                   MLACommonImpl,
+                                                   MLACommonMetadata,
+                                                   MLACommonMetadataBuilder)
+from vllm.v1.attention.backends.utils import AttentionCGSupport
+from vllm.v1.kv_cache_interface import AttentionSpec
+from vllm.vllm_flash_attn import flash_attn_varlen_func, get_scheduler_metadata
+
+logger = init_logger(__name__)
+
+# NOTE(matt): This is an arbitrary number, copied from
+# woosuk's implementation in standard FlashAttention backend
+_DEFAULT_MAX_NUM_SPLITS_FOR_CUDA_GRAPH = 16
+
+
+class FlashAttnMLABackend(MLACommonBackend):
+
+    @staticmethod
+    def get_name() -> str:
+        return "FLASH_ATTN_MLA"
+
+    @staticmethod
+    def get_metadata_cls() -> type["FlashAttnMLAMetadata"]:
+        return FlashAttnMLAMetadata
+
+    @staticmethod
+    def get_builder_cls() -> type["FlashAttnMLAMetadataBuilder"]:
+        return FlashAttnMLAMetadataBuilder
+
+    @staticmethod
+    def get_impl_cls() -> type["FlashAttnMLAImpl"]:
+        return FlashAttnMLAImpl
+
+
+@dataclass
+class FlashAttnMLADecodeMetadata(MLACommonDecodeMetadata):
+    query_start_loc: torch.Tensor
+    max_query_len: int
+    max_seq_len: int
+    scheduler_metadata: Optional[torch.Tensor] = None
+    max_num_splits: int = 0
+
+
+@dataclass
+class FlashAttnMLAMetadata(MLACommonMetadata[FlashAttnMLADecodeMetadata]):
+    pass
+
+
+class FlashAttnMLAMetadataBuilder(
+        MLACommonMetadataBuilder[FlashAttnMLAMetadata]):
+    cudagraph_support: ClassVar[AttentionCGSupport] = \
+        AttentionCGSupport.UNIFORM_BATCH
+
+    reorder_batch_threshold: ClassVar[int] = 512
+
+    def __init__(self, kv_cache_spec: AttentionSpec, layer_names: list[str],
+                 vllm_config: VllmConfig, device: torch.device):
+        super().__init__(kv_cache_spec, layer_names, vllm_config, device,
+                         FlashAttnMLAMetadata)
+        self.max_num_splits = 0  # No upper bound on the number of splits.
+        self.fa_aot_schedule = (get_flash_attn_version() == 3)
+
+        self.use_full_cuda_graph = \
+            self.compilation_config.cudagraph_mode.has_full_cudagraphs()
+
+        if self.use_full_cuda_graph and self.fa_aot_schedule:
+            self.max_cudagraph_size = self.compilation_config.max_capture_size
+
+            if self.max_cudagraph_size > 992:
+                # This condition derives from FA3's internal heuristic.
+                # TODO(woosuk): Support larger cudagraph sizes.
+                raise ValueError(
+                    "Capture size larger than 992 is not supported for "
+                    "full cuda graph.")
+
+            self.scheduler_metadata = torch.zeros(
+                vllm_config.scheduler_config.max_num_seqs + 1,
+                dtype=torch.int32,
+                device=self.device,
+            )
+            # When using cuda graph, we need to set the upper bound of the
+            # number of splits so that large enough intermediate buffers are
+            # pre-allocated during capture.
+            self.max_num_splits = _DEFAULT_MAX_NUM_SPLITS_FOR_CUDA_GRAPH
+
+        # TODO(lucas): Until we add support for the DCP custom masking we need
+        #   to restrict decodes to q_len == 1 when DCP is enabled.
+        self.__class__.reorder_batch_threshold = 1 \
+            if get_dcp_group().world_size > 1 else self.reorder_batch_threshold
+
+    def _schedule_decode(self, num_reqs, cu_query_lens, max_query_len, seqlens,
+                         max_seq_len, causal):
+        if self.fa_aot_schedule:
+            return get_scheduler_metadata(
+                batch_size=num_reqs,
+                max_seqlen_q=max_query_len,
+                max_seqlen_k=max_seq_len,
+                num_heads_q=self.num_heads,
+                num_heads_kv=1,
+                headdim=self.mla_dims.qk_rope_head_dim,
+                cache_seqlens=seqlens,
+                qkv_dtype=self.kv_cache_spec.dtype,
+                headdim_v=self.mla_dims.kv_lora_rank,
+                page_size=self.page_size,
+                cu_seqlens_q=cu_query_lens,
+                causal=causal,
+                num_splits=self.max_num_splits,
+            )
+        return None
+
+    def _build_decode(self, block_table_tensor: torch.Tensor,
+                      seq_lens_cpu: torch.Tensor,
+                      seq_lens_device: torch.Tensor,
+                      query_start_loc_cpu: torch.Tensor,
+                      query_start_loc_device: torch.Tensor,
+                      num_decode_tokens: int) -> FlashAttnMLADecodeMetadata:
+        query_lens_cpu = (query_start_loc_cpu[1:] - query_start_loc_cpu[:-1])
+        max_query_len = query_lens_cpu.max().item()
+        max_seq_len = seq_lens_cpu.max().item()
+
+        scheduler_metadata = self._schedule_decode(
+            num_reqs=seq_lens_cpu.numel(),
+            cu_query_lens=query_start_loc_device,
+            max_query_len=max_query_len,
+            seqlens=seq_lens_device,
+            max_seq_len=max_seq_len,
+            causal=True,
+        )
+
+        # For FA3 + full cudagraph
+        max_num_splits = 0
+        if self.use_full_cuda_graph and scheduler_metadata is not None:
+            n = scheduler_metadata.shape[0]
+            # Ensure the persistent buffer is large enough
+            assert n <= self.scheduler_metadata.shape[0], \
+                f"Scheduler metadata size {n} exceeds buffer size " + \
+                f"{self.scheduler_metadata.shape[0]}"
+            self.scheduler_metadata[:n] = scheduler_metadata
+            # NOTE(woosuk): We should zero out the rest of the scheduler
+            # metadata to guarantee the correctness. Otherwise, some thread
+            # blocks may use the invalid scheduler metadata and overwrite the
+            # output buffer.
+            self.scheduler_metadata[n:] = 0
+            scheduler_metadata = self.scheduler_metadata[:n]
+
+            if num_decode_tokens <= self.max_cudagraph_size:
+                # NOTE(woosuk): Setting num_splits > 1 may increase the memory
+                # usage, because the intermediate buffers of size [num_splits,
+                # num_heads, num_tokens, head_size] are allocated. Therefore,
+                # we only set num_splits when using cuda graphs.
+                max_num_splits = self.max_num_splits
+
+        return FlashAttnMLADecodeMetadata(
+            block_table=block_table_tensor,
+            seq_lens=seq_lens_device,
+            query_start_loc=query_start_loc_device,
+            max_query_len=max_query_len,
+            max_seq_len=max_seq_len,
+            scheduler_metadata=scheduler_metadata,
+            max_num_splits=max_num_splits,
+        )
+
+
+class FlashAttnMLAImpl(MLACommonImpl[FlashAttnMLAMetadata]):
+    can_return_lse_for_decode: bool = True
+
+    def __init__(
+            self,
+            num_heads: int,
+            head_size: int,
+            scale: float,
+            num_kv_heads: int,
+            alibi_slopes: Optional[list[float]],
+            sliding_window: Optional[int],
+            kv_cache_dtype: str,
+            logits_soft_cap: Optional[float],
+            attn_type: str,
+            kv_sharing_target_layer_name: Optional[str],
+            # MLA Specific Arguments
+            **mla_args) -> None:
+        super().__init__(num_heads, head_size, scale, num_kv_heads,
+                         alibi_slopes, sliding_window, kv_cache_dtype,
+                         logits_soft_cap, attn_type,
+                         kv_sharing_target_layer_name, **mla_args)
+
+        assert flash_attn_supports_mla(), \
+            "FlashAttnMLA is not supported on this device"
+
+        unsupported_features = [alibi_slopes, sliding_window, logits_soft_cap]
+        if any(unsupported_features):
+            raise NotImplementedError(
+                "FlashAttnMLAImpl does not support one of the following: "
+                "alibi_slopes, sliding_window, logits_soft_cap")
+
+        if attn_type != AttentionType.DECODER:
+            raise NotImplementedError("Encoder self-attention and "
+                                      "encoder/decoder cross-attention "
+                                      "are not implemented for "
+                                      "FlashAttnMLAImpl")
+
+        if is_quantized_kv_cache(self.kv_cache_dtype):
+            raise NotImplementedError(
+                "FlashAttnMLA V1 with FP8 KV cache not yet supported")
+
+    def _forward_decode(
+        self,
+        q: Union[torch.Tensor, tuple[torch.Tensor, torch.Tensor]],
+        kv_c_and_k_pe_cache: torch.Tensor,
+        attn_metadata: FlashAttnMLAMetadata,
+        layer: AttentionLayer,
+    ) -> tuple[torch.Tensor, Optional[torch.Tensor]]:
+        assert kv_c_and_k_pe_cache.numel() > 0
+        assert attn_metadata.decode is not None
+
+        if type(q) is tuple:
+            q_nope, q_pe = q
+        else:
+            q_nope, q_pe = torch.split(
+                q, [self.kv_lora_rank, self.qk_rope_head_dim], dim=-1)
+
+        if self.kv_cache_dtype.startswith("fp8"):
+            raise NotImplementedError(
+                "FP8 FlashAttention MLA not yet supported")
+
+        kv_c_cache = kv_c_and_k_pe_cache[..., :self.kv_lora_rank]
+        k_pe_cache = kv_c_and_k_pe_cache[..., self.kv_lora_rank:]
+
+        # NOTE(matt): During CUDA graph capture, max_query_len can be 0, but the
+        # kernel uses this to calculate grid dimensions. Ensure it's at least 1
+        # to prevent invalid grid configuration during graph capture.
+        max_seqlen_q = max(attn_metadata.decode.max_query_len, 1)
+
+        attn_out = flash_attn_varlen_func(
+            q=q_pe,
+            k=k_pe_cache.unsqueeze(-2),  # Add head dim of 1
+            v=kv_c_cache.unsqueeze(-2),  # Add head dim of 1
+            q_v=q_nope,
+            max_seqlen_q=max_seqlen_q,
+            cu_seqlens_q=attn_metadata.decode.query_start_loc,
+            max_seqlen_k=attn_metadata.decode.max_seq_len,
+            seqused_k=attn_metadata.decode.seq_lens,
+            block_table=attn_metadata.decode.block_table,
+            softmax_scale=self.scale,
+            causal=True,
+            return_softmax_lse=self.need_to_return_lse_for_decode,
+            fa_version=3,  # only version 3 is supported
+            scheduler_metadata=attn_metadata.decode.scheduler_metadata,
+            num_splits=attn_metadata.decode.max_num_splits,
+        )
+
+        if self.need_to_return_lse_for_decode:
+            o, lse = attn_out
+            # FA returns LSE in shape [ H, B ] but DCP wants [ B, H ]
+            return o, lse.transpose(0, 1)  # [ H, B ] -> [ B, H ]
+        else:
+            o = attn_out
+            return o, None
diff --git a/vllm/v1/attention/backends/mla/flashinfer_mla.py b/vllm/v1/attention/backends/mla/flashinfer_mla.py
new file mode 100644
index 0000000000000000000000000000000000000000..71eb9e0ce70e66ab24848478031343588fc6c06d
--- /dev/null
+++ b/vllm/v1/attention/backends/mla/flashinfer_mla.py
@@ -0,0 +1,110 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from typing import Optional, Union
+
+import torch
+from flashinfer.decode import trtllm_batch_decode_with_kv_cache_mla
+
+from vllm.attention.backends.abstract import (AttentionLayer, AttentionType,
+                                              is_quantized_kv_cache)
+from vllm.logger import init_logger
+from vllm.v1.attention.backends.mla.common import (MLACommonBackend,
+                                                   MLACommonImpl,
+                                                   MLACommonMetadata)
+
+logger = init_logger(__name__)
+
+FLASHINFER_MLA_WORKSPACE_BUFFER_SIZE = 128 * 1024 * 1024
+
+
+class FlashInferMLABackend(MLACommonBackend):
+
+    @staticmethod
+    def get_name() -> str:
+        return "FLASHINFER_MLA"
+
+    @staticmethod
+    def get_impl_cls() -> type["FlashInferMLAImpl"]:
+        return FlashInferMLAImpl
+
+
+g_fi_workspace = torch.zeros(
+    FLASHINFER_MLA_WORKSPACE_BUFFER_SIZE,
+    dtype=torch.uint8,
+    device="cuda",
+)
+
+
+class FlashInferMLAImpl(MLACommonImpl[MLACommonMetadata]):
+
+    def __init__(
+            self,
+            num_heads: int,
+            head_size: int,
+            scale: float,
+            num_kv_heads: int,
+            alibi_slopes: Optional[list[float]],
+            sliding_window: Optional[int],
+            kv_cache_dtype: str,
+            logits_soft_cap: Optional[float],
+            attn_type: str,
+            kv_sharing_target_layer_name: Optional[str],
+            # MLA Specific Arguments
+            **mla_args) -> None:
+        super().__init__(num_heads, head_size, scale, num_kv_heads,
+                         alibi_slopes, sliding_window, kv_cache_dtype,
+                         logits_soft_cap, attn_type,
+                         kv_sharing_target_layer_name, **mla_args)
+
+        unsupported_features = [alibi_slopes, sliding_window, logits_soft_cap]
+        if any(unsupported_features):
+            raise NotImplementedError(
+                "FlashInferMLAImpl does not support one of the following: "
+                "alibi_slopes, sliding_window, logits_soft_cap")
+
+        if attn_type != AttentionType.DECODER:
+            raise NotImplementedError("Encoder self-attention and "
+                                      "encoder/decoder cross-attention "
+                                      "are not implemented for "
+                                      "FlashInferMLAImpl")
+
+        if is_quantized_kv_cache(self.kv_cache_dtype):
+            raise NotImplementedError(
+                "FlashInferMLA V1 with FP8 KV cache not yet supported")
+
+        self._workspace_buffer = g_fi_workspace
+
+    def _forward_decode(
+        self,
+        q: Union[torch.Tensor, tuple[torch.Tensor, torch.Tensor]],
+        kv_c_and_k_pe_cache: torch.Tensor,
+        attn_metadata: MLACommonMetadata,
+        layer: AttentionLayer,
+    ) -> tuple[torch.Tensor, Optional[torch.Tensor]]:
+        assert kv_c_and_k_pe_cache.numel() > 0
+        assert attn_metadata.decode is not None
+
+        if isinstance(q, tuple):
+            q_nope, q_pe = q
+            q = torch.cat([q_nope, q_pe], dim=-1)
+
+        # trtllm API requires extra dimension q_len_per_request for MTP
+        q = q.unsqueeze(1)
+
+        o = trtllm_batch_decode_with_kv_cache_mla(
+            query=q,
+            kv_cache=kv_c_and_k_pe_cache.unsqueeze(1),
+            workspace_buffer=self._workspace_buffer,
+            qk_nope_head_dim=self.qk_nope_head_dim,
+            kv_lora_rank=self.kv_lora_rank,
+            qk_rope_head_dim=self.qk_rope_head_dim,
+            block_tables=attn_metadata.decode.block_table,
+            seq_lens=attn_metadata.decode.seq_lens,
+            max_seq_len=attn_metadata.max_seq_len,
+            bmm1_scale=self.scale,
+        )
+
+        # TODO: Return LSE pending support from Flashinfer API:
+        # https://github.com/flashinfer-ai/flashinfer/pull/1566
+        return o, None
diff --git a/vllm/v1/attention/backends/mla/flashmla.py b/vllm/v1/attention/backends/mla/flashmla.py
index 1c50144d4790097dc3f72afd558316a481c82774..549af1a06225244e273efd3a4d768e82fbb719ab 100644
--- a/vllm/v1/attention/backends/mla/flashmla.py
+++ b/vllm/v1/attention/backends/mla/flashmla.py
@@ -2,7 +2,7 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from dataclasses import dataclass
-from typing import ClassVar, Optional
+from typing import ClassVar, Optional, Union
 
 import torch
 
@@ -12,6 +12,7 @@ from vllm.attention.ops.flashmla import (flash_mla_with_kvcache,
                                          is_flashmla_supported)
 from vllm.config import VllmConfig
 from vllm.logger import init_logger
+from vllm.platforms.cuda import CudaPlatform
 from vllm.v1.attention.backends.mla.common import (MLACommonBackend,
                                                    MLACommonDecodeMetadata,
                                                    MLACommonImpl,
@@ -62,7 +63,6 @@ class FlashMLAMetadataBuilder(MLACommonMetadataBuilder[FlashMLAMetadata]):
         super().__init__(kv_cache_spec, layer_names, vllm_config, device,
                          FlashMLAMetadata)
 
-        self.compilation_config = vllm_config.compilation_config
         self.num_q_heads = vllm_config.model_config.get_num_attention_heads(
             vllm_config.parallel_config)
 
@@ -86,10 +86,14 @@ class FlashMLAMetadataBuilder(MLACommonMetadataBuilder[FlashMLAMetadata]):
                 dtype=torch.int32)
 
     def _build_decode(self, block_table_tensor: torch.Tensor,
-                      seq_lens: torch.Tensor) -> FlashMLADecodeMetadata:
+                      seq_lens_cpu: torch.Tensor,
+                      seq_lens_device: torch.Tensor,
+                      query_start_loc_cpu: torch.Tensor,
+                      query_start_loc_device: torch.Tensor,
+                      num_decode_tokens: int) -> FlashMLADecodeMetadata:
         tile_scheduler_metadata, num_splits = \
             get_mla_metadata(
-            seq_lens,
+            seq_lens_device,
             self.num_q_heads,
             1, # MQA for the decode path
         )
@@ -123,7 +127,7 @@ class FlashMLAMetadataBuilder(MLACommonMetadataBuilder[FlashMLAMetadata]):
 
         return FlashMLADecodeMetadata(
             block_table=block_table_tensor,
-            seq_lens=seq_lens,
+            seq_lens=seq_lens_device,
             tile_scheduler_metadata=tile_scheduler_metadata,
             num_splits=num_splits,
         )
@@ -131,6 +135,8 @@ class FlashMLAMetadataBuilder(MLACommonMetadataBuilder[FlashMLAMetadata]):
 
 class FlashMLAImpl(MLACommonImpl[FlashMLAMetadata]):
 
+    can_return_lse_for_decode: bool = True
+
     def __init__(
             self,
             num_heads: int,
@@ -153,6 +159,16 @@ class FlashMLAImpl(MLACommonImpl[FlashMLAMetadata]):
         assert is_flashmla_supported(), \
             "FlashMLA is not supported on this device"
 
+        # disallow FlashMLA on NVIDIA Blackwell (SM 10.0+) GPUs
+        # context:
+        # https://github.com/deepseek-ai/FlashMLA/issues/83
+        # https://github.com/vllm-project/vllm/issues/24513
+        if CudaPlatform.has_device_capability(100):
+            raise NotImplementedError(
+                "FlashMLA is temporarily disabled on Blackwell (SM 10.0). "
+                "Please use CUTLASS_MLA or TRITON_MLA instead. "
+                "Example: `export VLLM_ATTENTION_BACKEND=CUTLASS_MLA`")
+
         unsupported_features = [alibi_slopes, sliding_window, logits_soft_cap]
         if any(unsupported_features):
             raise NotImplementedError(
@@ -167,20 +183,20 @@ class FlashMLAImpl(MLACommonImpl[FlashMLAMetadata]):
 
     def _forward_decode(
         self,
-        q_nope: torch.Tensor,
-        q_pe: torch.Tensor,
+        q: Union[torch.Tensor, tuple[torch.Tensor, torch.Tensor]],
         kv_c_and_k_pe_cache: torch.Tensor,
         attn_metadata: FlashMLAMetadata,
         layer: AttentionLayer,
-    ) -> torch.Tensor:
+    ) -> tuple[torch.Tensor, Optional[torch.Tensor]]:
         assert kv_c_and_k_pe_cache.numel() > 0
         assert attn_metadata.decode is not None
 
-        q = torch.cat([q_nope, q_pe], dim=-1)\
-            .unsqueeze(1) # Add seqlen dim of 1 (decode)
+        if type(q) is tuple:
+            q = torch.cat(q, dim=-1)
 
-        o, _ = flash_mla_with_kvcache(
-            q=q,
+        assert isinstance(q, torch.Tensor)
+        o, lse = flash_mla_with_kvcache(
+            q=q.unsqueeze(1),  # Add seqlen dim of 1 (decode)
             k_cache=kv_c_and_k_pe_cache.unsqueeze(-2),  # Add head dim of 1
             block_table=attn_metadata.decode.block_table,
             cache_seqlens=attn_metadata.decode.seq_lens,
@@ -194,4 +210,4 @@ class FlashMLAImpl(MLACommonImpl[FlashMLAMetadata]):
             descale_k=layer._k_scale.reshape(1),
         )
 
-        return self._v_up_proj(o)
+        return o, lse
diff --git a/vllm/v1/attention/backends/mla/rocm_aiter_mla.py b/vllm/v1/attention/backends/mla/rocm_aiter_mla.py
index 870cc600388e7feff10cc72ea1966af20524cd96..db27a34d8959a98b6ed09baba0db9c90b53df7ec 100644
--- a/vllm/v1/attention/backends/mla/rocm_aiter_mla.py
+++ b/vllm/v1/attention/backends/mla/rocm_aiter_mla.py
@@ -2,7 +2,7 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from dataclasses import dataclass
-from typing import ClassVar, Optional
+from typing import ClassVar, Optional, Union
 
 import torch
 
@@ -105,11 +105,15 @@ class AiterMLAMetadataBuilder(MLACommonMetadataBuilder[AiterMLAMetadata]):
                                           device=device)
 
     def _build_decode(self, block_table_tensor: torch.Tensor,
-                      seq_lens: torch.Tensor) -> AiterMLADecodeMetadata:
+                      seq_lens_cpu: torch.Tensor,
+                      seq_lens_device: torch.Tensor,
+                      query_start_loc_cpu: torch.Tensor,
+                      query_start_loc_device: torch.Tensor,
+                      num_decode_tokens: int) -> AiterMLADecodeMetadata:
         page_size = self.kv_cache_spec.block_size
-        block_table_bounds = (seq_lens + page_size - 1) // page_size
+        block_table_bounds = (seq_lens_device + page_size - 1) // page_size
         device = self.device
-        num_reqs = seq_lens.size(0)
+        num_reqs = seq_lens_device.size(0)
 
         mask = (torch.arange(block_table_tensor.size(1),
                              dtype=block_table_tensor.dtype,
@@ -117,7 +121,7 @@ class AiterMLAMetadataBuilder(MLACommonMetadataBuilder[AiterMLAMetadata]):
                 < block_table_bounds.unsqueeze(1))
         paged_kv_indices = block_table_tensor[mask]
 
-        paged_kv_last_page_len = seq_lens % page_size
+        paged_kv_last_page_len = seq_lens_device % page_size
         paged_kv_last_page_len = torch.where(paged_kv_last_page_len == 0,
                                              page_size, paged_kv_last_page_len)
 
@@ -156,7 +160,7 @@ class AiterMLAMetadataBuilder(MLACommonMetadataBuilder[AiterMLAMetadata]):
 
         attn_metadata = AiterMLADecodeMetadata(
             block_table=block_table_tensor,
-            seq_lens=seq_lens,
+            seq_lens=seq_lens_device,
             paged_kv_indptr=paged_kv_indptr,
             paged_kv_indices=paged_kv_indices,
             paged_kv_last_page_len=paged_kv_last_page_len,
@@ -218,18 +222,19 @@ class AiterMLAImpl(MLACommonImpl[AiterMLAMetadata]):
 
     def _forward_decode(
         self,
-        q_nope: torch.Tensor,
-        q_pe: torch.Tensor,
+        q: Union[torch.Tensor, tuple[torch.Tensor, torch.Tensor]],
         kv_c_and_k_pe_cache: torch.Tensor,
         attn_metadata: AiterMLAMetadata,
         layer: AttentionLayer,
-    ) -> torch.Tensor:
+    ) -> tuple[torch.Tensor, Optional[torch.Tensor]]:
         assert kv_c_and_k_pe_cache.numel() > 0
         assert attn_metadata.decode is not None
 
-        B = q_nope.shape[0]
+        if type(q) is tuple:
+            q = torch.cat(q, dim=-1)
 
-        q = torch.cat([q_nope, q_pe], dim=-1)
+        assert isinstance(q, torch.Tensor)
+        B = q.shape[0]
         o = torch.zeros(B,
                         self.num_heads,
                         self.kv_lora_rank,
@@ -247,4 +252,4 @@ class AiterMLAImpl(MLACommonImpl[AiterMLAMetadata]):
                              attn_metadata.decode.paged_kv_indices,
                              attn_metadata.decode.paged_kv_last_page_len)
 
-        return self._v_up_proj(o)
+        return o, None
diff --git a/vllm/v1/attention/backends/mla/triton_mla.py b/vllm/v1/attention/backends/mla/triton_mla.py
index f2974ed668d99dc9e250729a3bafe89ce0dc9b2f..d692b00d78b463310e7f05554051944a39472a0d 100644
--- a/vllm/v1/attention/backends/mla/triton_mla.py
+++ b/vllm/v1/attention/backends/mla/triton_mla.py
@@ -1,7 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-from typing import Optional
+from typing import Optional, Union
 
 import torch
 
@@ -123,21 +123,22 @@ class TritonMLAImpl(MLACommonImpl[MLACommonMetadata]):
 
     def _forward_decode(
         self,
-        q_nope: torch.Tensor,
-        q_pe: torch.Tensor,
+        q: Union[torch.Tensor, tuple[torch.Tensor, torch.Tensor]],
         kv_c_and_k_pe_cache: torch.Tensor,
         attn_metadata: MLACommonMetadata,
         layer: AttentionLayer,
-    ) -> torch.Tensor:
+    ) -> tuple[torch.Tensor, Optional[torch.Tensor]]:
         assert kv_c_and_k_pe_cache.numel() > 0
         assert attn_metadata.decode is not None
 
         if self.kv_cache_dtype.startswith("fp8"):
             raise NotImplementedError("FP8 Triton MLA not yet supported")
 
-        B = q_nope.shape[0]
+        if type(q) is tuple:
+            q = torch.cat(q, dim=-1)
 
-        q = torch.cat([q_nope, q_pe], dim=-1)
+        assert isinstance(q, torch.Tensor)
+        B = q.shape[0]
         o = torch.zeros(B,
                         self.num_heads,
                         self.kv_lora_rank,
@@ -171,4 +172,4 @@ class TritonMLAImpl(MLACommonImpl[MLACommonMetadata]):
                              attn_metadata.decode.seq_lens, attn_logits,
                              num_kv_splits, self.scale, PAGE_SIZE)
 
-        return self._v_up_proj(o)
+        return o, None
diff --git a/vllm/v1/attention/backends/rocm_aiter_fa.py b/vllm/v1/attention/backends/rocm_aiter_fa.py
index 173a0a255e49146c2d175effefd47594e5540d06..a4e2758bd311f49b85b7ee4c7afffa740350ebeb 100644
--- a/vllm/v1/attention/backends/rocm_aiter_fa.py
+++ b/vllm/v1/attention/backends/rocm_aiter_fa.py
@@ -236,11 +236,11 @@ class AiterFlashAttentionMetadataBuilder(
 
     def __init__(self, kv_cache_spec: AttentionSpec, layer_names: list[str],
                  vllm_config: VllmConfig, device: torch.device):
-        self.vllm_config = vllm_config
+        super().__init__(kv_cache_spec, layer_names, vllm_config, device)
+
         self.model_config = vllm_config.model_config
         self.parallel_config = vllm_config.parallel_config
         self.cache_config = vllm_config.cache_config
-        self.device = device
 
         self.num_heads_q = self.model_config.get_num_attention_heads(
             self.parallel_config)
@@ -248,7 +248,6 @@ class AiterFlashAttentionMetadataBuilder(
             self.parallel_config)
         self.headdim = self.model_config.get_head_size()
         self.block_size = kv_cache_spec.block_size
-        self.kv_cache_spec = kv_cache_spec
         # Sliding window size to be used with the AOT scheduler will be
         # populated on first build() call.
         self.aot_sliding_window: Optional[tuple[int, int]] = None
diff --git a/vllm/v1/attention/backends/short_conv_attn.py b/vllm/v1/attention/backends/short_conv_attn.py
index d80ced8ec876a100260d49c5f88248f6e33ec454..f5ad65b02b4d48c08a9464bceaec5d0b391d6d8f 100644
--- a/vllm/v1/attention/backends/short_conv_attn.py
+++ b/vllm/v1/attention/backends/short_conv_attn.py
@@ -45,8 +45,8 @@ class ShortConvAttentionMetadataBuilder(
 
     def __init__(self, kv_cache_spec: AttentionSpec, layer_names: list[str],
                  vllm_config: VllmConfig, device: torch.device):
+        super().__init__(kv_cache_spec, layer_names, vllm_config, device)
         assert isinstance(kv_cache_spec, MambaSpec)
-        self.kv_cache_spec = kv_cache_spec
 
     def build(self,
               common_prefix_len: int,
@@ -58,8 +58,9 @@ class ShortConvAttentionMetadataBuilder(
         state_indices_tensor = common_attn_metadata.block_table_tensor[:, 0]
 
         num_decodes, num_prefills, num_decode_tokens, num_prefill_tokens = (
-            split_decodes_and_prefills(common_attn_metadata,
-                                       decode_threshold=1))
+            split_decodes_and_prefills(
+                common_attn_metadata,
+                decode_threshold=self.reorder_batch_threshold))
         has_initial_states = None
         if num_prefills > 0:
             #[batch,]
@@ -78,4 +79,4 @@ class ShortConvAttentionMetadataBuilder(
             has_initial_states=has_initial_states,
             state_indices_tensor=state_indices_tensor,
         )
-        return attn_metadata
\ No newline at end of file
+        return attn_metadata
diff --git a/vllm/v1/attention/backends/tree_attn.py b/vllm/v1/attention/backends/tree_attn.py
index b96d957a150b56a70fd22e588536168f0ba7c91b..10238f36455d274b26d2fb8b48f2da12cf64b087 100644
--- a/vllm/v1/attention/backends/tree_attn.py
+++ b/vllm/v1/attention/backends/tree_attn.py
@@ -165,7 +165,8 @@ class TreeAttentionMetadataBuilder(
         vllm_config: VllmConfig,
         device: torch.device,
     ):
-        self.kv_cache_spec = kv_cache_spec
+        super().__init__(kv_cache_spec, layer_names, vllm_config, device)
+
         self.block_size = kv_cache_spec.block_size
 
         spec_config = vllm_config.speculative_config
diff --git a/vllm/v1/attention/backends/triton_attn.py b/vllm/v1/attention/backends/triton_attn.py
index a37a7f6811ef9ffb000ebdf34dcf08944955402b..c294a5a73cbdd5745a9b6cf7215c44807a10fe34 100644
--- a/vllm/v1/attention/backends/triton_attn.py
+++ b/vllm/v1/attention/backends/triton_attn.py
@@ -7,7 +7,6 @@ from typing import ClassVar, Optional
 
 import torch
 
-from vllm import _custom_ops as ops
 from vllm import envs
 from vllm.attention.backends.abstract import (AttentionBackend, AttentionImpl,
                                               AttentionMetadata, AttentionType)
@@ -16,6 +15,8 @@ from vllm.attention.ops.chunked_prefill_paged_decode import (
 from vllm.attention.ops.paged_attn import PagedAttention
 from vllm.config import VllmConfig
 from vllm.logger import init_logger
+from vllm.model_executor.layers.quantization.utils.quant_utils import (
+    QuantKey, kFp8StaticTensorSym)
 from vllm.platforms import current_platform
 from vllm.v1.attention.backends.flash_attn import FlashAttentionMetadata
 from vllm.v1.attention.backends.utils import (AttentionCGSupport,
@@ -23,6 +24,11 @@ from vllm.v1.attention.backends.utils import (AttentionCGSupport,
                                               CommonAttentionMetadata)
 from vllm.v1.kv_cache_interface import AttentionSpec
 
+if current_platform.is_cuda_alike():
+    from vllm import _custom_ops as ops
+elif current_platform.is_xpu():
+    from vllm._ipex_ops import ipex_ops as ops
+
 logger = init_logger(__name__)
 
 
@@ -62,9 +68,9 @@ class TritonAttentionMetadataBuilder(
 
     def __init__(self, kv_cache_spec: AttentionSpec, layer_names: list[str],
                  vllm_config: VllmConfig, device: torch.device):
-        self.device = device
+        super().__init__(kv_cache_spec, layer_names, vllm_config, device)
+
         self.block_size = kv_cache_spec.block_size
-        self.kv_cache_spec = kv_cache_spec
 
         model_config = vllm_config.model_config
         self.num_heads_q = model_config.get_num_attention_heads(
@@ -198,6 +204,9 @@ def use_aiter_unified_attention() -> bool:
 
 class TritonAttentionImpl(AttentionImpl):
 
+    def fused_output_quant_supported(self, quant_key: QuantKey):
+        return quant_key == kFp8StaticTensorSym
+
     def __init__(
         self,
         num_heads: int,
@@ -293,9 +302,9 @@ class TritonAttentionImpl(AttentionImpl):
         """
         assert output is not None, "Output tensor must be provided."
 
-        if output_scale is not None or output_block_scale is not None:
+        if output_block_scale is not None:
             raise NotImplementedError(
-                "fused output quantization is not yet supported"
+                "fused block_scale output quantization is not yet supported"
                 " for TritonAttentionImpl")
 
         if attn_metadata is None:
@@ -337,7 +346,7 @@ class TritonAttentionImpl(AttentionImpl):
                     layer._v_scale,
                 )
             else:
-                torch.ops._C_cache_ops.reshape_and_cache_flash(
+                ops.reshape_and_cache_flash(
                     key,
                     value,
                     key_cache,
@@ -354,9 +363,10 @@ class TritonAttentionImpl(AttentionImpl):
             num_tokens, num_heads, head_size = query.shape
             assert layer._q_scale == 1.0, \
                 "A non 1.0 q_scale is not currently supported."
-            if not current_platform.is_rocm():
-                # Skip Q quantization on ROCm, since dequantizing back to
-                # f32 in the attention kernel is not supported.
+            if current_platform.is_cuda():
+                # Skip Q quantization on ROCm and XPU, enable this on cuda
+                # only, since dequantizing back to f32 in the attention kernel
+                # is not supported.
                 query, _ = ops.scaled_fp8_quant(
                     query.reshape(
                         (num_tokens, num_heads * head_size)).contiguous(),
@@ -389,6 +399,7 @@ class TritonAttentionImpl(AttentionImpl):
                 alibi_slopes=self.alibi_slopes,
                 sliding_window=self.sliding_window[0],
                 sm_scale=self.scale,
+                output_scale=output_scale,
                 sinks=self.sinks,
             )
 
@@ -414,6 +425,6 @@ class TritonAttentionImpl(AttentionImpl):
                 k_descale=layer._k_scale.expand(descale_shape),
                 v_descale=layer._v_scale.expand(descale_shape),
                 sinks=self.sinks,
-            )
+                output_scale=output_scale)
 
         return output
diff --git a/vllm/v1/attention/backends/utils.py b/vllm/v1/attention/backends/utils.py
index 011a90ece01bdc48c056d3252e2aa0aeda2e1fff..009943fa743d89c10eb6e2140a9ab5e6ebea6380 100644
--- a/vllm/v1/attention/backends/utils.py
+++ b/vllm/v1/attention/backends/utils.py
@@ -72,6 +72,9 @@ class CommonAttentionMetadata:
     logits_indices_padded: Optional[torch.Tensor] = None
     num_logits_indices: Optional[int] = None
 
+    # Needed by CrossAttentionBuilder
+    encoder_seq_lens: Optional[np.ndarray] = None
+
 
 @dataclass
 class UbatchSlice:
@@ -193,6 +196,9 @@ class AttentionMetadataBuilder(abc.ABC, Generic[M]):
     def __init__(self, kv_cache_spec: AttentionSpec, layer_names: list[str],
                  vllm_config: VllmConfig, device: torch.device):
         self.kv_cache_spec = kv_cache_spec
+        self.layer_names = layer_names
+        self.vllm_config = vllm_config
+        self.device = device
 
     @abstractmethod
     def build(self,
@@ -542,7 +548,14 @@ def make_local_attention_virtual_batches(
                                                    1)
     batch_indices = np.repeat(np.arange(actual_batch_size, dtype=np.int32),
                               local_blocks * pages_per_local_batch)
-    block_table_local = block_table[batch_indices, block_indices]\
+
+    # NOTE: https://github.com/pytorch/pytorch/pull/160256 causes performance
+    # regression when using numpy arrays (batch and block indices) to index into
+    # torch tensor (block_table). As a workaround, convert numpy arrays to torch
+    # tensor first, which recovers perf.
+    batch_indices_torch = torch.from_numpy(batch_indices)
+    block_indices_torch = torch.from_numpy(block_indices)
+    block_table_local = block_table[batch_indices_torch, block_indices_torch]\
         .view(virtual_batches, -1)
 
     query_start_loc_cpu = torch.from_numpy(cu_seqlens_q_local)
@@ -709,7 +722,7 @@ def reorder_batch_to_split_decodes_and_prefills(
 
     for i, req_id in enumerate(input_batch.req_ids):
         num_tokens = scheduler_output.num_scheduled_tokens[req_id]
-        # for now treat 1 scheduled token as "decode" even if its not,
+        # for now treat 1 scheduled token as "decode" even if it's not,
         # we should update this to something like < 8 in the future but
         # currently the TritonMLA._forward_decode only supports
         # num_tokens = 1
diff --git a/vllm/v1/attention/backends/xformers.py b/vllm/v1/attention/backends/xformers.py
index 7f888c113574312265d459bbd8f5087de0ddefc9..a6ca3349123534fb5fe225db7361df27ec58513c 100644
--- a/vllm/v1/attention/backends/xformers.py
+++ b/vllm/v1/attention/backends/xformers.py
@@ -3,7 +3,7 @@
 """Attention layer with XFormersAttention."""
 
 from dataclasses import dataclass
-from typing import TYPE_CHECKING, Optional
+from typing import TYPE_CHECKING, ClassVar, Optional
 
 import torch
 
@@ -197,6 +197,8 @@ class XFormersAttentionMetadata:
 class XFormersAttentionMetadataBuilder(
         AttentionMetadataBuilder[XFormersAttentionMetadata]):
 
+    reorder_batch_threshold: ClassVar[int] = 1
+
     def __init__(
         self,
         kv_cache_spec: AttentionSpec,
@@ -204,17 +206,19 @@ class XFormersAttentionMetadataBuilder(
         vllm_config: VllmConfig,
         device: torch.device,
     ):
+        super().__init__(kv_cache_spec, layer_names, vllm_config, device)
+
         assert XFORMERS_AVAILABLE
-        self.kv_cache_spec = kv_cache_spec
         self.block_size = kv_cache_spec.block_size
         self._num_decodes = 0
         self._num_decode_tokens = 0
 
     def reorder_batch(self, input_batch: "InputBatch",
                       scheduler_output: "SchedulerOutput") -> bool:
-        return reorder_batch_to_split_decodes_and_prefills(input_batch,
-                                                           scheduler_output,
-                                                           decode_threshold=1)
+        return reorder_batch_to_split_decodes_and_prefills(
+            input_batch,
+            scheduler_output,
+            decode_threshold=self.reorder_batch_threshold)
 
     def build(
         self,
@@ -223,8 +227,9 @@ class XFormersAttentionMetadataBuilder(
         fast_build: bool = False,
     ) -> XFormersAttentionMetadata:
         num_decodes, num_prefills, num_decode_tokens, num_prefill_tokens = (
-            split_decodes_and_prefills(common_attn_metadata,
-                                       decode_threshold=1))
+            split_decodes_and_prefills(
+                common_attn_metadata,
+                decode_threshold=self.reorder_batch_threshold))
 
         num_actual_tokens = common_attn_metadata.num_actual_tokens
         q_start_loc = common_attn_metadata.query_start_loc
diff --git a/vllm/v1/core/block_pool.py b/vllm/v1/core/block_pool.py
index b537cac8e1d72e800bb8aca0ee7933f4e9c355c6..d1e1c1c8d038292c35718bb20b995f5e973dfe19 100644
--- a/vllm/v1/core/block_pool.py
+++ b/vllm/v1/core/block_pool.py
@@ -9,7 +9,11 @@ from vllm.distributed.kv_events import (MEDIUM_GPU, AllBlocksCleared,
                                         KVCacheEvent)
 from vllm.logger import init_logger
 from vllm.v1.core.kv_cache_utils import (BlockHash, BlockHashWithGroupId,
-                                         FreeKVCacheBlockQueue, KVCacheBlock)
+                                         ExternalBlockHash,
+                                         FreeKVCacheBlockQueue, KVCacheBlock,
+                                         get_block_hash,
+                                         make_block_hash_with_group_id,
+                                         maybe_convert_block_hash)
 from vllm.v1.request import Request
 
 logger = init_logger(__name__)
@@ -84,8 +88,10 @@ class BlockPool:
         """
         cached_blocks = []
         for group_id in kv_cache_group_ids:
+            block_hash_with_group_id = make_block_hash_with_group_id(
+                block_hash, group_id)
             cached_blocks_one_group = self.cached_block_hash_to_block.get(
-                BlockHashWithGroupId(block_hash, group_id))
+                block_hash_with_group_id)
             if not cached_blocks_one_group:
                 return None
             first_block = next(iter(cached_blocks_one_group.values()))
@@ -124,28 +130,29 @@ class BlockPool:
         assert len(request.block_hashes) >= num_full_blocks
         new_block_hashes = request.block_hashes[num_cached_blocks:]
 
-        new_hashes: Optional[list[int]] = ([] if self.enable_kv_cache_events
-                                           else None)
+        new_hashes: Optional[list[ExternalBlockHash]] = (
+            [] if self.enable_kv_cache_events else None)
         for i, blk in enumerate(new_full_blocks):
             assert blk.block_hash is None
             block_hash = new_block_hashes[i]
 
             # Update and added the full block to the cache.
-            block_hash_with_group_id = BlockHashWithGroupId(
+            block_hash_with_group_id = make_block_hash_with_group_id(
                 block_hash, kv_cache_group_id)
             blk.block_hash = block_hash_with_group_id
             self.cached_block_hash_to_block[block_hash_with_group_id][
                 blk.block_id] = blk
             if new_hashes is not None:
-                new_hashes.append(block_hash.hash_value)
+                new_hashes.append(maybe_convert_block_hash(block_hash))
 
         if self.enable_kv_cache_events:
             if num_cached_blocks == 0:
-                parent_block_hash = None
+                parent_block_hash: Optional[ExternalBlockHash] = None
             else:
                 parent_block = blocks[num_cached_blocks - 1]
                 assert parent_block.block_hash is not None
-                parent_block_hash = parent_block.block_hash.get_hash_value()
+                parent_block_hash = maybe_convert_block_hash(
+                    get_block_hash(parent_block.block_hash))
 
             self.kv_event_queue.append(
                 BlockStored(
@@ -220,7 +227,9 @@ class BlockPool:
             # we disable hybrid kv cache manager when kv cache event is
             # enabled, so there is only one group.
             self.kv_event_queue.append(
-                BlockRemoved(block_hashes=[block_hash.get_hash_value()],
+                BlockRemoved(block_hashes=[
+                    maybe_convert_block_hash(get_block_hash(block_hash))
+                ],
                              medium=MEDIUM_GPU))
         return True
 
diff --git a/vllm/v1/core/kv_cache_coordinator.py b/vllm/v1/core/kv_cache_coordinator.py
index 9421341f990c859161477cd4833447d94ecf50a7..86771060c409928edf6ed110465aef8e0ac8db94 100644
--- a/vllm/v1/core/kv_cache_coordinator.py
+++ b/vllm/v1/core/kv_cache_coordinator.py
@@ -24,6 +24,7 @@ class KVCacheCoordinator(ABC):
         use_eagle: bool,
         enable_caching: bool,
         enable_kv_cache_events: bool,
+        dcp_world_size: int,
     ):
         self.kv_cache_config = kv_cache_config
         self.max_model_len = max_model_len
@@ -39,6 +40,7 @@ class KVCacheCoordinator(ABC):
                 kv_cache_spec=kv_cache_group.kv_cache_spec,
                 block_pool=self.block_pool,
                 kv_cache_group_id=i,
+                dcp_world_size=dcp_world_size,
             ) for i, kv_cache_group in enumerate(
                 self.kv_cache_config.kv_cache_groups))
 
@@ -197,9 +199,14 @@ class KVCacheCoordinatorNoPrefixCache(KVCacheCoordinator):
     """
 
     def __init__(self, kv_cache_config: KVCacheConfig, max_model_len: int,
-                 use_eagle: bool, enable_kv_cache_events: bool):
-        super().__init__(kv_cache_config, max_model_len, use_eagle, False,
-                         enable_kv_cache_events)
+                 use_eagle: bool, enable_kv_cache_events: bool,
+                 dcp_world_size: int):
+        super().__init__(kv_cache_config,
+                         max_model_len,
+                         use_eagle,
+                         False,
+                         enable_kv_cache_events,
+                         dcp_world_size=dcp_world_size)
         self.num_single_type_manager = len(self.single_type_managers)
 
     def get_num_common_prefix_blocks(self, request_id: str,
@@ -225,12 +232,19 @@ class UnitaryKVCacheCoordinator(KVCacheCoordinator):
 
     def __init__(self, kv_cache_config: KVCacheConfig, max_model_len: int,
                  use_eagle: bool, enable_caching: bool,
-                 enable_kv_cache_events: bool):
-        super().__init__(kv_cache_config, max_model_len, use_eagle,
-                         enable_caching, enable_kv_cache_events)
+                 enable_kv_cache_events: bool, dcp_world_size: int):
+        super().__init__(kv_cache_config,
+                         max_model_len,
+                         use_eagle,
+                         enable_caching,
+                         enable_kv_cache_events,
+                         dcp_world_size=dcp_world_size)
         self.kv_cache_spec = self.kv_cache_config.kv_cache_groups[
             0].kv_cache_spec
         self.block_size = self.kv_cache_spec.block_size
+        self.dcp_world_size = dcp_world_size
+        if dcp_world_size > 1:
+            self.block_size *= dcp_world_size
         assert len(self.kv_cache_config.kv_cache_groups) == 1, (
             "UnitaryKVCacheCoordinator assumes only one kv cache group")
 
@@ -246,6 +260,7 @@ class UnitaryKVCacheCoordinator(KVCacheCoordinator):
             block_pool=self.block_pool,
             kv_cache_spec=self.kv_cache_spec,
             use_eagle=self.use_eagle,
+            dcp_world_size=self.dcp_world_size,
         )
         return hit_blocks, len(hit_blocks[0]) * self.block_size
 
@@ -261,9 +276,14 @@ class HybridKVCacheCoordinator(KVCacheCoordinator):
 
     def __init__(self, kv_cache_config: KVCacheConfig, max_model_len: int,
                  use_eagle: bool, enable_caching: bool,
-                 enable_kv_cache_events: bool):
-        super().__init__(kv_cache_config, max_model_len, use_eagle,
-                         enable_caching, enable_kv_cache_events)
+                 enable_kv_cache_events: bool, dcp_world_size: int):
+        super().__init__(kv_cache_config,
+                         max_model_len,
+                         use_eagle,
+                         enable_caching,
+                         enable_kv_cache_events,
+                         dcp_world_size=dcp_world_size)
+        assert dcp_world_size == 1, "DCP not support hybrid attn now."
         self.verify_and_split_kv_cache_groups()
 
     def verify_and_split_kv_cache_groups(self) -> None:
@@ -394,17 +414,27 @@ class HybridKVCacheCoordinator(KVCacheCoordinator):
         return hit_blocks, hit_length
 
 
-def get_kv_cache_coordinator(
-        kv_cache_config: KVCacheConfig, max_model_len: int, use_eagle: bool,
-        enable_caching: bool,
-        enable_kv_cache_events: bool) -> KVCacheCoordinator:
+def get_kv_cache_coordinator(kv_cache_config: KVCacheConfig,
+                             max_model_len: int, use_eagle: bool,
+                             enable_caching: bool,
+                             enable_kv_cache_events: bool,
+                             dcp_world_size: int) -> KVCacheCoordinator:
     if not enable_caching:
-        return KVCacheCoordinatorNoPrefixCache(kv_cache_config, max_model_len,
+        return KVCacheCoordinatorNoPrefixCache(kv_cache_config,
+                                               max_model_len,
                                                use_eagle,
-                                               enable_kv_cache_events)
+                                               enable_kv_cache_events,
+                                               dcp_world_size=dcp_world_size)
     if len(kv_cache_config.kv_cache_groups) == 1:
-        return UnitaryKVCacheCoordinator(kv_cache_config, max_model_len,
-                                         use_eagle, enable_caching,
-                                         enable_kv_cache_events)
-    return HybridKVCacheCoordinator(kv_cache_config, max_model_len, use_eagle,
-                                    enable_caching, enable_kv_cache_events)
+        return UnitaryKVCacheCoordinator(kv_cache_config,
+                                         max_model_len,
+                                         use_eagle,
+                                         enable_caching,
+                                         enable_kv_cache_events,
+                                         dcp_world_size=dcp_world_size)
+    return HybridKVCacheCoordinator(kv_cache_config,
+                                    max_model_len,
+                                    use_eagle,
+                                    enable_caching,
+                                    enable_kv_cache_events,
+                                    dcp_world_size=dcp_world_size)
diff --git a/vllm/v1/core/kv_cache_manager.py b/vllm/v1/core/kv_cache_manager.py
index 87a11fe58a0485418627bbbbd94847f25d84fff9..3a0fbb5e5c41e204f26b131be4e5c7879a4ea693 100644
--- a/vllm/v1/core/kv_cache_manager.py
+++ b/vllm/v1/core/kv_cache_manager.py
@@ -91,6 +91,7 @@ class KVCacheManager:
         use_eagle: bool = False,
         log_stats: bool = False,
         enable_kv_cache_events: bool = False,
+        dcp_world_size: int = 1,
     ) -> None:
         self.max_model_len = max_model_len
 
@@ -109,12 +110,20 @@ class KVCacheManager:
             self.block_size = kv_cache_config.kv_cache_groups[
                 0].kv_cache_spec.block_size
 
+            if dcp_world_size > 1:
+                assert len(kv_cache_config.kv_cache_groups) == 1
+                # Note(hc): need revisit. When both DCP and any future
+                # PCP are enabled, the block_size may need to be scaled
+                # by a factor of dcp_size × pcp_size?
+                self.block_size *= dcp_world_size
+
         self.coordinator = get_kv_cache_coordinator(
             kv_cache_config=kv_cache_config,
             max_model_len=self.max_model_len,
             use_eagle=self.use_eagle,
             enable_caching=self.enable_caching,
             enable_kv_cache_events=enable_kv_cache_events,
+            dcp_world_size=dcp_world_size,
         )
         self.num_kv_cache_groups = len(kv_cache_config.kv_cache_groups)
         self.block_pool = self.coordinator.block_pool
diff --git a/vllm/v1/core/kv_cache_utils.py b/vllm/v1/core/kv_cache_utils.py
index 590baa6208d078eb051d1c3b70b56ce0843f5d06..2c0eac3ddd79d9e0a4f1bb54a538e4bab052b6b0 100644
--- a/vllm/v1/core/kv_cache_utils.py
+++ b/vllm/v1/core/kv_cache_utils.py
@@ -6,11 +6,12 @@ import os
 from collections import defaultdict, deque
 from collections.abc import Iterable, Sequence
 from dataclasses import astuple, dataclass
-from typing import Any, Callable, NamedTuple, Optional
+from typing import Any, Callable, NewType, Optional, Union
 
+from vllm import envs
 from vllm.config import VllmConfig
 from vllm.logger import init_logger
-from vllm.utils import GiB_bytes, cdiv, sha256_cbor_64bit
+from vllm.utils import GiB_bytes, cdiv, sha256_cbor
 from vllm.v1.kv_cache_interface import (ChunkedLocalAttentionSpec,
                                         FullAttentionSpec, KVCacheConfig,
                                         KVCacheGroupSpec, KVCacheSpec,
@@ -18,59 +19,78 @@ from vllm.v1.kv_cache_interface import (ChunkedLocalAttentionSpec,
 from vllm.v1.metrics.stats import PrefixCacheStats
 from vllm.v1.request import Request
 
-logger = init_logger(__name__)
+# BlockHash represents the hash of a single KV-cache block used for
+# prefix caching.  Treating it as a distinct type from ``bytes`` helps
+# catch accidental misuse when passing around raw byte strings.
+BlockHash = NewType("BlockHash", bytes)
+
+# ``BlockHashWithGroupId`` combines a ``BlockHash`` with its KV cache group ID.
+# It is represented as raw bytes for compactness and efficiency. The helper
+# functions below pack/unpack the ``BlockHash`` and group id into/from the key.
+BlockHashWithGroupId = NewType("BlockHashWithGroupId", bytes)
+
+# ExternalBlockHash is used for reproducible prefix-cache block hashing.
+# It's a union of ``bytes`` and ``int`` to keep backward compatibility
+# after we default block hashing to use sha256 bytes.
+ExternalBlockHash = Union[bytes, int]
+
 
+def make_block_hash_with_group_id(block_hash: BlockHash,
+                                  group_id: int) -> BlockHashWithGroupId:
+    """Pack a ``BlockHash`` and group id into a ``BlockHashWithGroupId``.
 
-class BlockHash(NamedTuple):
-    """Hash value of a block (int), the token IDs in the block, and extra keys.
-    We keep a tuple of token IDs and extra keys to reduce the likelihood of
-    hash collisions when the hash value is the same. By using SHA256 however,
-    hash collisions are practically impossible.
+    The group id is encoded using 4 bytes in big-endian order and appended to
+    the block hash bytes.  This representation avoids creating tuples while
+    still allowing us to recover both components when needed.
     """
-    # Hash value of the block in an integer.
-    hash_value: int
-    # Token IDs in the block.
-    token_ids: tuple[int, ...]
-    # Extra keys for the block.
-    extra_keys: Optional[Any] = None
+    return BlockHashWithGroupId(block_hash +
+                                group_id.to_bytes(4, "big", signed=False))
+
+
+def get_block_hash(key: BlockHashWithGroupId) -> BlockHash:
+    """Extract the ``BlockHash`` from a ``BlockHashWithGroupId``."""
+    return BlockHash(key[:-4])
+
 
+def get_group_id(key: BlockHashWithGroupId) -> int:
+    """Extract the group id from a ``BlockHashWithGroupId``."""
+    return int.from_bytes(key[-4:], "big", signed=False)
 
-class BlockHashWithGroupId(NamedTuple):
-    # The hash value for the contents (e.g., token_ids) of a block without group
-    # ID. The value is the same for blocks representing the same tokens but for
-    # different groups.
-    block_hash: BlockHash
-    # The KV cache group ID.
-    group_id: int
 
-    def get_hash_value(self) -> int:
-        return self.block_hash.hash_value
+def maybe_convert_block_hash(hash_bytes: BlockHash) -> ExternalBlockHash:
+    if not envs.VLLM_KV_EVENTS_USE_INT_BLOCK_HASHES:
+        return hash_bytes
+    return int.from_bytes(hash_bytes, byteorder="big") & ((1 << 64) - 1)
 
 
+logger = init_logger(__name__)
+
 # The hash seed for the first block of any prefix block sequence.
 #
 # We use a random value to avoid hash collisions or PYTHONHASHSEED environment
-# variable if set such that processes can share the seed if needed.
-# This aligns with the behavior of Python's hash() function, which also uses
-# a random seed if PYTHONHASHSEED is not set.
+# variable if set such that processes can share the seed if needed. This aligns
+# with the behavior of Python's hash() function, which also uses a random seed
+# if PYTHONHASHSEED is not set.
 #
 # The function `init_none_hash` initializes this variable globally.
-NONE_HASH: int
+NONE_HASH: BlockHash
 
 
-def init_none_hash(hash_fn: Callable):
+def init_none_hash(hash_fn: Callable[[Any], bytes]):
     global NONE_HASH
 
     hash_seed = os.getenv("PYTHONHASHSEED")
-    if hash_seed is None and hash_fn is sha256_cbor_64bit:
+    if hash_seed is None and hash_fn is sha256_cbor:
         logger.warning(
             "PYTHONHASHSEED is not set. This will lead to non-reproducible "
-            "block-hashes when using sha256_cbor_64bit as the hash function."
+            "block-hashes when using sha256_cbor as the hash function."
             "Consider setting PYTHONHASHSEED to a fixed value for "
             "reproducibility.")
 
-    NONE_HASH = (int.from_bytes(os.urandom(32), byteorder="big")
-                 if hash_seed is None else hash_fn(hash_seed))
+    if hash_seed is None:
+        NONE_HASH = BlockHash(os.urandom(32))
+    else:
+        NONE_HASH = BlockHash(hash_fn(hash_seed))
 
 
 class PrefixCachingMetrics:
@@ -142,8 +162,8 @@ class KVCacheBlock:
     block_id: int
     # Reference count.
     ref_cnt: int = 0
-    # The hash of the block composed of (block hash, tuple of token IDs).
-    # It is only available when the block is full.
+    # The hash key (block hash + group id) of the block, only available
+    # when the block is full and cached.
     _block_hash: Optional[BlockHashWithGroupId] = None
 
     # Used to construct a doubly linked list for free blocks.
@@ -177,7 +197,7 @@ class KVCacheBlock:
                          if self.next_free_block else None)
         return (f"KVCacheBlock(block_id={self.block_id}, "
                 f"ref_cnt={self.ref_cnt}, "
-                f"_block_hash={self._block_hash}, "
+                f"_block_hash={self._block_hash!r}, "
                 f"prev_free_block={prev_block_id}, "
                 f"next_free_block={next_block_id})")
 
@@ -217,7 +237,7 @@ class FreeKVCacheBlockQueue:
         # Create a fake head and a tail block for the doubly linked list to
         # reduce branching in the code
         #
-        # The implementation garenteed that the fake head and tail
+        # The implementation guaranteed that the fake head and tail
         # are NEVER got popped, so we could safely assume each real blocks
         # in the queue has prev and next blocks.
         self.fake_free_list_head = KVCacheBlock(block_id=-1)
@@ -517,15 +537,14 @@ def generate_block_hash_extra_keys(
 
 
 def hash_block_tokens(
-        hash_function: Callable,
-        parent_block_hash: Optional[int],
+        hash_function: Callable[[Any], bytes],
+        parent_block_hash: Optional[BlockHash],
         curr_block_token_ids: Sequence[int],
         extra_keys: Optional[tuple[Any, ...]] = None) -> BlockHash:
     """Computes a hash value corresponding to the contents of a block and
     the contents of the preceding block(s). The hash value is used for
     prefix caching. We use LRU cache for this function to avoid recomputing
     hash values for the same block contents.
-
     Args:
         hash_function: The hash function used to compute block hash.
         parent_block_hash: The hash of the parent block. None
@@ -533,7 +552,6 @@ def hash_block_tokens(
         curr_block_token_ids: A list of token ids in the current
             block. The current block is assumed to be full.
         extra_keys: Extra keys for the block.
-
     Returns:
         The hash value of the block and the token ids in the block.
         The entire tuple is used as the hash key of the block.
@@ -544,26 +562,16 @@ def hash_block_tokens(
     curr_block_token_ids_tuple = tuple(curr_block_token_ids)
     return BlockHash(
         hash_function(
-            (parent_block_hash, curr_block_token_ids_tuple, extra_keys)),
-        curr_block_token_ids_tuple, extra_keys)
+            (parent_block_hash, curr_block_token_ids_tuple, extra_keys)))
 
 
 def get_request_block_hasher(
     block_size: int,
-    caching_hash_fn: Callable[[Any],
-                              int]) -> Callable[[Request], list[BlockHash]]:
+    caching_hash_fn: Callable[[Any], bytes],
+) -> Callable[[Request], list[BlockHash]]:
     """
     Returns a function which computes the list of un-computed block hashes
-    of a request.
-
-    Each request holds a list of its block hashes (request.block_hashes).
-    When a request is created, it calls the below function to compute
-    the hashes of all full blocks of the request's initial tokens.
-    The hashes are then stored in request.block_hashes.
-    Later, whenever new tokens are appended to the request, it calls
-    the below function again to compute any new full blocks of tokens.
-    The returned new hashes are appended to request.block_hashes.
-    """
+    of a request."""
 
     def request_block_hasher(request: Request) -> list[BlockHash]:
         start_token_idx = len(request.block_hashes) * block_size
@@ -577,8 +585,8 @@ def get_request_block_hasher(
             # last mm input.
             curr_mm_idx = -1
 
-        prev_block_hash_value = request.block_hashes[-1].hash_value \
-            if request.block_hashes else None
+        prev_block_hash_value = (request.block_hashes[-1]
+                                 if request.block_hashes else None)
         new_block_hashes: list[BlockHash] = []
         while True:
             end_token_idx = start_token_idx + block_size
@@ -598,7 +606,7 @@ def get_request_block_hasher(
 
             new_block_hashes.append(block_hash)
             start_token_idx += block_size
-            prev_block_hash_value = block_hash.hash_value
+            prev_block_hash_value = block_hash
 
         return new_block_hashes
 
@@ -846,6 +854,12 @@ def _get_kv_cache_config_uniform_type(vllm_config: VllmConfig,
     )
 
     num_tokens = num_blocks * vllm_config.cache_config.block_size
+    if vllm_config.parallel_config.decode_context_parallel_size > 1:
+        num_tokens *= vllm_config.parallel_config.decode_context_parallel_size
+        logger.info(
+            "Multiplying the GPU KV cache size by the dcp_world_size %d.",
+            vllm_config.parallel_config.decode_context_parallel_size)
+
     num_tokens_str = f"{num_tokens:,}"
     logger.info("GPU KV cache size: %s tokens", num_tokens_str)
     max_model_len_str = f"{vllm_config.model_config.max_model_len:,}"
diff --git a/vllm/v1/core/sched/output.py b/vllm/v1/core/sched/output.py
index b5cd6c5c8af51a4e7765461f709f99701c02bf19..9888f25735753d950861c2a1de1220c72e7a6037 100644
--- a/vllm/v1/core/sched/output.py
+++ b/vllm/v1/core/sched/output.py
@@ -6,6 +6,8 @@ from __future__ import annotations
 from dataclasses import dataclass
 from typing import TYPE_CHECKING, Optional
 
+from vllm import bc_linter_include
+
 if TYPE_CHECKING:
     import numpy as np
     import numpy.typing as npt
@@ -19,6 +21,7 @@ if TYPE_CHECKING:
     from vllm.v1.request import Request
 
 
+@bc_linter_include
 @dataclass
 class NewRequestData:
 
@@ -80,6 +83,7 @@ class NewRequestData:
                 ")")
 
 
+@bc_linter_include
 @dataclass
 class CachedRequestData:
 
@@ -109,6 +113,7 @@ class CachedRequestData:
         )
 
 
+@bc_linter_include
 @dataclass
 class SchedulerOutput:
 
diff --git a/vllm/v1/core/sched/scheduler.py b/vllm/v1/core/sched/scheduler.py
index d4391b1c2137a976482305a64b66176865255507..aa45f6669207d467451026c4f6e5da78424af025 100644
--- a/vllm/v1/core/sched/scheduler.py
+++ b/vllm/v1/core/sched/scheduler.py
@@ -100,6 +100,15 @@ class Scheduler(SchedulerInterface):
 
         self.block_size = self.cache_config.block_size
 
+        self.dcp_world_size = \
+            vllm_config.parallel_config.decode_context_parallel_size
+        # Note(hc): The scheduler’s block_size must be multiplied
+        # by dcp_world_size, since block hashes are computed on the
+        # original full token sequence at a granularity of
+        # original_block_size × dcp_world_size.
+        if self.dcp_world_size > 1:
+            self.block_size *= self.dcp_world_size
+
         # req_id -> Request
         self.requests: dict[str, Request] = {}
         # Scheduling policy
@@ -135,8 +144,8 @@ class Scheduler(SchedulerInterface):
         )
 
         # NOTE(woosuk): Here, "encoder" includes the vision encoder (and
-        # projector if needed). Currently, we assume that the encoder also
-        # has the Transformer architecture (e.g., ViT).
+        # projector if needed) for MM models as well as encoder-decoder
+        # transformers.
         self.max_num_encoder_input_tokens = encoder_compute_budget
         # NOTE: For the models without encoder (e.g., text-only models),
         # the encoder cache will not be initialized because cache size is 0
@@ -161,6 +170,7 @@ class Scheduler(SchedulerInterface):
             use_eagle=self.use_eagle,
             log_stats=self.log_stats,
             enable_kv_cache_events=self.enable_kv_cache_events,
+            dcp_world_size=self.dcp_world_size,
         )
         self.use_pp = self.parallel_config.pipeline_parallel_size > 1
 
@@ -377,6 +387,14 @@ class Scheduler(SchedulerInterface):
                             self.connector.get_num_new_matched_tokens(
                                 request, num_new_local_computed_tokens))
 
+                        if num_external_computed_tokens is None:
+                            # The request cannot be scheduled because
+                            # the KVConnector couldn't determine
+                            # the number of matched tokens.
+                            self.waiting.pop_request()
+                            skipped_waiting_requests.prepend_request(request)
+                            continue
+
                     # Total computed tokens (local + external).
                     num_computed_tokens = (num_new_local_computed_tokens +
                                            num_external_computed_tokens)
@@ -757,15 +775,19 @@ class Scheduler(SchedulerInterface):
                 # in the decoder's KV cache.
                 continue
 
-            # The same encoder input has already been scheduled in the current
-            # step.
-            if request.mm_hashes[i] in mm_hashes_to_schedule:
-                continue
+            if not self.is_encoder_decoder:
+                # We are not using the encoder cache for encoder-decoder models,
+                # yet.
+                if request.mm_hashes[i] in mm_hashes_to_schedule:
+                    # The same encoder input has already been scheduled in the
+                    # current step.
+                    continue
 
-            if self.encoder_cache_manager.check_and_update_cache(request, i):
-                # The encoder input is already computed and cached from a
-                # previous step.
-                continue
+                if self.encoder_cache_manager.check_and_update_cache(
+                        request, i):
+                    # The encoder input is already computed and cached from a
+                    # previous step.
+                    continue
 
             # If no encoder input chunking is allowed, we do not want to
             # partially schedule a multimodal item. If the scheduled range would
@@ -815,7 +837,7 @@ class Scheduler(SchedulerInterface):
         # NOTE: structured_output_request_ids maps
         # a request's (request that uses structured output)
         # request_id to its index in the batch.
-        # This will helps us determine to slice the grammar bitmask
+        # This will help us determine to slice the grammar bitmask
         # and only applies valid mask for requests that
         # uses structured decoding.
         structured_output_request_ids: dict[str, int] = {}
@@ -873,19 +895,19 @@ class Scheduler(SchedulerInterface):
             scheduled_spec_token_ids = (
                 scheduler_output.scheduled_spec_decode_tokens.get(req_id))
             if scheduled_spec_token_ids:
+                num_draft_tokens = len(scheduled_spec_token_ids)
+                num_accepted = len(generated_token_ids) - 1
+                num_rejected = num_draft_tokens - num_accepted
                 # num_computed_tokens represents the number of tokens
                 # processed in the current step, considering scheduled
                 # tokens and rejections. If some tokens are rejected,
                 # num_computed_tokens is decreased by the number of rejected
-                # tokens, where is given by:
-                # len(scheduled_spec_token_ids) + 1 - len(generated_token_ids).
-                num_tokens_rejected = (len(scheduled_spec_token_ids) + 1 -
-                                       len(generated_token_ids))
-                request.num_computed_tokens -= num_tokens_rejected
+                # tokens.
+                request.num_computed_tokens -= num_rejected
                 spec_decoding_stats = self.make_spec_decoding_stats(
                     spec_decoding_stats,
-                    num_draft_tokens=len(scheduled_spec_token_ids),
-                    num_accepted_tokens=len(generated_token_ids) - 1)
+                    num_draft_tokens=num_draft_tokens,
+                    num_accepted_tokens=num_accepted)
 
             stopped = False
             new_logprobs = None
@@ -923,7 +945,7 @@ class Scheduler(SchedulerInterface):
                     request):
                 # NOTE: structured_output_request
                 # should not be None if use_structured_output, we have
-                # check above, so safe to ignore type warning
+                # checked above, so safe to ignore type warning
                 request.structured_output_request.grammar.accept_tokens(  # type: ignore[union-attr]
                     req_id, new_token_ids)
 
@@ -947,9 +969,9 @@ class Scheduler(SchedulerInterface):
                         stop_reason=request.stop_reason,
                         events=request.take_events(),
                         kv_transfer_params=kv_transfer_params,
+                        trace_headers=request.trace_headers,
                         num_cached_tokens=request.num_cached_tokens,
                     ))
-
             else:
                 # Invariant: EngineCore returns no partial prefill outputs.
                 assert not prompt_logprobs_tensors
@@ -1029,7 +1051,13 @@ class Scheduler(SchedulerInterface):
             mm_positions = request.mm_positions[input_id]
             start_pos = mm_positions.offset
             num_tokens = mm_positions.length
-            if start_pos + num_tokens <= request.num_computed_tokens:
+            if self.is_encoder_decoder and request.num_computed_tokens > 0:
+                # With Whisper, as soon as we've generated a single token,
+                # we know we're done with the encoder input. Cross Attention
+                # KVs have been calculated and cached already.
+                self.encoder_cache_manager.free_encoder_input(
+                    request, input_id)
+            elif start_pos + num_tokens <= request.num_computed_tokens:
                 # The encoder output is already processed and stored
                 # in the decoder's KV cache.
                 self.encoder_cache_manager.free_encoder_input(
@@ -1178,6 +1206,8 @@ class Scheduler(SchedulerInterface):
     def shutdown(self) -> None:
         if self.kv_event_publisher:
             self.kv_event_publisher.shutdown()
+        if self.connector is not None:
+            self.connector.shutdown()
 
     ########################################################################
     # KV Connector Related Methods
@@ -1242,7 +1272,7 @@ class Scheduler(SchedulerInterface):
         finished_sending reqs to the output.
         * if finished_sending: free the blocks
         # if finished_recving: add to state so we can
-            scheduler the request during the next step.
+            schedule the request during the next step.
         """
 
         if self.connector is not None:
diff --git a/vllm/v1/core/single_type_kv_cache_manager.py b/vllm/v1/core/single_type_kv_cache_manager.py
index f6affb3dab66ff7085c894092a06b06955b1d96f..d27239164b0dbf5d54f771200358d0550f700dff 100644
--- a/vllm/v1/core/single_type_kv_cache_manager.py
+++ b/vllm/v1/core/single_type_kv_cache_manager.py
@@ -25,6 +25,7 @@ class SingleTypeKVCacheManager(ABC):
         kv_cache_spec: KVCacheSpec,
         block_pool: BlockPool,
         kv_cache_group_id: int,
+        dcp_world_size: int = 1,
     ) -> None:
         """
         Initializes the SingleTypeKVCacheManager.
@@ -33,8 +34,10 @@ class SingleTypeKVCacheManager(ABC):
             block_pool: The block pool.
             kv_cache_group_id: The id of the kv cache group of this manager.
         """
-
         self.block_size = kv_cache_spec.block_size
+        self.dcp_world_size = dcp_world_size
+        if self.dcp_world_size > 1:
+            self.block_size *= dcp_world_size
         self.kv_cache_spec = kv_cache_spec
         self.block_pool = block_pool
 
@@ -196,6 +199,7 @@ class SingleTypeKVCacheManager(ABC):
         block_pool: BlockPool,
         kv_cache_spec: KVCacheSpec,
         use_eagle: bool,
+        dcp_world_size: int = 1,
     ) -> tuple[list[KVCacheBlock], ...]:
         """
         Get the longest cache hit prefix of the blocks that is not longer than 
@@ -253,6 +257,7 @@ class FullAttentionManager(SingleTypeKVCacheManager):
         block_pool: BlockPool,
         kv_cache_spec: KVCacheSpec,
         use_eagle: bool,
+        dcp_world_size: int = 1,
     ) -> tuple[list[KVCacheBlock], ...]:
         assert isinstance(
             kv_cache_spec, (FullAttentionSpec, ChunkedLocalAttentionSpec)
@@ -260,7 +265,10 @@ class FullAttentionManager(SingleTypeKVCacheManager):
             "and chunked local attention groups"
         computed_blocks: tuple[list[KVCacheBlock], ...] = tuple(
             [] for _ in range(len(kv_cache_group_ids)))
-        max_num_blocks = max_length // kv_cache_spec.block_size
+        block_size = kv_cache_spec.block_size
+        if dcp_world_size > 1:
+            block_size *= dcp_world_size
+        max_num_blocks = max_length // block_size
         for block_hash in itertools.islice(block_hashes, max_num_blocks):
             # block_hashes is a chain of block hashes. If a block hash is not
             # in the cached_block_hash_to_id, the following block hashes are
@@ -310,9 +318,11 @@ class SlidingWindowManager(SingleTypeKVCacheManager):
         block_pool: BlockPool,
         kv_cache_spec: KVCacheSpec,
         use_eagle: bool,
+        dcp_world_size: int = 1,
     ) -> tuple[list[KVCacheBlock], ...]:
         assert isinstance(kv_cache_spec, SlidingWindowSpec), (
             "SlidingWindowManager can only be used for sliding window groups")
+        assert dcp_world_size == 1, "DCP not support sliding window attn now."
 
         # The number of contiguous blocks needed for prefix cache hit.
         # -1 since the input token itself is also included in the window
@@ -408,6 +418,7 @@ class ChunkedLocalAttentionManager(SingleTypeKVCacheManager):
         block_pool: BlockPool,
         kv_cache_spec: KVCacheSpec,
         use_eagle: bool,
+        dcp_world_size: int = 1,
     ) -> tuple[list[KVCacheBlock], ...]:
         """
         For chunked local attention, we need to find the longest cache hit
@@ -445,6 +456,7 @@ class ChunkedLocalAttentionManager(SingleTypeKVCacheManager):
             "chunked local attention groups")
         assert use_eagle is False, ("Hybrid KV cache is not supported for " +
                                     "eagle + chunked local attention.")
+        assert dcp_world_size == 1, "DCP not support chunked local attn now."
         max_num_blocks = max_length // kv_cache_spec.block_size
         if max_length > 0:
             local_attention_start_idx = (max_length //
@@ -525,10 +537,12 @@ class MambaManager(SingleTypeKVCacheManager):
         block_pool: BlockPool,
         kv_cache_spec: KVCacheSpec,
         use_eagle: bool,
+        dcp_world_size: int = 1,
     ) -> tuple[list[KVCacheBlock], ...]:
         assert isinstance(
             kv_cache_spec,
             MambaSpec), ("MambaManager can only be used for mamba groups")
+        assert dcp_world_size == 1, "DCP not support mamba now."
         # Prefix caching is not supported for mamba now. Always return empty
         # list.
         computed_blocks: tuple[list[KVCacheBlock], ...] = tuple(
@@ -545,12 +559,48 @@ class MambaManager(SingleTypeKVCacheManager):
                                      num_running_requests: int) -> int:
         return 0
 
+    def get_num_blocks_to_allocate(
+            self, request_id: str, num_tokens: int,
+            new_computed_blocks: list[KVCacheBlock]) -> int:
+        """
+        Get the number of blocks needed to be allocated for the request.
+
+        Args:
+            request_id: The request ID.
+            num_tokens: The total number of tokens that need a slot (including
+                tokens that are already allocated).
+            new_computed_blocks: The new computed blocks just hitting the
+                prefix caching.
+
+        Returns:
+            The number of blocks
+        """
+
+        assert isinstance(self.kv_cache_spec, MambaSpec)
+        if self.kv_cache_spec.num_speculative_blocks > 0:
+            num_tokens += (self.kv_cache_spec.block_size *
+                           self.kv_cache_spec.num_speculative_blocks)
+        num_required_blocks = cdiv(num_tokens, self.block_size)
+        num_new_blocks = (num_required_blocks - len(new_computed_blocks) -
+                          len(self.req_to_blocks[request_id]))
+        # If a computed block of a request is an eviction candidate (in the
+        # free queue and ref_cnt == 0), it will be changed from a free block
+        # to a computed block when the request is allocated, so we also count
+        # it as needed to be allocated.
+        num_evictable_computed_blocks = sum(
+            blk.ref_cnt == 0 and not blk.is_null
+            for blk in new_computed_blocks)
+        return num_new_blocks + num_evictable_computed_blocks
+
     def allocate_new_blocks(self, request_id: str,
                             num_tokens: int) -> list[KVCacheBlock]:
-        new_blocks = super().allocate_new_blocks(request_id, num_tokens)
-        assert len(self.req_to_blocks[request_id]) == 1, (
-            "MambaManager should only allocate 1 block for each request.")
-        return new_blocks
+        # Allocate extra `num_speculative_blocks` blocks for
+        # speculative decoding (MTP/EAGLE) with linear attention.
+        assert isinstance(self.kv_cache_spec, MambaSpec)
+        if self.kv_cache_spec.num_speculative_blocks > 0:
+            num_tokens += (self.kv_cache_spec.block_size *
+                           self.kv_cache_spec.num_speculative_blocks)
+        return super().allocate_new_blocks(request_id, num_tokens)
 
 
 class CrossAttentionManager(SingleTypeKVCacheManager):
@@ -583,6 +633,7 @@ class CrossAttentionManager(SingleTypeKVCacheManager):
         block_pool: BlockPool,
         kv_cache_spec: KVCacheSpec,
         use_eagle: bool,
+        dcp_world_size: int = 1,
     ) -> tuple[list[KVCacheBlock], ...]:
         assert isinstance(kv_cache_spec, CrossAttentionSpec), (
             "CrossAttentionManager can only be used for cross-attention groups"
diff --git a/vllm/v1/engine/__init__.py b/vllm/v1/engine/__init__.py
index 5d8959a3cd3feb4565b28b4f6eab0f1bde0f0f08..dec4abec519bd87c3b5b80eaef1509bbcde50ac1 100644
--- a/vllm/v1/engine/__init__.py
+++ b/vllm/v1/engine/__init__.py
@@ -3,6 +3,7 @@
 
 import enum
 import time
+from collections.abc import Mapping
 from typing import Any, Optional, Union
 
 import msgspec
@@ -66,6 +67,8 @@ class EngineCoreRequest(
     current_wave: int = 0
     priority: int = 0
 
+    trace_headers: Optional[Mapping[str, str]] = None
+
 
 class EngineCoreEventType(enum.IntEnum):
     """The type of engine core request event."""
@@ -111,6 +114,7 @@ class EngineCoreOutput(
     events: Optional[list[EngineCoreEvent]] = None
     kv_transfer_params: Optional[dict[str, Any]] = None
 
+    trace_headers: Optional[Mapping[str, str]] = None
     # The number of tokens with prefix cache hits.
     num_cached_tokens: int = 0
 
@@ -144,7 +148,7 @@ class EngineCoreOutputs(
         omit_defaults=True,  # type: ignore[call-arg]
         gc=False):  # type: ignore[call-arg]
 
-    #NOTE(Nick): We could consider ways to make this more compact,
+    # NOTE(Nick): We could consider ways to make this more compact,
     # e.g. columnwise layout
 
     engine_index: int = 0
diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py
index 2a9fa1fd9172c023ac824277199d55ab4965187f..e467f40f6eab1dee965a8c6da43d5ae633c46736 100644
--- a/vllm/v1/engine/async_llm.py
+++ b/vllm/v1/engine/async_llm.py
@@ -26,6 +26,7 @@ from vllm.outputs import PoolingRequestOutput, RequestOutput
 from vllm.pooling_params import PoolingParams
 from vllm.sampling_params import SamplingParams
 from vllm.tasks import SupportedTask
+from vllm.tracing import init_tracer
 from vllm.transformers_utils.config import (
     maybe_register_config_serialize_by_value)
 from vllm.transformers_utils.tokenizer import AnyTokenizer
@@ -97,8 +98,14 @@ class AsyncLLM(EngineClient):
 
         self.model_config = vllm_config.model_config
         self.vllm_config = vllm_config
+        self.observability_config = vllm_config.observability_config
         self.log_requests = log_requests
-        self.log_stats = log_stats
+
+        self.log_stats = log_stats or (stat_loggers is not None)
+        if not log_stats and stat_loggers is not None:
+            logger.info(
+                "AsyncLLM created with log_stats=False and non-empty custom "
+                "logger list; enabling logging without default stat loggers")
 
         if self.model_config.skip_tokenizer_init:
             self.tokenizer = None
@@ -119,6 +126,11 @@ class AsyncLLM(EngineClient):
         # OutputProcessor (converts EngineCoreOutputs --> RequestOutput).
         self.output_processor = OutputProcessor(self.tokenizer,
                                                 log_stats=self.log_stats)
+        if self.observability_config.otlp_traces_endpoint is not None:
+            tracer = init_tracer(
+                "vllm.llm_engine",
+                self.observability_config.otlp_traces_endpoint)
+            self.output_processor.tracer = tracer
 
         # EngineCore (starts the engine in background process).
         self.engine_core = EngineCoreClient.make_async_mp_client(
@@ -137,6 +149,8 @@ class AsyncLLM(EngineClient):
                 vllm_config=vllm_config,
                 engine_idxs=self.engine_core.engine_ranks_managed,
                 custom_stat_loggers=stat_loggers,
+                enable_default_loggers=log_stats,
+                client_count=client_count,
             )
             self.logger_manager.log_engine_initialized()
 
@@ -596,7 +610,7 @@ class AsyncLLM(EngineClient):
         return self.tokenizer.get_lora_tokenizer(lora_request)
 
     async def is_tracing_enabled(self) -> bool:
-        return False
+        return self.observability_config.otlp_traces_endpoint is not None
 
     async def do_log_stats(
         self,
diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py
index 922c06b44be8847bb1bda545fc48b9642b608a1f..b46ae72ccdf1b6e5f1c7564a44061bc079712c33 100644
--- a/vllm/v1/engine/core.py
+++ b/vllm/v1/engine/core.py
@@ -1,5 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import gc
 import os
 import queue
 import signal
@@ -223,7 +224,7 @@ class EngineCore:
 
     def add_request(self, request: Request, request_wave: int = 0):
         """Add request to the scheduler.
-        
+
         `request_wave`: indicate which wave of requests this is expected to
         belong to in DP case
         """
@@ -432,13 +433,13 @@ class EngineCore:
     def preprocess_add_request(
             self, request: EngineCoreRequest) -> tuple[Request, int]:
         """Preprocess the request.
-        
+
         This function could be directly used in input processing thread to allow
         request initialization running in parallel with Model forward
         """
         # Note on thread safety: no race condition.
         # `mm_receiver_cache` is reset at the end of LLMEngine init,
-        # and will only accessed in the input processing thread afterwards.
+        # and will only be accessed in the input processing thread afterwards.
         if self.mm_receiver_cache is not None and request.mm_features:
             request.mm_features = (
                 self.mm_receiver_cache.get_and_update_features(
@@ -536,6 +537,11 @@ class EngineCoreProc(EngineCore):
         self.step_fn = (self.step if self.batch_queue is None else
                         self.step_with_batch_queue)
 
+        # Mark the startup heap as static so that it's ignored by GC.
+        # Reduces pause times of oldest generation collections.
+        gc.collect()
+        gc.freeze()
+
     @contextmanager
     def _perform_handshakes(
         self,
@@ -691,7 +697,7 @@ class EngineCoreProc(EngineCore):
             parallel_config: ParallelConfig = kwargs[
                 "vllm_config"].parallel_config
             if parallel_config.data_parallel_size > 1 or dp_rank > 0:
-                set_process_title("DPEngineCore", str(dp_rank))
+                set_process_title("EngineCore", f"DP{dp_rank}")
                 decorate_logs()
                 # Set data parallel rank for this engine process.
                 parallel_config.data_parallel_rank = dp_rank
diff --git a/vllm/v1/engine/core_client.py b/vllm/v1/engine/core_client.py
index b8db0d99bdc73f6900fb210c441053fe6ac3132e..3c7f67cf0e2ff620f86f4c13d4a77653a2e05904 100644
--- a/vllm/v1/engine/core_client.py
+++ b/vllm/v1/engine/core_client.py
@@ -348,8 +348,9 @@ class BackgroundResources:
 
         if isinstance(self.output_socket, zmq.asyncio.Socket):
             # Async case.
-            loop = self.output_socket._get_loop()
-            asyncio.get_running_loop()
+            loop = self.output_queue_task._loop \
+                if self.output_queue_task else None
+
             sockets = (self.output_socket, self.input_socket,
                        self.first_req_send_socket, self.first_req_rcv_socket,
                        self.stats_update_socket)
@@ -360,11 +361,12 @@ class BackgroundResources:
                 close_sockets(sockets)
                 for task in tasks:
                     if task is not None and not task.done():
-                        task.cancel()
+                        with contextlib.suppress(Exception):
+                            task.cancel()
 
             if in_loop(loop):
                 close_sockets_and_tasks()
-            elif not loop.is_closed():
+            elif loop and not loop.is_closed():
                 loop.call_soon_threadsafe(close_sockets_and_tasks)
             else:
                 # Loop has been closed, try to clean up directly.
diff --git a/vllm/v1/engine/detokenizer.py b/vllm/v1/engine/detokenizer.py
index 04ad51aae0a8c3b8c75872a1371277605a4f4873..cf4b06db843bd315db17fa955b5ce702e743277a 100644
--- a/vllm/v1/engine/detokenizer.py
+++ b/vllm/v1/engine/detokenizer.py
@@ -121,12 +121,9 @@ class BaseIncrementalDetokenizer(IncrementalDetokenizer, ABC):
                     self.output_token_ids) <= self.min_tokens:
                 stop_check_offset = len(self.output_text)
 
-        if stop_terminated:
-            if skipped_stop_token_id is not None:
-                # Cleanup after skipping detokenization.
-                self.token_ids.append(skipped_stop_token_id)
-            # Stop token triggered; skip stop string check.
-            return None
+        if skipped_stop_token_id is not None:
+            # Cleanup after skipping detokenization.
+            self.token_ids.append(skipped_stop_token_id)
 
         # 2) Evaluate stop strings.
         stop_string = None
@@ -233,8 +230,13 @@ class FastIncrementalDetokenizer(BaseIncrementalDetokenizer):
     def _protected_step(self, next_token_id: int) -> Optional[str]:
         try:
             token = self.stream.step(self.tokenizer, next_token_id)
+        except OverflowError:
+            # Handle rare observed overflow, still to be diagnosed.
+            # See https://github.com/vllm-project/vllm/issues/21951.
+            logger.exception("Encountered invalid token id: %d", next_token_id)
+            token = None
         except Exception as e:
-            if str(e) != INVALID_PREFIX_ERR_MSG:
+            if not str(e).startswith(INVALID_PREFIX_ERR_MSG):
                 raise e
             # Recover from edge case where tokenizer can produce non-monotonic,
             # invalid UTF-8 output, which breaks the internal state of
@@ -243,7 +245,8 @@ class FastIncrementalDetokenizer(BaseIncrementalDetokenizer):
             logger.warning(
                 "Encountered invalid prefix detokenization error"
                 " for request %s, resetting decode stream.", self.request_id)
-            self.stream = DecodeStream(self.skip_special_tokens)
+            self.stream = DecodeStream(
+                skip_special_tokens=self.skip_special_tokens)
             token = self.stream.step(self.tokenizer, next_token_id)
         return token
 
diff --git a/vllm/v1/engine/llm_engine.py b/vllm/v1/engine/llm_engine.py
index 7130f666ef19fb5509aa7e41eafc38b147b49d3d..fca5a783bc3bf6a0b75bac082f8800698614852e 100644
--- a/vllm/v1/engine/llm_engine.py
+++ b/vllm/v1/engine/llm_engine.py
@@ -19,6 +19,7 @@ from vllm.outputs import PoolingRequestOutput, RequestOutput
 from vllm.pooling_params import PoolingParams
 from vllm.sampling_params import SamplingParams
 from vllm.tasks import SupportedTask
+from vllm.tracing import init_tracer
 from vllm.transformers_utils.tokenizer_group import (
     TokenizerGroup, init_tokenizer_from_configs)
 from vllm.usage.usage_lib import UsageContext
@@ -65,6 +66,7 @@ class LLMEngine:
                 "Set VLLM_USE_V1=0 and file and issue on Github.")
 
         self.vllm_config = vllm_config
+        self.observability_config = vllm_config.observability_config
         self.model_config = vllm_config.model_config
         self.cache_config = vllm_config.cache_config
 
@@ -99,6 +101,11 @@ class LLMEngine:
         # OutputProcessor (convert EngineCoreOutputs --> RequestOutput).
         self.output_processor = OutputProcessor(self.tokenizer,
                                                 log_stats=self.log_stats)
+        if self.observability_config.otlp_traces_endpoint is not None:
+            tracer = init_tracer(
+                "vllm.llm_engine",
+                self.observability_config.otlp_traces_endpoint)
+            self.output_processor.tracer = tracer
 
         # EngineCore (gets EngineCoreRequests and gives EngineCoreOutputs)
         self.engine_core = EngineCoreClient.make_client(
diff --git a/vllm/v1/engine/logprobs.py b/vllm/v1/engine/logprobs.py
index 3de7fa6889e5599640c8b580cc1becca03847adc..133122b6fcc0ce932cbdef3eb2eaafc24846a20f 100644
--- a/vllm/v1/engine/logprobs.py
+++ b/vllm/v1/engine/logprobs.py
@@ -7,7 +7,7 @@ from dataclasses import dataclass
 from typing import Optional
 
 from vllm.logger import init_logger
-from vllm.sequence import Logprob, PromptLogprobs, SampleLogprobs
+from vllm.logprobs import Logprob, PromptLogprobs, SampleLogprobs
 from vllm.transformers_utils.detokenizer_utils import (
     AnyTokenizer, convert_ids_list_to_tokens)
 from vllm.v1.engine import EngineCoreOutput, EngineCoreRequest
diff --git a/vllm/v1/engine/output_processor.py b/vllm/v1/engine/output_processor.py
index 2ee55b585da6c81fc6186bc4eab72622609b476d..02c8c61cb9093370e93eaa34c10487c4ac9c9be4 100644
--- a/vllm/v1/engine/output_processor.py
+++ b/vllm/v1/engine/output_processor.py
@@ -11,6 +11,8 @@ import torch
 from vllm.outputs import (CompletionOutput, PoolingOutput,
                           PoolingRequestOutput, RequestOutput)
 from vllm.sampling_params import RequestOutputKind
+from vllm.tracing import (SpanAttributes, SpanKind, Tracer,
+                          extract_trace_context)
 from vllm.transformers_utils.tokenizer import AnyTokenizer
 from vllm.transformers_utils.tokenizer_group import TokenizerGroup
 from vllm.v1.engine import EngineCoreOutput, EngineCoreRequest, FinishReason
@@ -71,7 +73,6 @@ class RequestOutputCollector:
 
 @dataclass
 class OutputProcessorOutput:
-
     request_outputs: list[Union[RequestOutput, PoolingRequestOutput]]
     reqs_to_abort: list[str]
 
@@ -93,6 +94,9 @@ class RequestState:
         arrival_time: float,
         queue: Optional[RequestOutputCollector],
         log_stats: bool,
+        top_p: Optional[float] = None,
+        n: Optional[int] = None,
+        temperature: Optional[float] = None,
     ):
         self.request_id = request_id
         self.parent_req = parent_req
@@ -105,6 +109,9 @@ class RequestState:
         self.logprobs_processor = logprobs_processor
         self.detokenizer = detokenizer
         self.max_tokens_param = max_tokens_param
+        self.top_p = top_p
+        self.n = n
+        self.temperature = temperature
         self.is_prefilling = True
         self.queue = queue
         self.num_cached_tokens = 0
@@ -137,10 +144,16 @@ class RequestState:
                 request=request,
             )
             max_tokens_param = sampling_params.max_tokens
+            top_p = sampling_params.top_p
+            n = sampling_params.n
+            temperature = sampling_params.temperature
         else:
             logprobs_processor = None
             detokenizer = None
             max_tokens_param = None
+            top_p = None
+            n = None
+            temperature = None
             assert request.pooling_params is not None
             output_kind = request.pooling_params.output_kind
 
@@ -156,6 +169,9 @@ class RequestState:
             logprobs_processor=logprobs_processor,
             detokenizer=detokenizer,
             max_tokens_param=max_tokens_param,
+            top_p=top_p,
+            n=n,
+            temperature=temperature,
             arrival_time=request.arrival_time,
             queue=queue,
             log_stats=log_stats,
@@ -274,16 +290,13 @@ class RequestState:
 class OutputProcessor:
     """Process EngineCoreOutputs into RequestOutputs."""
 
-    def __init__(
-        self,
-        tokenizer: TokenizerGroup,
-        log_stats: bool,
-    ):
+    def __init__(self, tokenizer: TokenizerGroup, log_stats: bool):
         self.log_stats = log_stats
         self.tokenizer = tokenizer
         self.request_states: dict[str, RequestState] = {}
         self.parent_requests: dict[str, ParentRequest] = {}
         self.lora_states = LoRARequestStates()
+        self.tracer: Optional[Tracer] = None
 
     def get_num_unfinished_requests(self):
         return len(self.request_states)
@@ -441,7 +454,9 @@ class OutputProcessor:
                 # Track per-request stats
                 self._update_stats_from_finished(req_state, finish_reason,
                                                  iteration_stats)
-
+                if self.tracer:
+                    self.do_tracing(engine_core_output, req_state,
+                                    iteration_stats)
         self.lora_states.update_iteration_stats(iteration_stats)
 
         return OutputProcessorOutput(
@@ -449,6 +464,63 @@ class OutputProcessor:
             reqs_to_abort=reqs_to_abort,
         )
 
+    def do_tracing(self, engine_core_output: EngineCoreOutput,
+                   req_state: RequestState,
+                   iteration_stats: Optional[IterationStats]) -> None:
+        assert req_state.stats is not None
+        assert iteration_stats is not None
+        assert self.tracer is not None
+
+        arrival_time_nano_seconds = int(req_state.stats.arrival_time * 1e9)
+        trace_context = extract_trace_context(engine_core_output.trace_headers)
+        with (self.tracer.start_as_current_span(
+                "llm_request",
+                kind=SpanKind.SERVER,
+                context=trace_context,
+                start_time=arrival_time_nano_seconds) as span):
+            metrics = req_state.stats
+            e2e_time = iteration_stats.iteration_timestamp - \
+                       metrics.arrival_time
+            queued_time = metrics.scheduled_ts - metrics.queued_ts
+            prefill_time = metrics.first_token_ts - metrics.scheduled_ts
+            decode_time = metrics.last_token_ts - metrics.first_token_ts
+            inference_time = metrics.last_token_ts - metrics.scheduled_ts
+            span.set_attribute(
+                SpanAttributes.GEN_AI_LATENCY_TIME_TO_FIRST_TOKEN,
+                metrics.first_token_latency)
+            span.set_attribute(SpanAttributes.GEN_AI_LATENCY_E2E, e2e_time)
+            span.set_attribute(SpanAttributes.GEN_AI_LATENCY_TIME_IN_QUEUE,
+                               queued_time)
+            span.set_attribute(SpanAttributes.GEN_AI_USAGE_PROMPT_TOKENS,
+                               len(req_state.prompt_token_ids))
+            span.set_attribute(SpanAttributes.GEN_AI_USAGE_COMPLETION_TOKENS,
+                               metrics.num_generation_tokens)
+            span.set_attribute(
+                SpanAttributes.GEN_AI_LATENCY_TIME_IN_MODEL_PREFILL,
+                prefill_time)
+            span.set_attribute(
+                SpanAttributes.GEN_AI_LATENCY_TIME_IN_MODEL_DECODE,
+                decode_time)
+            span.set_attribute(
+                SpanAttributes.GEN_AI_LATENCY_TIME_IN_MODEL_INFERENCE,
+                inference_time)
+
+            # meta
+            span.set_attribute(SpanAttributes.GEN_AI_REQUEST_ID,
+                               req_state.request_id)
+            if req_state.top_p:
+                span.set_attribute(SpanAttributes.GEN_AI_REQUEST_TOP_P,
+                                   req_state.top_p)
+            if req_state.max_tokens_param:
+                span.set_attribute(SpanAttributes.GEN_AI_REQUEST_MAX_TOKENS,
+                                   req_state.max_tokens_param)
+            if req_state.temperature:
+                span.set_attribute(SpanAttributes.GEN_AI_REQUEST_TEMPERATURE,
+                                   req_state.temperature)
+            if req_state.n:
+                span.set_attribute(SpanAttributes.GEN_AI_REQUEST_N,
+                                   req_state.n)
+
     def _update_stats_from_output(self, req_state: RequestState,
                                   engine_core_output: EngineCoreOutput,
                                   engine_core_timestamp: Optional[float],
diff --git a/vllm/v1/engine/processor.py b/vllm/v1/engine/processor.py
index 1aa117ded4ed8ea787d4b927ec4f34e6ca3825e0..a54a98adf56b5bd1de662a56375ff99788c21142 100644
--- a/vllm/v1/engine/processor.py
+++ b/vllm/v1/engine/processor.py
@@ -12,7 +12,7 @@ from vllm.inputs.preprocess import InputPreprocessor
 from vllm.lora.request import LoRARequest
 from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalRegistry
 from vllm.multimodal.cache import processor_cache_from_config
-from vllm.multimodal.inputs import MultiModalFeatureSpec
+from vllm.multimodal.inputs import MultiModalFeatureSpec, MultiModalUUIDDict
 from vllm.multimodal.processing import EncDecMultiModalProcessor
 from vllm.multimodal.utils import argsort_mm_positions
 from vllm.pooling_params import PoolingParams
@@ -65,19 +65,27 @@ class Processor:
     ) -> None:
         max_logprobs = self.model_config.max_logprobs
         if max_logprobs == -1:
-            return
+            max_logprobs = self.model_config.get_vocab_size()
+
         # Validate sample logprobs.
-        if params.logprobs and (params.logprobs == -1
-                                or params.logprobs > max_logprobs):
-            raise ValueError(
-                f"Requested sample logprobs of {params.logprobs}, "
-                f"which is greater than max allowed: {max_logprobs}")
+        if params.logprobs:
+            num_logprobs = params.logprobs
+            if num_logprobs == -1:
+                num_logprobs = self.model_config.get_vocab_size()
+            if num_logprobs > max_logprobs:
+                raise ValueError(
+                    f"Requested sample logprobs of {num_logprobs}, "
+                    f"which is is greater than max allowed: {max_logprobs}")
 
         # Validate prompt logprobs.
-        if params.prompt_logprobs and params.prompt_logprobs > max_logprobs:
-            raise ValueError(
-                f"Requested prompt logprobs of {params.prompt_logprobs}, "
-                f"which is greater than max allowed: {max_logprobs}")
+        if params.prompt_logprobs:
+            num_prompt_logprobs = params.prompt_logprobs
+            if num_prompt_logprobs == -1:
+                num_prompt_logprobs = self.model_config.get_vocab_size()
+            if num_prompt_logprobs > max_logprobs:
+                raise ValueError(
+                    f"Requested prompt logprobs of {num_prompt_logprobs}, "
+                    f"which is is greater than max allowed: {max_logprobs}")
 
     def _validate_sampling_params(
         self,
@@ -268,11 +276,11 @@ class Processor:
             # Remember that this backend was set automatically
             params.guided_decoding.backend_was_auto = True
 
-    def _maybe_build_mm_hash_overrides(
+    def _maybe_build_mm_uuids(
         self,
         request_id: str,
         prompt: PromptType,
-    ) -> Optional[dict[str, list[str]]]:
+    ) -> Optional[MultiModalUUIDDict]:
         """Build per-item multimodal hash overrides when enabled. In this case,
         multimodal data items are identified by their request id, modality and
         index rather than their content.
@@ -295,13 +303,13 @@ class Processor:
         if not mm_data:
             return None
 
-        overrides: dict[str, list[str]] = {}
+        mm_uuids: MultiModalUUIDDict = {}
         for modality, data in mm_data.items():
             n = len(data) if isinstance(data, list) else 1
-            overrides[modality] = [
+            mm_uuids[modality] = [
                 f"{request_id}-{modality}-{i}" for i in range(n)
             ]
-        return overrides
+        return mm_uuids
 
     def process_inputs(
         self,
@@ -317,11 +325,8 @@ class Processor:
     ) -> tuple[Optional[str], EngineCoreRequest]:
 
         # TODO(woosuk): Support pooling models.
-        # TODO(woosuk): Support encoder-decoder models.
         self._validate_lora(lora_request)
         self._validate_params(params, lora_request)
-        if trace_headers is not None:
-            raise ValueError("V1 does not support tracing yet.")
 
         data_parallel_size = self.vllm_config.parallel_config.data_parallel_size
         if data_parallel_rank is not None and not (0 <= data_parallel_rank <
@@ -343,16 +348,15 @@ class Processor:
         if (self.model_config.multimodal_config and
                 self.model_config.multimodal_config.mm_processor_cache_gb == 0
                 and not self.cache_config.enable_prefix_caching):
-            mm_hash_overrides = self._maybe_build_mm_hash_overrides(
-                request_id, prompt)
+            mm_uuids = self._maybe_build_mm_uuids(request_id, prompt)
         else:
             # Otherwise, use user-provided uuids as multimodal hash overrides
             # if provided.
             self._validate_multi_modal_uuids(prompt)
             if isinstance(prompt, dict):
-                mm_hash_overrides = prompt.get("multi_modal_uuids")
+                mm_uuids = prompt.get("multi_modal_uuids")
             else:
-                mm_hash_overrides = None
+                mm_uuids = None
 
         # Process inputs, which includes:
         # 1. Tokenize text prompt, with LoRA request if one exists.
@@ -362,7 +366,7 @@ class Processor:
             prompt,
             tokenization_kwargs=tokenization_kwargs,
             lora_request=lora_request,
-            mm_hash_overrides=mm_hash_overrides,
+            mm_uuids=mm_uuids,
         )
         from vllm.platforms import current_platform
         current_platform.validate_request(
@@ -377,10 +381,6 @@ class Processor:
 
         encoder_inputs, decoder_inputs = split_enc_dec_inputs(processed_inputs)
 
-        # TODO: Impl encoder-decoder
-        if encoder_inputs is not None:
-            raise NotImplementedError
-
         sampling_params = None
         pooling_params = None
         if isinstance(params, SamplingParams):
@@ -433,6 +433,7 @@ class Processor:
             cache_salt=decoder_inputs.get("cache_salt"),
             priority=priority,
             data_parallel_rank=data_parallel_rank,
+            trace_headers=trace_headers,
         )
 
     def _validate_model_inputs(self,
diff --git a/vllm/v1/engine/utils.py b/vllm/v1/engine/utils.py
index 56ef8477d267ac03a1ed6ffec239448653564b1c..df2fd8d9df078dcbf18d19c763a2d55ad22794a1 100644
--- a/vllm/v1/engine/utils.py
+++ b/vllm/v1/engine/utils.py
@@ -116,7 +116,7 @@ class CoreEngineProcManager:
             local_dp_ranks.append(local_index)
             self.processes.append(
                 context.Process(target=target_fn,
-                                name=f"EngineCore_{global_index}",
+                                name=f"EngineCore_DP{global_index}",
                                 kwargs=common_kwargs | {
                                     "dp_rank": global_index,
                                     "local_dp_rank": local_index,
@@ -315,7 +315,6 @@ class CoreEngineActorManager:
 
         import ray
         from ray._private.state import available_resources_per_node
-        from ray.util.state import list_nodes
 
         logger.info("Creating placement groups for data parallel")
         dp_master_ip = \
@@ -324,31 +323,28 @@ class CoreEngineActorManager:
         local_engine_count = \
             vllm_config.parallel_config.data_parallel_size_local
 
-        nodes = sorted(list_nodes(filters=[("state", "=", "ALIVE")]),
-                       key=lambda node: node.node_ip != dp_master_ip)
-        assert nodes[0].node_ip == dp_master_ip, (
-            "The head node is missing or dead")
-        assert len(nodes) == 1 or nodes[1].node_ip != dp_master_ip, (
-            "There can only be one head node")
-
         available_resources = available_resources_per_node()
         world_size = vllm_config.parallel_config.world_size
         placement_groups: list[PlacementGroup] = []
         local_dp_ranks: list[int] = []
-
-        for node in nodes:
-            node_ip = node.node_ip
-            node_resources = available_resources[node.node_id]
+        dp_master_ip_key = f'node:{dp_master_ip}'
+        nodes = sorted(available_resources.values(),
+                       key=lambda x: dp_master_ip_key not in x)
+        assert len(nodes) > 0, (
+            "No nodes with resources found in Ray cluster.")
+        assert dp_master_ip_key in nodes[0], (
+            "The DP master node (ip: %s) is missing or dead", dp_master_ip)
+        for node_resources in nodes:
             if "GPU" not in node_resources:
                 continue
             # For now, each DP rank can only be assigned to one node
             # TODO(rui): support allocating a single DP rank
             # to multiple nodes
             available_engine_count = int(node_resources["GPU"]) // world_size
-            if node_ip == dp_master_ip:
+            if dp_master_ip_key in node_resources:
                 assert available_engine_count >= local_engine_count, (
                     "Not enough resources to allocate DP ranks "
-                    f"on DP master node {node_ip}")
+                    f"on DP master node {dp_master_ip}")
                 for i in range(local_engine_count):
                     bundles = [{
                         "GPU": 1.0,
diff --git a/vllm/v1/executor/multiproc_executor.py b/vllm/v1/executor/multiproc_executor.py
index 12e79ff165f4e5ac56572acf4fae09a0621c3df2..bcf6dda9c1e9168047307fee70f99463d96f96ed 100644
--- a/vllm/v1/executor/multiproc_executor.py
+++ b/vllm/v1/executor/multiproc_executor.py
@@ -1,8 +1,8 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import multiprocessing
-import os
 import pickle
+import queue
 import signal
 import threading
 import time
@@ -26,6 +26,8 @@ from vllm.distributed import (destroy_distributed_environment,
 from vllm.distributed.device_communicators.shm_broadcast import (Handle,
                                                                  MessageQueue)
 from vllm.distributed.kv_transfer.kv_connector.utils import KVOutputAggregator
+from vllm.distributed.parallel_state import (get_dp_group, get_ep_group,
+                                             get_pp_group, get_tp_group)
 from vllm.executor.multiproc_worker_utils import (
     set_multiprocessing_worker_envs)
 from vllm.logger import init_logger
@@ -33,7 +35,8 @@ from vllm.utils import (decorate_logs, get_distributed_init_method,
                         get_loopback_ip, get_mp_context, get_open_port,
                         set_process_title)
 from vllm.v1.executor.abstract import Executor, FailureCallback
-from vllm.v1.outputs import DraftTokenIds, ModelRunnerOutput
+from vllm.v1.outputs import (AsyncModelRunnerOutput, DraftTokenIds,
+                             ModelRunnerOutput)
 from vllm.worker.worker_base import WorkerWrapperBase
 
 logger = init_logger(__name__)
@@ -253,7 +256,8 @@ class MultiprocExecutor(Executor):
                     if not non_block:
                         result = result.result()
                 elif not non_block:
-                    result = get_response(w, dequeue_timeout)
+                    result = get_response(w, dequeue_timeout,
+                                          self.shutdown_event)
                 else:
                     raise RuntimeError("non_block can only be used when"
                                        " max_concurrent_batches > 1")
@@ -295,12 +299,8 @@ class MultiprocExecutor(Executor):
         """Properly shut down the executor and its workers"""
         if not getattr(self, 'shutting_down', False):
             self.shutting_down = True
-            self.shutdown_event.set()
-
-            if self.io_thread_pool is not None:
-                self.io_thread_pool.shutdown(wait=False, cancel_futures=True)
-                self.io_thread_pool = None
 
+            # Make sure all the worker processes are terminated first.
             if workers := getattr(self, 'workers', None):
                 for w in workers:
                     # Close death_writer to signal child processes to exit
@@ -310,6 +310,11 @@ class MultiprocExecutor(Executor):
                     w.worker_response_mq = None
                 self._ensure_worker_termination([w.proc for w in workers])
 
+            self.shutdown_event.set()
+            if self.io_thread_pool is not None:
+                self.io_thread_pool.shutdown(wait=False, cancel_futures=True)
+                del self.io_thread_pool
+
         self.rpc_broadcast_mq = None
 
     def check_health(self) -> None:
@@ -394,17 +399,6 @@ class WorkerProc:
         wrapper.init_worker(all_kwargs)
         self.worker = wrapper
 
-        pp_size = vllm_config.parallel_config.pipeline_parallel_size
-        tp_size = vllm_config.parallel_config.tensor_parallel_size
-        pp_str = f"PP{rank // tp_size}" if pp_size > 1 else ""
-        tp_str = f"TP{rank % tp_size}" if tp_size > 1 else ""
-        suffix = f"{pp_str}{'_' if pp_str and tp_str else ''}{tp_str}"
-        process_name = "VllmWorker"
-        if suffix:
-            set_process_title(suffix, append=True)
-            process_name = f"{process_name} {suffix}"
-        decorate_logs(process_name)
-
         # Initialize MessageQueue for receiving SchedulerOutput
         self.rpc_broadcast_mq = MessageQueue.create_from_handle(
             input_shm_handle, self.worker.rank)
@@ -412,8 +406,24 @@ class WorkerProc:
         # Initializes a message queue for sending the model output
         self.worker_response_mq = MessageQueue(1, 1)
 
-        # Initialize device and loads weights
+        scheduler_config = vllm_config.scheduler_config
+        self.use_async_scheduling = scheduler_config.async_scheduling
+        if self.use_async_scheduling:
+            self.async_output_queue: queue.Queue = queue.Queue()
+            self.async_output_copy_thread = Thread(
+                target=self.async_output_busy_loop,
+                daemon=True,
+                name="WorkerAsyncOutputCopy")
+            self.async_output_copy_thread.start()
+
+        # Initialize device
         self.worker.init_device()
+
+        # Set process title and log prefix
+        self.setup_proc_title_and_log_prefix(
+            enable_ep=vllm_config.parallel_config.enable_expert_parallel)
+
+        # Load model
         self.worker.load_model()
 
     @staticmethod
@@ -493,6 +503,7 @@ class WorkerProc:
         return cast(list[WorkerProcHandle], ready_proc_handles)
 
     def shutdown(self):
+        self.worker.shutdown()
         self.rpc_broadcast_mq = None
         self.worker_response_mq = None
         destroy_model_parallel()
@@ -522,7 +533,7 @@ class WorkerProc:
         # tuple[Connection, Connection]
         reader, ready_writer = kwargs.pop("ready_pipe")
         death_pipe = kwargs.pop("death_pipe", None)
-
+        shutdown_event = threading.Event()
         # Start death monitoring thread if death_pipe is provided
         if death_pipe is not None:
 
@@ -534,7 +545,7 @@ class WorkerProc:
                     # Parent process has exited, terminate this worker
                     logger.info("Parent process exited, terminating worker")
                     # Send signal to self to trigger clean shutdown
-                    os.kill(os.getpid(), signal.SIGTERM)
+                    shutdown_event.set()
                 except Exception as e:
                     logger.warning("Death monitoring error: %s", e)
 
@@ -562,7 +573,7 @@ class WorkerProc:
             ready_writer.close()
             ready_writer = None
 
-            worker.worker_busy_loop()
+            worker.worker_busy_loop(cancel=shutdown_event)
 
         except Exception:
             # NOTE: if an Exception arises in busy_loop, we send
@@ -572,6 +583,8 @@ class WorkerProc:
 
             if ready_writer is not None:
                 logger.exception("WorkerProc failed to start.")
+            elif shutdown_event.is_set():
+                logger.info("WorkerProc shutting down.")
             else:
                 logger.exception("WorkerProc failed.")
 
@@ -593,11 +606,41 @@ class WorkerProc:
         SUCCESS = auto()
         FAILURE = auto()
 
-    def worker_busy_loop(self):
-        """Main busy loop for Multiprocessing Workers"""
+    def enqueue_output(self, output: Any):
+        """Prepares output from the worker and enqueues it to the
+        worker_response_mq. If the output is an Exception, it is
+        converted to a FAILURE response.
+        """
+        if isinstance(output, AsyncModelRunnerOutput):
+            output = output.get_output()
+
+        if isinstance(output, Exception):
+            result = (WorkerProc.ResponseStatus.FAILURE, str(output))
+        else:
+            result = (WorkerProc.ResponseStatus.SUCCESS, output)
+        self.worker_response_mq.enqueue(result)
+
+    def handle_output(self, output: Any):
+        """Handles output from the worker. If async scheduling is enabled,
+        it is passed to the async_output_busy_loop thread. Otherwise, it is
+        enqueued directly to the worker_response_mq.
+        """
+        if self.use_async_scheduling:
+            self.async_output_queue.put(output)
+        else:
+            self.enqueue_output(output)
+
+    def async_output_busy_loop(self):
+        """Entrypoint for the thread which handles outputs asynchronously."""
         while True:
-            method, args, kwargs, output_rank = self.rpc_broadcast_mq.dequeue()
+            output = self.async_output_queue.get()
+            self.enqueue_output(output)
 
+    def worker_busy_loop(self, cancel: Optional[threading.Event] = None):
+        """Main busy loop for Multiprocessing Workers"""
+        while True:
+            method, args, kwargs, output_rank = self.rpc_broadcast_mq.dequeue(
+                cancel=cancel)
             try:
                 if isinstance(method, str):
                     func = getattr(self.worker, method)
@@ -612,10 +655,29 @@ class WorkerProc:
                 # exception might not be serializable, so we convert it to
                 # string, only for logging purpose.
                 if output_rank is None or self.rank == output_rank:
-                    self.worker_response_mq.enqueue(
-                        (WorkerProc.ResponseStatus.FAILURE, str(e)))
+                    self.handle_output(e)
                 continue
 
             if output_rank is None or self.rank == output_rank:
-                self.worker_response_mq.enqueue(
-                    (WorkerProc.ResponseStatus.SUCCESS, output))
+                self.handle_output(output)
+
+    @staticmethod
+    def setup_proc_title_and_log_prefix(enable_ep: bool) -> None:
+        dp_size = get_dp_group().world_size
+        dp_rank = get_dp_group().rank_in_group
+        pp_size = get_pp_group().world_size
+        pp_rank = get_pp_group().rank_in_group
+        tp_size = get_tp_group().world_size
+        tp_rank = get_tp_group().rank_in_group
+        process_name = "Worker"
+        if dp_size > 1:
+            process_name += f"_DP{dp_rank}"
+        if pp_size > 1:
+            process_name += f"_PP{pp_rank}"
+        if tp_size > 1:
+            process_name += f"_TP{tp_rank}"
+        if enable_ep:
+            ep_rank = get_ep_group().rank_in_group
+            process_name += f"_EP{ep_rank}"
+        set_process_title(name=process_name)
+        decorate_logs(process_name)
diff --git a/vllm/v1/kv_cache_interface.py b/vllm/v1/kv_cache_interface.py
index a3e4d393e4d2054ee4768b40747abc990cb774f7..6e8f569fff0e3aca62b652e861447febce2e53bb 100644
--- a/vllm/v1/kv_cache_interface.py
+++ b/vllm/v1/kv_cache_interface.py
@@ -86,6 +86,12 @@ class FullAttentionSpec(AttentionSpec):
 
     def max_memory_usage_bytes(self, vllm_config: VllmConfig) -> int:
         max_model_len = vllm_config.model_config.max_model_len
+        dcp_world_size = \
+            vllm_config.parallel_config.decode_context_parallel_size
+        # Note(hc): each dcp rank only need save
+        # (max_model_len//dcp_world_size) tokens locally.
+        if dcp_world_size > 1:
+            max_model_len = cdiv(max_model_len, dcp_world_size)
         return cdiv(max_model_len, self.block_size) * self.page_size_bytes
 
     @classmethod
@@ -162,6 +168,8 @@ class SlidingWindowSpec(AttentionSpec):
         assert not self.use_mla, "MLA is not supported for sliding window"
 
     def max_memory_usage_bytes(self, vllm_config: VllmConfig) -> int:
+        assert vllm_config.parallel_config.decode_context_parallel_size == 1, \
+            "DCP not support sliding window."
         max_model_len = vllm_config.model_config.max_model_len
         max_num_batched_tokens = (
             vllm_config.scheduler_config.max_num_batched_tokens)
@@ -186,6 +194,7 @@ class MambaSpec(KVCacheSpec):
     dtypes: tuple[torch.dtype]
     page_size_padded: Optional[int] = None
     mamba_type: str = "mamba2"
+    num_speculative_blocks: int = 0
 
     @property
     def page_size_bytes(self) -> int:
diff --git a/vllm/v1/metrics/loggers.py b/vllm/v1/metrics/loggers.py
index 3b0616952babf65408c4325d3153c0388d6d6535..347185d8341ee2243c3a4f9fffd6e9972792d7ce 100644
--- a/vllm/v1/metrics/loggers.py
+++ b/vllm/v1/metrics/loggers.py
@@ -377,9 +377,13 @@ class PrometheusStatLogger(StatLoggerBase):
         self.histogram_time_to_first_token = make_per_engine(
             histogram_time_to_first_token, engine_indexes, model_name)
 
+        # Deprecated in 0.11 - Renamed as vllm:inter_token_latency_seconds
+        # TODO: in 0.12, only enable if show_hidden_metrics=True
         histogram_time_per_output_token = self._histogram_cls(
             name="vllm:time_per_output_token_seconds",
-            documentation="Histogram of time per output token in seconds.",
+            documentation=(
+                "Histogram of time per output token in seconds."
+                "DEPRECATED: Use vllm:inter_token_latency_seconds instead."),
             buckets=[
                 0.01, 0.025, 0.05, 0.075, 0.1, 0.15, 0.2, 0.3, 0.4, 0.5, 0.75,
                 1.0, 2.5, 5.0, 7.5, 10.0, 20.0, 40.0, 80.0
@@ -388,6 +392,17 @@ class PrometheusStatLogger(StatLoggerBase):
         self.histogram_time_per_output_token = make_per_engine(
             histogram_time_per_output_token, engine_indexes, model_name)
 
+        histogram_inter_token_latency = self._histogram_cls(
+            name="vllm:inter_token_latency_seconds",
+            documentation="Histogram of inter-token latency in seconds.",
+            buckets=[
+                0.01, 0.025, 0.05, 0.075, 0.1, 0.15, 0.2, 0.3, 0.4, 0.5, 0.75,
+                1.0, 2.5, 5.0, 7.5, 10.0, 20.0, 40.0, 80.0
+            ],
+            labelnames=labelnames)
+        self.histogram_inter_token_latency = make_per_engine(
+            histogram_inter_token_latency, engine_indexes, model_name)
+
         request_latency_buckets = [
             0.3, 0.5, 0.8, 1.0, 1.5, 2.0, 2.5, 5.0, 10.0, 15.0, 20.0, 30.0,
             40.0, 50.0, 60.0, 120.0, 240.0, 480.0, 960.0, 1920.0, 7680.0
@@ -537,8 +552,9 @@ class PrometheusStatLogger(StatLoggerBase):
             self.histogram_n_request[engine_idx].observe(n_param)
         for ttft in iteration_stats.time_to_first_tokens_iter:
             self.histogram_time_to_first_token[engine_idx].observe(ttft)
-        for tpot in iteration_stats.time_per_output_tokens_iter:
-            self.histogram_time_per_output_token[engine_idx].observe(tpot)
+        for itl in iteration_stats.inter_token_latencies_iter:
+            self.histogram_inter_token_latency[engine_idx].observe(itl)
+            self.histogram_time_per_output_token[engine_idx].observe(itl)
 
         for finished_request in iteration_stats.finished_requests:
             self.counter_request_success[
@@ -635,15 +651,21 @@ class StatLoggerManager:
         vllm_config: VllmConfig,
         engine_idxs: Optional[list[int]] = None,
         custom_stat_loggers: Optional[list[StatLoggerFactory]] = None,
+        enable_default_loggers: bool = True,
+        client_count: int = 1,
     ):
         self.engine_idxs = engine_idxs if engine_idxs else [0]
 
-        factories: list[StatLoggerFactory]
+        factories: list[StatLoggerFactory] = []
         if custom_stat_loggers is not None:
-            factories = custom_stat_loggers
-        else:
-            factories = []
-            if logger.isEnabledFor(logging.INFO):
+            factories.extend(custom_stat_loggers)
+
+        if enable_default_loggers and logger.isEnabledFor(logging.INFO):
+            if client_count > 1:
+                logger.warning(
+                    "AsyncLLM created with api_server_count more than 1; "
+                    "disabling stats logging to avoid incomplete stats.")
+            else:
                 factories.append(LoggingStatLogger)
 
         # engine_idx: StatLogger
diff --git a/vllm/v1/metrics/stats.py b/vllm/v1/metrics/stats.py
index 9a80460261e0290b20c85a34043515e494fadaef..e6c344d193df229e31174d264859ffb56582e749 100644
--- a/vllm/v1/metrics/stats.py
+++ b/vllm/v1/metrics/stats.py
@@ -59,7 +59,7 @@ class RequestStateStats:
 
     num_generation_tokens: int = 0
 
-    # This is a engine frontend timestamp (wall-clock)
+    # This is an engine frontend timestamp (wall-clock)
     arrival_time: float = 0.0
 
     # These are engine core timestamps (monotonic)
@@ -68,6 +68,9 @@ class RequestStateStats:
     first_token_ts: float = 0.0
     last_token_ts: float = 0.0
 
+    # first token latency
+    first_token_latency: float = 0.0
+
 
 @dataclass
 class FinishedRequestStats:
@@ -96,7 +99,7 @@ class IterationStats:
         self.max_num_generation_tokens_iter: list[int] = []
         self.n_params_iter: list[int] = []
         self.time_to_first_tokens_iter: list[float] = []
-        self.time_per_output_tokens_iter: list[float] = []
+        self.inter_token_latencies_iter: list[float] = []
         self.waiting_lora_adapters: dict[str, int] = {}
         self.running_lora_adapters: dict[str, int] = {}
 
@@ -116,6 +119,7 @@ class IterationStats:
 
             first_token_latency = self._time_since(req_stats.arrival_time)
             self.time_to_first_tokens_iter.append(first_token_latency)
+            req_stats.first_token_latency = first_token_latency
 
         req_stats.num_generation_tokens += num_new_generation_tokens
 
@@ -128,8 +132,8 @@ class IterationStats:
         if is_prefilling:
             req_stats.first_token_ts = engine_core_timestamp
         else:
-            tpot = engine_core_timestamp - req_stats.last_token_ts
-            self.time_per_output_tokens_iter.append(tpot)
+            itl = engine_core_timestamp - req_stats.last_token_ts
+            self.inter_token_latencies_iter.append(itl)
 
         req_stats.last_token_ts = engine_core_timestamp
 
diff --git a/vllm/v1/outputs.py b/vllm/v1/outputs.py
index f8d6b24702f3c15f3852cfcc9d3bbe0c8bb948c8..1b2da8addb19e40b0c73c3481619428d1bcec083 100644
--- a/vllm/v1/outputs.py
+++ b/vllm/v1/outputs.py
@@ -1,6 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
+from abc import ABC, abstractmethod
 from dataclasses import dataclass
 from typing import NamedTuple, Optional
 
@@ -114,6 +115,20 @@ class ModelRunnerOutput:
     num_nans_in_logits: Optional[dict[str, int]] = None
 
 
+# ModelRunnerOutput wrapper for async scheduling.
+class AsyncModelRunnerOutput(ABC):
+
+    @abstractmethod
+    def get_output(self) -> ModelRunnerOutput:
+        """Get the ModelRunnerOutput for this async output.
+        
+        This is a blocking call that waits until the results are ready, which
+        might involve copying device tensors to the host.
+        This method should only be called once per AsyncModelRunnerOutput.
+        """
+        pass
+
+
 @dataclass
 class DraftTokenIds:
 
diff --git a/vllm/v1/request.py b/vllm/v1/request.py
index ad7477241ebbd7909c17e85e304706a7c3eb367b..64cce3e9efc51a6b564a337facdebcef1279a9fe 100644
--- a/vllm/v1/request.py
+++ b/vllm/v1/request.py
@@ -3,6 +3,7 @@
 
 import enum
 import time
+from collections.abc import Mapping
 from functools import partial
 from typing import TYPE_CHECKING, Any, Callable, Optional, Union
 
@@ -35,6 +36,7 @@ class Request:
         structured_output_request: Optional["StructuredOutputRequest"] = None,
         cache_salt: Optional[str] = None,
         priority: int = 0,
+        trace_headers: Optional[Mapping[str, str]] = None,
         block_hasher: Optional[Callable[["Request"],
                                         list["BlockHash"]]] = None,
     ) -> None:
@@ -100,7 +102,8 @@ class Request:
         # they should also be updated simultaneously.
         self.output_token_ids = ConstantList(self._output_token_ids)
         self.all_token_ids = ConstantList(self._all_token_ids)
-
+        # trace_headers
+        self.trace_headers = trace_headers
         # State
         # The number of tokens with prefix cache hits.
         self.num_cached_tokens = -1
@@ -136,6 +139,7 @@ class Request:
                     if request.sampling_params else None,
             cache_salt=request.cache_salt,
             priority=request.priority,
+            trace_headers=request.trace_headers,
             block_hasher=block_hasher,
         )
 
diff --git a/vllm/v1/sample/logits_processor/__init__.py b/vllm/v1/sample/logits_processor/__init__.py
index 822026916295179ee648811252b8639ebab11315..df944873bcaf3623b22dc3bf554b5af611b87c80 100644
--- a/vllm/v1/sample/logits_processor/__init__.py
+++ b/vllm/v1/sample/logits_processor/__init__.py
@@ -1,16 +1,22 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import importlib
+import inspect
 import itertools
+from abc import abstractmethod
 from collections.abc import Sequence
+from functools import partial
 from typing import TYPE_CHECKING, Optional, Union
 
 import torch
 
 from vllm.logger import init_logger
+from vllm.logits_process import LogitsProcessor as RequestLogitsProcessor
+from vllm.sampling_params import SamplingParams
 from vllm.v1.sample.logits_processor.builtin import (LogitBiasLogitsProcessor,
                                                      MinPLogitsProcessor,
-                                                     MinTokensLogitsProcessor)
+                                                     MinTokensLogitsProcessor,
+                                                     process_dict_updates)
 from vllm.v1.sample.logits_processor.interface import (BatchUpdate,
                                                        LogitsProcessor,
                                                        MoveDirectionality)
@@ -177,9 +183,112 @@ def build_logitsprocs(
             BUILTIN_LOGITS_PROCESSORS, custom_logitsprocs_classes))
 
 
+class AdapterLogitsProcessor(LogitsProcessor):
+    """Wrapper for per-request logits processors
+    
+    To wrap a specific per-request logits processor,
+    * Subclass `AdapterLogitsProcessor`
+    * Implement `self.is_argmax_invariant()` base-class method
+    * Implement `self.new_req_logits_processor(params)`
+    
+    `self.__init__(vllm_config, device, is_pin_memory)` does not need to be
+    overridden in general. However, to implement custom constructor behavior -
+    especially any logic which operates on or stores `vllm_config`, `device`,
+    or `is_pin_memory` - `self.__init__(vllm_config, device, is_pin_memory)`
+    must be overridden and the override must call
+    `super().__init__(vllm_config, device, is_pin_memory)`
+    """
+
+    def __init__(self, vllm_config: "VllmConfig", device: torch.device,
+                 is_pin_memory: bool):
+        """Subclass must invoke
+        `super().__init__(vllm_config, device, is_pin_memory)`.
+
+        Subclass constructor may find it useful to utilize the `vllm_config`,
+        `device` and `is_pin_memory` argument. However regardless of whether
+        these arguments are used, the vLLM logits processor interface requires
+        all three arguments to be present.
+        """
+
+        # Map req index -> logits processor state
+        #
+        # State representation is a partial[Tensor] comprising a request-level
+        # logits processor with the output token ids argument and (if required)
+        # the prompt token ids argument pre-populated
+        #
+        # Note that the partial carries a *reference* to output token ids, and
+        # will thus always operate on the list as it is currently, not as it
+        # was when the partial was created.
+        self.req_info: dict[int, partial[torch.Tensor]] = {}
+
+    @abstractmethod
+    def new_req_logits_processor(
+        self,
+        params: SamplingParams,
+    ) -> Optional[RequestLogitsProcessor]:
+        """Consume request info; return a per-request logits processor.
+
+        Return None if logits processor does not need to be applied to request
+
+        Args:
+          params: request sampling params
+
+        Returns:
+          None if logits processor should not be applied to request; otherwise
+          returns a `RequestLogitsProcessor` instance
+        
+        """
+        raise NotImplementedError
+
+    def _new_state(
+        self,
+        params: SamplingParams,
+        prompt_ids: list[int],
+        output_ids: list[int],
+    ) -> Optional[partial[torch.Tensor]]:
+        """Return state representation for new request
+
+        Returns None if logits processor is not applicable to request
+
+        Args:
+          params: request sampling params
+          prompt_ids: request prompt token ids
+          output_ids: decoded tokens so far for this request
+
+        Returns:
+          logits processor partial[Tensor] or None
+        
+        """
+        if req_lp := self.new_req_logits_processor(params):
+            args = [prompt_ids, output_ids] if (len(
+                inspect.signature(req_lp).parameters) == 3) else [output_ids]
+            return partial(req_lp, *args)
+        return None
+
+    def update_state(self, batch_update: Optional[BatchUpdate]):
+        process_dict_updates(
+            self.req_info,
+            batch_update,
+            self._new_state,
+        )
+
+    def apply(self, logits: torch.Tensor) -> torch.Tensor:
+        if self.req_info:
+            # Apply per-request logits processors to corresponding rows of
+            # logits tensor
+            for req_idx, req_lp in self.req_info.items():
+                req_logits = logits[req_idx]
+                new_logits = req_lp(req_logits)
+                if new_logits is not req_logits:
+                    # Modify logits tensor row in-place if necessary
+                    logits[req_idx] = new_logits
+        return logits
+
+
 __all__ = [
     "LogitsProcessor", "LogitBiasLogitsProcessor", "MinPLogitsProcessor",
     "MinTokensLogitsProcessor", "BatchUpdate", "BatchUpdateBuilder",
     "MoveDirectionality", "LogitsProcessors", "build_logitsprocs",
-    "STR_POOLING_REJECTS_LOGITSPROCS", "LOGITSPROCS_GROUP"
+    "STR_POOLING_REJECTS_LOGITSPROCS", "LOGITSPROCS_GROUP",
+    "AdapterLogitsProcessor"
 ]
diff --git a/vllm/v1/sample/ops/topk_topp_sampler.py b/vllm/v1/sample/ops/topk_topp_sampler.py
index 7bd4a5a380ac06193af2b1b88a1c2874751e95ab..cc5653b10ec1d8f643e18403592f192bac5827b9 100644
--- a/vllm/v1/sample/ops/topk_topp_sampler.py
+++ b/vllm/v1/sample/ops/topk_topp_sampler.py
@@ -73,10 +73,8 @@ class TopKTopPSampler(nn.Module):
                 self.forward = self.forward_native
         else:
             self.forward = self.forward_native
-        if current_platform.is_tpu():
-            self.apply_top_k_top_p = apply_top_k_top_p_tpu
-        else:
-            self.apply_top_k_top_p = apply_top_k_top_p
+
+        self.apply_top_k_top_p = apply_top_k_top_p
 
     def forward_native(
         self,
@@ -125,53 +123,6 @@ class TopKTopPSampler(nn.Module):
         return flashinfer_sample(logits.contiguous(), k, p, generators), None
 
 
-def apply_top_k_top_p_tpu(
-    logits: torch.Tensor,
-    k: torch.Tensor,
-    p: torch.Tensor,
-) -> torch.Tensor:
-    """
-    Apply top-k and top-p optimized for TPU.
-
-    This algorithm avoids using torch.scatter which is extremely slow on TPU.
-    This is achieved by finding a "cut-off" element in the original logit, and
-    after thresholding the logit using this cut-off, the remaining elements
-    shall constitute the top-p set.
-
-    Note: in the case of tie (i.e. multipple cut-off elements present in the
-    logit), all tie elements are included in the top-p set. In other words,
-    this function does not break ties. Instead, these tie tokens have equal
-    chance of being chosen during final sampling, so we can consider the tie
-    being broken then.
-    """
-    probs = logits.softmax(dim=-1)
-    probs_sort, _ = probs.sort(dim=-1, descending=False)
-
-    if k is not None:
-        top_k_count = probs_sort.size(1) - k.to(torch.long)  # shape: (batch, )
-        top_k_count = top_k_count.unsqueeze(dim=1)
-        top_k_cutoff = probs_sort.gather(-1, top_k_count)
-
-        # Make sure the no top-k rows are no-op.
-        no_top_k_mask = (k == logits.shape[1]).unsqueeze(dim=1)
-        top_k_cutoff.masked_fill_(no_top_k_mask, -float("inf"))
-
-        elements_to_discard = probs < top_k_cutoff
-        logits.masked_fill_(elements_to_discard, -float("inf"))
-
-    if p is not None:
-        cumprob = torch.cumsum(probs_sort, dim=-1)
-        top_p_mask = cumprob <= 1 - p.unsqueeze(dim=1)
-        top_p_mask[:, -1] = False  # at least one
-
-        top_p_count = top_p_mask.sum(dim=-1).unsqueeze(1)
-        top_p_cutoff = probs_sort.gather(-1, top_p_count)
-        elements_to_discard = probs < top_p_cutoff
-        logits.masked_fill_(elements_to_discard, -float("inf"))
-
-    return logits
-
-
 def apply_top_k_top_p(
     logits: torch.Tensor,
     k: Optional[torch.Tensor],
diff --git a/vllm/v1/sample/tpu/sampler.py b/vllm/v1/sample/tpu/sampler.py
index e84136e3a6d07b994734b72a6288b0c49ff98a5c..17b83a4ba074c706b7f17fb326575ed2facee28e 100644
--- a/vllm/v1/sample/tpu/sampler.py
+++ b/vllm/v1/sample/tpu/sampler.py
@@ -2,11 +2,12 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Sampler layer implementing TPU supported operations."""
 
+from typing import Optional
+
 import torch
 import torch.nn as nn
 
 from vllm.v1.outputs import LogprobsTensors, SamplerOutput
-from vllm.v1.sample.ops.topk_topp_sampler import TopKTopPSampler
 from vllm.v1.sample.tpu.metadata import TPUSupportedSamplingMetadata
 
 _SAMPLING_EPS = 1e-5
@@ -17,7 +18,6 @@ class Sampler(nn.Module):
     def __init__(self):
         # TODO(houseroad): Add support for logprobs_mode.
         super().__init__()
-        self.topk_topp_sampler = TopKTopPSampler()
 
     def forward(
         self,
@@ -65,13 +65,17 @@ class Sampler(nn.Module):
             logits = self.apply_min_p(logits, sampling_metadata.min_p)
 
         # Apply top_k and/or top_p.
-        random_sampled, _ = self.topk_topp_sampler(
+        logits = apply_top_k_top_p(
             logits,
-            sampling_metadata.generators,
             sampling_metadata.top_k,
             sampling_metadata.top_p,
         )
 
+        # Random sample.
+        probs = logits.softmax(dim=-1, dtype=torch.float32)
+        random_sampled = self.random_sample(probs,
+                                            sampling_metadata.generators)
+
         sampled = torch.where(sampling_metadata.temperature < _SAMPLING_EPS,
                               greedy_sampled, random_sampled)
         return sampled
@@ -144,3 +148,66 @@ class Sampler(nn.Module):
         # Apply mask using boolean indexing (xla friendly)
         logits.masked_fill_(~valid_token_mask, -float("inf"))
         return logits
+
+    def random_sample(
+        self,
+        probs: torch.Tensor,
+        generators: dict[int, torch.Generator],
+    ) -> torch.Tensor:
+        q = torch.empty_like(probs)
+        # NOTE(woosuk): To batch-process the requests without their own seeds,
+        # which is the common case, we first assume that every request does
+        # not have its own seed. Then, we overwrite the values for the requests
+        # that have their own seeds.
+        q.exponential_()
+        if generators:
+            for i, generator in generators.items():
+                q[i].exponential_(generator=generator)
+        return probs.div_(q).argmax(dim=-1).view(-1)
+
+
+def apply_top_k_top_p(
+    logits: torch.Tensor,
+    k: Optional[torch.Tensor],
+    p: Optional[torch.Tensor],
+) -> torch.Tensor:
+    """
+    Apply top-k and top-p optimized for TPU.
+
+    This algorithm avoids using torch.scatter which is extremely slow on TPU.
+    This is achieved by finding a "cut-off" element in the original logit, and
+    after thresholding the logit using this cut-off, the remaining elements
+    shall constitute the top-p set.
+
+    Note: in the case of tie (i.e. multipple cut-off elements present in the
+    logit), all tie elements are included in the top-p set. In other words,
+    this function does not break ties. Instead, these tie tokens have equal
+    chance of being chosen during final sampling, so we can consider the tie
+    being broken then.
+    """
+    probs = logits.softmax(dim=-1)
+    probs_sort, _ = probs.sort(dim=-1, descending=False)
+
+    if k is not None:
+        top_k_count = probs_sort.size(1) - k.to(torch.long)  # shape: (batch, )
+        top_k_count = top_k_count.unsqueeze(dim=1)
+        top_k_cutoff = probs_sort.gather(-1, top_k_count)
+
+        # Make sure the no top-k rows are no-op.
+        no_top_k_mask = (k == logits.shape[1]).unsqueeze(dim=1)
+        top_k_cutoff.masked_fill_(no_top_k_mask, -float("inf"))
+
+        elements_to_discard = probs < top_k_cutoff
+        logits.masked_fill_(elements_to_discard, -float("inf"))
+
+    if p is not None:
+        cumprob = torch.cumsum(probs_sort, dim=-1)
+        top_p_mask = cumprob <= 1 - p.unsqueeze(dim=1)
+        top_p_mask[:, -1] = False  # at least one
+
+        top_p_count = top_p_mask.sum(dim=-1).unsqueeze(1)
+        top_p_cutoff = probs_sort.gather(-1, top_p_count)
+        elements_to_discard = probs < top_p_cutoff
+        logits.masked_fill_(elements_to_discard, -float("inf"))
+
+    return logits
diff --git a/vllm/v1/spec_decode/eagle.py b/vllm/v1/spec_decode/eagle.py
index 7e09074d65435bdb2df9872e5200d1f60e80e16c..0727244b44ed86ab7fc858ae9e0b4f84c52010b1 100644
--- a/vllm/v1/spec_decode/eagle.py
+++ b/vllm/v1/spec_decode/eagle.py
@@ -218,8 +218,9 @@ class EagleProposer:
                 hidden_states=self.hidden_states[:num_input_tokens],
                 inputs_embeds=inputs_embeds,
             )
-            if self.method in ("deepseek_mtp", "ernie_mtp"):
+            if self.method in ("deepseek_mtp", "ernie_mtp", "qwen3_next_mtp"):
                 last_hidden_states = ret_hidden_states
+                hidden_states = last_hidden_states
             else:
                 last_hidden_states, hidden_states = ret_hidden_states
         sample_hidden_states = last_hidden_states[last_token_indices]
@@ -321,12 +322,18 @@ class EagleProposer:
             with set_forward_context(per_layer_attn_metadata,
                                      self.vllm_config,
                                      num_tokens=input_batch_size):
-                last_hidden_states, hidden_states = self.model(
+                ret_hidden_states = self.model(
                     input_ids=input_ids,
                     positions=self.positions[:input_batch_size],
                     hidden_states=self.hidden_states[:input_batch_size],
                     inputs_embeds=inputs_embeds,
                 )
+                if self.method in ("deepseek_mtp", "ernie_mtp",
+                                   "qwen3_next_mtp"):
+                    last_hidden_states = ret_hidden_states
+                    hidden_states = ret_hidden_states
+                else:
+                    last_hidden_states, hidden_states = ret_hidden_states
             hidden_states = hidden_states[:batch_size]
             logits = self.model.compute_logits(last_hidden_states[:batch_size],
                                                None)
diff --git a/vllm/v1/spec_decode/metrics.py b/vllm/v1/spec_decode/metrics.py
index b4bc3058c570ac3cbf5d27ba6ad78c8d3758c211..2aa8962f5739c0e124f6969202ad89bd8353ee1b 100644
--- a/vllm/v1/spec_decode/metrics.py
+++ b/vllm/v1/spec_decode/metrics.py
@@ -1,6 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
+import time
 from dataclasses import dataclass, field
 from typing import Optional
 
@@ -58,6 +59,7 @@ class SpecDecodingLogging:
         self.num_draft_tokens: list[int] = []
         self.num_accepted_tokens: list[int] = []
         self.accepted_tokens_per_pos_lists: list[list[int]] = []
+        self.last_log_time = time.monotonic()
 
     def observe(self, spec_decoding_stats: SpecDecodingStats):
         self.num_drafts.append(spec_decoding_stats.num_drafts)
@@ -73,6 +75,13 @@ class SpecDecodingLogging:
         num_drafts = np.sum(self.num_drafts)
         num_draft_tokens = np.sum(self.num_draft_tokens)
         num_accepted_tokens = np.sum(self.num_accepted_tokens)
+        draft_throughput = 0
+        accepted_throughput = 0
+
+        elapsed_time = time.monotonic() - self.last_log_time
+        if elapsed_time > 0:
+            draft_throughput = num_draft_tokens / elapsed_time
+            accepted_throughput = num_accepted_tokens / elapsed_time
 
         draft_acceptance_rate = (num_accepted_tokens / num_draft_tokens *
                                  100 if num_draft_tokens > 0 else float("nan"))
@@ -86,16 +95,20 @@ class SpecDecodingLogging:
 
         log_fn(
             "SpecDecoding metrics: "
-            "Draft acceptance rate: %.1f%%, "
             "Mean acceptance length: %.2f, "
+            "Accepted throughput: %.2f tokens/s, "
+            "Drafted throughput: %.2f tokens/s, "
             "Accepted: %d tokens, "
             "Drafted: %d tokens, "
-            "Per-position acceptance rate: %s",
-            draft_acceptance_rate,
+            "Per-position acceptance rate: %s, "
+            "Avg Draft acceptance rate: %.1f%%",
             mean_acceptance_length,
+            accepted_throughput,
+            draft_throughput,
             num_accepted_tokens,
             num_draft_tokens,
             rates_str,
+            draft_acceptance_rate,
         )
         self.reset()
 
diff --git a/vllm/v1/spec_decode/ngram_proposer.py b/vllm/v1/spec_decode/ngram_proposer.py
index fbcf2cb50d371b5ed47ed9bec1c71e4f56d7d68f..b92e396d4536e1ddefc1a19a5aba9b08eecd556c 100644
--- a/vllm/v1/spec_decode/ngram_proposer.py
+++ b/vllm/v1/spec_decode/ngram_proposer.py
@@ -107,7 +107,7 @@ def _find_longest_matched_ngram_and_propose_tokens(
     longest_ngram = 0
     position = 0
 
-    # lps[0] always equal to 0, we starts with index 1
+    # lps[0] always equal to 0, we start with index 1
     prev_lps = 0
     i = 1
     while i < total_token:
diff --git a/vllm/v1/structured_output/utils.py b/vllm/v1/structured_output/utils.py
index 95319831d51214b4e1117efd9cf55b6270cc15ef..953185a8fc31de645701fa3ad16b1b75957cdb01 100644
--- a/vllm/v1/structured_output/utils.py
+++ b/vllm/v1/structured_output/utils.py
@@ -65,9 +65,9 @@ def get_outlines_cache_path() -> str:
     elif xdg_cache_home:
         return os.path.join(xdg_cache_home, ".cache", "outlines")
     # If homedir is "/", we may be inside a container, and thus writing to
-    # root would be problematic, so we fallback to using a tempfile.
+    # root would be problematic, so we fall back to using a tempfile.
     # Also validate the path exists, since os.path.expanduser does
-    # not garuntee existence.
+    # not guarantee existence.
     elif os.path.isdir(home_dir) and home_dir != "/":
         # Default Unix fallback: ~/.cache/outlines
         return os.path.join(home_dir, ".cache", "outlines")
diff --git a/vllm/v1/utils.py b/vllm/v1/utils.py
index 8f9face6fbf2ee7a3b3d416ce96b7345c12e86d8..fd84b4a111f585627b48b1aa3ed9e5010e025ece 100644
--- a/vllm/v1/utils.py
+++ b/vllm/v1/utils.py
@@ -1,17 +1,21 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import argparse
+import contextlib
 import multiprocessing
 import time
 import weakref
 from collections.abc import Sequence
+from contextlib import AbstractContextManager
 from multiprocessing import connection
 from multiprocessing.process import BaseProcess
 from typing import (TYPE_CHECKING, Any, Callable, Generic, Optional, TypeVar,
                     Union, overload)
 
 import torch
+from torch.autograd.profiler import record_function
 
+import vllm.envs as envs
 from vllm.logger import init_logger
 from vllm.usage.usage_lib import (UsageContext, is_usage_stats_enabled,
                                   usage_message)
@@ -19,6 +23,8 @@ from vllm.utils import (get_open_port, get_open_zmq_ipc_path, get_tcp_uri,
                         kill_process_tree)
 
 if TYPE_CHECKING:
+    import numpy as np
+
     from vllm.v1.engine.coordinator import DPCoordinator
     from vllm.v1.engine.utils import (CoreEngineActorManager,
                                       CoreEngineProcManager)
@@ -97,20 +103,31 @@ class ConstantList(Generic[T], Sequence):
 
 
 class CpuGpuBuffer:
+    """Buffer to easily copy tensors between CPU and GPU."""
 
     def __init__(
         self,
-        *args,
+        *size: Union[int, torch.SymInt],
         dtype: torch.dtype,
         device: torch.device,
         pin_memory: bool,
-    ):
-        self.cpu = torch.zeros(*args,
+        with_numpy: bool = True,
+    ) -> None:
+        self.cpu = torch.zeros(*size,
                                dtype=dtype,
                                device="cpu",
                                pin_memory=pin_memory)
-        self.np = self.cpu.numpy()
         self.gpu = self.cpu.to(device)
+        self.np: np.ndarray
+        # To keep type hints simple (avoiding generics and subclasses), we
+        # only conditionally create the numpy array attribute. This can cause
+        # AttributeError if `self.np` is accessed when `with_numpy=False`.
+        if with_numpy:
+            if dtype == torch.bfloat16:
+                raise ValueError(
+                    "Bfloat16 torch tensors cannot be directly cast to a "
+                    "numpy array, so call CpuGpuBuffer with with_numpy=False")
+            self.np = self.cpu.numpy()
 
     def copy_to_gpu(self, n: Optional[int] = None) -> torch.Tensor:
         if n is None:
@@ -142,7 +159,7 @@ def get_engine_client_zmq_addr(local_only: bool,
 
 class APIServerProcessManager:
     """Manages a group of API server processes.
-    
+
     Handles creation, monitoring, and termination of API server worker
     processes. Also monitors extra processes to check if they are healthy.
     """
@@ -159,7 +176,7 @@ class APIServerProcessManager:
         stats_update_address: Optional[str] = None,
     ):
         """Initialize and start API server worker processes.
-        
+
         Args:
             target_server_fn: Function to call for each API server process
             listen_address: Address to listen for client connections
@@ -168,7 +185,7 @@ class APIServerProcessManager:
             num_servers: Number of API server processes to start
             input_addresses: Input addresses for each API server
             output_addresses: Output addresses for each API server
-            stats_update_address: Optional stats update address 
+            stats_update_address: Optional stats update address
         """
         self.listen_address = listen_address
         self.sock = sock
@@ -212,7 +229,7 @@ def wait_for_completion_or_failure(
                                        "CoreEngineActorManager"]] = None,
         coordinator: Optional["DPCoordinator"] = None) -> None:
     """Wait for all processes to complete or detect if any fail.
-    
+
     Raises an exception if any process exits with a non-zero status.
 
     Args:
@@ -338,7 +355,8 @@ def report_usage_stats(
             vllm_config.cache_config.block_size,
             "gpu_memory_utilization":
             vllm_config.cache_config.gpu_memory_utilization,
-
+            "kv_cache_memory_bytes":
+            vllm_config.cache_config.kv_cache_memory_bytes,
             # Quantization
             "quantization":
             vllm_config.model_config.quantization,
@@ -355,3 +373,10 @@ def report_usage_stats(
             "disable_custom_all_reduce":
             vllm_config.parallel_config.disable_custom_all_reduce,
         })
+
+
+def record_function_or_nullcontext(name: str) -> AbstractContextManager:
+    if envs.VLLM_CUSTOM_SCOPES_FOR_PROFILING:
+        return record_function(name)
+    else:
+        return contextlib.nullcontext()
diff --git a/vllm/v1/worker/block_table.py b/vllm/v1/worker/block_table.py
index 6ab5ce2748a4a05b6b74654a5a2147e6eaf16a85..194984bf5053628bc60755986c475450131ae21a 100644
--- a/vllm/v1/worker/block_table.py
+++ b/vllm/v1/worker/block_table.py
@@ -4,6 +4,7 @@
 import numpy as np
 import torch
 
+from vllm.distributed import get_dcp_group
 from vllm.logger import init_logger
 from vllm.utils import cdiv
 
@@ -50,6 +51,13 @@ class BlockTable:
         self.slot_mapping = torch.zeros(self.max_num_batched_tokens,
                                         dtype=torch.int64,
                                         device=self.device)
+        try:
+            self.dcp_world_size = get_dcp_group().world_size
+            self.dcp_rank = get_dcp_group().rank_in_group
+        except AssertionError:
+            # DCP might not be initialized in testing
+            self.dcp_world_size = 1
+            self.dcp_rank = 0
 
     def append_row(
         self,
@@ -89,13 +97,36 @@ class BlockTable:
         # NOTE(woosuk): We can't simply use `token_indices // block_size`
         # here because M (max_model_len) is not necessarily divisible by
         # block_size.
-        block_table_indices = (req_indices * self.max_num_blocks_per_req +
-                               positions // self.block_size)
-        block_numbers = self.block_table_np.ravel()[block_table_indices]
-        block_offsets = positions % self.block_size
-        np.add(block_numbers * self.block_size,
-               block_offsets,
-               out=self.slot_mapping_np[:req_indices.shape[0]])
+        if self.dcp_world_size > 1:
+            # Note(hc): The DCP implement store kvcache with an interleave
+            # style, the kvcache for the token whose token_idx is i is
+            # always stored on the GPU whose dcp_rank equals i % cp_world_size:
+
+            # Use a "virtual block" which equals to world_size * block_size
+            # for block_table_indices calculation.
+            virtual_block_size = self.block_size * self.dcp_world_size
+            block_table_indices = (req_indices * self.max_num_blocks_per_req +
+                                   positions // virtual_block_size)
+            block_numbers = self.block_table_np.ravel()[block_table_indices]
+            # Use virtual_block_size for mask calculation, which marks local
+            # tokens.
+            virtual_block_offsets = positions % virtual_block_size
+            mask = virtual_block_offsets % self.dcp_world_size == self.dcp_rank
+            # Calculate local block_offsets
+            block_offsets = virtual_block_offsets // self.dcp_world_size
+            # Calculate slot_mapping
+            slot_mapping = block_numbers * self.block_size + block_offsets
+            # Write final slots, use -1 for not-local
+            self.slot_mapping_np[:req_indices.shape[0]] = np.where(
+                mask, slot_mapping, -1)
+        else:
+            block_table_indices = (req_indices * self.max_num_blocks_per_req +
+                                   positions // self.block_size)
+            block_numbers = self.block_table_np.ravel()[block_table_indices]
+            block_offsets = positions % self.block_size
+            np.add(block_numbers * self.block_size,
+                   block_offsets,
+                   out=self.slot_mapping_np[:req_indices.shape[0]])
 
     def commit_block_table(self, num_reqs: int) -> None:
         self.block_table[:num_reqs].copy_(self.block_table_cpu[:num_reqs],
@@ -125,14 +156,30 @@ class BlockTable:
 class MultiGroupBlockTable:
     """The BlockTables for each KV cache group."""
 
-    def __init__(self, max_num_reqs: int, max_model_len: int,
-                 max_num_batched_tokens: int, pin_memory: bool,
-                 device: torch.device, block_sizes: list[int]) -> None:
+    def __init__(self,
+                 max_num_reqs: int,
+                 max_model_len: int,
+                 max_num_batched_tokens: int,
+                 pin_memory: bool,
+                 device: torch.device,
+                 block_sizes: list[int],
+                 num_speculative_tokens: int = 0) -> None:
+        # Note(hc): each dcp rank only store
+        # (max_model_len//dcp_world_size) tokens in kvcache,
+        # so the block_size which used for calc max_num_blocks_per_req
+        # must be multiplied by dcp_world_size.
+        try:
+            dcp_world_size = get_dcp_group().world_size
+        except AssertionError:
+            # DCP might not be initialized in testing
+            dcp_world_size = 1
+
         self.block_tables = [
-            BlockTable(block_size, max_num_reqs, cdiv(max_model_len,
-                                                      block_size),
-                       max_num_batched_tokens, pin_memory, device)
-            for block_size in block_sizes
+            BlockTable(
+                block_size, max_num_reqs,
+                max(cdiv(max_model_len, block_size * dcp_world_size),
+                    1 + num_speculative_tokens), max_num_batched_tokens,
+                pin_memory, device) for block_size in block_sizes
         ]
 
     def append_row(self, block_ids: tuple[list[int], ...],
diff --git a/vllm/v1/worker/cpu_model_runner.py b/vllm/v1/worker/cpu_model_runner.py
index feb49978d7518a1d82eb6f087298b5c2d3c55e4f..d5ec19b86b0612a6da4676e78b11a443245c78db 100644
--- a/vllm/v1/worker/cpu_model_runner.py
+++ b/vllm/v1/worker/cpu_model_runner.py
@@ -55,11 +55,23 @@ class CPUModelRunner(GPUModelRunner):
             raise ValueError("Multiple KVCacheGroups is not"
                              "currently supported with CPU model runner.")
 
-        assert type(self.attn_groups[0]
-                    [0].metadata_builder) is TorchSDPAMetadataBuilderV1
+        # Guard against encoder-only / pooling models where `attn_groups`
+        # may be empty or lack the expected metadata_builder.
+        # Without this check, accessing `attn_groups[0][0]` would trigger
+        # an AssertionError on CPU backend.
+        if not hasattr(self, "attn_groups") or not self.attn_groups:
+            return
+        if not self.attn_groups[0]:
+            return
+
+        mb = getattr(self.attn_groups[0][0], "metadata_builder", None)
+        if not isinstance(mb, TorchSDPAMetadataBuilderV1):
+            # Encoder-only / rerank models do not benefit from reordering,
+            # so we safely skip here.
+            return
 
-        self.attn_groups[0][0].metadata_builder.reorder_batch(
-            self.input_batch, scheduler_output)
+        # Safe path for decoder/attention-heavy models
+        mb.reorder_batch(self.input_batch, scheduler_output)
 
     def _postprocess_tensors(self) -> None:
         # Note: replace device tensors with cpu tensors
diff --git a/vllm/v1/worker/gpu_input_batch.py b/vllm/v1/worker/gpu_input_batch.py
index f4c2f45df5954100dcff394f77b5b73afb5a4b00..1cf56656d7adfb6dc91a9caf773de2acaff853eb 100644
--- a/vllm/v1/worker/gpu_input_batch.py
+++ b/vllm/v1/worker/gpu_input_batch.py
@@ -83,6 +83,7 @@ class InputBatch:
         logitsprocs: Optional[LogitsProcessors] = None,
         is_spec_decode: bool = False,
         is_pooling_model: bool = False,
+        num_speculative_tokens: int = 0,
     ):
         self.is_pooling_model = is_pooling_model
         self.is_spec_decode = is_spec_decode
@@ -127,6 +128,7 @@ class InputBatch:
             pin_memory=pin_memory,
             device=device,
             block_sizes=block_sizes,
+            num_speculative_tokens=num_speculative_tokens,
         )
 
         # Sampling-related.
@@ -202,6 +204,14 @@ class InputBatch:
             self.repetition_penalties_cpu_tensor.numpy()
         self.repetition_penalties_reqs: set[str] = set()
 
+        # Speculative decoding
+        self.num_accepted_tokens_cpu_tensor = torch.ones((max_num_reqs, ),
+                                                         dtype=torch.int64,
+                                                         device="cpu",
+                                                         pin_memory=pin_memory)
+        self.num_accepted_tokens_cpu = \
+            self.num_accepted_tokens_cpu_tensor.numpy()
+
         # lora related
         self.request_lora_mapping = np.zeros((self.max_num_reqs, ),
                                              dtype=np.int32)
@@ -250,6 +260,11 @@ class InputBatch:
 
         self.pooling_params: dict[str, PoolingParams] = {}
 
+        # Cached reference to the GPU tensor of previously sampled tokens
+        self.prev_sampled_token_ids: Optional[torch.Tensor] = None
+        self.prev_sampled_token_ids_invalid_indices: Optional[set[int]] = None
+        self.prev_req_id_to_index: Optional[dict[str, int]] = None
+
     @property
     def req_ids(self) -> list[str]:
         # None elements should only be present transiently
@@ -355,8 +370,9 @@ class InputBatch:
                                              if sampling_params.logprobs == -1
                                              else sampling_params.logprobs)
             if sampling_params.prompt_logprobs is not None:
-                self.num_prompt_logprobs[
-                    req_id] = sampling_params.prompt_logprobs
+                self.num_prompt_logprobs[req_id] = (
+                    self.vocab_size if sampling_params.prompt_logprobs == -1
+                    else sampling_params.prompt_logprobs)
 
             if sampling_params.allowed_token_ids:
                 self.has_allowed_token_ids.add(req_id)
@@ -388,6 +404,9 @@ class InputBatch:
         else:
             raise NotImplementedError("Unrecognized request type")
 
+        # Speculative decoding: by default 1 token is generated.
+        self.num_accepted_tokens_cpu[req_index] = 1
+
         # Add request lora ID
         if request.lora_request:
             lora_id = request.lora_request.lora_int_id
@@ -509,6 +528,8 @@ class InputBatch:
             self.presence_penalties_cpu[i2], self.presence_penalties_cpu[i1]
         self.repetition_penalties_cpu[i1], self.repetition_penalties_cpu[i2] = \
             self.repetition_penalties_cpu[i2], self.repetition_penalties_cpu[i1]
+        self.num_accepted_tokens_cpu[i1], self.num_accepted_tokens_cpu[i2] =\
+            self.num_accepted_tokens_cpu[i2], self.num_accepted_tokens_cpu[i1]
 
         swap_dict_values(self.generators, i1, i2)
         swap_dict_values(self.bad_words_token_ids, i1, i2)
@@ -584,7 +605,7 @@ class InputBatch:
 
             if self.is_pooling_model:
                 last_req_index -= 1
-                # Samping state not used by pooling models.
+                # Sampling state not used by pooling models.
                 continue
 
             # Autoregressive models require detailed tracking of condense
@@ -603,6 +624,8 @@ class InputBatch:
                 empty_index] = self.presence_penalties_cpu[last_req_index]
             self.repetition_penalties_cpu[
                 empty_index] = self.repetition_penalties_cpu[last_req_index]
+            self.num_accepted_tokens_cpu[
+                empty_index] = self.num_accepted_tokens_cpu[last_req_index]
             generator = self.generators.pop(last_req_index, None)
             if generator is not None:
                 self.generators[empty_index] = generator
@@ -704,17 +727,12 @@ class InputBatch:
             logitsprocs=self.logitsprocs,
         )
 
-    @property
-    def pooling_metadata(self) -> PoolingMetadata:
-        if len(self.pooling_params) == 0:
-            pooling_params = []
-        else:
-            # Note, for now this assumes that all request in the batch
-            # are either sampling or pooling requests
-            assert len(self.req_ids) == len(self.pooling_params)
-            pooling_params = [
-                self.pooling_params[req_id] for req_id in self.req_ids
-            ]
+    def get_pooling_params(self) -> list[PoolingParams]:
+        assert len(self.req_ids) == len(self.pooling_params)
+        return [self.pooling_params[req_id] for req_id in self.req_ids]
+
+    def get_pooling_metadata(self) -> PoolingMetadata:
+        pooling_params = self.get_pooling_params()
 
         return PoolingMetadata(
             prompt_lens=torch.from_numpy(
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index 08e13ab887bf909dc2dde0408b56c2abfc0c7cba..ebb18e81c38a8f8d89360328e69aa359afe84177 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -28,6 +28,7 @@ from vllm.config import (CompilationLevel, CUDAGraphMode, VllmConfig,
 from vllm.distributed.eplb.eplb_state import EplbState
 from vllm.distributed.kv_transfer import (get_kv_transfer_group,
                                           has_kv_transfer_group)
+from vllm.distributed.kv_transfer.kv_connector.utils import copy_kv_blocks
 from vllm.distributed.parallel_state import (
     get_pp_group, get_tp_group, graph_capture, is_global_first_rank,
     prepare_communication_buffer_for_model)
@@ -52,22 +53,27 @@ from vllm.sampling_params import SamplingType
 from vllm.sequence import IntermediateTensors, PoolerOutput
 from vllm.tasks import GenerationTask, PoolingTask, SupportedTask
 from vllm.utils import (STR_DTYPE_TO_TORCH_DTYPE, DeviceMemoryProfiler,
-                        GiB_bytes, LazyLoader, cdiv, check_use_alibi,
-                        get_dtype_size, is_pin_memory_available, round_up,
-                        supports_dynamo)
+                        GiB_bytes, LazyLoader, check_use_alibi, get_dtype_size,
+                        is_pin_memory_available, round_up, supports_dynamo)
+from vllm.v1.attention.backends.gdn_attn import GDNAttentionMetadataBuilder
 from vllm.v1.attention.backends.utils import (
     AttentionCGSupport, AttentionMetadataBuilder, CommonAttentionMetadata,
     create_fast_prefill_custom_backend,
     reorder_batch_to_split_decodes_and_prefills)
 from vllm.v1.cudagraph_dispatcher import CudagraphDispatcher
+# yapf conflicts with isort for this block
+# yapf: disable
 from vllm.v1.kv_cache_interface import (AttentionSpec,
                                         ChunkedLocalAttentionSpec,
+                                        CrossAttentionSpec,
                                         EncoderOnlyAttentionSpec,
                                         FullAttentionSpec, KVCacheConfig,
                                         KVCacheGroupSpec, KVCacheSpec,
                                         MambaSpec, SlidingWindowSpec)
-from vllm.v1.outputs import (EMPTY_MODEL_RUNNER_OUTPUT, DraftTokenIds,
-                             LogprobsTensors, ModelRunnerOutput)
+# yapf: enable
+from vllm.v1.outputs import (EMPTY_MODEL_RUNNER_OUTPUT, AsyncModelRunnerOutput,
+                             DraftTokenIds, LogprobsLists, LogprobsTensors,
+                             ModelRunnerOutput, SamplerOutput)
 from vllm.v1.pool.metadata import PoolingMetadata
 from vllm.v1.sample.logits_processor import LogitsProcessors, build_logitsprocs
 from vllm.v1.sample.metadata import SamplingMetadata
@@ -77,7 +83,7 @@ from vllm.v1.spec_decode.eagle import EagleProposer
 from vllm.v1.spec_decode.medusa import MedusaProposer
 from vllm.v1.spec_decode.metadata import SpecDecodeMetadata
 from vllm.v1.spec_decode.ngram_proposer import NgramProposer
-from vllm.v1.utils import CpuGpuBuffer
+from vllm.v1.utils import CpuGpuBuffer, record_function_or_nullcontext
 from vllm.v1.worker.gpu_input_batch import CachedRequestState, InputBatch
 from vllm.v1.worker.kv_connector_model_runner_mixin import (
     KVConnectorModelRunnerMixin, KVConnectorOutput)
@@ -90,19 +96,62 @@ from .utils import (AttentionGroup, MultiModalBudget,
 
 if TYPE_CHECKING:
     import xgrammar as xgr
-    import xgrammar.kernels.apply_token_bitmask_inplace_torch_compile as xgr_torch_compile  # noqa: E501
 
     from vllm.model_executor.model_loader.tensorizer import TensorizerConfig
     from vllm.v1.core.sched.output import SchedulerOutput
 else:
     xgr = LazyLoader("xgr", globals(), "xgrammar")
-    xgr_torch_compile = LazyLoader(
-        "xgr_torch_compile", globals(),
-        "xgrammar.kernels.apply_token_bitmask_inplace_torch_compile")
 
 logger = init_logger(__name__)
 
 
+# Wrapper for ModelRunnerOutput to support overlapped execution.
+class AsyncGPUModelRunnerOutput(AsyncModelRunnerOutput):
+
+    def __init__(
+        self,
+        model_runner_output: ModelRunnerOutput,
+        sampled_token_ids: torch.Tensor,
+        invalid_req_indices: list[int],
+        async_output_copy_stream: torch.cuda.Stream,
+    ):
+        self._model_runner_output = model_runner_output
+        self._invalid_req_indices = invalid_req_indices
+
+        # Event on the copy stream so we can synchronize the non-blocking copy.
+        self._async_copy_ready_event = torch.cuda.Event()
+
+        # Keep a reference to the device tensor to avoid it being
+        # deallocated until we finish copying it to the host.
+        self._sampled_token_ids = sampled_token_ids
+
+        # Initiate the copy on a separate stream, but do not synchronize it.
+        default_stream = torch.cuda.current_stream()
+        with torch.cuda.stream(async_output_copy_stream):
+            async_output_copy_stream.wait_stream(default_stream)
+            self._sampled_token_ids_cpu = self._sampled_token_ids.to(
+                'cpu', non_blocking=True)
+            self._async_copy_ready_event.record()
+
+    def get_output(self) -> ModelRunnerOutput:
+        """Copy the device tensors to the host and return a ModelRunnerOutput.
+        
+        This function blocks until the copy is finished.
+        """
+        self._async_copy_ready_event.synchronize()
+
+        # Release the device tensor once the copy has completed
+        del self._sampled_token_ids
+
+        valid_sampled_token_ids = self._sampled_token_ids_cpu.tolist()
+        for i in self._invalid_req_indices:
+            valid_sampled_token_ids[i].clear()
+
+        output = self._model_runner_output
+        output.sampled_token_ids = valid_sampled_token_ids
+        return output
+
+
 class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
 
     def __init__(
@@ -138,11 +187,12 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
             self.kv_cache_dtype = STR_DTYPE_TO_TORCH_DTYPE[
                 cache_config.cache_dtype]
 
-        self.is_pooling_model = model_config.pooler_config is not None
+        self.is_pooling_model = (model_config.runner_type == 'pooling')
         self.is_multimodal_raw_input_only_model = (
             model_config.is_multimodal_raw_input_only_model)
 
         self.max_model_len = model_config.max_model_len
+        self.dcp_world_size = self.parallel_config.decode_context_parallel_size
         self.max_num_tokens = scheduler_config.max_num_batched_tokens
         self.max_num_reqs = scheduler_config.max_num_seqs
 
@@ -162,6 +212,14 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
         self.supports_mm_inputs = self.mm_registry.supports_multimodal_inputs(
             model_config)
 
+        if self.model_config.is_encoder_decoder:
+            # Maximum length of the encoder input, only for encoder-decoder
+            # models.
+            self.max_encoder_len = self.mm_registry.\
+                get_encdec_max_encoder_len(model_config)
+        else:
+            self.max_encoder_len = 0
+
         # Sampler
         self.sampler = Sampler(logprobs_mode=self.model_config.logprobs_mode)
 
@@ -219,7 +277,9 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
         # the block_sizes in the kv cache config.
         self.input_batch = InputBatch(
             max_num_reqs=self.max_num_reqs,
-            max_model_len=self.max_model_len,
+            # We need to use the encoder length for encoder-decoer
+            # because of KV cache for cross-attention.
+            max_model_len=max(self.max_model_len, self.max_encoder_len),
             max_num_batched_tokens=self.max_num_tokens,
             device=self.device,
             pin_memory=self.pin_memory,
@@ -233,6 +293,10 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
             is_pooling_model=self.is_pooling_model,
         )
 
+        self.use_async_scheduling = self.scheduler_config.async_scheduling
+        self.async_output_copy_stream = torch.cuda.Stream() if \
+            self.use_async_scheduling else None
+
         # TODO(woosuk): Provide an option to tune the max cudagraph batch size.
         # The convention is different.
         # self.cudagraph_batch_sizes sorts in ascending order.
@@ -253,10 +317,17 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
         self.query_start_loc = self._make_buffer(self.max_num_reqs + 1,
                                                  dtype=torch.int32)
         self.seq_lens = self._make_buffer(self.max_num_reqs, dtype=torch.int32)
-        self.inputs_embeds = torch.zeros(
-            (self.max_num_tokens, self.hidden_size),
-            dtype=self.dtype,
-            device=self.device)
+        # Because inputs_embeds may be bfloat16 and we don't need a numpy
+        # version of this tensor, avoid a RuntimeError by not creating a
+        # numpy buffer.
+        self.inputs_embeds = self._make_buffer(self.max_num_tokens,
+                                               self.hidden_size,
+                                               dtype=self.dtype,
+                                               numpy=False)
+        self.num_draft_tokens = self._make_buffer(self.max_num_reqs,
+                                                  dtype=torch.int32)
+        self.num_accepted_tokens = self._make_buffer(self.max_num_reqs,
+                                                     dtype=torch.int64)
 
         # Only relevant for models using M-RoPE (e.g, Qwen2-VL)
         if self.uses_mrope:
@@ -273,6 +344,14 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
             self.mrope_positions = self._make_buffer(
                 (3, self.max_num_tokens + 1), dtype=torch.int64)
 
+        # CUDA event to synchronize use of reused CPU tensors between steps
+        # when async scheduling is enabled.
+        self.prepare_inputs_event: Optional[torch.cuda.Event] = None
+        if self.use_async_scheduling:
+            self.prepare_inputs_event = torch.cuda.Event()
+            # Start in a completed state.
+            self.prepare_inputs_event.record(torch.cuda.default_stream())
+
         # None in the first PP rank. The rest are set after load_model.
         self.intermediate_tensors: Optional[IntermediateTensors] = None
 
@@ -301,11 +380,11 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
         # Cudagraph dispatcher for runtime cudagraph dispatching.
         self.cudagraph_dispatcher = CudagraphDispatcher(self.vllm_config)
 
-        self.mm_budget = (MultiModalBudget(
+        self.mm_budget = MultiModalBudget(
             self.model_config,
             self.scheduler_config,
             self.mm_registry,
-        ) if self.supports_mm_inputs else None)
+        ) if self.supports_mm_inputs else None
 
         self.reorder_batch_threshold: Optional[int] = None
 
@@ -324,25 +403,27 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
             device="cpu",
             pin_memory=self.pin_memory)
 
-    def _make_buffer(self, *args, dtype: torch.dtype) -> CpuGpuBuffer:
-        return CpuGpuBuffer(*args,
+    def _make_buffer(self,
+                     *size: Union[int, torch.SymInt],
+                     dtype: torch.dtype,
+                     numpy: bool = True) -> CpuGpuBuffer:
+        # Bfloat16 torch tensors cannot be directly cast to a numpy array, so
+        # if a bfloat16 buffer is needed without a corresponding numpy array,
+        # don't bother instantiating the numpy array.
+        return CpuGpuBuffer(*size,
                             dtype=dtype,
                             device=self.device,
-                            pin_memory=self.pin_memory)
+                            pin_memory=self.pin_memory,
+                            with_numpy=numpy)
 
     def _init_model_kwargs(self, num_tokens: int):
         model_kwargs = dict[str, Any]()
-        num_reqs = self.input_batch.num_reqs
 
-        num_pooling_reqs = len(self.input_batch.pooling_params)
-
-        if num_pooling_reqs == 0:
+        if not self.is_pooling_model:
             return model_kwargs
 
-        # This does nontrivial work.
-        pooling_params = self.input_batch.pooling_metadata.pooling_params
-
-        assert num_pooling_reqs == num_reqs
+        num_reqs = self.input_batch.num_reqs
+        pooling_params = self.input_batch.get_pooling_params()
 
         token_type_id_requests = dict[int, Any]()
         for i, param in enumerate(pooling_params):
@@ -385,6 +466,12 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
             return
 
         if self.reorder_batch_threshold is not None:
+            # NOTE(lucas): currently no backend supports the custom masking
+            #  required for DCP with q_len > 1, so we assert here. Remove this
+            #  assert once the custom mask is support is added to FA3.
+            if self.dcp_world_size > 1:
+                assert self.reorder_batch_threshold == 1, \
+                    "DCP not support reorder_batch_threshold > 1 now."
             reorder_batch_to_split_decodes_and_prefills(
                 self.input_batch,
                 scheduler_output,
@@ -456,7 +543,8 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
             else:
                 generator = None
 
-            if pooling_params:
+            if self.is_pooling_model:
+                assert pooling_params is not None
                 task = pooling_params.task
                 assert task is not None, "You did not set `task` in the API"
 
@@ -579,6 +667,31 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
         # Refresh batch metadata with any pending updates.
         self.input_batch.refresh_metadata()
 
+    def _update_states_after_model_execute(
+            self, output_token_ids: torch.Tensor) -> None:
+        """Update the cached states after model execution.
+
+        This is used for MTP/EAGLE for hybrid models, as in linear attention,
+        only the last token's state is kept. In MTP/EAGLE, for draft tokens
+        the state are kept util we decide how many tokens are accepted for
+        each sequence, and a shifting is done during the next iteration
+        based on the number of accepted tokens.
+        """
+        if not self.model_config.is_hybrid or not self.speculative_config:
+            return
+
+        # Find the number of accepted tokens for each sequence.
+        num_accepted_tokens = (torch.cat(
+            [
+                output_token_ids,
+                torch.full((output_token_ids.size(0), 1),
+                           -1,
+                           device=output_token_ids.device),
+            ],
+            dim=1) == -1).int().argmax(-1).cpu().numpy()
+        for i, num_tokens in enumerate(num_accepted_tokens):
+            self.input_batch.num_accepted_tokens_cpu[i] = num_tokens
+
     def _init_mrope_positions(self, req_state: CachedRequestState):
         image_grid_thw = []
         video_grid_thw = []
@@ -661,6 +774,91 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
 
         return cu_num_tokens, arange
 
+    def _prepare_input_ids(self, total_num_scheduled_tokens: int,
+                           cu_num_tokens: np.ndarray) -> None:
+        """Prepare the input IDs for the current batch.
+        
+        Carefully handles the `prev_sampled_token_ids` which can be cached
+        from the previous engine iteration, in which case those tokens on the
+        GPU need to be copied into the corresponding slots into input_ids."""
+
+        if self.input_batch.prev_sampled_token_ids is None:
+            # Normal scheduling case
+            self.input_ids.copy_to_gpu(total_num_scheduled_tokens)
+            return
+
+        # Async scheduling case, where some decode requests from the previous
+        # iteration won't have entries in input_ids_cpu and need to be copied
+        # on the GPU from prev_sampled_token_ids.
+        prev_req_id_to_index = self.input_batch.prev_req_id_to_index
+        assert prev_req_id_to_index is not None
+        flattened_indices = []
+        prev_common_req_indices = []
+        indices_match = True
+        max_flattened_index = -1
+        for req_id, cur_index in self.input_batch.req_id_to_index.items():
+            if (prev_index := prev_req_id_to_index.get(req_id)) is not None:
+                prev_common_req_indices.append(prev_index)
+                # We need to compute the flattened input_ids index of the
+                # last token in each common request.
+                flattened_index = cu_num_tokens[cur_index].item() - 1
+                flattened_indices.append(flattened_index)
+                indices_match &= (prev_index == flattened_index)
+                max_flattened_index = max(max_flattened_index, flattened_index)
+        num_commmon_tokens = len(flattened_indices)
+        if num_commmon_tokens < total_num_scheduled_tokens:
+            # If not all requests are decodes from the last iteration,
+            # We need to copy the input_ids_cpu to the GPU first.
+            self.input_ids.copy_to_gpu(total_num_scheduled_tokens)
+        if num_commmon_tokens == 0:
+            # No requests in common with the previous iteration
+            # So input_ids_cpu will have all the input ids.
+            return
+        if indices_match and max_flattened_index == (num_commmon_tokens - 1):
+            # Common-case optimization: the batch is unchanged
+            # and no reordering happened.
+            # The indices are both the same permutation of 0..N-1 so
+            # we can copy directly using a single slice.
+            self.input_ids.gpu[:num_commmon_tokens].copy_(
+                self.input_batch.prev_sampled_token_ids[:num_commmon_tokens,
+                                                        0],
+                non_blocking=True)
+            return
+        # Upload the index tensors asynchronously
+        # so the scatter can be non-blocking.
+        input_ids_index_tensor = torch.tensor(flattened_indices,
+                                              dtype=torch.int64,
+                                              pin_memory=self.pin_memory).to(
+                                                  self.device,
+                                                  non_blocking=True)
+        prev_common_req_indices_tensor = torch.tensor(
+            prev_common_req_indices,
+            dtype=torch.int64,
+            pin_memory=self.pin_memory).to(self.device, non_blocking=True)
+        self.input_ids.gpu.scatter_(
+            dim=0,
+            index=input_ids_index_tensor,
+            src=self.input_batch.prev_sampled_token_ids[
+                prev_common_req_indices_tensor, 0])
+
+    def _get_encoder_seq_lens(
+        self,
+        scheduler_output: "SchedulerOutput",
+        kv_cache_spec: KVCacheSpec,
+        num_reqs: int,
+    ) -> Optional[np.ndarray]:
+        if not isinstance(kv_cache_spec, CrossAttentionSpec):
+            return None
+
+        # Build encoder_seq_lens array mapping request indices to
+        # encoder lengths for inputs scheduled in this batch
+        encoder_seq_lens = np.zeros(num_reqs, dtype=np.int32)
+        for req_id in scheduler_output.scheduled_encoder_inputs:
+            req_index = self.input_batch.req_id_to_index[req_id]
+            encoder_seq_lens[req_index] = self.max_encoder_len
+
+        return encoder_seq_lens
+
     def _prepare_inputs(
         self,
         scheduler_output: "SchedulerOutput",
@@ -747,7 +945,8 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
         max_seq_len = self.seq_lens.np[:num_reqs].max().item()
 
         # Copy the tensors to the GPU.
-        self.input_ids.copy_to_gpu(total_num_scheduled_tokens)
+        self._prepare_input_ids(total_num_scheduled_tokens, cu_num_tokens)
+
         if self.uses_mrope:
             # Only relevant for models using M-RoPE (e.g, Qwen2-VL)
             self.mrope_positions.gpu[:, :total_num_scheduled_tokens].copy_(
@@ -766,6 +965,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
             # We will ignore the sampled tokens from the partial requests.
             # TODO: Support prompt logprobs.
             logits_indices = query_start_loc[1:] - 1
+            num_draft_tokens = None
             spec_decode_metadata = None
         else:
             # Get the number of draft tokens for each request.
@@ -780,6 +980,9 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
             spec_decode_metadata = self._calc_spec_decode_metadata(
                 num_draft_tokens, cu_num_tokens)
             logits_indices = spec_decode_metadata.logits_indices
+            self.num_draft_tokens.np[:num_reqs] = num_draft_tokens
+            self.num_draft_tokens.np[num_reqs:].fill(0)
+            self.num_draft_tokens.copy_to_gpu()
 
         logits_indices_padded = None
         if self.cache_config.kv_sharing_fast_prefill:
@@ -794,11 +997,18 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
         num_computed_tokens_cpu = (
             self.input_batch.num_computed_tokens_cpu_tensor[:num_reqs])
         spec_decode_common_attn_metadata = None
+        if use_spec_decode:
+            self.num_accepted_tokens.np[:num_reqs] = (
+                self.input_batch.num_accepted_tokens_cpu[:num_reqs])
+            self.num_accepted_tokens.np[num_reqs:].fill(1)
+            self.num_accepted_tokens.copy_to_gpu()
 
         # Prepare the attention metadata for each KV cache group and make layers
         # in the same group share the same metadata.
         for kv_cache_group_id, kv_cache_group_spec in enumerate(
                 self.kv_cache_config.kv_cache_groups):
+            encoder_seq_lens = self._get_encoder_seq_lens(
+                scheduler_output, kv_cache_group_spec.kv_cache_spec, num_reqs)
 
             if isinstance(kv_cache_group_spec.kv_cache_spec,
                           EncoderOnlyAttentionSpec):
@@ -843,6 +1053,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
                 logits_indices_padded=logits_indices_padded,
                 num_logits_indices=logits_indices.size(0),
                 causal=True,
+                encoder_seq_lens=encoder_seq_lens,
             )
 
             if self.speculative_config and \
@@ -861,10 +1072,19 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
                         builder,
                     )
 
-                attn_metadata_i = (builder.build(
+                extra_attn_metadata_args = {}
+                if use_spec_decode and isinstance(builder,
+                                                  GDNAttentionMetadataBuilder):
+                    extra_attn_metadata_args = dict(
+                        num_accepted_tokens=self.num_accepted_tokens.
+                        gpu[:num_reqs],
+                        num_draft_tokens=self.num_draft_tokens.gpu[:num_reqs],
+                    )
+
+                attn_metadata_i = builder.build(
                     common_prefix_len=common_prefix_len,
                     common_attn_metadata=common_attn_metadata,
-                ))
+                    **extra_attn_metadata_args)
 
                 for layer_name in attn_group.layer_names:
                     attn_metadata[layer_name] = attn_metadata_i
@@ -1115,10 +1335,24 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
             self.kv_sharing_fast_prefill_logits_indices[:num_logits_padded])
         return logits_indices_padded
 
-    def _execute_mm_encoder(self, scheduler_output: "SchedulerOutput"):
+    def _batch_mm_kwargs_from_scheduler(
+        self,
+        scheduler_output: "SchedulerOutput",
+    ) -> tuple[list[MultiModalKwargsItem], list[tuple[str, PlaceholderRange]]]:
+        """Batch multimodal kwargs from scheduled encoder inputs.
+
+        Args:
+            scheduler_output: The scheduler output containing scheduled encoder
+              inputs.
+
+        Returns:
+            A tuple of (mm_kwargs, req_ids_pos) where:
+            - mm_kwargs: List of multimodal kwargs items to be batched
+            - mm_hashes_pos: List of (mm_hash, position_info) tuples
+        """
         scheduled_encoder_inputs = scheduler_output.scheduled_encoder_inputs
         if not scheduled_encoder_inputs:
-            return
+            return [], []
         # Batch the multi-modal inputs.
         mm_kwargs = list[MultiModalKwargsItem]()
         # list of tuple (mm_hash, position_info)
@@ -1132,6 +1366,16 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
                 mm_hashes_pos.append(
                     (mm_hash, req_state.mm_positions[mm_input_id]))
 
+        return mm_kwargs, mm_hashes_pos
+
+    def _execute_mm_encoder(self, scheduler_output: "SchedulerOutput"):
+        # Batch the multi-modal inputs using the helper method.
+        mm_kwargs, mm_hashes_pos = self._batch_mm_kwargs_from_scheduler(
+            scheduler_output)
+
+        if not mm_kwargs:
+            return
+
         # Batch mm inputs as much as we can: if a request in the batch has
         # multiple modalities or a different modality than the previous one,
         # we process it separately to preserve item order.
@@ -1222,6 +1466,35 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
                 mm_embeds.append(mm_embeds_item)
         return mm_embeds
 
+    def _extract_encoder_inputs(
+        self,
+        scheduler_output: "SchedulerOutput",
+    ) -> dict[str, torch.Tensor]:
+        """Extract encoder inputs for encoder-decoder models.
+
+        This method extracts multimodal input features from scheduled encoder
+        inputs and formats them for the encoder-decoder model forward pass.
+        """
+        # Batch the multi-modal inputs using the helper method.
+        mm_kwargs, _ = self._batch_mm_kwargs_from_scheduler(scheduler_output)
+
+        if not mm_kwargs:
+            return {}
+
+        # Group MM kwargs by modality and extract features
+        encoder_features = {}
+        for _, _, mm_kwargs_group in group_mm_kwargs_by_modality(
+                mm_kwargs,
+                device=self.device,
+                pin_memory=self.pin_memory,
+        ):
+            # Add the grouped features to encoder_features dict
+            # This allows the model to receive them as kwargs (e.g.,
+            # input_features=...)
+            encoder_features.update(mm_kwargs_group)
+
+        return encoder_features
+
     def get_model(self) -> nn.Module:
         # get raw model out of the cudagraph wrapper.
         if isinstance(self.model, CUDAGraphWrapper):
@@ -1337,10 +1610,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
         # so we receive it in that format.
         grammar_bitmask = torch.from_numpy(grammar_bitmask).contiguous()
 
-        # Force use of the torch.compile implementation from xgrammar to work
-        # around issues with the Triton kernel in concurrent structured output
-        # scenarios. See PR #19565 and issues #19493, #18376 for details.
-        xgr_torch_compile.apply_token_bitmask_inplace_torch_compile(
+        xgr.apply_token_bitmask_inplace(
             logits,
             grammar_bitmask.to(self.device, non_blocking=True),
             indices=out_indices if not skip_out_indices else None,
@@ -1437,7 +1707,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
         " a batch must be pooling request"
 
         hidden_states = hidden_states[:num_scheduled_tokens]
-        pooling_metadata = self.input_batch.pooling_metadata
+        pooling_metadata = self.input_batch.get_pooling_metadata()
         pooling_metadata.build_pooling_cursor(num_scheduled_tokens_np.tolist(),
                                               device=hidden_states.device)
         seq_lens_cpu = self.seq_lens.cpu[:self.input_batch.num_reqs]
@@ -1463,31 +1733,13 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
             kv_connector_output=kv_connector_output,
         )
 
-    @torch.inference_mode()
-    def execute_model(
+    def _preprocess(
         self,
         scheduler_output: "SchedulerOutput",
         intermediate_tensors: Optional[IntermediateTensors] = None,
-    ) -> Union[ModelRunnerOutput, IntermediateTensors]:
-        self._update_states(scheduler_output)
-        if not scheduler_output.total_num_scheduled_tokens:
-            if not has_kv_transfer_group():
-                # Return empty ModelRunnerOutput if there's no work to do.
-                return EMPTY_MODEL_RUNNER_OUTPUT
-
-            return self.kv_connector_no_forward(scheduler_output,
-                                                self.vllm_config)
-
-        if self.cache_config.kv_sharing_fast_prefill:
-            assert not self.input_batch.num_prompt_logprobs, (
-                "--kv-sharing-fast-prefill produces incorrect logprobs for "
-                "prompt tokens, tokens, please disable it when the requests "
-                "need prompt logprobs")
-
-        # Prepare the decoder inputs.
-        (attn_metadata, logits_indices, spec_decode_metadata,
-         num_scheduled_tokens_np, spec_decode_common_attn_metadata,
-         max_query_len) = self._prepare_inputs(scheduler_output)
+    ) -> tuple[int, int, Optional[torch.Tensor], Optional[torch.Tensor],
+               Optional[torch.Tensor], torch.Tensor,
+               Optional[IntermediateTensors], dict[str, Any]]:
 
         num_scheduled_tokens = scheduler_output.total_num_scheduled_tokens
         if (self.compilation_config.cudagraph_mode != CUDAGraphMode.NONE
@@ -1514,14 +1766,12 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
 
         # _prepare_inputs may reorder the batch, so we must gather multi
         # modal outputs after that to ensure the correct order
-        if self.supports_mm_inputs:
+        if (self.supports_mm_inputs and get_pp_group().is_first_rank
+                and not self.model_config.is_encoder_decoder):
             # Run the multimodal encoder if any.
             self._execute_mm_encoder(scheduler_output)
             mm_embeds = self._gather_mm_embeddings(scheduler_output)
-        else:
-            mm_embeds = []
 
-        if self.supports_mm_inputs and get_pp_group().is_first_rank:
             # NOTE(woosuk): To unify token ids and soft tokens (vision
             # embeddings), we always use embeddings (rather than token ids)
             # as input to the multimodal model, even when the input is text.
@@ -1531,11 +1781,11 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
             )
 
             # TODO(woosuk): Avoid the copy. Optimize.
-            self.inputs_embeds[:num_scheduled_tokens].copy_(
+            self.inputs_embeds.gpu[:num_scheduled_tokens].copy_(
                 inputs_embeds_scheduled)
 
             input_ids = None
-            inputs_embeds = self.inputs_embeds[:num_input_tokens]
+            inputs_embeds = self.inputs_embeds.gpu[:num_input_tokens]
             model_kwargs = {
                 **self._init_model_kwargs(num_scheduled_tokens),
                 **self._extract_mm_kwargs(scheduler_output),
@@ -1559,75 +1809,26 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
             intermediate_tensors = self.sync_and_slice_intermediate_tensors(
                 num_input_tokens, intermediate_tensors, True)
 
-        uniform_decode = (max_query_len == self.uniform_decode_query_len) and (
-            num_scheduled_tokens == self.input_batch.num_reqs * max_query_len)
-        batch_descriptor = BatchDescriptor(num_tokens=num_input_tokens,
-                                           uniform_decode=uniform_decode)
-        cudagraph_runtime_mode, batch_descriptor = \
-            self.cudagraph_dispatcher.dispatch(batch_descriptor)
-
-        # Run the model.
-        # Use persistent buffers for CUDA graphs.
-        with set_forward_context(
-                attn_metadata,
-                self.vllm_config,
-                num_tokens=num_input_tokens,
-                num_tokens_across_dp=num_tokens_across_dp,
-                cudagraph_runtime_mode=cudagraph_runtime_mode,
-                batch_descriptor=batch_descriptor,
-        ), self.maybe_get_kv_connector_output(
-                scheduler_output) as kv_connector_output:
-
-            model_output = self.model(
-                input_ids=input_ids,
-                positions=positions,
-                intermediate_tensors=intermediate_tensors,
-                inputs_embeds=inputs_embeds,
-                **model_kwargs,
-            )
-
-        if self.use_aux_hidden_state_outputs:
-            hidden_states, aux_hidden_states = model_output
-        else:
-            hidden_states = model_output
-            aux_hidden_states = None
-
-        # Broadcast PP output for external_launcher (torchrun)
-        # to make sure we are synced across pp ranks
-        # TODO: Support overlapping mirco-batches
-        # https://github.com/vllm-project/vllm/issues/18019
-        broadcast_pp_output = \
-            self.parallel_config.distributed_executor_backend \
-            == "external_launcher" and len(get_pp_group().ranks) > 0
-        if not get_pp_group().is_last_rank:
-            # For mid-pipeline stages, return the hidden states.
-            assert isinstance(hidden_states, IntermediateTensors)
-            if not broadcast_pp_output:
-                hidden_states.kv_connector_output = kv_connector_output
-                return hidden_states
-            get_pp_group().send_tensor_dict(hidden_states.tensors,
-                                            all_gather_group=get_tp_group())
-            logits = None
-        else:
-            if self.input_batch.pooling_params:
-                return self._pool(hidden_states, num_scheduled_tokens,
-                                  num_scheduled_tokens_np, kv_connector_output)
-
-            sample_hidden_states = hidden_states[logits_indices]
-            logits = self.model.compute_logits(sample_hidden_states, None)
-        if broadcast_pp_output:
-            model_output_broadcast_data = {
-                "logits": logits.contiguous(),
-            } if logits is not None else {}
-            model_output_broadcast_data = get_pp_group().broadcast_tensor_dict(
-                model_output_broadcast_data, src=len(get_pp_group().ranks) - 1)
-            assert model_output_broadcast_data is not None
-            logits = model_output_broadcast_data["logits"]
-
-        # Apply structured output bitmasks if present
-        if scheduler_output.grammar_bitmask is not None:
-            self.apply_grammar_bitmask(scheduler_output, logits)
+        if (self.model_config.is_encoder_decoder
+                and scheduler_output.scheduled_encoder_inputs):
+            encoder_inputs = self._extract_encoder_inputs(scheduler_output)
+            model_kwargs.update(encoder_inputs)
+
+        return (
+            num_scheduled_tokens,
+            num_input_tokens,
+            num_tokens_across_dp,
+            input_ids,
+            inputs_embeds,
+            positions,
+            intermediate_tensors,
+            model_kwargs,
+        )
 
+    def _sample(
+            self, logits: Optional[torch.Tensor],
+            spec_decode_metadata: Optional[SpecDecodeMetadata]
+    ) -> SamplerOutput:
         # Sample the next token and get logprobs if needed.
         sampling_metadata = self.input_batch.sampling_metadata
         if spec_decode_metadata is None:
@@ -1660,7 +1861,23 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
                 sampling_metadata,
             )
             sampler_output.sampled_token_ids = output_token_ids
+            self._update_states_after_model_execute(output_token_ids)
 
+        return sampler_output
+
+    def _bookkeeping_sync(
+        self, scheduler_output: "SchedulerOutput",
+        sampler_output: SamplerOutput, logits: Optional[torch.Tensor],
+        hidden_states: torch.Tensor, num_scheduled_tokens: int
+    ) -> tuple[
+            dict[str, int],
+            Optional[LogprobsLists],
+            list[list[int]],
+            dict[str, Optional[LogprobsTensors]],
+            list[str],
+            dict[str, int],
+            list[int],
+    ]:
         num_nans_in_logits = {}
         if envs.VLLM_COMPUTE_NANS_IN_LOGITS:
             num_nans_in_logits = self._get_nans_in_logits(logits)
@@ -1683,6 +1900,12 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
                 # so that we could clear the sampled tokens before returning.
                 discard_sampled_tokens_req_indices.append(i)
 
+        # Copy some objects so they don't get modified after returning.
+        # This is important when using async scheduling.
+        req_ids_output_copy = self.input_batch.req_ids.copy()
+        req_id_to_index_output_copy = \
+            self.input_batch.req_id_to_index.copy()
+
         # NOTE: GPU -> CPU Sync happens here.
         # Move as many CPU operations as possible before this sync point.
         logprobs_tensors = sampler_output.logprobs_tensors
@@ -1695,21 +1918,42 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
             scheduler_output.num_scheduled_tokens,
         )
 
-        # Get the valid generated tokens.
+        num_sampled_tokens = sampler_output.sampled_token_ids.shape[0]
         sampled_token_ids = sampler_output.sampled_token_ids
-        max_gen_len = sampled_token_ids.shape[-1]
-        if max_gen_len == 1:
-            # No spec decode tokens.
-            valid_sampled_token_ids = self._to_list(sampled_token_ids)
+        invalid_req_indices = []
+        if not self.use_async_scheduling:
+            # Get the valid generated tokens.
+            max_gen_len = sampled_token_ids.shape[-1]
+            if max_gen_len == 1:
+                # No spec decode tokens.
+                valid_sampled_token_ids = self._to_list(sampled_token_ids)
+            else:
+                # Includes spec decode tokens.
+                valid_sampled_token_ids = self.rejection_sampler.parse_output(
+                    sampled_token_ids,
+                    self.input_batch.vocab_size,
+                )
+            # Mask out the sampled tokens that should not be sampled.
+            for i in discard_sampled_tokens_req_indices:
+                valid_sampled_token_ids[i].clear()
         else:
-            # Includes spec decode tokens.
-            valid_sampled_token_ids = self.rejection_sampler.parse_output(
-                sampled_token_ids,
-                self.input_batch.vocab_size,
-            )
-        # Mask out the sampled tokens that should not be sampled.
-        for i in discard_sampled_tokens_req_indices:
-            valid_sampled_token_ids[i].clear()
+            valid_sampled_token_ids = []
+            invalid_req_indices = list(discard_sampled_tokens_req_indices)
+            invalid_req_indices_set = set(invalid_req_indices)
+            assert sampled_token_ids.shape[-1] == 1
+
+            # Cache the sampled tokens on the GPU and avoid CPU sync.
+            # These will be copied into input_ids in the next step
+            # when preparing inputs.
+            self.input_batch.prev_sampled_token_ids = \
+                sampled_token_ids
+            self.input_batch.prev_sampled_token_ids_invalid_indices = \
+                invalid_req_indices_set
+            self.input_batch.prev_req_id_to_index = {
+                req_id: i
+                for i, req_id in enumerate(self.input_batch.req_ids)
+                if i not in invalid_req_indices_set
+            }
 
         # Cache the sampled tokens in the model runner, so that the scheduler
         # doesn't need to send them back.
@@ -1717,7 +1961,12 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
         # the sampled tokens back, because there's no direct communication
         # between the first-stage worker and the last-stage worker.
         req_ids = self.input_batch.req_ids
-        for req_idx, sampled_ids in enumerate(valid_sampled_token_ids):
+        for req_idx in range(num_sampled_tokens):
+            if self.use_async_scheduling:
+                sampled_ids = [-1] if \
+                    req_idx not in invalid_req_indices_set else None
+            else:
+                sampled_ids = valid_sampled_token_ids[req_idx]
             if not sampled_ids:
                 continue
 
@@ -1732,28 +1981,175 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
                                            start_idx:end_idx] = sampled_ids
             self.input_batch.num_tokens_no_spec[req_idx] = end_idx
             self.input_batch.num_tokens[req_idx] = end_idx
+
             req_id = req_ids[req_idx]
             req_state = self.requests[req_id]
             req_state.output_token_ids.extend(sampled_ids)
 
+        return (
+            num_nans_in_logits,
+            logprobs_lists,
+            valid_sampled_token_ids,
+            prompt_logprobs_dict,
+            req_ids_output_copy,
+            req_id_to_index_output_copy,
+            invalid_req_indices,
+        )
+
+    @torch.inference_mode()
+    def execute_model(
+        self,
+        scheduler_output: "SchedulerOutput",
+        intermediate_tensors: Optional[IntermediateTensors] = None,
+    ) -> Union[ModelRunnerOutput, AsyncModelRunnerOutput, IntermediateTensors]:
+        with record_function_or_nullcontext("Preprocess"):
+            self._update_states(scheduler_output)
+            if not scheduler_output.total_num_scheduled_tokens:
+                if not has_kv_transfer_group():
+                    # Return empty ModelRunnerOutput if there's no work to do.
+                    return EMPTY_MODEL_RUNNER_OUTPUT
+                return self.kv_connector_no_forward(scheduler_output,
+                                                    self.vllm_config)
+            if self.cache_config.kv_sharing_fast_prefill:
+                assert not self.input_batch.num_prompt_logprobs, (
+                    "--kv-sharing-fast-prefill produces incorrect logprobs for "
+                    "prompt tokens, tokens, please disable it when the requests"
+                    " need prompt logprobs")
+
+            if self.prepare_inputs_event is not None:
+                # Ensure prior step has finished with reused CPU tensors.
+                self.prepare_inputs_event.synchronize()
+            try:
+                # Prepare the decoder inputs.
+                (attn_metadata, logits_indices, spec_decode_metadata,
+                 num_scheduled_tokens_np, spec_decode_common_attn_metadata,
+                 max_query_len) = self._prepare_inputs(scheduler_output)
+
+            finally:
+                if self.prepare_inputs_event is not None:
+                    self.prepare_inputs_event.record()
+
+            (
+                num_scheduled_tokens,
+                num_input_tokens,
+                num_tokens_across_dp,
+                input_ids,
+                inputs_embeds,
+                positions,
+                intermediate_tensors,
+                model_kwargs,
+            ) = self._preprocess(scheduler_output, intermediate_tensors)
+
+            uniform_decode = (max_query_len
+                              == self.uniform_decode_query_len) and (
+                                  num_scheduled_tokens
+                                  == self.input_batch.num_reqs * max_query_len)
+            batch_descriptor = BatchDescriptor(num_tokens=num_input_tokens,
+                                               uniform_decode=uniform_decode)
+            cudagraph_runtime_mode, batch_descriptor = \
+                self.cudagraph_dispatcher.dispatch(batch_descriptor)
+
+        # Run the model.
+        # Use persistent buffers for CUDA graphs.
+        with (set_forward_context(
+                attn_metadata,
+                self.vllm_config,
+                num_tokens=num_input_tokens,
+                num_tokens_across_dp=num_tokens_across_dp,
+                cudagraph_runtime_mode=cudagraph_runtime_mode,
+                batch_descriptor=batch_descriptor,
+        ), record_function_or_nullcontext("Forward"),
+              self.maybe_get_kv_connector_output(scheduler_output) as
+              kv_connector_output):
+            model_output = self.model(
+                input_ids=input_ids,
+                positions=positions,
+                intermediate_tensors=intermediate_tensors,
+                inputs_embeds=inputs_embeds,
+                **model_kwargs,
+            )
+
+        with record_function_or_nullcontext("Postprocess"):
+            if self.use_aux_hidden_state_outputs:
+                hidden_states, aux_hidden_states = model_output
+            else:
+                hidden_states = model_output
+                aux_hidden_states = None
+
+            # Broadcast PP output for external_launcher (torchrun)
+            # to make sure we are synced across pp ranks
+            # TODO: Support overlapping mirco-batches
+            # https://github.com/vllm-project/vllm/issues/18019
+            broadcast_pp_output = \
+                self.parallel_config.distributed_executor_backend \
+                == "external_launcher" and len(get_pp_group().ranks) > 0
+            if not get_pp_group().is_last_rank:
+                # For mid-pipeline stages, return the hidden states.
+                assert isinstance(hidden_states, IntermediateTensors)
+                if not broadcast_pp_output:
+                    hidden_states.kv_connector_output = kv_connector_output
+                    return hidden_states
+                get_pp_group().send_tensor_dict(
+                    hidden_states.tensors, all_gather_group=get_tp_group())
+                logits = None
+            else:
+                if self.is_pooling_model:
+                    return self._pool(hidden_states, num_scheduled_tokens,
+                                      num_scheduled_tokens_np,
+                                      kv_connector_output)
+
+                sample_hidden_states = hidden_states[logits_indices]
+                logits = self.model.compute_logits(sample_hidden_states, None)
+            if broadcast_pp_output:
+                model_output_broadcast_data = {
+                    "logits": logits.contiguous(),
+                } if logits is not None else {}
+                model_output_broadcast_data = get_pp_group(
+                ).broadcast_tensor_dict(model_output_broadcast_data,
+                                        src=len(get_pp_group().ranks) - 1)
+                assert model_output_broadcast_data is not None
+                logits = model_output_broadcast_data["logits"]
+
+            # Apply structured output bitmasks if present
+            if scheduler_output.grammar_bitmask is not None:
+                self.apply_grammar_bitmask(scheduler_output, logits)
+
+        with record_function_or_nullcontext("Sample"):
+            sampler_output = self._sample(logits, spec_decode_metadata)
+
+        with record_function_or_nullcontext("Bookkeep"):
+            (
+                num_nans_in_logits,
+                logprobs_lists,
+                valid_sampled_token_ids,
+                prompt_logprobs_dict,
+                req_ids_output_copy,
+                req_id_to_index_output_copy,
+                invalid_req_indices,
+            ) = self._bookkeeping_sync(scheduler_output, sampler_output,
+                                       logits, hidden_states,
+                                       num_scheduled_tokens)
+
         if self.speculative_config:
             assert spec_decode_common_attn_metadata is not None
-            self._draft_token_ids = self.propose_draft_token_ids(
-                scheduler_output,
-                valid_sampled_token_ids,
-                sampling_metadata,
-                hidden_states,
-                sample_hidden_states,
-                aux_hidden_states,
-                spec_decode_metadata,
-                spec_decode_common_attn_metadata,
-            )
+            with record_function_or_nullcontext("Draft"):
+                self._draft_token_ids = self.propose_draft_token_ids(
+                    scheduler_output,
+                    valid_sampled_token_ids,
+                    self.input_batch.sampling_metadata,
+                    hidden_states,
+                    sample_hidden_states,
+                    aux_hidden_states,
+                    spec_decode_metadata,
+                    spec_decode_common_attn_metadata,
+                )
 
-        self.eplb_step()
+        with record_function_or_nullcontext("EPLB"):
+            self.eplb_step()
 
-        return ModelRunnerOutput(
-            req_ids=self.input_batch.req_ids,
-            req_id_to_index=self.input_batch.req_id_to_index,
+        output = ModelRunnerOutput(
+            req_ids=req_ids_output_copy,
+            req_id_to_index=req_id_to_index_output_copy,
             sampled_token_ids=valid_sampled_token_ids,
             logprobs=logprobs_lists,
             prompt_logprobs_dict=prompt_logprobs_dict,
@@ -1762,6 +2158,16 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
             num_nans_in_logits=num_nans_in_logits,
         )
 
+        if not self.use_async_scheduling:
+            return output
+
+        return AsyncGPUModelRunnerOutput(
+            model_runner_output=output,
+            sampled_token_ids=sampler_output.sampled_token_ids,
+            invalid_req_indices=invalid_req_indices,
+            async_output_copy_stream=self.async_output_copy_stream,
+        )
+
     def take_draft_token_ids(self) -> Optional[DraftTokenIds]:
         if self._draft_token_ids is None:
             return None
@@ -2219,6 +2625,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
         uniform_decode: bool = False,
         skip_eplb: bool = False,
         is_profile: bool = False,
+        create_mixed_batch: bool = False,
         remove_lora: bool = True,
     ) -> tuple[torch.Tensor, torch.Tensor]:
         """
@@ -2237,6 +2644,8 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
             uniform_decode: If True, the batch is a uniform decode batch.
             skip_eplb: If True, skip EPLB state update.
             is_profile: If True, this is a profile run.
+            create_mixed_batch: If True, create a mixed batch with both decode
+                (1 token) and prefill (multiple tokens) requests.
             remove_lora: If False, dummy LoRAs are not destroyed after the run
         """
         assert cudagraph_runtime_mode in {
@@ -2268,13 +2677,27 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
         # has num_tokens in total.
         assert num_tokens <= self.scheduler_config.max_num_batched_tokens
         max_num_reqs = self.scheduler_config.max_num_seqs
-        if uniform_decode:
-            num_reqs = cdiv(num_tokens, max_query_len)
+        if create_mixed_batch:
+            assert not uniform_decode
+            # Create mixed batch:
+            # first half decode tokens, second half one prefill
+            num_decode_tokens = num_tokens // 2
+            num_prefill_tokens = num_tokens - num_decode_tokens
+            num_reqs = num_decode_tokens + 1
+
+            # Create decode requests (1 token each) followed by prefill request
+            num_scheduled_tokens_list = [1] * num_decode_tokens + [
+                num_prefill_tokens
+            ]
+            # Note: Overriding max_query_len to be the prefill tokens
+            max_query_len = num_prefill_tokens
+        elif uniform_decode:
+            num_reqs = num_tokens // max_query_len
             assert num_reqs <= max_num_reqs, \
                 "Do not capture num_reqs > max_num_reqs for uniform batch"
             num_scheduled_tokens_list = [max_query_len] * num_reqs
             if num_tokens % max_query_len != 0:
-                num_scheduled_tokens_list[-1] = num_tokens % max_query_len
+                num_scheduled_tokens_list[-1] += num_tokens % max_query_len
         else:
             num_reqs = min(num_tokens, max_num_reqs)
             min_tokens_per_req = num_tokens // num_reqs
@@ -2293,8 +2716,15 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
         if force_attention or cudagraph_runtime_mode == CUDAGraphMode.FULL:
             attn_metadata = {}
 
-            # Make sure max_model_len is used at the graph capture time.
-            self.seq_lens.np[:num_reqs] = self.max_model_len
+            if create_mixed_batch:
+                # In the mixed batch mode (used for FI warmup), we use
+                # shorter sequence lengths to run faster.
+                # TODO(luka) better system for describing dummy batches
+                seq_lens = [1] * num_decode_tokens + [num_prefill_tokens + 1]
+            else:
+                # Make sure max_model_len is used at the graph capture time.
+                seq_lens = self.max_model_len
+            self.seq_lens.np[:num_reqs] = seq_lens
             self.seq_lens.np[num_reqs:] = 0
             self.seq_lens.copy_to_gpu()
 
@@ -2326,17 +2756,18 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
 
         with self.maybe_dummy_run_with_lora(self.lora_config,
                                             num_scheduled_tokens, remove_lora):
-            if self.supports_mm_inputs:
+            model_kwargs = self._init_model_kwargs(num_tokens)
+            if (self.supports_mm_inputs
+                    and not self.model_config.is_encoder_decoder):
                 input_ids = None
-                inputs_embeds = self.inputs_embeds[:num_tokens]
+                inputs_embeds = self.inputs_embeds.gpu[:num_tokens]
                 model_kwargs = {
-                    **self._init_model_kwargs(num_tokens),
+                    **model_kwargs,
                     **self._dummy_mm_kwargs(num_reqs),
                 }
             else:
                 input_ids = self.input_ids.gpu[:num_tokens]
                 inputs_embeds = None
-                model_kwargs = self._init_model_kwargs(num_tokens)
 
             if self.uses_mrope:
                 positions = self.mrope_positions.gpu[:, :num_tokens]
@@ -2558,7 +2989,6 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
                 mm_budget = self.mm_budget
                 assert mm_budget is not None
 
-                # TODO: handle encoder-decoder models once we support them.
                 if (encoder_budget := mm_budget.get_encoder_budget()) > 0:
                     # NOTE: Currently model is profiled with a single non-text
                     # modality with the max possible input tokens even when
@@ -2611,12 +3041,12 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
         self.encoder_cache.clear()
         gc.collect()
 
-    def capture_model(self) -> None:
+    def capture_model(self) -> int:
         if self.compilation_config.cudagraph_mode == CUDAGraphMode.NONE:
             logger.warning(
                 "Skipping CUDA graph capture. To turn on CUDA graph capture, "
                 "ensure `cudagraph_mode` was not manually set to `NONE`")
-            return
+            return 0
         else:
             self.initialize_cudagraph_capture()
 
@@ -2639,6 +3069,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
             finally:
                 if should_freeze:
                     gc.unfreeze()
+                    gc.collect()
 
         # Trigger CUDA graph capture for specific shapes.
         # Capture the large shapes first so that the smaller shapes
@@ -2675,7 +3106,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
         # Disable cudagraph capturing globally, so any unexpected cudagraph
         # capturing will be detected and raise an error after here.
         # Note: We don't put it into graph_capture context manager because
-        # we may doing lazy capturing in future that still allows capturing
+        # we may do lazy capturing in future that still allows capturing
         # after here.
         set_cudagraph_capturing_enabled(False)
 
@@ -2686,6 +3117,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
         # This usually takes 5~20 seconds.
         logger.info("Graph capturing finished in %.0f secs, took %.2f GiB",
                     elapsed_time, cuda_graph_size / (1 << 30))
+        return cuda_graph_size
 
     def _capture_cudagraphs(self, compilation_cases: list[int],
                             cudagraph_runtime_mode: CUDAGraphMode,
@@ -2787,7 +3219,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
             self.attn_groups.append(
                 create_attn_groups(attn_backends, kv_cache_spec))
 
-        # Calculate reorder batch threshold (if neeeded)
+        # Calculate reorder batch threshold (if needed)
         self.calculate_reorder_batch_threshold()
 
     def initialize_cudagraph_capture(self) -> None:
@@ -2904,7 +3336,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
                 "for more details.")
             self.input_batch = InputBatch(
                 max_num_reqs=self.max_num_reqs,
-                max_model_len=self.max_model_len,
+                max_model_len=max(self.max_model_len, self.max_encoder_len),
                 max_num_batched_tokens=self.max_num_tokens,
                 device=self.device,
                 pin_memory=self.pin_memory,
@@ -2913,6 +3345,9 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
                 is_spec_decode=bool(self.vllm_config.speculative_config),
                 logitsprocs=self.input_batch.logitsprocs,
                 is_pooling_model=self.is_pooling_model,
+                num_speculative_tokens=(
+                    self.vllm_config.speculative_config.num_speculative_tokens
+                    if self.vllm_config.speculative_config else 0),
             )
 
     def _allocate_kv_cache_tensors(
@@ -3150,6 +3585,21 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
 
         if has_kv_transfer_group():
             get_kv_transfer_group().register_kv_caches(kv_caches)
+            if self.device.type == 'xpu':
+                get_kv_transfer_group().set_host_xfer_buffer_ops(
+                    copy_kv_blocks)
+
+        if self.dcp_world_size > 1:
+            layer_names = self.attn_groups[0][0].layer_names
+            layers = get_layers_from_vllm_config(self.vllm_config,
+                                                 AttentionLayerBase,
+                                                 layer_names)
+            for layer in layers.values():
+                assert layer.impl.need_to_return_lse_for_decode, (
+                    "DCP requires attention impls to return"
+                    " the softmax lse for decode, but the impl "
+                    f"{layer.impl.__class__.__name__} "
+                    "does not return the softmax lse for decode.")
 
     def may_add_encoder_only_layers_to_kv_cache_config(self) -> None:
         """
@@ -3162,7 +3612,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
         attn_layers = get_layers_from_vllm_config(self.vllm_config, Attention)
         for layer_name, attn_module in attn_layers.items():
             if attn_module.attn_type == AttentionType.ENCODER_ONLY:
-                attn_spec = EncoderOnlyAttentionSpec(
+                attn_spec: AttentionSpec = EncoderOnlyAttentionSpec(
                     block_size=block_size,
                     num_kv_heads=attn_module.num_kv_heads,
                     head_size=attn_module.head_size,
@@ -3204,7 +3654,6 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
                 self.shared_kv_cache_layers[layer_name] = kv_tgt_layer
                 continue
 
-            # TODO: Support other attention modules, e.g., cross-attention
             # TODO(lucas): move the attention specs into the model layers like
             # the attention backends
             if attn_module.attn_type == AttentionType.DECODER:
@@ -3232,19 +3681,26 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
                         head_size=attn_module.head_size,
                         dtype=self.kv_cache_dtype,
                         use_mla=use_mla)
+            elif attn_module.attn_type == AttentionType.ENCODER_DECODER:
+                kv_cache_spec[layer_name] = CrossAttentionSpec(
+                    block_size=block_size,
+                    num_kv_heads=attn_module.num_kv_heads,
+                    head_size=attn_module.head_size,
+                    dtype=self.kv_cache_dtype,
+                    use_mla=use_mla)
             elif attn_module.attn_type in (AttentionType.ENCODER,
                                            AttentionType.ENCODER_ONLY):
                 # encoder-only attention does not need KV cache.
                 continue
-            elif attn_module.attn_type == AttentionType.ENCODER_DECODER:
-                raise NotImplementedError
             else:
                 raise ValueError(
                     f"Unknown attention type: {attn_module.attn_type}")
 
         mamba_layers = get_layers_from_vllm_config(self.vllm_config, MambaBase)
         if len(mamba_layers) > 0:
-            if self.vllm_config.speculative_config is not None:
+            if (self.vllm_config.speculative_config is not None
+                    and self.vllm_config.model_config.hf_config.model_type
+                    not in ["qwen3_next"]):
                 raise NotImplementedError(
                     "Mamba with speculative decoding is not supported yet.")
             if self.vllm_config.cache_config.enable_prefix_caching:
@@ -3263,7 +3719,11 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
                     dtypes=mamba_module.get_state_dtype(),
                     block_size=max_model_len,
                     page_size_padded=page_size_padded,
-                    mamba_type=mamba_module.mamba_type)
+                    mamba_type=mamba_module.mamba_type,
+                    num_speculative_blocks=(
+                        self.speculative_config.num_speculative_tokens
+                        if self.speculative_config else 0),
+                )
 
         return kv_cache_spec
 
diff --git a/vllm/v1/worker/gpu_worker.py b/vllm/v1/worker/gpu_worker.py
index f49f5bdd9703bb971cc08fdd631583c59d5aadd5..37dd431fd68f8898e9f8c7b18b111defe7c48694 100644
--- a/vllm/v1/worker/gpu_worker.py
+++ b/vllm/v1/worker/gpu_worker.py
@@ -5,7 +5,7 @@ import copy
 import gc
 import os
 from contextlib import AbstractContextManager, nullcontext
-from typing import TYPE_CHECKING, Any, Optional
+from typing import TYPE_CHECKING, Any, Optional, Union
 
 import torch
 import torch.distributed
@@ -28,8 +28,8 @@ from vllm.tasks import SupportedTask
 from vllm.utils import GiB_bytes, MemorySnapshot, memory_profiling
 from vllm.v1.engine import ReconfigureDistributedRequest, ReconfigureRankType
 from vllm.v1.kv_cache_interface import KVCacheConfig, KVCacheSpec
-from vllm.v1.outputs import (EMPTY_MODEL_RUNNER_OUTPUT, DraftTokenIds,
-                             ModelRunnerOutput)
+from vllm.v1.outputs import (EMPTY_MODEL_RUNNER_OUTPUT, AsyncModelRunnerOutput,
+                             DraftTokenIds, ModelRunnerOutput)
 from vllm.v1.utils import report_usage_stats
 from vllm.v1.worker.gpu_model_runner import GPUModelRunner
 from vllm.v1.worker.worker_base import WorkerBase
@@ -231,18 +231,40 @@ class Worker(WorkerBase):
             You may limit the usage of GPU memory
             by adjusting the `gpu_memory_utilization` parameter.
         """
+        GiB = lambda b: b / GiB_bytes
+        if kv_cache_memory_bytes := self.cache_config.kv_cache_memory_bytes:
+            # still need a profile run which compiles the model for
+            # max_num_batched_tokens
+            self.model_runner.profile_run()
+
+            msg = (
+                f"Initial free memory {GiB(self.init_snapshot.free_memory)} "
+                f"GiB, reserved {GiB(kv_cache_memory_bytes):.2f}GiB memory for "
+                "KV Cache as specified by kv_cache_memory_bytes config and "
+                "skipped memory profiling. This does does not respect the "
+                "gpu_memory_utilization config. Only use kv_cache_memory_bytes "
+                "config when you want manual control of KV cache memory "
+                "size. If OOM'ed, check the difference of initial free "
+                "memory between the current run and the previous run "
+                "where kv_cache_memory_bytes is suggested and update it "
+                "correspondingly.")
+            logger.info(msg)
+            return kv_cache_memory_bytes
+
         torch.cuda.empty_cache()
         torch.cuda.reset_peak_memory_stats()
-        GiB = lambda b: b / GiB_bytes
 
         # Execute a forward pass with dummy inputs to profile the memory usage
         # of the model.
         with memory_profiling(
                 self.init_snapshot,
-                weights_memory=int(
-                    self.model_runner.model_memory_usage)) as profile_result:
+                weights_memory=int(self.model_runner.model_memory_usage),
+        ) as profile_result:
             self.model_runner.profile_run()
 
+        self.non_torch_memory = profile_result.non_torch_increase
+        self.peak_activation_memory = profile_result.torch_peak_increase
+
         free_gpu_memory = profile_result.after_profile.free_memory
         # NOTE(woosuk): Here we assume that the other processes using the same
         # GPU did not change their memory usage during the profiling.
@@ -254,7 +276,7 @@ class Worker(WorkerBase):
             "release GPU memory while vLLM is profiling during initialization. "
             "To fix this, ensure consistent GPU memory allocation or "
             "isolate vLLM in its own container.")
-        available_kv_cache_memory = self.requested_memory \
+        self.available_kv_cache_memory_bytes = self.requested_memory \
             - profile_result.non_kv_cache_memory
 
         unrequested_memory = self.init_snapshot.free_memory \
@@ -274,10 +296,10 @@ class Worker(WorkerBase):
         )
         logger.debug(profile_result)
         logger.info("Available KV cache memory: %.2f GiB",
-                    GiB(available_kv_cache_memory))
+                    GiB(self.available_kv_cache_memory_bytes))
         gc.collect()
 
-        return int(available_kv_cache_memory)
+        return int(self.available_kv_cache_memory_bytes)
 
     def get_kv_cache_spec(self) -> dict[str, KVCacheSpec]:
         return self.model_runner.get_kv_cache_spec()
@@ -317,8 +339,56 @@ class Worker(WorkerBase):
         # cuda graph capture.
         kernel_warmup(self)
 
+        cuda_graph_memory_bytes = 0
         if not self.model_config.enforce_eager:
-            self.model_runner.capture_model()
+            cuda_graph_memory_bytes = self.model_runner.capture_model()
+
+        if (self.cache_config.kv_cache_memory_bytes is None
+                and hasattr(self, "peak_activation_memory")):
+            # Suggests optimal kv cache memory size if we rely on
+            # memory_profiling to guess the kv cache memory size which
+            # provides peak_activation_memory and a few other memory
+            # consumption. `memory_profiling` does not consider
+            # CUDAGraph memory size and may not utilize all gpu memory.
+            # Users may want fine-grained control to specify kv cache
+            # memory size.
+            GiB = lambda b: round(b / GiB_bytes, 2)
+
+            # empirically observed that the memory profiling may
+            # slightly underestimate the memory consumption.
+            # So leave a small buffer (=150MiB) to avoid OOM.
+            redundancy_buffer_memory = 150 * (1 << 20)
+            non_kv_cache_memory = (self.model_runner.model_memory_usage +
+                                   self.peak_activation_memory +
+                                   self.non_torch_memory +
+                                   cuda_graph_memory_bytes)
+            kv_cache_memory_bytes_to_gpu_limit = (
+                self.init_snapshot.free_memory - non_kv_cache_memory -
+                redundancy_buffer_memory)
+            kv_cache_memory_bytes_to_requested_limit = (
+                int(self.requested_memory) - non_kv_cache_memory -
+                redundancy_buffer_memory)
+
+            msg = (
+                f"Free memory on device "
+                f"({GiB(self.init_snapshot.free_memory)}/"
+                f"{GiB(self.init_snapshot.total_memory)} GiB) on startup. "
+                f"Desired GPU memory utilization is "
+                f"({self.cache_config.gpu_memory_utilization}, "
+                f"{GiB(self.requested_memory)} GiB). "
+                f"Actual usage is {GiB(self.model_runner.model_memory_usage)} "
+                f"GiB for weight, {GiB(self.peak_activation_memory)} GiB "
+                f"for peak activation, {GiB(self.non_torch_memory)} GiB "
+                f"for non-torch memory, and {GiB(cuda_graph_memory_bytes)} "
+                f"GiB for CUDAGraph memory. Replace gpu_memory_utilization "
+                f"config with `--kv-cache-memory="
+                f"{kv_cache_memory_bytes_to_requested_limit}` to fit into "
+                f"requested memory, or `--kv-cache-memory="
+                f"{kv_cache_memory_bytes_to_gpu_limit}` to fully "
+                f"utilize gpu memory. Current kv cache memory in use is "
+                f"{int(self.available_kv_cache_memory_bytes)} bytes.")
+
+            logger.info(msg)
 
         # Warm up sampler and preallocate memory buffer for logits and other
         # sampling related tensors of max possible shape to avoid memory
@@ -355,7 +425,7 @@ class Worker(WorkerBase):
     def execute_model(
         self,
         scheduler_output: "SchedulerOutput",
-    ) -> Optional[ModelRunnerOutput]:
+    ) -> Optional[Union[ModelRunnerOutput, AsyncModelRunnerOutput]]:
         intermediate_tensors = None
         forward_pass = scheduler_output.total_num_scheduled_tokens > 0
         if forward_pass and not get_pp_group().is_first_rank:
@@ -365,7 +435,7 @@ class Worker(WorkerBase):
 
         output = self.model_runner.execute_model(scheduler_output,
                                                  intermediate_tensors)
-        if isinstance(output, ModelRunnerOutput):
+        if isinstance(output, (ModelRunnerOutput, AsyncModelRunnerOutput)):
             return output
 
         assert isinstance(output, IntermediateTensors)
@@ -400,8 +470,10 @@ class Worker(WorkerBase):
             self.profiler.start()
         else:
             self.profiler.stop()
-            print(self.profiler.key_averages().table(
-                sort_by="self_cuda_time_total"))
+            # only print profiler results on rank 0
+            if self.local_rank == 0:
+                print(self.profiler.key_averages().table(
+                    sort_by="self_cuda_time_total"))
 
     def execute_dummy_batch(self) -> None:
         self.model_runner._dummy_run(1)
@@ -498,7 +570,8 @@ class Worker(WorkerBase):
         parallel_config = self.vllm_config.parallel_config
         moe_modules = [
             module for module in self.model_runner.model.modules()
-            if module.__class__.__name__ == "FusedMoE"
+            if (module.__class__.__name__ == "FusedMoE"
+                or module.__class__.__name__ == "SharedFusedMoE")
         ]
         num_local_experts = moe_modules[0].moe_config.num_local_experts
         assert all(module.moe_config.num_local_experts == num_local_experts
@@ -598,6 +671,9 @@ class Worker(WorkerBase):
         self.model_runner.save_tensorized_model(
             tensorizer_config=tensorizer_config, )
 
+    def shutdown(self) -> None:
+        self.model_runner.ensure_kv_transfer_shutdown()
+
 
 def init_worker_distributed_environment(
     vllm_config: VllmConfig,
@@ -613,7 +689,9 @@ def init_worker_distributed_environment(
     init_distributed_environment(parallel_config.world_size, rank,
                                  distributed_init_method, local_rank, backend)
 
-    ensure_model_parallel_initialized(parallel_config.tensor_parallel_size,
-                                      parallel_config.pipeline_parallel_size)
+    ensure_model_parallel_initialized(
+        parallel_config.tensor_parallel_size,
+        parallel_config.pipeline_parallel_size,
+        parallel_config.decode_context_parallel_size)
 
     ensure_kv_transfer_initialized(vllm_config)
diff --git a/vllm/v1/worker/kv_connector_model_runner_mixin.py b/vllm/v1/worker/kv_connector_model_runner_mixin.py
index a03ebe35d8e0ac2051018fc6b740cba01bae94b4..3eb9f26e9f5b6aeb31e3132c78e31e02b2ad910c 100644
--- a/vllm/v1/worker/kv_connector_model_runner_mixin.py
+++ b/vllm/v1/worker/kv_connector_model_runner_mixin.py
@@ -9,7 +9,8 @@ from typing import Generator  # noqa: UP035
 from typing import TYPE_CHECKING, Optional
 
 from vllm.config import VllmConfig
-from vllm.distributed.kv_transfer import (get_kv_transfer_group,
+from vllm.distributed.kv_transfer import (ensure_kv_transfer_shutdown,
+                                          get_kv_transfer_group,
                                           has_kv_transfer_group)
 from vllm.distributed.kv_transfer.kv_connector.base import KVConnectorBase
 from vllm.forward_context import get_forward_context, set_forward_context
@@ -42,6 +43,12 @@ class KVConnectorModelRunnerMixin:
             # Do this here to save a collective_rpc.
             kv_connector.start_load_kv(get_forward_context())
 
+    @staticmethod
+    def ensure_kv_transfer_shutdown() -> None:
+        # has_kv_transfer_group can be None during interpreter shutdown.
+        if has_kv_transfer_group and has_kv_transfer_group():
+            ensure_kv_transfer_shutdown()
+
     @staticmethod
     def maybe_wait_for_kv_save() -> None:
         if has_kv_transfer_group():
@@ -82,7 +89,7 @@ class KVConnectorModelRunnerMixin:
             scheduler_output) if has_kv_transfer_group() else nullcontext()
 
     # This context manager must be used within an active forward context.
-    # It encapsulates the entire KV conector lifecycle within execute_model
+    # It encapsulates the entire KV connector lifecycle within execute_model
     @staticmethod
     @contextmanager
     def _get_kv_connector_output(
diff --git a/vllm/v1/worker/lora_model_runner_mixin.py b/vllm/v1/worker/lora_model_runner_mixin.py
index 4b5f27d27541b9cca09cbb1a8b9bc94cdc21bb8b..f2ebd5e10210bf05a14146e10e5f21d7512aefd0 100644
--- a/vllm/v1/worker/lora_model_runner_mixin.py
+++ b/vllm/v1/worker/lora_model_runner_mixin.py
@@ -11,7 +11,8 @@ import numpy as np
 import torch
 import torch.nn as nn
 
-from vllm.config import LoRAConfig, ModelConfig, SchedulerConfig
+from vllm.config import ModelConfig, SchedulerConfig
+from vllm.config.lora import LoRAConfig
 from vllm.logger import init_logger
 from vllm.lora.layers import LoRAMapping
 from vllm.lora.request import LoRARequest
diff --git a/vllm/v1/worker/tpu_model_runner.py b/vllm/v1/worker/tpu_model_runner.py
index 985d5ba58c49c6a8bdf3d88b62c17d4235ec324b..15af7ffac8095a9ecee9542dcc9e66df93c1330c 100644
--- a/vllm/v1/worker/tpu_model_runner.py
+++ b/vllm/v1/worker/tpu_model_runner.py
@@ -3,7 +3,7 @@
 import bisect
 import gc
 import time
-from typing import TYPE_CHECKING, Any, Literal, Optional, Union, cast
+from typing import TYPE_CHECKING, Any, Optional, cast
 from unittest.mock import patch
 
 import numpy as np
@@ -23,6 +23,7 @@ from vllm.config import (ParallelConfig, VllmConfig,
                          get_layers_from_vllm_config, update_config)
 from vllm.distributed.kv_transfer import (get_kv_transfer_group,
                                           has_kv_transfer_group)
+from vllm.distributed.kv_transfer.kv_connector.utils import copy_kv_blocks
 from vllm.forward_context import set_forward_context
 from vllm.logger import init_logger
 from vllm.lora.layers import BaseLayerWithLoRA
@@ -1768,28 +1769,22 @@ class TPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
         self.grammar_bitmask_cpu.zero_()
         self.require_structured_out_cpu.zero_()
 
-        # We receive the structured output bitmask from the scheduler, but the
-        # indices of the requests in the batch may not match the indices of
-        # the bitmask since the scheduler doesn't know how the tpu runner is
-        # ordering the requests in the batch. We need to match the order of
-        # bitmask with the order of requests
-        struct_out_indices: list[int] = []
-        mask_indices: list[int] = []
-        for req_id in self.input_batch.req_ids:
-            mask_index = scheduler_output.structured_output_request_ids.get(
-                req_id)
-            if mask_index is None:
+        sorted_struct_requests = sorted(
+            scheduler_output.structured_output_request_ids.items(),
+            key=lambda item: item[1])
+        cumulative_mask_idx = 0
+        for req_id, _ in sorted_struct_requests:
+            if req_id not in self.input_batch.req_id_to_index:
                 continue
             batch_index = self.input_batch.req_id_to_index[req_id]
-            struct_out_indices.append(batch_index)
-            mask_indices.append(mask_index)
-        self.grammar_bitmask_cpu[struct_out_indices] = torch.from_numpy(
-            grammar_bitmask[mask_indices])
-        # It's not guaranteed that all requests in this batch require
-        # structured output, so create a bool tensor to represent
-        # the requests that need structured output.
-        struct_out_indices = torch.tensor(struct_out_indices, dtype=torch.long)
-        self.require_structured_out_cpu[struct_out_indices] = True
+            self.grammar_bitmask_cpu[batch_index] = torch.from_numpy(
+                grammar_bitmask[cumulative_mask_idx])
+            # It's not guaranteed that all requests in this batch require
+            # structured output, so create a bool tensor to represent
+            # the requests that need structured output.
+            self.require_structured_out_cpu[batch_index] = True
+            cumulative_mask_idx += 1
+
         return self.require_structured_out_cpu[:num_reqs].to(logits.device), \
             self.grammar_bitmask_cpu[:num_reqs].to(logits.device), \
             self.structured_decode_arange.to(logits.device)
@@ -1887,75 +1882,6 @@ def _get_padded_token_len(paddings: list[int], x: int) -> int:
     return paddings[index]
 
 
-def _make_src_and_dst_indices(
-    src_block_ids: list[int],
-    dst_block_ids: list[int],
-    src_device: Union[torch.device, str],
-    dst_device: Union[torch.device, str],
-) -> tuple[torch.Tensor, torch.Tensor]:
-    src_indices = torch.tensor(src_block_ids,
-                               device=src_device,
-                               dtype=torch.int64)
-    dst_indices = torch.tensor(dst_block_ids,
-                               device=dst_device,
-                               dtype=torch.int64)
-    return src_indices, dst_indices
-
-
-@torch.compile(backend="openxla")
-def _insert_blocks_to_tpu(
-    cpu_cache: torch.Tensor,
-    tpu_cache: torch.Tensor,
-    cpu_block_indices: torch.Tensor,
-    tpu_block_indices: torch.Tensor,
-) -> None:
-    torch.ops.xla.dynamo_set_buffer_donor_(tpu_cache, True)
-    tpu_cache[tpu_block_indices] = cpu_cache[cpu_block_indices].to(
-        tpu_cache.device)
-
-
-@torch.compile(backend="openxla")
-def _swap_out_tpu_blocks(
-    tpu_cache: torch.Tensor,
-    cpu_cache: torch.Tensor,
-    tpu_block_indices: torch.Tensor,
-    cpu_block_indices: torch.Tensor,
-) -> None:
-    """ tpu blocks to cpu blocks"""
-    torch.ops.xla.dynamo_set_buffer_donor_(tpu_cache, True)
-    cpu_cache[cpu_block_indices] = tpu_cache[tpu_block_indices].cpu()
-
-
-def copy_kv_blocks(
-    src_kv_caches: dict[str, torch.Tensor],
-    dst_kv_caches: dict[str, torch.Tensor],
-    src_block_ids: list[int],
-    dst_block_ids: list[int],
-    direction: Literal["h2d", "d2h"],
-) -> None:
-    """Copy kv blocks between different buffers."""
-    if not src_kv_caches or not dst_kv_caches or \
-       not src_block_ids or not dst_block_ids or \
-       len(src_block_ids) != len(dst_block_ids):
-        return
-
-    src_device = next(iter(src_kv_caches.values())).device
-    dst_device = next(iter(dst_kv_caches.values())).device
-
-    src_indices, dst_indices = _make_src_and_dst_indices(
-        src_block_ids=src_block_ids,
-        dst_block_ids=dst_block_ids,
-        src_device=src_device,
-        dst_device=dst_device)
-
-    _copy_fn = _insert_blocks_to_tpu if direction == "h2d" else \
-               _swap_out_tpu_blocks
-    for layer_name in src_kv_caches:
-        src_tensor = src_kv_caches[layer_name]
-        dst_tensor = dst_kv_caches[layer_name]
-        _copy_fn(src_tensor, dst_tensor, src_indices, dst_indices)
-
-
 def _get_padded_num_kv_cache_update_slices(num_tokens: int, max_num_reqs: int,
                                            page_size: int) -> int:
     """Calculates the padded number of KV cache update slices to avoid
diff --git a/vllm/v1/worker/tpu_worker.py b/vllm/v1/worker/tpu_worker.py
index 9adf8a14213f353d178edac6764259d74b8a2081..fc72b954df9cf7ca6476749a19c3ec1e1cfaf3fc 100644
--- a/vllm/v1/worker/tpu_worker.py
+++ b/vllm/v1/worker/tpu_worker.py
@@ -250,7 +250,7 @@ class TPUWorker:
         scheduler_output: "SchedulerOutput",
     ) -> Optional[ModelRunnerOutput]:
         output = self.model_runner.execute_model(scheduler_output)
-        # every worker's output is needed when kv_transfer_group is setup
+        # every worker's output is needed when kv_transfer_group is set up
         return output if self.is_driver_worker or has_kv_transfer_group(
         ) else None
 
@@ -330,6 +330,9 @@ class TPUWorker:
 
         ensure_kv_transfer_initialized(vllm_config)
 
+    def shutdown(self) -> None:
+        self.model_runner.ensure_kv_transfer_shutdown()
+
 
 if USE_TPU_COMMONS:
     from tpu_commons.worker import TPUWorker as TPUCommonsWorker
diff --git a/vllm/v1/worker/utils.py b/vllm/v1/worker/utils.py
index 6767804c71b9f422b8015a5096bfd49e150763da..be05d02ff29fe3777b9f9d96dc136eb1cecfcad5 100644
--- a/vllm/v1/worker/utils.py
+++ b/vllm/v1/worker/utils.py
@@ -12,6 +12,7 @@ from vllm.model_executor.models.interfaces import MultiModalEmbeddings
 from vllm.model_executor.models.utils import extract_layer_index
 from vllm.multimodal.cache import processor_only_cache_from_config
 from vllm.multimodal.registry import MultiModalRegistry
+from vllm.platforms import current_platform
 from vllm.v1.attention.backends.utils import AttentionMetadataBuilder
 from vllm.v1.core.encoder_cache_manager import compute_mm_encoder_budget
 from vllm.v1.kv_cache_interface import KVCacheGroupSpec
@@ -269,7 +270,17 @@ def bind_kv_cache(
             # One typical case is encoder-decoder model, e.g., bart.
             # The cross attention and self attention in the same decoder layer
             # has different layer_name but the same layer_index.
-            raise NotImplementedError
+
+            # TODO - analyze where runner_kv_caches is used and the right
+            # way to ensure it properly reflects multiple attention layers
+            # in the same decoder block.
+            if current_platform.is_cuda():
+                # We know that the GPU runner is not impacted by this
+                # case. Some test code depends on runner_kv_caches, but
+                # not in a way that's impacted by ignoring this.
+                pass
+            else:
+                raise NotImplementedError
         layer_name = layer_names[0]
         runner_kv_caches.append(kv_caches[layer_name])
 
diff --git a/vllm/v1/worker/xpu_worker.py b/vllm/v1/worker/xpu_worker.py
index 17288cda8eccff905426123c3850467aef631042..7355206f30f57419a9423bed77852149a6ac527c 100644
--- a/vllm/v1/worker/xpu_worker.py
+++ b/vllm/v1/worker/xpu_worker.py
@@ -84,7 +84,7 @@ class XPUWorker(Worker):
         """Profiles the peak memory usage of the model to determine how many
         KV blocks may be allocated without OOMs.
         The engine will first conduct a profiling of the existing memory usage.
-        Then, it calculate the maximum possible number of GPU and CPU blocks
+        Then, it calculates the maximum possible number of GPU and CPU blocks
         that can be allocated with the remaining free memory.
         .. tip::
             You may limit the usage of GPU memory
diff --git a/vllm/worker/model_runner.py b/vllm/worker/model_runner.py
index f05401fd0132761e0f9832cd974d68c8778ff08a..88f83c9dd7e6c13d9cbb94cc27af24ac021d5ba7 100644
--- a/vllm/worker/model_runner.py
+++ b/vllm/worker/model_runner.py
@@ -1337,8 +1337,9 @@ class GPUModelRunnerBase(ModelRunnerBase[TModelInputForGPU]):
         return self.lora_manager.list_adapters()
 
     @torch.inference_mode()
-    def capture_model(self, kv_caches: List[List[torch.Tensor]]) -> None:
-        """Cuda graph capture a model.
+    def capture_model(self, kv_caches: List[List[torch.Tensor]]) -> int:
+        """Cuda graph capture a model and return cudagraph memory
+        consumption in bytes.
 
         Note that CUDA graph's performance gain is negligible if number
         of batched tokens are larger than 200. And since CUDA graph
@@ -1505,6 +1506,7 @@ class GPUModelRunnerBase(ModelRunnerBase[TModelInputForGPU]):
         # This usually takes < 10 seconds.
         logger.info("Graph capturing finished in %.0f secs, took %.2f GiB",
                     elapsed_time, cuda_graph_size / GiB_bytes)
+        return cuda_graph_size
 
     def _update_inputs_to_capture_for_enc_dec_model(self,
                                                     capture_inputs: Dict[str,
diff --git a/vllm/worker/neuron_model_runner.py b/vllm/worker/neuron_model_runner.py
deleted file mode 100644
index 8317b9abff0cd9cc55eadcaeac4c0d74f0c07466..0000000000000000000000000000000000000000
--- a/vllm/worker/neuron_model_runner.py
+++ /dev/null
@@ -1,455 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-import os
-from dataclasses import dataclass
-from typing import TYPE_CHECKING, Any, Dict, List, Optional, Set, Tuple, Union
-
-import torch
-from torch import nn
-
-from vllm.config import DeviceConfig, VllmConfig
-from vllm.logger import init_logger
-from vllm.lora.layers import LoRAMapping
-from vllm.lora.request import LoRARequest
-from vllm.model_executor import SamplingMetadata
-from vllm.model_executor.layers.sampler import SamplerOutput
-from vllm.model_executor.model_loader.neuron import get_neuron_model
-from vllm.multimodal import BatchedTensorInputs, MultiModalKwargs
-from vllm.platforms import current_platform
-from vllm.sampling_params import SamplingParams
-from vllm.sequence import IntermediateTensors, SequenceGroupMetadata
-from vllm.utils import is_pin_memory_available, make_tensor_with_pad
-from vllm.worker.model_runner_base import ModelRunnerBase, ModelRunnerInputBase
-
-if TYPE_CHECKING:
-    from vllm.attention.backends.abstract import AttentionBackend
-
-logger = init_logger(__name__)
-
-
-@dataclass(frozen=True)
-class ModelInputForNeuron(ModelRunnerInputBase):
-    """
-    Used by the NeuronModelRunner.
-    """
-    input_tokens: Optional[torch.Tensor] = None
-    input_positions: Optional[torch.Tensor] = None
-    input_block_ids: Optional[torch.Tensor] = None
-    sampling_metadata: SamplingMetadata = None
-    multi_modal_kwargs: BatchedTensorInputs = None
-    adapter_ids: Optional[str] = None
-
-    def as_broadcastable_tensor_dict(
-            self) -> Dict[str, Union[int, torch.Tensor]]:
-        return {
-            "input_tokens": self.input_tokens,
-            "input_positions": self.input_positions,
-            "input_block_ids": self.input_block_ids,
-            "sampling_metadata": self.sampling_metadata,
-            "multi_modal_kwargs": self.multi_modal_kwargs,
-        }
-
-    @classmethod
-    def from_broadcasted_tensor_dict(
-        cls,
-        tensor_dict: Dict[str, Any],
-        attn_backend: Optional["AttentionBackend"] = None,
-    ) -> "ModelInputForNeuron":
-        return ModelInputForNeuron(
-            input_tokens=tensor_dict["input_tokens"],
-            input_positions=tensor_dict["input_positions"],
-            input_block_ids=tensor_dict["input_block_ids"],
-            sampling_metadata=tensor_dict["sampling_metadata"],
-            multi_modal_kwargs=tensor_dict["multi_modal_kwargs"],
-        )
-
-
-class NeuronModelRunner(ModelRunnerBase[ModelInputForNeuron]):
-    """A model runner for AWS Neuron hardware"""
-
-    # NEURON has an upper limit on the top_k
-    _MAX_NEURON_SAMPLING_TOP_K = 256
-
-    def __init__(
-        self,
-        vllm_config: VllmConfig,
-    ):
-        ModelRunnerBase.__init__(self, vllm_config)
-
-        if (self.model_config is not None
-                and self.model_config.get_sliding_window()):
-            logger.warning("Sliding window is not supported on Neuron. "
-                           "The model will run without sliding window.")
-        self.device_config = (self.device_config if self.device_config
-                              is not None else DeviceConfig())
-        self.lora_config = vllm_config.lora_config
-        self.device = self.device_config.device
-        self.pin_memory = is_pin_memory_available()
-
-        # Lazy initialization.
-        self.model: nn.Module  # initialize after load_model.
-
-        # Once NEURON_ON_DEVICE_SAMPLING_DISABLED is set to a non-zero value,
-        # turn off on-device sampling.
-        self._on_device_sampling_disabled = int(
-            os.getenv("NEURON_ON_DEVICE_SAMPLING_DISABLED", "0"))
-
-        # NEURON needs to update sampling parameters when request IDs change
-        # across batches. This variable stores the previous batch's request IDs
-        # to determine if an update is needed.
-        self._previous_batch_request_ids: List[str] = []
-
-        if not self._on_device_sampling_disabled:
-            self._init_neuron_sampling()
-
-    def _init_neuron_sampling(self) -> None:
-        if current_platform.use_transformers_neuronx():
-            from transformers_neuronx.config import GenerationConfig
-        else:
-            from transformers import GenerationConfig
-        logger.warning(
-            "On-device sampling is turned on in Neuron by default, only "
-            "top_k, top_p, and temperature are current supported sampling "
-            "parameters. To turn off the on-device sampling, please set "
-            "the environment variable NEURON_ON_DEVICE_SAMPLING_DISABLED=1.")
-        self.model_config.neuron_sampling_params = GenerationConfig(
-            max_length=self.scheduler_config.max_model_len,
-            do_sample=True,
-            per_batch_line=True,
-            top_k=[self._MAX_NEURON_SAMPLING_TOP_K] \
-                  * self.scheduler_config.max_num_seqs,
-            top_p=[1.0] * self.scheduler_config.max_num_seqs,
-            temperature=[1.0] * self.scheduler_config.max_num_seqs,
-            dynamic=True,
-            global_top_k=self._MAX_NEURON_SAMPLING_TOP_K)
-
-    def load_model(self) -> None:
-        self.model = get_neuron_model(self.model_config,
-                                      parallel_config=self.parallel_config,
-                                      scheduler_config=self.scheduler_config)
-
-    def get_model(self) -> nn.Module:
-        return self.model
-
-    def _prepare_prompt(
-        self,
-        seq_group_metadata_list: List[SequenceGroupMetadata],
-    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, List[int],
-               BatchedTensorInputs]:
-        assert len(seq_group_metadata_list) > 0
-        input_tokens: List[List[int]] = []
-        input_positions: List[List[int]] = []
-        input_block_ids: List[int] = []
-
-        seq_lens: List[int] = []
-        multi_modal_kwargs_list: List[MultiModalKwargs] = []
-        for seq_group_metadata in seq_group_metadata_list:
-            assert seq_group_metadata.is_prompt
-            seq_ids = list(seq_group_metadata.seq_data.keys())
-            assert len(seq_ids) == 1
-            seq_id = seq_ids[0]
-
-            seq_data = seq_group_metadata.seq_data[seq_id]
-            prompt_tokens = seq_data.get_token_ids()
-            seq_len = len(prompt_tokens)
-            seq_lens.append(seq_len)
-
-            input_tokens.append(prompt_tokens)
-            input_positions.append(list(range(seq_len)))
-
-            assert seq_group_metadata.block_tables is not None
-            block_table = seq_group_metadata.block_tables[seq_id]
-            assert len(block_table) == 1
-            input_block_ids.append(block_table[0])
-
-            mm_kwargs = seq_group_metadata.multi_modal_data
-            if mm_kwargs:
-                mm_kwargs = self.process_multi_modal_data_neuron(mm_kwargs)
-                multi_modal_kwargs_list.append(mm_kwargs)
-
-        max_seq_len = max(seq_lens)
-        assert max_seq_len > 0
-        input_tokens = make_tensor_with_pad(input_tokens,
-                                            pad=0,
-                                            max_len=max_seq_len,
-                                            dtype=torch.long,
-                                            device=self.device)
-        input_positions = make_tensor_with_pad(input_positions,
-                                               pad=0,
-                                               max_len=max_seq_len,
-                                               dtype=torch.long,
-                                               device=self.device)
-        input_block_ids = torch.tensor(input_block_ids,
-                                       dtype=torch.long,
-                                       device=self.device)
-
-        multi_modal_kwargs = MultiModalKwargs.batch(multi_modal_kwargs_list)
-
-        return (input_tokens, input_positions, input_block_ids, seq_lens,
-                multi_modal_kwargs)
-
-    def _prepare_decode(
-        self,
-        seq_group_metadata_list: List[SequenceGroupMetadata],
-    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
-        assert len(seq_group_metadata_list) > 0
-        input_tokens: List[List[int]] = []
-        input_positions: List[List[int]] = []
-        input_block_ids: List[int] = []
-        context_lens: List[int] = []
-
-        for seq_group_metadata in seq_group_metadata_list:
-            assert not seq_group_metadata.is_prompt
-
-            seq_ids = list(seq_group_metadata.seq_data.keys())
-
-            for seq_id in seq_ids:
-                seq_data = seq_group_metadata.seq_data[seq_id]
-                generation_token = seq_data.get_last_token_id()
-                input_tokens.append([generation_token])
-
-                seq_len = seq_data.get_len()
-                position = seq_len - 1
-                input_positions.append([position])
-                context_lens.append(seq_len)
-
-                assert seq_group_metadata.block_tables is not None
-                block_table = seq_group_metadata.block_tables[seq_id]
-                assert len(block_table) == 1
-                input_block_ids.append(block_table[0])
-
-        input_tokens = make_tensor_with_pad(input_tokens,
-                                            pad=0,
-                                            max_len=1,
-                                            dtype=torch.long,
-                                            device=self.device)
-        input_positions = make_tensor_with_pad(input_positions,
-                                               pad=0,
-                                               max_len=1,
-                                               dtype=torch.long,
-                                               device=self.device)
-        context_lens = torch.tensor(context_lens,
-                                    dtype=torch.int,
-                                    device=self.device)
-        input_block_ids = torch.tensor(input_block_ids,
-                                       dtype=torch.long,
-                                       device=self.device)
-
-        return input_tokens, input_positions, input_block_ids
-
-    def make_model_input_from_broadcasted_tensor_dict(
-            self, tensor_dict: Dict[str, Any]) -> ModelInputForNeuron:
-        return ModelInputForNeuron.from_broadcasted_tensor_dict(tensor_dict)
-
-    def prepare_model_input(
-        self,
-        seq_group_metadata_list: List[SequenceGroupMetadata],
-        virtual_engine: int = 0,
-        finished_requests_ids: Optional[List[str]] = None
-    ) -> ModelInputForNeuron:
-        multi_modal_kwargs = None
-        # NOTE: We assume that all sequences in the group are all prompts or
-        # all decodes.
-        is_prompt = seq_group_metadata_list[0].is_prompt
-        # Prepare input tensors.
-        if is_prompt:
-            (input_tokens, input_positions, input_block_ids, seq_lens,
-             multi_modal_kwargs
-             ) = self._prepare_prompt(seq_group_metadata_list)
-        else:
-            (input_tokens, input_positions,
-             input_block_ids) = self._prepare_decode(seq_group_metadata_list)
-            seq_lens = None
-
-        if not self._on_device_sampling_disabled:
-            for seq_group_metadata in seq_group_metadata_list:
-                sampling_params = seq_group_metadata.sampling_params
-                top_k, top_p, temperature = (
-                    self._convert_to_neuron_sampling_params(sampling_params))
-                sampling_params.top_k = top_k
-                sampling_params.top_p = top_p
-                sampling_params.temperature = temperature
-
-        # we need multi_modal_data for later tokens as well
-        multi_modal_kwargs_list: List[MultiModalKwargs] = []
-        for seq_group_metadata in seq_group_metadata_list:
-            mm_data = seq_group_metadata.multi_modal_data
-            if mm_data:
-                multi_modal_kwargs_list.append(mm_data)
-        multi_modal_kwargs = MultiModalKwargs.batch(multi_modal_kwargs_list)
-
-        sampling_metadata = SamplingMetadata.prepare(
-            seq_group_metadata_list,
-            seq_lens,
-            # query_lens is not needed if chunked prefill is not
-            # supported. Since neuron worker doesn't support chunked prefill
-            # just use seq_lens instead.
-            seq_lens,
-            self.device,
-            self.pin_memory,
-            generators=self.get_generators(finished_requests_ids))
-
-        if current_platform.use_transformers_neuronx(
-        ) and not self._on_device_sampling_disabled:
-            # Once the request IDs are changed in current iteration, we will
-            # update the on-device sampling parameters.
-            current_batch_request_ids = [
-                seq_group_meta_data.request_id
-                for seq_group_meta_data in seq_group_metadata_list
-            ]
-            if current_batch_request_ids != self._previous_batch_request_ids:
-                self._update_neuron_sampling_params(seq_group_metadata_list)
-                self._previous_batch_request_ids = current_batch_request_ids
-
-        return ModelInputForNeuron(input_tokens=input_tokens,
-                                   input_positions=input_positions,
-                                   input_block_ids=input_block_ids,
-                                   sampling_metadata=sampling_metadata,
-                                   multi_modal_kwargs=multi_modal_kwargs)
-
-    def _update_neuron_sampling_params(
-            self, seq_group_metadata_list: List[SequenceGroupMetadata]):
-        # Update Neuron sampling parameters (GenerationConfig in Neuron)
-        current_sampling_params = self.model_config.neuron_sampling_params
-        assert current_sampling_params is not None, (
-            f"Failed to update sampling_params, "
-            f"current sampling params is {current_sampling_params}")
-
-        is_update_needed = False
-
-        top_k = current_sampling_params.top_k
-        top_p = current_sampling_params.top_p
-        temperature = current_sampling_params.temperature
-
-        # The index of a sequence's sampling parameters in neuron is equal to
-        # its index in `input_block_ids`.
-        for seq_group_metadata in seq_group_metadata_list:
-            seq_ids = list(seq_group_metadata.seq_data.keys())
-            sampling_params = seq_group_metadata.sampling_params
-
-            seq_group_top_k = sampling_params.top_k
-            seq_group_top_p = sampling_params.top_p
-            seq_group_temperature = sampling_params.temperature
-
-            for seq_id in seq_ids:
-                index = seq_group_metadata.block_tables[seq_id][0]
-                if (top_k[index] != seq_group_top_k
-                        or top_p[index] != seq_group_top_p
-                        or temperature[index] != seq_group_temperature):
-                    is_update_needed = True
-
-                top_k[index] = seq_group_top_k
-                top_p[index] = seq_group_top_p
-                temperature[index] = seq_group_temperature
-
-        # update_generation_config is only available in transformers-neuronx
-        if is_update_needed and current_platform.use_transformers_neuronx():
-            self.model.model.update_generation_config(current_sampling_params)
-
-    def _convert_to_neuron_sampling_params(
-            self, sampling_params: SamplingParams) -> Tuple[int, float, float]:
-        # Returns the top_k, top_p and temperature parameters for neuron.
-        top_k = sampling_params.top_k
-        top_p = sampling_params.top_p
-        temperature = sampling_params.temperature
-
-        if temperature == 0.0:
-            # Enable greedy sampling on zero temperature
-            return (1, 1.0, 1.0)
-        if top_k < 1 or top_k > self._MAX_NEURON_SAMPLING_TOP_K:
-            top_k = self._MAX_NEURON_SAMPLING_TOP_K
-
-        return (top_k, top_p, temperature)
-
-    @torch.inference_mode()
-    def execute_model(
-        self,
-        model_input: ModelInputForNeuron,
-        kv_caches: Optional[List[torch.Tensor]] = None,
-        intermediate_tensors: Optional[IntermediateTensors] = None,
-        num_steps: int = 1,
-    ) -> Optional[List[SamplerOutput]]:
-        if num_steps > 1:
-            raise ValueError(
-                "NeuronModelRunner does not support multi-step execution.")
-
-        # extract top_k, top_p and temperature from model_input for neuron
-        # forward call
-        sampling_params = (torch.tensor([[
-            seq_group.sampling_params.top_k, seq_group.sampling_params.top_p,
-            seq_group.sampling_params.temperature
-        ] for seq_group in model_input.sampling_metadata.seq_groups]))
-
-        if current_platform.use_neuronx_distributed():
-            hidden_states = self.model(
-                input_ids=model_input.input_tokens,
-                positions=model_input.input_positions,
-                input_block_ids=model_input.input_block_ids,
-                sampling_params=sampling_params,
-                adapter_ids=model_input.adapter_ids,
-                **MultiModalKwargs.as_kwargs(
-                    model_input.multi_modal_kwargs or {},
-                    device=self.device,
-                ),
-            )
-        elif current_platform.use_transformers_neuronx():
-            # [TODO] validate on-device sampling
-            # The model signature may need change for on-device sampling
-            hidden_states = self.model(
-                input_ids=model_input.input_tokens,
-                positions=model_input.input_positions,
-                input_block_ids=model_input.input_block_ids,
-                **MultiModalKwargs.as_kwargs(
-                    model_input.multi_modal_kwargs or {},
-                    device=self.device,
-                ),
-            )
-
-        # Compute the logits only if the on-device sampling is turned off as
-        # on-device sampling outputs the token ids.
-        if self._on_device_sampling_disabled:
-            logits = self.model.compute_logits(hidden_states,
-                                               model_input.sampling_metadata)
-        else:
-            logits = hidden_states
-
-        # Sample the next token.
-        output = self.model.sample(
-            logits=logits,
-            sampling_metadata=model_input.sampling_metadata,
-        )
-        return [output]
-
-    @property
-    def vocab_size(self) -> int:
-        return self.model_config.get_vocab_size()
-
-    def process_multi_modal_data_neuron(self, mm_data):
-        # this is a no-op for NeuronModelRunner
-        return mm_data
-
-    def remove_all_loras(self):
-        raise NotImplementedError(
-            "LoRAs are not supported for Transformers NeuronX framework")
-
-    def set_active_loras(self, lora_requests: Set[LoRARequest],
-                         lora_mapping: LoRAMapping) -> None:
-        raise NotImplementedError(
-            "LoRAs are not supported for Transformers NeuronX framework")
-
-    def add_lora(self, lora_request: LoRARequest):
-        raise NotImplementedError(
-            "LoRAs are not supported for Transformers NeuronX framework")
-
-    def remove_lora(self, lora_id: int) -> bool:
-        raise NotImplementedError(
-            "LoRAs are not supported for Transformers NeuronX framework")
-
-    def pin_lora(self, lora_id: int) -> bool:
-        raise NotImplementedError(
-            "LoRAs are not supported for Transformers NeuronX framework")
-
-    def list_loras(self) -> Set[int]:
-        raise NotImplementedError(
-            "LoRAs are not supported for Transformers NeuronX framework")
diff --git a/vllm/worker/neuron_worker.py b/vllm/worker/neuron_worker.py
deleted file mode 100644
index 3e4512a639083eb0acb78c0baf5472c856ca3073..0000000000000000000000000000000000000000
--- a/vllm/worker/neuron_worker.py
+++ /dev/null
@@ -1,189 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-"""A Neuron worker class."""
-import os
-from typing import List, Optional, Set, Tuple
-
-import torch.distributed
-
-from vllm.config import VllmConfig
-from vllm.distributed import (ensure_model_parallel_initialized,
-                              init_distributed_environment)
-from vllm.logger import init_logger
-from vllm.lora.request import LoRARequest
-from vllm.model_executor import set_random_seed
-from vllm.platforms import current_platform
-from vllm.platforms.neuron import NeuronFramework
-from vllm.sequence import ExecuteModelRequest
-from vllm.worker.neuron_model_runner import NeuronModelRunner
-from vllm.worker.worker_base import (LocalOrDistributedWorkerBase, WorkerBase,
-                                     WorkerInput)
-
-logger = init_logger(__name__)
-
-
-class NeuronWorker(LocalOrDistributedWorkerBase):
-    """A worker class that executes the model on a group of neuron cores.
-    """
-
-    model_runner: NeuronModelRunner
-
-    def __init__(self,
-                 vllm_config: VllmConfig,
-                 local_rank: int,
-                 rank: int,
-                 distributed_init_method: str,
-                 is_driver_worker: bool = False) -> None:
-        WorkerBase.__init__(self, vllm_config=vllm_config)
-        self.local_rank = local_rank
-        self.rank = rank
-        self.distributed_init_method = distributed_init_method
-        self.is_driver_worker = is_driver_worker
-        self.lora_config = vllm_config.lora_config
-
-        if self.model_config.trust_remote_code:
-            # note: lazy import to avoid importing torch before initializing
-            from vllm.utils import init_cached_hf_modules
-            init_cached_hf_modules()
-
-        neuron_framework = current_platform.get_neuron_framework_to_use()
-        if neuron_framework == NeuronFramework.TRANSFORMERS_NEURONX:
-            self.model_runner = self.get_tnx_model_runner(vllm_config)
-        elif neuron_framework == NeuronFramework.NEURONX_DISTRIBUTED_INFERENCE:
-            self.model_runner = self.get_neuronx_distributed_model_runner(
-                vllm_config)
-        else:
-            raise NotImplementedError(
-                "Specified framework" +
-                f" {os.environ.get('VLLM_NEURON_FRAMEWORK')}" +
-                " is either not installed or not supported." +
-                " Supported frameworks: " +
-                "[transformers-neuronx, neuronx-distributed-inference]")
-
-    def get_tnx_model_runner(self, vllm_config):
-        assert (self.lora_config
-                is None), ("LoRA is not supported for TransformersNeuronX "
-                           "framework.")
-        if self.speculative_config is not None:
-            raise NotImplementedError(
-                "Speculative decoding is not supported for TransformersNeuronX"
-            )
-        return NeuronModelRunner(vllm_config=vllm_config)
-
-    def get_neuronx_distributed_model_runner(self, vllm_config):
-        from vllm.worker.neuronx_distributed_model_runner import (
-            NeuronxDistributedModelRunner)
-        if self.speculative_config is not None:
-            assert (self.lora_config is None), (
-                "LoRA is not supported for Speculative Decoding")
-            raise NotImplementedError(
-                "Speculative decoding is not supported for NeuronxDistributed")
-        return NeuronxDistributedModelRunner(vllm_config=vllm_config)
-
-    def init_device(self) -> None:
-        self.init_distributed_environment()
-
-        # Set random seed.
-        set_random_seed(self.model_config.seed)
-
-    def load_model(self):
-        self.model_runner.load_model()
-
-    def determine_num_available_blocks(self) -> Tuple[int, int]:
-        """Determine the number of available KV blocks.
-
-        Swapping is not yet supported, so always return num_cpu_blocks=0.
-
-        We configure num_gpu_blocks to be equal to max_num_seqs.
-        """
-        # Set the number of GPU blocks to be the same as the maximum number of
-        # sequences that can be processed in a single batch. This is equivalent
-        # to schedule without PagedAttention.
-        num_gpu_blocks = self.scheduler_config.max_num_seqs + 1
-
-        # Swap not yet supported with Neuron backend.
-        num_cpu_blocks = 0
-
-        return num_gpu_blocks, num_cpu_blocks
-
-    def initialize_cache(self, num_gpu_blocks: int,
-                         num_cpu_blocks: int) -> None:
-        """Initialize the KV cache.
-        """
-
-        # Different values are not tested.
-        assert num_cpu_blocks == 0
-        assert num_gpu_blocks == self.scheduler_config.max_num_seqs + 1
-
-        self.cache_config.num_gpu_blocks = num_gpu_blocks
-        self.cache_config.num_cpu_blocks = num_cpu_blocks
-
-    @property
-    def do_metadata_broadcast(self) -> bool:
-        return False
-
-    @property
-    def kv_cache(self) -> Optional[List[List[torch.Tensor]]]:
-        return None
-
-    @torch.inference_mode()
-    def prepare_worker_input(
-            self, execute_model_req: ExecuteModelRequest) -> WorkerInput:
-        return WorkerInput(num_seq_groups=len(
-            execute_model_req.seq_group_metadata_list), )
-
-    def execute_worker(self, worker_input: WorkerInput) -> None:
-        pass
-
-    def get_cache_block_size_bytes(self) -> int:
-        """Determine the size in bytes of a cache block.
-
-        This is required for speculative decoding; it is not yet implemented.
-        """
-        raise NotImplementedError
-
-    def init_distributed_environment(self):
-        """Neuron uses transformers-neuronx for tensor parallelism.
-
-        vLLM still needs the environment initialized when TP/PP > 1
-        """
-        init_distributed_environment(
-            world_size=1,
-            rank=self.rank,
-            local_rank=self.local_rank,
-            distributed_init_method=self.distributed_init_method,
-            backend=current_platform.dist_backend,
-        )
-
-        ensure_model_parallel_initialized(
-            1,
-            1,
-        )
-
-    def add_lora(self, lora_request: LoRARequest) -> bool:
-        if current_platform.use_transformers_neuronx():
-            raise NotImplementedError(
-                f"{type(self)} does not support LoRA with Neuron Framework "
-                f"Transformers NeuronX")
-        return self.model_runner.add_lora(lora_request)
-
-    def remove_lora(self, lora_id: int) -> bool:
-        if current_platform.use_transformers_neuronx():
-            raise NotImplementedError(
-                f"{type(self)} does not support LoRA with Neuron Framework "
-                f"Transformers NeuronX")
-        return self.model_runner.remove_lora(lora_id)
-
-    def pin_lora(self, lora_id: int) -> bool:
-        if current_platform.use_transformers_neuronx():
-            raise NotImplementedError(
-                f"{type(self)} does not support LoRA with Neuron Framework "
-                f"Transformers NeuronX")
-        return self.model_runner.pin_lora(lora_id)
-
-    def list_loras(self) -> Set[int]:
-        if current_platform.use_transformers_neuronx():
-            raise NotImplementedError(
-                f"{type(self)} does not support LoRA with Neuron Framework "
-                f"Transformers NeuronX")
-        return self.model_runner.list_loras()
diff --git a/vllm/worker/neuronx_distributed_model_runner.py b/vllm/worker/neuronx_distributed_model_runner.py
deleted file mode 100644
index 2a0f4e77c99e5c487f0b8457bcc9ce4f625f4945..0000000000000000000000000000000000000000
--- a/vllm/worker/neuronx_distributed_model_runner.py
+++ /dev/null
@@ -1,294 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-from typing import List, Optional, Set
-
-import torch
-from neuronx_distributed_inference.models.mllama.aspect_ratio_utils import (
-    get_all_supported_aspect_ratios)
-from neuronx_distributed_inference.modules.generation.sampling import (
-    prepare_sampling_params)
-from neuronx_distributed_inference.modules.lora_serving import (
-    LoraCheckpoint, LoraServingConfig)
-
-from vllm.config import VllmConfig
-from vllm.entrypoints.openai.serving_models import LoRAModulePath
-from vllm.logger import init_logger
-from vllm.lora.layers import LoRAMapping
-from vllm.lora.request import LoRARequest
-from vllm.model_executor import SamplingMetadata
-from vllm.model_executor.layers.sampler import SamplerOutput
-from vllm.model_executor.model_loader.neuronx_distributed import (
-    _get_model_architecture, get_neuron_model)
-from vllm.multimodal import MultiModalKwargs
-from vllm.sequence import IntermediateTensors, SequenceGroupMetadata
-from vllm.worker.neuron_model_runner import (ModelInputForNeuron,
-                                             NeuronModelRunner)
-
-logger = init_logger(__name__)
-
-
-class NeuronxDistributedModelRunner(NeuronModelRunner):
-
-    def __init__(
-        self,
-        vllm_config: VllmConfig,
-    ):
-        super().__init__(vllm_config)
-        self.lora_checkpoint = None
-        self.model = None
-        self.lora_serving_config = None
-
-    @staticmethod
-    def _get_lora_paths_strings(lora_modules: List[LoRAModulePath]):
-        if not lora_modules:
-            return None
-        return {_.get("name"): _.get("path") for _ in lora_modules}
-
-    def _get_nxdi_lora_config(self):
-        override_neuron_config = self.model_config.override_neuron_config
-        lora_modules = override_neuron_config.pop("lora_modules", None)
-        target_modules = override_neuron_config.pop("target_modules", None)
-        lora_ckpt_paths = self._get_lora_paths_strings(lora_modules)
-        if self.lora_config.max_loras < len(lora_ckpt_paths):
-            raise ValueError(
-                "Number of LoRAs (%s) exceeds maximum "
-                "allowed (%s)", len(lora_ckpt_paths),
-                self.lora_config.max_loras)
-
-        return LoraServingConfig(
-            max_loras=self.lora_config.max_loras,
-            max_lora_rank=self.lora_config.max_lora_rank,
-            target_modules=target_modules,
-            lora_ckpt_paths=lora_ckpt_paths,
-        )
-
-    def load_model(self) -> None:
-        # Update LoRA config
-        if self.lora_config is not None:
-            self.lora_serving_config = self._get_nxdi_lora_config()
-            self.lora_checkpoint = LoraCheckpoint(self.lora_serving_config)
-        self.model = get_neuron_model(
-            self.model_config,
-            parallel_config=self.parallel_config,
-            scheduler_config=self.scheduler_config,
-            lora_serving_config=self.lora_serving_config)
-
-    def get_nxd_sampling_params(self, sampling_metadata):
-        if self.model.config.neuron_config.on_device_sampling_config:
-            max_topk = (self.model.config.neuron_config.
-                        on_device_sampling_config.global_topk)
-        else:
-            max_topk = self.model.config.vocab_size
-
-        top_k = [1] * self.scheduler_config.max_num_seqs
-        top_p = [1.0] * self.scheduler_config.max_num_seqs
-        temperature = [1.0] * self.scheduler_config.max_num_seqs
-
-        for index, sequenceGroupToSample in enumerate(
-                sampling_metadata.seq_groups):
-            top_k[index] = (sequenceGroupToSample.sampling_params.top_k
-                            if sequenceGroupToSample.sampling_params.top_k > 0
-                            else max_topk)
-            top_p[index] = sequenceGroupToSample.sampling_params.top_p
-            temperature[index] = (
-                sequenceGroupToSample.sampling_params.temperature)
-
-        sampling_params = prepare_sampling_params(
-            batch_size=self.scheduler_config.max_num_seqs,
-            top_k=top_k,
-            top_p=top_p,
-            temperature=temperature)
-        return sampling_params
-
-    def get_multi_modal_data_neuron(self, input_images):
-        raise NotImplementedError("need to restore multi-modal support")
-
-    @torch.inference_mode()
-    def execute_model(
-        self,
-        model_input: ModelInputForNeuron,
-        kv_caches: Optional[List[torch.Tensor]] = None,
-        intermediate_tensors: Optional[IntermediateTensors] = None,
-        num_steps: int = 1,
-    ) -> Optional[List[SamplerOutput]]:
-        if num_steps > 1:
-            raise ValueError(
-                "NeuronModelRunner does not support multi-step execution.")
-
-        if _get_model_architecture(
-                self.model.config) != "MllamaForConditionalGeneration":
-            return super().execute_model(model_input, kv_caches,
-                                         intermediate_tensors, num_steps)
-
-        sampling_params = self.get_nxd_sampling_params(
-            model_input.sampling_metadata)
-
-        if model_input.multi_modal_kwargs.get('pixel_values') is not None:
-            hidden_states = self.model(
-                input_ids=model_input.input_tokens,
-                positions=model_input.input_positions,
-                seq_ids=model_input.input_block_ids,
-                pixel_values=model_input.multi_modal_kwargs.get(
-                    'pixel_values'),
-                aspect_ratios=model_input.multi_modal_kwargs.get(
-                    'aspect_ratios'),
-                sampling_params=sampling_params,
-                num_chunks=model_input.multi_modal_kwargs.get('num_chunks'),
-                has_image=model_input.multi_modal_kwargs.get(
-                    'has_image').squeeze(1),
-            )
-        else:
-            bs = model_input.input_tokens.shape[0] if (model_input.input_tokens
-                                                       is not None) else 1
-            empty_pixel_values = torch.zeros([bs, 1, 4, 3, 560, 560],
-                                             dtype=torch.bfloat16)
-            empty_aspect_ratios = torch.ones([bs, 1, 2], dtype=torch.int64)
-            num_chunks = torch.zeros((bs, 1), dtype=torch.int32)
-            has_image = torch.zeros([bs], dtype=torch.int32)
-            hidden_states = self.model(
-                input_ids=model_input.input_tokens,
-                positions=model_input.input_positions,
-                seq_ids=model_input.input_block_ids,
-                pixel_values=empty_pixel_values,
-                aspect_ratios=empty_aspect_ratios,
-                sampling_params=sampling_params,
-                num_chunks=num_chunks,
-                has_image=has_image,
-            )
-
-        output = self.model.sample(
-            hidden_states=hidden_states,
-            sampling_metadata=model_input.sampling_metadata,
-        )
-
-        return [output]
-
-    def process_multi_modal_data_neuron(self, mm_data):
-        # Neuron uses aspect_ratios instead of aspect_ratio_ids
-        all_supported_aspect_ratios = get_all_supported_aspect_ratios(
-            self.model.config.vision_config.max_num_tiles)
-        aspect_ratio_ids = mm_data.get("aspect_ratio_ids")
-        mm_data["aspect_ratios"] = torch.tensor(
-            all_supported_aspect_ratios[aspect_ratio_ids]).unsqueeze(0)
-
-        # Neuron's num_chunks is HF's num_tiles
-        mm_data["num_chunks"] = mm_data.get("num_tiles")
-
-        # Input has an image if it has pixel_values
-        bs = mm_data["num_chunks"].shape[0]
-        pixel_values = mm_data.get("pixel_values")
-        if pixel_values is not None and not torch.all(pixel_values == 0):
-            mm_data["has_image"] = torch.ones(bs)
-
-        else:
-            mm_data["has_image"] = torch.zeros(bs)
-        return mm_data
-
-    def _get_lora_adapter_ids(self, seq_group_metadata_list):
-        # set LoRA adapter IDs for multi-lora serving
-        batch_size = len(seq_group_metadata_list)
-        if self.lora_checkpoint is not None:
-            # "0" indicates NxDI to use the base model for inference
-            adapter_ids = ["0"] * batch_size
-            for idx, seq_group_metadata in enumerate(seq_group_metadata_list):
-                if seq_group_metadata.lora_request is not None:
-                    adapter_ids[
-                        idx] = seq_group_metadata.lora_request.lora_name
-
-            # convert adapter_ids from strings to integers
-            adapter_ids = self.lora_checkpoint.convert_adapter_ids_to_indices(
-                adapter_ids, batch_size)
-        else:
-            adapter_ids = torch.zeros((batch_size), dtype=torch.int32)
-
-        return adapter_ids
-
-    def prepare_model_input(
-        self,
-        seq_group_metadata_list: List[SequenceGroupMetadata],
-        virtual_engine: int = 0,
-        finished_requests_ids: Optional[List[str]] = None
-    ) -> ModelInputForNeuron:
-        # NOTE: We assume that all sequences in the group are all prompts or
-        # all decodes.
-        is_prompt = seq_group_metadata_list[0].is_prompt
-        # Prepare input tensors.
-        if is_prompt:
-            (input_tokens, input_positions, input_block_ids, seq_lens,
-             multi_modal_kwargs
-             ) = self._prepare_prompt(seq_group_metadata_list)
-        else:
-            (input_tokens, input_positions,
-             input_block_ids) = self._prepare_decode(seq_group_metadata_list)
-            seq_lens = None
-
-        if not self._on_device_sampling_disabled:
-            for seq_group_metadata in seq_group_metadata_list:
-                sampling_params = seq_group_metadata.sampling_params
-                top_k, top_p, temperature = (
-                    self._convert_to_neuron_sampling_params(sampling_params))
-                sampling_params.top_k = top_k
-                sampling_params.top_p = top_p
-                sampling_params.temperature = temperature
-
-        # we need multi_modal_data for later tokens as well
-        multi_modal_kwargs_list: List[MultiModalKwargs] = []
-        for seq_group_metadata in seq_group_metadata_list:
-            mm_data = seq_group_metadata.multi_modal_data
-            if mm_data:
-                multi_modal_kwargs_list.append(mm_data)
-        multi_modal_kwargs = MultiModalKwargs.batch(multi_modal_kwargs_list)
-
-        lora_adapter_ids = self._get_lora_adapter_ids(seq_group_metadata_list)
-
-        sampling_metadata = SamplingMetadata.prepare(
-            seq_group_metadata_list,
-            seq_lens,
-            # query_lens is not needed if chunked prefill is not
-            # supported. Since neuron worker doesn't support chunked prefill
-            # just use seq_lens instead.
-            seq_lens,
-            self.device,
-            self.pin_memory,
-            generators=self.get_generators(finished_requests_ids))
-
-        return ModelInputForNeuron(input_tokens=input_tokens,
-                                   input_positions=input_positions,
-                                   input_block_ids=input_block_ids,
-                                   sampling_metadata=sampling_metadata,
-                                   multi_modal_kwargs=multi_modal_kwargs,
-                                   adapter_ids=lora_adapter_ids)
-
-    def remove_all_loras(self):
-        raise NotImplementedError(
-            "Managing LoRAs is only supported through the "
-            "lora_modules parameter in override_neuron_config")
-
-    def set_active_loras(self, lora_requests: Set[LoRARequest],
-                         lora_mapping: LoRAMapping) -> None:
-        raise NotImplementedError(
-            "Managing LoRAs is only supported through the "
-            "lora_modules parameter in override_neuron_config")
-
-    def add_lora(self, lora_request: LoRARequest):
-        logger.warning(
-            "Adding LoRAs is only supported through the "
-            "lora_modules parameter in override_neuron_config. If you supplied "
-            "the parameter, you can ignore this warning. Ignoring"
-            "lora request: ", lora_request)
-
-    def remove_lora(self, lora_id: int) -> bool:
-        raise NotImplementedError(
-            "Managing LoRAs is only supported through the "
-            "lora_modules parameter in override_neuron_config")
-
-    def pin_lora(self, lora_id: int) -> bool:
-        raise NotImplementedError(
-            "Managing LoRAs is only supported through the "
-            "lora_modules parameter in override_neuron_config")
-
-    def list_loras(self) -> Set[int]:
-        raise NotImplementedError(
-            "Managing LoRAs is only supported through the "
-            "lora_modules parameter in override_neuron_config")
diff --git a/vllm/worker/worker.py b/vllm/worker/worker.py
index 2e20c89c632c5818e9b060cc89c70d47cfdc7b86..670f256c0bf6592cec50a10d9b1d04f2cf7c4e6a 100644
--- a/vllm/worker/worker.py
+++ b/vllm/worker/worker.py
@@ -78,7 +78,8 @@ class Worker(LocalOrDistributedWorkerBase):
                         "deepseek_mtp",
                         "glm4_moe_mtp",
                         "mimo_mtp",
-                        "ernie_mtp")) \
+                        "ernie_mtp",
+                        "qwen3_next_mtp")) \
                     else {"return_hidden_states": True}
 
         ModelRunnerClass: Type[GPUModelRunnerBase] = ModelRunner
@@ -128,8 +129,10 @@ class Worker(LocalOrDistributedWorkerBase):
         if self.profiler is None:
             raise RuntimeError("Profiler is not enabled.")
         self.profiler.stop()
-        print(
-            self.profiler.key_averages().table(sort_by="self_cuda_time_total"))
+        # only print profiler results on rank 0
+        if self.local_rank == 0:
+            print(self.profiler.key_averages().table(
+                sort_by="self_cuda_time_total"))
 
     def sleep(self, level: int = 1) -> None:
         free_bytes_before_sleep = torch.cuda.mem_get_info()[0]
@@ -226,13 +229,74 @@ class Worker(LocalOrDistributedWorkerBase):
         self.model_runner.save_tensorized_model(
             tensorizer_config=tensorizer_config, )
 
+    @torch.inference_mode()
+    def determine_available_kv_cache_memory(self,
+                                            total_gpu_memory: int) -> float:
+        if kv_cache_memory_bytes := self.cache_config.kv_cache_memory_bytes:
+            # still need a profile run which compiles the model for
+            # max_num_batched_tokens
+            self.model_runner.profile_run()
+
+            GiB = lambda b: b / GiB_bytes
+            msg = (
+                f"Initial free memory "
+                f"{GiB(self.baseline_snapshot.free_memory):.2f} "
+                f"GiB, reserved {GiB(kv_cache_memory_bytes):.2f}GiB memory for "
+                "KV Cache as specified by kv_cache_memory_bytes config and "
+                "skipped memory profiling. This does does not respect the "
+                "gpu_memory_utilization config. Only use kv_cache_memory_bytes "
+                "config when you want manual control of KV cache memory "
+                "size. If OOM'ed, check the difference of initial free "
+                "memory between the current run and the previous run "
+                "where kv_cache_memory_bytes is suggested and update it "
+                "correspondingly.")
+            logger.info(msg)
+            return self.cache_config.kv_cache_memory_bytes
+
+        # Execute a forward pass with dummy inputs to profile the memory usage
+        # of the model.
+        with memory_profiling(
+                self.baseline_snapshot,
+                weights_memory=self.model_runner.model_memory_usage) as result:
+            self.model_runner.profile_run()
+
+        self.non_torch_memory = result.non_torch_increase
+        self.peak_activation_memory = result.torch_peak_increase
+
+        self._assert_memory_footprint_increased_during_profiling()
+
+        self.requested_memory = total_gpu_memory * \
+            self.cache_config.gpu_memory_utilization
+
+        self.available_kv_cache_memory = (self.requested_memory -
+                                          result.non_kv_cache_memory)
+
+        msg = (f"Memory profiling takes {result.profile_time:.2f} seconds\n"
+               "the current vLLM instance can use "
+               "total_gpu_memory "
+               f"({(total_gpu_memory / GiB_bytes):.2f}GiB)"
+               " x gpu_memory_utilization "
+               f"({self.cache_config.gpu_memory_utilization:.2f})"
+               f" = {(self.requested_memory / GiB_bytes):.2f}GiB\n"
+               "model weights take "
+               f"{(result.weights_memory / GiB_bytes):.2f}GiB;"
+               " non_torch_memory takes "
+               f"{(result.non_torch_increase / GiB_bytes):.2f}GiB;"
+               " PyTorch activation peak memory takes "
+               f"{(result.torch_peak_increase / GiB_bytes):.2f}GiB;"
+               " the rest of the memory reserved for KV Cache is "
+               f"{(self.available_kv_cache_memory / GiB_bytes):.2f}GiB.")
+
+        logger.info(msg)
+        return self.available_kv_cache_memory
+
     @torch.inference_mode()
     def determine_num_available_blocks(self) -> Tuple[int, int]:
         """Profiles the peak memory usage of the model to determine how many
         KV blocks may be allocated without OOMs.
 
         The engine will first conduct a profiling of the existing memory usage.
-        Then, it calculate the maximum possible number of GPU and CPU blocks
+        Then, it calculates the maximum possible number of GPU and CPU blocks
         that can be allocated with the remaining free memory.
 
         Tip:
@@ -245,20 +309,8 @@ class Worker(LocalOrDistributedWorkerBase):
         torch.cuda.reset_peak_memory_stats()
 
         free_memory_pre_profile, total_gpu_memory = torch.cuda.mem_get_info()
-
-        # Execute a forward pass with dummy inputs to profile the memory usage
-        # of the model.
-        with memory_profiling(
-                self.baseline_snapshot,
-                weights_memory=self.model_runner.model_memory_usage) as result:
-            self.model_runner.profile_run()
-
-        self._assert_memory_footprint_increased_during_profiling()
-
-        memory_for_current_instance = total_gpu_memory * \
-            self.cache_config.gpu_memory_utilization
-        available_kv_cache_memory = (memory_for_current_instance -
-                                     result.non_kv_cache_memory)
+        available_kv_cache_memory = self.determine_available_kv_cache_memory(
+            total_gpu_memory)
 
         # Calculate the number of blocks that can be allocated with the
         # profiled peak memory.
@@ -273,23 +325,6 @@ class Worker(LocalOrDistributedWorkerBase):
         num_gpu_blocks = max(num_gpu_blocks, 0)
         num_cpu_blocks = max(num_cpu_blocks, 0)
 
-        msg = (f"Memory profiling takes {result.profile_time:.2f} seconds\n"
-               "the current vLLM instance can use "
-               "total_gpu_memory "
-               f"({(total_gpu_memory / GiB_bytes):.2f}GiB)"
-               " x gpu_memory_utilization "
-               f"({self.cache_config.gpu_memory_utilization:.2f})"
-               f" = {(memory_for_current_instance / GiB_bytes):.2f}GiB\n"
-               "model weights take "
-               f"{(result.weights_memory / GiB_bytes):.2f}GiB;"
-               " non_torch_memory takes "
-               f"{(result.non_torch_increase / GiB_bytes):.2f}GiB;"
-               " PyTorch activation peak memory takes "
-               f"{(result.torch_peak_increase / GiB_bytes):.2f}GiB;"
-               " the rest of the memory reserved for KV Cache is "
-               f"{(available_kv_cache_memory / GiB_bytes):.2f}GiB.")
-
-        logger.info(msg)
         # Final cleanup
         gc.collect()
 
@@ -379,8 +414,58 @@ class Worker(LocalOrDistributedWorkerBase):
         for size in sorted(warmup_sizes, reverse=True):
             logger.info("Compile and warming up model for size %d", size)
             self.model_runner._dummy_run(size)
+
+        cuda_graph_memory_bytes = 0
         if not self.model_config.enforce_eager:
-            self.model_runner.capture_model(self.gpu_cache)
+            cuda_graph_memory_bytes = self.model_runner.capture_model(
+                self.gpu_cache)
+
+        if (self.cache_config.kv_cache_memory_bytes is None
+                and hasattr(self, "peak_activation_memory")):
+            # Suggests optimal kv cache memory size if we rely on
+            # memory_profiling to guess the kv cache memory size which
+            # provides peak_activation_memory and a few other memory
+            # consumption. `memory_profiling` does not consider
+            # CUDAGraph memory size and may not utilize all gpu memory.
+            # Users may want fine-grained control to specify kv cache
+            # memory size.
+            GiB = lambda b: round(b / GiB_bytes, 2)
+            non_kv_cache_memory = (self.model_runner.model_memory_usage +
+                                   self.peak_activation_memory +
+                                   self.non_torch_memory +
+                                   cuda_graph_memory_bytes)
+
+            # empirically observed that the memory profiling may
+            # slightly underestimate the memory consumption.
+            # So leave a small buffer (=150MiB) to avoid OOM.
+            redundancy_buffer_memory = 150 * (1 << 20)
+            kv_cache_memory_bytes_to_gpu_limit = (
+                self.baseline_snapshot.free_memory - non_kv_cache_memory -
+                redundancy_buffer_memory)
+            kv_cache_memory_bytes_to_requested_limit = (
+                int(self.requested_memory) - non_kv_cache_memory -
+                redundancy_buffer_memory)
+
+            msg = (
+                f"Free memory on device "
+                f"({GiB(self.baseline_snapshot.free_memory)}/"
+                f"{GiB(self.baseline_snapshot.total_memory)} GiB) on startup. "
+                f"Desired GPU memory utilization is "
+                f"({self.cache_config.gpu_memory_utilization}, "
+                f"{GiB(self.requested_memory)} GiB). "
+                f"Actual usage is {GiB(self.model_runner.model_memory_usage)} "
+                f"GiB for weight, {GiB(self.peak_activation_memory)} GiB "
+                f"for peak activation, {GiB(self.non_torch_memory)} GiB "
+                f"for non-torch memory, and {GiB(cuda_graph_memory_bytes)} "
+                f"GiB for CUDAGraph memory. Replace gpu_memory_utilization "
+                f"config with `--kv-cache-memory="
+                f"{kv_cache_memory_bytes_to_requested_limit}` to fit into "
+                f"requested memory, or `--kv-cache-memory="
+                f"{kv_cache_memory_bytes_to_gpu_limit}` to fully "
+                f"utilize gpu memory. Current kv cache memory in use is "
+                f"{int(self.available_kv_cache_memory)} bytes.")
+            logger.info(msg)
+
         # Reset the seed to ensure that the random state is not affected by
         # the model initialization and profiling.
         set_random_seed(self.model_config.seed)
@@ -537,8 +622,10 @@ def init_worker_distributed_environment(
     init_distributed_environment(parallel_config.world_size, rank,
                                  distributed_init_method, local_rank,
                                  current_platform.dist_backend)
-    ensure_model_parallel_initialized(parallel_config.tensor_parallel_size,
-                                      parallel_config.pipeline_parallel_size)
+    ensure_model_parallel_initialized(
+        parallel_config.tensor_parallel_size,
+        parallel_config.pipeline_parallel_size,
+        parallel_config.decode_context_parallel_size)
 
     ensure_kv_transfer_initialized(vllm_config)
 
diff --git a/vllm/worker/worker_base.py b/vllm/worker/worker_base.py
index a1fa7f2cf7a2ecb13a9e1ffaf33e942ee127b127..aa76d21f0fcaa3c80b736771205f94e3743ac387 100644
--- a/vllm/worker/worker_base.py
+++ b/vllm/worker/worker_base.py
@@ -129,6 +129,10 @@ class WorkerBase:
         """Get vocabulary size from model configuration."""
         return self.model_config.get_vocab_size()
 
+    def shutdown(self) -> None:
+        """Clean up resources held by the worker."""
+        return
+
 
 class DelegateWorkerBase(WorkerBase):
     """
@@ -519,6 +523,10 @@ class WorkerWrapperBase:
                 from vllm.utils import init_cached_hf_modules
                 init_cached_hf_modules()
 
+    def shutdown(self) -> None:
+        if self.worker is not None:
+            self.worker.shutdown()
+
     def adjust_rank(self, rank_mapping: Dict[int, int]) -> None:
         """
         Adjust the rpc_rank based on the given mapping.