Merge tag 'v0.8.2' into v0.8.2-ori

53076d70 · zhuwenwen · 322a0be6 · 9c5c81b0 · 53076d70 · 53076d70
Commit 53076d70 authored Mar 24, 2025 by zhuwenwen
20 changed files
--- a/tests/v1/worker/test_gpu_model_runner.py
+++ b/tests/v1/worker/test_gpu_model_runner.py
@@ -3,8 +3,8 @@ import pytest

 from vllm.config import CacheConfig, ModelConfig, SchedulerConfig, VllmConfig
 from vllm.sampling_params import SamplingParams
-from vllm.v1.core.scheduler_output import (CachedRequestData, NewRequestData,
-                                           SchedulerOutput)
+from vllm.v1.core.sched.output import (CachedRequestData, NewRequestData,
+                                       SchedulerOutput)
 from vllm.v1.sample.metadata import SamplingMetadata
 from vllm.v1.worker.gpu_model_runner import GPUModelRunner


--- a/vllm/attention/__init__.py
+++ b/vllm/attention/__init__.py
@@ -4,12 +4,16 @@ from vllm.attention.backends.abstract import (AttentionBackend,
                                              AttentionMetadata,
                                              AttentionMetadataBuilder,
                                              AttentionState, AttentionType)
-from vllm.attention.backends.utils import get_flash_attn_version
 from vllm.attention.layer import Attention
 from vllm.attention.selector import get_attn_backend

 __all__ = [
-    "Attention", "AttentionBackend", "AttentionMetadata", "AttentionType",
-    "AttentionMetadataBuilder", "Attention", "AttentionState",
-    "get_attn_backend", "get_flash_attn_version"
+    "Attention",
+    "AttentionBackend",
+    "AttentionMetadata",
+    "AttentionType",
+    "AttentionMetadataBuilder",
+    "Attention",
+    "AttentionState",
+    "get_attn_backend",
 ]
--- a/vllm/attention/backends/abstract.py
+++ b/vllm/attention/backends/abstract.py
@@ -232,6 +232,7 @@ class AttentionMetadataBuilder(ABC, Generic[T]):

 class AttentionLayer(Protocol):

+    _q_scale: torch.Tensor
    _k_scale: torch.Tensor
    _v_scale: torch.Tensor
    _k_scale_float: float

--- a/vllm/attention/backends/flash_attn.py
+++ b/vllm/attention/backends/flash_attn.py
@@ -19,10 +19,10 @@ from vllm.attention.backends.abstract import (AttentionBackend, AttentionImpl,
 # yapf: enable
 from vllm.attention.backends.utils import (
    PAD_SLOT_ID, CommonAttentionState, compute_slot_mapping,
-    compute_slot_mapping_start_idx, get_flash_attn_version,
-    get_num_prefill_decode_query_kv_tokens, get_seq_len_block_table_args,
-    is_all_cross_attn_metadata_set, is_all_encoder_attn_metadata_set,
-    is_block_tables_empty)
+    compute_slot_mapping_start_idx, get_num_prefill_decode_query_kv_tokens,
+    get_seq_len_block_table_args, is_all_cross_attn_metadata_set,
+    is_all_encoder_attn_metadata_set, is_block_tables_empty)
+from vllm.fa_utils import get_flash_attn_version
 from vllm.logger import init_logger
 from vllm.multimodal import MultiModalPlaceholderMap
 from vllm.utils import async_tensor_h2d, make_tensor_with_pad
@@ -630,9 +630,12 @@ class FlashAttentionImpl(AttentionImpl):
        self.sliding_window = ((sliding_window - 1,
                                0) if sliding_window is not None else (-1, -1))
        self.kv_cache_dtype = kv_cache_dtype
-        if is_quantized_kv_cache(self.kv_cache_dtype):
+        self.vllm_flash_attn_version = get_flash_attn_version(
+            requires_alibi=self.alibi_slopes is not None)
+        if (is_quantized_kv_cache(self.kv_cache_dtype)
+                and self.vllm_flash_attn_version != 3):
            raise NotImplementedError(
-                "FlashAttention with FP8 KV cache not yet supported")
+                "Only FlashAttention3 supports FP8 KV cache")
        if logits_soft_cap is None:
            # In flash-attn, setting logits_soft_cap as 0 means no soft cap.
            logits_soft_cap = 0
@@ -647,7 +650,6 @@ class FlashAttentionImpl(AttentionImpl):
                f"Head size {head_size} is not supported by FlashAttention. "
                f"Supported head sizes are: {support_head_sizes}.")
        self.attn_type = attn_type
-        self.vllm_flash_attn_version = get_flash_attn_version()

    def forward(
        self,
@@ -671,13 +673,19 @@ class FlashAttentionImpl(AttentionImpl):
                for profiling run.
            attn_metadata: Metadata for attention.
        NOTE: It in-place updates the output tensor.
+        NOTE: FP8 quantization, flash-attn expect the size of
+              {q,k,v}_descale to be (num_sequences, num_kv_heads).
+              We use torch's .expand() to avoid duplicating values
        """
-        # NOTE(woosuk): FlashAttention does not support FP8 KV cache.
-        assert layer._k_scale_float == 1.0 and layer._v_scale_float == 1.0, (
-            "key/v_scale is not supported in FlashAttention.")
-
        assert output is not None, "Output tensor must be provided."

+        # NOTE(woosuk): FlashAttention2 does not support FP8 KV cache.
+        if self.vllm_flash_attn_version < 3 or output.dtype != torch.bfloat16:
+            assert (
+                layer._k_scale_float == 1.0 and layer._v_scale_float == 1.0), (
+                    "key/v_scale is only supported in FlashAttention 3 with "
+                    "base dtype bfloat16")
+
        attn_type = self.attn_type
        if (attn_type == AttentionType.ENCODER
                and (not attn_metadata.is_all_encoder_attn_metadata_set)):
@@ -694,6 +702,7 @@ class FlashAttentionImpl(AttentionImpl):
        window_size = self.sliding_window
        alibi_slopes: Optional[torch.Tensor] = self.alibi_slopes
        logits_soft_cap: Optional[float] = self.logits_soft_cap
+        fp8_attention = kv_cache_dtype.startswith("fp8")

        if kv_cache.numel() > 0:
            key_cache = kv_cache[0]
@@ -729,6 +738,19 @@ class FlashAttentionImpl(AttentionImpl):
                    layer._v_scale,
                )

+                if fp8_attention:
+                    kv_cache = kv_cache.view(torch.float8_e4m3fn)
+                    key_cache = key_cache.view(torch.float8_e4m3fn)
+                    value_cache = value_cache.view(torch.float8_e4m3fn)
+
+        if fp8_attention:
+            num_tokens, num_heads, head_size = query.shape
+            query, _ = ops.scaled_fp8_quant(
+                query.reshape(
+                    (num_tokens, num_heads * head_size)).contiguous(),
+                layer._q_scale)
+            query = query.reshape((num_tokens, num_heads, head_size))
+
        (num_prefill_query_tokens, num_prefill_kv_tokens,
        num_decode_query_tokens) = \
            get_num_prefill_decode_query_kv_tokens(attn_metadata, attn_type)
@@ -753,6 +775,23 @@ class FlashAttentionImpl(AttentionImpl):
                key = key[:num_prefill_kv_tokens]
                value = value[:num_prefill_kv_tokens]

+                if fp8_attention:
+                    num_kv_tokens, num_kv_heads, head_size = key.shape
+
+                    key, _ = ops.scaled_fp8_quant(
+                        key.reshape((num_kv_tokens,
+                                     num_kv_heads * head_size)).contiguous(),
+                        layer._k_scale)
+                    key = key.reshape((num_kv_tokens, num_kv_heads, head_size))
+
+                    value, _ = ops.scaled_fp8_quant(
+                        value.reshape((num_kv_tokens,
+                                       num_kv_heads * head_size)).contiguous(),
+                        layer._v_scale)
+                    value = value.reshape(
+                        (num_kv_tokens, num_kv_heads, head_size))
+
+                descale_shape = (q_seq_start_loc.shape[0] - 1, key.shape[1])
                flash_attn_varlen_func(
                    q=query,
                    k=key,
@@ -768,13 +807,19 @@ class FlashAttentionImpl(AttentionImpl):
                    softcap=logits_soft_cap,
                    out=prefill_output,
                    fa_version=self.vllm_flash_attn_version,
+                    q_descale=layer._q_scale.expand(descale_shape),
+                    k_descale=layer._k_scale.expand(descale_shape),
+                    v_descale=layer._v_scale.expand(descale_shape),
                )
            else:
                # prefix-enabled attention
                assert attn_type == AttentionType.DECODER, (
                    "Only decoder-only models support prefix caching")
                assert prefill_meta.seq_lens is not None
+                assert prefill_meta.query_start_loc is not None
                max_seq_len = max(prefill_meta.seq_lens)
+                descale_shape = (prefill_meta.query_start_loc.shape[0] - 1,
+                                 key.shape[1])
                flash_attn_varlen_func(  # noqa
                    q=query,
                    k=key_cache,
@@ -791,6 +836,9 @@ class FlashAttentionImpl(AttentionImpl):
                    softcap=logits_soft_cap,
                    out=prefill_output,
                    fa_version=self.vllm_flash_attn_version,
+                    q_descale=layer._q_scale.expand(descale_shape),
+                    k_descale=layer._k_scale.expand(descale_shape),
+                    v_descale=layer._v_scale.expand(descale_shape),
                )

        if decode_meta := attn_metadata.decode_metadata:
@@ -804,6 +852,9 @@ class FlashAttentionImpl(AttentionImpl):
                assert attn_type == AttentionType.DECODER, (
                    "Only decoder-only models support max_decode_query_len > 1"
                )
+                assert decode_meta.query_start_loc is not None
+                descale_shape = (decode_meta.query_start_loc.shape[0] - 1,
+                                 key.shape[1])
                flash_attn_varlen_func(
                    q=decode_query,
                    k=key_cache,
@@ -820,6 +871,9 @@ class FlashAttentionImpl(AttentionImpl):
                    block_table=decode_meta.block_tables,
                    out=decode_output,
                    fa_version=self.vllm_flash_attn_version,
+                    q_descale=layer._q_scale.expand(descale_shape),
+                    k_descale=layer._k_scale.expand(descale_shape),
+                    v_descale=layer._v_scale.expand(descale_shape),
                )
            else:
                # Use flash_attn_with_kvcache for normal decoding.
@@ -828,6 +882,7 @@ class FlashAttentionImpl(AttentionImpl):
                    _,
                    block_tables_arg,
                ) = get_seq_len_block_table_args(decode_meta, False, attn_type)
+                descale_shape = (seq_lens_arg.shape[0], key_cache.shape[-2])
                flash_attn_with_kvcache(
                    q=decode_query.unsqueeze(1),
                    k_cache=key_cache,
@@ -841,6 +896,9 @@ class FlashAttentionImpl(AttentionImpl):
                    softcap=logits_soft_cap,
                    out=decode_output.unsqueeze(1),
                    fa_version=self.vllm_flash_attn_version,
+                    q_descale=layer._q_scale.expand(descale_shape),
+                    k_descale=layer._k_scale.expand(descale_shape),
+                    v_descale=layer._v_scale.expand(descale_shape),
                )
        return output


--- a/vllm/attention/backends/mla/common.py
+++ b/vllm/attention/backends/mla/common.py
@@ -203,9 +203,9 @@ from vllm.attention.backends.abstract import (AttentionBackend, AttentionLayer,
                                              AttentionState, MLAAttentionImpl)
 from vllm.attention.backends.utils import (PAD_SLOT_ID, compute_slot_mapping,
                                           compute_slot_mapping_start_idx,
-                                           get_flash_attn_version,
                                           is_block_tables_empty)
 from vllm.attention.ops.triton_merge_attn_states import merge_attn_states
+from vllm.fa_utils import get_flash_attn_version
 from vllm.model_executor.layers.linear import (ColumnParallelLinear,
                                               LinearBase, RowParallelLinear,
                                               UnquantizedLinearMethod)

--- a/vllm/attention/backends/openvino.py
+++ b/vllm/attention/backends/openvino.py
-# SPDX-License-Identifier: Apache-2.0
-
-from dataclasses import dataclass
-from typing import Dict, List, Optional, Tuple, Type
-
-import openvino as ov
-import torch
-
-from vllm.attention.backends.abstract import (AttentionBackend,
-                                              AttentionMetadata)
-from vllm.attention.backends.utils import CommonAttentionState
-from vllm.multimodal import MultiModalPlaceholderMap
-
-
-def copy_cache_block(src_tensor: ov.Tensor, dst_tensor: ov.Tensor,
-                     src_offset: int, dst_offset: int) -> None:
-
-    def create_roi_tensor(
-        tensor: ov.Tensor,
-        block_number: int,
-    ) -> ov.Tensor:
-        roi_begin = ov.runtime.Coordinate([0, 0, 0, 0])
-        roi_end = ov.runtime.Coordinate(tensor.get_shape())
-
-        roi_begin[0] = block_number
-        roi_end[0] = block_number + 1
-
-        if isinstance(tensor, ov.Tensor):
-            return ov.Tensor(tensor, roi_begin, roi_end)
-        else:
-            return ov.RemoteTensor(tensor, roi_begin, roi_end)
-
-    src_roi_tensor = \
-        create_roi_tensor(src_tensor, src_offset)
-    dst_roi_tensor = \
-        create_roi_tensor(dst_tensor, dst_offset)
-    src_roi_tensor.copy_to(dst_roi_tensor)
-
-
-class OpenVINOAttentionBackend(AttentionBackend):
-
-    @staticmethod
-    def get_name() -> str:
-        return "OPENVINO"
-
-    @staticmethod
-    def get_impl_cls():
-        # OpenVINO implements PagedAttention as part of the Optimum
-        # exported model
-        raise NotImplementedError
-
-    @staticmethod
-    def make_metadata(*args, **kwargs) -> "AttentionMetadata":
-        raise NotImplementedError
-
-    @staticmethod
-    def get_state_cls() -> Type["CommonAttentionState"]:
-        return CommonAttentionState
-
-    @staticmethod
-    def make_openvino_metadata(*args, **kwargs) -> "OpenVINOAttentionMetadata":
-        return OpenVINOAttentionMetadata(*args, **kwargs)
-
-    @staticmethod
-    def get_kv_cache_shape(
-        num_blocks: int,
-        block_size: int,
-        num_kv_heads: int,
-        head_size: int,
-    ) -> Tuple[int, ...]:
-        return (2, num_blocks, num_kv_heads, block_size, head_size)
-
-    @staticmethod
-    def swap_blocks(
-        src_tensor: ov.Tensor,
-        dst_tensor: ov.Tensor,
-        src_to_dists: List[Tuple[int, int]],
-    ) -> None:
-        for src, dst in src_to_dists:
-            copy_cache_block(src_tensor, dst_tensor, src, dst)
-
-    @staticmethod
-    def copy_blocks(
-        kv_caches: List[Tuple[ov.Tensor, ov.Tensor]],
-        src_to_dists: List[Tuple[int, int]],
-    ) -> None:
-        for src, dst in src_to_dists:
-            for key_cache, value_cache in kv_caches:
-                copy_cache_block(key_cache, key_cache, src, dst)
-                copy_cache_block(value_cache, value_cache, src, dst)
-
-
-@dataclass
-class OpenVINOAttentionMetadata:
-    """Metadata for OpenVINOAttentionBackend.
-
-    Basic terms used below:
-    - batch_size_in_sequences - total number of sequences to execute
-    - prompt_lens – per sequence size number of scheduled tokens
-    - batch_size_in_tokens = sum(prompt_lens)
-    - max_context_len = max(context_lens)
-    - max_num_blocks = div_up(max_context_len / BLOCK_SIZE)
-    - num_blocks – total number of blocks in block_indices
-    """
-
-    # Describes past KV cache size for each sequence within a batch
-    # Shape: [batch_size_in_sequences]
-    # Type: i32
-    past_lens: torch.Tensor
-
-    # Describes start indices of input / speculative tokens from
-    # current sequences within a batch sequence
-    # Shape: [batch_size_in_sequences + 1]
-    # Type: i32
-    subsequence_begins: torch.Tensor
-
-    # Describes block tables for each sequence within a batch -
-    # indices along 0th dimension in key_cache and value_cache inputs
-    # Shape: [num_blocks]
-    # Type: i32
-    block_indices: torch.Tensor
-
-    # Describes block tables for each sequence within a batch -
-    # for i-th element, it is an index in block_indices with the
-    # first block belonging to i-th sequence
-    # Shape: [batch_size_in_sequences + 1]
-    # Type: i32
-    block_indices_begins: torch.Tensor
-
-    # Describes max context length
-    # Shape: scalar
-    # Type: i32
-    max_context_len: torch.Tensor
-
-    # The index maps that relate multi-modal embeddings to the corresponding
-    # placeholders.
-    #
-    # N.B. These aren't really related to attention and don't belong on this
-    # type -- this is just a temporary solution to make them available to
-    # `model_executable`.
-    multi_modal_placeholder_index_maps: Optional[Dict[
-        str, MultiModalPlaceholderMap.IndexMap]]
-
-    # Enable/disable KV scales calculation. This is so that we can disable the
-    # calculation until after prefill and cuda graph capture.
-    enable_kv_scales_calculation: bool
--- a/vllm/attention/backends/utils.py
+++ b/vllm/attention/backends/utils.py
@@ -8,13 +8,11 @@ from typing import TYPE_CHECKING, Any, Dict, List, Tuple, Type, TypeVar, Union
 import numpy as np
 import torch

-from vllm import envs
 from vllm.attention import (AttentionMetadata, AttentionMetadataBuilder,
                            AttentionState)
 from vllm.attention.backends.abstract import AttentionType
 from vllm.logger import init_logger
 from vllm.multimodal import MultiModalPlaceholderMap
-from vllm.platforms import current_platform
 from vllm.utils import async_tensor_h2d, make_tensor_with_pad

 logger = init_logger(__name__)
@@ -585,35 +583,3 @@ def get_num_prefill_decode_query_kv_tokens(

    return (num_prefill_query_tokens, num_prefill_kv_tokens,
            num_decode_query_tokens)
-
-
-def get_flash_attn_version():
-    try:
-        from vllm.vllm_flash_attn.flash_attn_interface import (
-            fa_version_unsupported_reason, is_fa_version_supported)
-
-        # if hopper default to FA3, otherwise stick to FA2 for now
-        # TODO(lucas): profile FA3 on ampere to see if it makes sense to
-        #  use FA3 as default for both
-        if current_platform.get_device_capability()[0] == 9:
-            fa_version = 3 if is_fa_version_supported(3) else 2
-        else:
-            fa_version = 2
-
-        if envs.VLLM_FLASH_ATTN_VERSION is not None:
-            assert envs.VLLM_FLASH_ATTN_VERSION in [2, 3]
-            fa_version = envs.VLLM_FLASH_ATTN_VERSION
-            if (current_platform.get_device_capability()[0] == 10
-                    and envs.VLLM_FLASH_ATTN_VERSION == 3):
-                logger.warning("Cannot use FA version 3 on Blackwell platform",
-                               "defaulting to FA version 2.")
-                fa_version = 2
-
-        if not is_fa_version_supported(fa_version):
-            logger.error("Cannot use FA version %d is not supported due to %s",
-                         fa_version, fa_version_unsupported_reason(fa_version))
-
-        assert is_fa_version_supported(fa_version)
-        return fa_version
-    except (ImportError, AssertionError):
-        return None
--- a/vllm/attention/layer.py
+++ b/vllm/attention/layer.py
@@ -84,6 +84,9 @@ class Attention(nn.Module):
        self.calculate_kv_scales = calculate_kv_scales
        self._k_scale = torch.tensor(1.0, dtype=torch.float32)
        self._v_scale = torch.tensor(1.0, dtype=torch.float32)
+        # FlashAttn doesn't support quantizing the kv-cache only
+        # but requires q to be quantized as well.
+        self._q_scale = torch.tensor(1.0, dtype=torch.float32)

        # We also keep the float32 versions of k/v_scale for attention
        # backends that don't support tensors (Flashinfer)
@@ -153,6 +156,7 @@ class Attention(nn.Module):
            ).parallel_config.pipeline_parallel_size)
        ]

+        self.q_range = torch.tensor(envs.Q_SCALE_CONSTANT, dtype=torch.float32)
        self.k_range = torch.tensor(envs.K_SCALE_CONSTANT, dtype=torch.float32)
        self.v_range = torch.tensor(envs.V_SCALE_CONSTANT, dtype=torch.float32)

@@ -178,7 +182,7 @@ class Attention(nn.Module):
        if self.calculate_kv_scales:
            attn_metadata = get_forward_context().attn_metadata
            if attn_metadata.enable_kv_scales_calculation:
-                self.calc_kv_scales(key, value)
+                self.calc_kv_scales(query, key, value)
        if self.use_output:
            output_shape = (output_shape
                            if output_shape is not None else query.shape)
@@ -225,7 +229,8 @@ class Attention(nn.Module):
                return torch.ops.vllm.unified_attention(
                    query, key, value, self.layer_name)

-    def calc_kv_scales(self, key, value):
+    def calc_kv_scales(self, query, key, value):
+        self._q_scale.copy_(torch.abs(query).max() / self.q_range)
        self._k_scale.copy_(torch.abs(key).max() / self.k_range)
        self._v_scale.copy_(torch.abs(value).max() / self.v_range)
        self._k_scale_float = self._k_scale.item()
@@ -276,8 +281,7 @@ class MultiHeadAttention(nn.Module):
            backend = _Backend.XFORMERS

        self.attn_backend = backend if backend in {
-            _Backend.TORCH_SDPA,
-            _Backend.XFORMERS,
+            _Backend.TORCH_SDPA, _Backend.XFORMERS, _Backend.PALLAS_VLLM_V1
        } else _Backend.TORCH_SDPA

    def forward(
@@ -315,6 +319,13 @@ class MultiHeadAttention(nn.Module):
                                                 value,
                                                 scale=self.scale)
            out = out.transpose(1, 2)
+        elif self.attn_backend == _Backend.PALLAS_VLLM_V1:
+            query, key, value = (x.transpose(1, 2)
+                                 for x in (query, key, value))
+            from torch_xla.experimental.custom_kernel import flash_attention
+            out = flash_attention(query, key, value, sm_scale=self.scale)
+            out = out.transpose(1, 2)
+
        return out.reshape(bsz, q_len, -1)



--- a/vllm/compilation/backends.py
+++ b/vllm/compilation/backends.py
@@ -357,6 +357,11 @@ class VllmBackend:
            # graph.

            factors = []
+            # 0. factors come from the env, for example, The values of
+            # VLLM_PP_LAYER_PARTITION will affects the computation graph.
+            env_hash = envs.compute_hash()
+            factors.append(env_hash)
+
            # 1. factors come from the vllm_config (it mainly summarizes how the
            #    model is created)
            config_hash = vllm_config.compute_hash()
@@ -399,6 +404,7 @@ class VllmBackend:
        rank = vllm_config.parallel_config.rank
        dp_rank = vllm_config.parallel_config.data_parallel_rank
        local_cache_dir = os.path.join(cache_dir, f"rank_{rank}_{dp_rank}")
+        os.makedirs(local_cache_dir, exist_ok=True)
        self.compilation_config.local_cache_dir = local_cache_dir

        disable_cache = envs.VLLM_DISABLE_COMPILE_CACHE

--- a/vllm/config.py
+++ b/vllm/config.py
@@ -246,6 +246,7 @@ class ModelConfig:
        max_seq_len_to_capture: Optional[int] = None,
        max_logprobs: int = 20,
        disable_sliding_window: bool = False,
+        disable_cascade_attn: bool = False,
        skip_tokenizer_init: bool = False,
        served_model_name: Optional[Union[str, list[str]]] = None,
        limit_mm_per_prompt: Optional[Mapping[str, int]] = None,
@@ -322,6 +323,7 @@ class ModelConfig:
        self.max_seq_len_to_capture = max_seq_len_to_capture
        self.max_logprobs = max_logprobs
        self.disable_sliding_window = disable_sliding_window
+        self.disable_cascade_attn = disable_cascade_attn
        self.skip_tokenizer_init = skip_tokenizer_init
        self.enable_sleep_mode = enable_sleep_mode

@@ -670,14 +672,6 @@ class ModelConfig:
        self.max_seq_len_to_capture = min(self.max_seq_len_to_capture,
                                          self.max_model_len)

-        MODEL_NOT_SUPPORT_CUDA_GRAPH = ['mllama']
-        if (self.hf_config.model_type in MODEL_NOT_SUPPORT_CUDA_GRAPH
-                and not self.enforce_eager):
-            logger.warning(
-                "CUDA graph is not supported for %s yet, fallback to the eager "
-                "mode.", self.hf_config.model_type)
-            self.enforce_eager = True
-
    def _verify_bnb_config(self) -> None:
        """
        The current version of bitsandbytes (0.44.0) with 8-bit models does not
@@ -1029,6 +1023,13 @@ class ModelConfig:
                    "max_new_tokens")
        else:
            diff_sampling_param = {}
+
+        if diff_sampling_param:
+            logger.warning_once(
+                "Default sampling parameters have been overridden by the "
+                "model's Hugging Face generation config recommended from the "
+                "model creator. If this is not intended, please relaunch "
+                "vLLM instance with `--generation-config vllm`.")
        return diff_sampling_param

    @property
@@ -1300,6 +1301,12 @@ class LoadConfig:
            "tensorizer" will use CoreWeave's tensorizer library for
                fast weight loading.
            "bitsandbytes" will load nf4 type weights.
+            "sharded_state" will load weights from pre-sharded checkpoint files,
+                supporting efficient loading of tensor-parallel models.
+            "gguf" will load weights from GGUF format files.
+            "mistral" will load weights from consolidated safetensors files used
+                by Mistral models.
+            "runai_streamer" will load weights from RunAI streamer format files.
        model_loader_extra_config: The extra config for the model loader.
        ignore_patterns: The list of patterns to ignore when loading the model.
            Default to "original/**/*" to avoid repeated loading of llama's
@@ -1473,7 +1480,7 @@ class ParallelConfig:
            os.environ["VLLM_ENABLE_V1_MULTIPROCESSING"] = "0"
            logger.info("Disabling V1 multiprocessing for external launcher.")

-        ray_only_devices = ["tpu"]
+        ray_only_devices: list[str] = []
        from vllm.platforms import current_platform
        if (current_platform.device_type in ray_only_devices
                and self.world_size > 1):
@@ -1801,7 +1808,7 @@ class DeviceConfig:
            self.device_type = device

        # Some device types require processing inputs on CPU
-        if self.device_type in ["neuron", "openvino"]:
+        if self.device_type in ["neuron"]:
            self.device = torch.device("cpu")
        elif self.device_type in ["tpu"]:
            self.device = None
@@ -1810,12 +1817,139 @@ class DeviceConfig:
            self.device = torch.device(self.device_type)


+@dataclass
 class SpeculativeConfig:
-    """Configuration for speculative decoding.
+    """
+    Configuration for speculative decoding.
+    Configurable parameters include:
+    - General Speculative Decoding Control:
+        - num_speculative_tokens (int): The number of speculative
+            tokens, if provided. It will default to the number in the draft
+            model config if present, otherwise, it is required.
+        - model (Optional[str]): The name of the draft model, eagle head,
+            or additional weights, if provided.
+        - method (Optional[str]): The name of the speculative method to use.
+            If users provide and set the `model` param, the speculative method
+            type will be detected automatically if possible, if `model` param
+            is not provided, the method name must be provided.
+            - Possible values:
+                - ngram
+                    Related additional configuration:
+                    - prompt_lookup_max (Optional[int]):
+                        Maximum size of ngram token window when using Ngram
+                        proposer, required when method is set to ngram.
+                    - prompt_lookup_min (Optional[int]):
+                        Minimum size of ngram token window when using Ngram
+                        proposer, if provided. Defaults to 1.
+                - eagle
+                - medusa
+                - mlp_speculator
+                - draft_model
+        - acceptance_method (str): The method to use for accepting draft
+            tokens. This can take two possible values: 'rejection_sampler' and
+            'typical_acceptance_sampler' for RejectionSampler and
+            TypicalAcceptanceSampler respectively. If not specified, it
+            defaults to 'rejection_sampler'.
+            - Possible values:
+                - rejection_sampler
+                - typical_acceptance_sampler
+                    Related additional configuration:
+                    - posterior_threshold (Optional[float]):
+                        A threshold value that sets a lower bound on the
+                        posterior probability of a token in the target model
+                        for it to be accepted. This threshold is used only
+                        when we use the TypicalAcceptanceSampler for token
+                        acceptance.
+                    - posterior_alpha (Optional[float]):
+                        Scaling factor for entropy-based threshold, applied
+                        when using TypicalAcceptanceSampler.
+        - draft_tensor_parallel_size (Optional[int]): The degree of the tensor
+            parallelism for the draft model. Can only be 1 or the same as the
+            target model's tensor parallel size.
+        - disable_logprobs (bool): If set to True, token log probabilities are
+            not returned during speculative decoding. If set to False, token
+            log probabilities are returned according to the log probability
+            settings in SamplingParams. If not specified, it defaults to True.
+
+    - Draft Model Configuration:
+        - quantization (Optional[str]): Quantization method that was used to
+            quantize the draft model weights. If None, we assume the
+            model weights are not quantized. Note that it only takes effect
+            when using the draft model-based speculative method.
+        - max_model_len (Optional[int]): The maximum model length of the
+            draft model. Used when testing the ability to skip
+            speculation for some sequences.
+        - revision: The specific model version to use for the draft model. It
+            can be a branch name, a tag name, or a commit id. If unspecified,
+            will use the default version.
+        - code_revision: The specific revision to use for the draft model code
+            on Hugging Face Hub. It can be a branch name, a tag name, or a
+            commit id. If unspecified, will use the default version.

-    The configuration is currently specialized to draft-model speculative
-    decoding with top-1 proposals.
+    - Advanced Control:
+        - disable_mqa_scorer (bool): Disable the MQA scorer and fall back to
+            batch expansion for scoring proposals. If not specified, it
+            defaults to False.
+        - disable_by_batch_size (Optional[int]): Disable speculative decoding
+            for new incoming requests when the number of enqueued requests is
+            larger than this value, if provided.
+
+    Although the parameters above are structured hierarchically, there is no
+    need to nest them during configuration.
+
+    Non-configurable internal parameters include:
+    - Model Configuration:
+        - target_model_config (ModelConfig): The configuration of the target
+            model.
+        - draft_model_config (ModelConfig): The configuration of the draft
+            model initialized internal.
+    - Parallelism Configuration:
+        - target_parallel_config (ParallelConfig): The parallel configuration
+            for the target model.
+        - draft_parallel_config (ParallelConfig): The parallel configuration
+            for the draft model initialized internal.
+    - Execution Control:
+        - enable_chunked_prefill (bool): Whether vLLM is configured to use
+            chunked prefill or not. Used for raising an error since it's not
+            yet compatible with speculative decode.
+        - disable_log_stats (bool): Whether to disable the periodic printing of
+            stage times in speculative decoding.
    """
+    # speculative configs from cli args
+    num_speculative_tokens: int = field(default=None,
+                                        init=True)  # type: ignore
+    method: Optional[str] = None
+    acceptance_method: str = "rejection_sampler"
+    draft_tensor_parallel_size: Optional[int] = None
+    disable_logprobs: bool = True
+
+    model: Optional[str] = None
+    quantization: Optional[str] = None
+    max_model_len: Optional[int] = None
+    revision: Optional[str] = None
+    code_revision: Optional[str] = None
+
+    disable_mqa_scorer: bool = False
+    disable_by_batch_size: Optional[int] = None
+    prompt_lookup_max: Optional[int] = None
+    prompt_lookup_min: Optional[int] = None
+    posterior_threshold: Optional[float] = None
+    posterior_alpha: Optional[float] = None
+
+    # required configuration params passed from engine
+    target_model_config: ModelConfig = field(default=None,
+                                             init=True)  # type: ignore
+    target_parallel_config: ParallelConfig = field(default=None,
+                                                   init=True)  # type: ignore
+    enable_chunked_prefill: bool = field(default=None,
+                                         init=True)  # type: ignore
+    disable_log_stats: bool = field(default=None, init=True)  # type: ignore
+
+    # params generated in the post-init stage
+    draft_model_config: ModelConfig = field(default=None,
+                                            init=True)  # type: ignore
+    draft_parallel_config: ParallelConfig = field(default=None,
+                                                  init=True)  # type: ignore

    def compute_hash(self) -> str:
        """
@@ -1835,6 +1969,11 @@ class SpeculativeConfig:
        hash_str = hashlib.md5(str(factors).encode()).hexdigest()
        return hash_str

+    @classmethod
+    def from_dict(cls, dict_value: dict) -> "SpeculativeConfig":
+        """Parse the CLI value for the speculative config."""
+        return cls(**dict_value)
+
    @staticmethod
    def hf_config_override(hf_config: PretrainedConfig) -> PretrainedConfig:
        if hf_config.model_type == "deepseek_v3":
@@ -1847,230 +1986,172 @@ class SpeculativeConfig:
            })
        return hf_config

-    @staticmethod
-    def maybe_create_spec_config(
-        target_model_config: ModelConfig,
-        target_parallel_config: ParallelConfig,
-        target_dtype: str,
-        speculative_model: Optional[str],
-        speculative_model_quantization: Optional[str],
-        speculative_draft_tensor_parallel_size: Optional[int],
-        num_speculative_tokens: Optional[int],
-        speculative_disable_mqa_scorer: Optional[bool],
-        speculative_max_model_len: Optional[int],
-        enable_chunked_prefill: bool,
-        disable_log_stats: bool,
-        speculative_disable_by_batch_size: Optional[int],
-        ngram_prompt_lookup_max: Optional[int],
-        ngram_prompt_lookup_min: Optional[int],
-        draft_token_acceptance_method: str,
-        typical_acceptance_sampler_posterior_threshold: Optional[float],
-        typical_acceptance_sampler_posterior_alpha: Optional[float],
-        disable_logprobs: Optional[bool],
-    ) -> Optional["SpeculativeConfig"]:
-        """Create a SpeculativeConfig if possible, else return None.
-
-        This function attempts to create a SpeculativeConfig object based on the
-        provided parameters. If the necessary conditions are met, it returns an
-        instance of SpeculativeConfig. Otherwise, it returns None.
-
-        Args:
-            target_model_config (ModelConfig): The configuration of the target
-                model.
-            target_parallel_config (ParallelConfig): The parallel configuration
-                for the target model.
-            target_dtype (str): The data type used for the target model.
-            speculative_model (Optional[str]): The name of the speculative
-                model, if provided.
-            speculative_model_quantization (Optional[str]): Quantization method
-                that was used to quantize the speculative model weights. If
-                None, we assume the model weights are not quantized.
-            speculative_draft_tensor_parallel_size (Optional[int]): The degree
-                of the tensor parallelism for the draft model.
-            num_speculative_tokens (Optional[int]): The number of speculative
-                tokens, if provided. Will default to the number in the draft
-                model config if present, otherwise is required.
-            speculative_disable_mqa_scorer (Optional[bool]): Disable the MQA
-                scorer for the speculative model and fall back to batch
-                expansion for scoring.
-            speculative_max_model_len (Optional[int]): The maximum model len of
-                the speculative model. Used when testing the ability to skip
-                speculation for some sequences.
-            enable_chunked_prefill (bool): Whether vLLM is configured to use
-                chunked prefill or not. Used for raising an error since its not
-                yet compatible with spec decode.
-            speculative_disable_by_batch_size (Optional[int]): Disable
-                speculative decoding for new incoming requests when the number
-                of enqueue requests  is larger than this value, if provided.
-            ngram_prompt_lookup_max (Optional[int]): Max size of ngram token
-                window, if provided.
-            ngram_prompt_lookup_min (Optional[int]): Min size of ngram token
-                window, if provided.
-            draft_token_acceptance_method (str): The method to use for
-                accepting draft tokens. This can take two possible
-                values 'rejection_sampler' and 'typical_acceptance_sampler'
-                for RejectionSampler and TypicalAcceptanceSampler
-                respectively.
-            typical_acceptance_sampler_posterior_threshold (Optional[float]):
-                A threshold value that sets a lower bound on the posterior
-                probability of a token in the target model for it to be
-                accepted. This threshold is used only when we use the
-                TypicalAcceptanceSampler for token acceptance.
-            typical_acceptance_sampler_posterior_alpha (Optional[float]):
-                A scaling factor for the entropy-based threshold in the
-                TypicalAcceptanceSampler.
-            disable_logprobs (Optional[bool]): If set to True, token log
-                probabilities are not returned during speculative decoding.
-                If set to False, token log probabilities are returned
-                according to the log probability settings in SamplingParams.
-                If not specified, it defaults to True.
+    def __post_init__(self):

-        Returns:
-            Optional["SpeculativeConfig"]: An instance of SpeculativeConfig if
-                the necessary conditions are met, else None.
-        """
-        if speculative_model is None:
-            if num_speculative_tokens is not None:
-                if target_model_config.hf_text_config.model_type \
+        # Note: After next release, the method parameter will be used to
+        # specify the speculative method, which helps to extend the
+        # configuration of non-model-based proposers, and the model parameter
+        # will be used when the draft model or head is needed.
+        # If users do not specify the method, the speculative method will
+        # be detected automatically if possible. If the speculative method can
+        # not be detected, it will be considered as the draft-model-based
+        # method by default.
+
+        if self.model is None and self.num_speculative_tokens is not None:
+            # TODO(Shangming): Refactor mtp configuration logic when supporting
+            # mtp acceleration for more models besides deepseek_v3
+            if self.target_model_config.hf_text_config.model_type \
                        == "deepseek_v3":
-                    # use the draft model from the same model:
-                    speculative_model = target_model_config.model
-                else:
-                    raise ValueError(
-                        "num_speculative_tokens was provided without "
-                        "speculative_model.")
+                # use the draft model from the same model:
+                self.model = self.target_model_config.model
+            elif self.method in ("ngram", "[ngram]"):
+                self.model = "ngram"
            else:
-                return None
-
-        if (speculative_disable_by_batch_size is not None
-                and speculative_disable_by_batch_size < 2):
-            raise ValueError("Expect the batch size threshold of disabling "
-                             "speculative decoding is > 1, but got "
-                             f"{speculative_disable_by_batch_size=}")
-        if (enable_chunked_prefill and speculative_model == "eagle"):
-            raise ValueError("Chunked prefill and EAGLE are not compatible.")
-        # TODO: The user should be able to specify revision/max model len
-        # for the draft model. It is not currently supported.
-        draft_revision = None
-        draft_code_revision = None
-        draft_quantization = speculative_model_quantization
-
-        if speculative_model == "[ngram]":
-            if ngram_prompt_lookup_min is None:
-                ngram_prompt_lookup_min = 1
-            if ngram_prompt_lookup_max is None or ngram_prompt_lookup_max < 1:
-                raise ValueError(f"{ngram_prompt_lookup_max=} must be > 0")
-            if ngram_prompt_lookup_min < 1:
-                raise ValueError(f"{ngram_prompt_lookup_min=} must be > 0")
-            if ngram_prompt_lookup_min > ngram_prompt_lookup_max:
-                raise ValueError(f"{ngram_prompt_lookup_min=} cannot be "
-                                 f"larger than {ngram_prompt_lookup_max=}")
+                raise ValueError("num_speculative_tokens was provided without "
+                                 "speculative model.")
+
+        # Automatically configure the ngram method during configuration
+        # refactoring to ensure a smooth transition.
+        if self.method is None and (self.model is not None
+                                    and self.model in ("ngram", "[ngram]")):
+            self.method = "ngram"
+
+        if self.method in ("ngram", "[ngram]"):
+            # Unified to "ngram" internally
+            self.method = "ngram"
+            # Set default values if not provided
+            if (self.prompt_lookup_min is None
+                    and self.prompt_lookup_max is None):
+                # TODO(woosuk): Tune these values. They are arbitrarily chosen.
+                self.prompt_lookup_min = 5
+                self.prompt_lookup_max = 5
+            elif self.prompt_lookup_min is None:
+                assert self.prompt_lookup_max is not None
+                self.prompt_lookup_min = self.prompt_lookup_max
+            elif self.prompt_lookup_max is None:
+                assert self.prompt_lookup_min is not None
+                self.prompt_lookup_max = self.prompt_lookup_min
+
+            # Validate values
+            if self.prompt_lookup_min < 1:
+                raise ValueError(
+                    f"prompt_lookup_min={self.prompt_lookup_min} must be > 0")
+            if self.prompt_lookup_max < 1:
+                raise ValueError(
+                    f"prompt_lookup_max={self.prompt_lookup_max} must be > 0")
+            if self.prompt_lookup_min > self.prompt_lookup_max:
+                raise ValueError(
+                    f"prompt_lookup_min={self.prompt_lookup_min} must "
+                    f"be <= prompt_lookup_max={self.prompt_lookup_max}")

            # TODO: current we still need extract vocab_size from target model
            # config, in future, we may try refactor it out, and set
            # draft related config as None here.
-            draft_model_config = target_model_config
-            draft_parallel_config = target_parallel_config
+            self.draft_model_config = self.target_model_config
+            self.draft_parallel_config = self.target_parallel_config
        else:
-            ngram_prompt_lookup_max = 0
-            ngram_prompt_lookup_min = 0
-            draft_model_config = ModelConfig(
-                model=speculative_model,
-                task="draft",
-                tokenizer=target_model_config.tokenizer,
-                tokenizer_mode=target_model_config.tokenizer_mode,
-                trust_remote_code=target_model_config.trust_remote_code,
-                allowed_local_media_path=target_model_config.
-                allowed_local_media_path,
-                dtype=target_model_config.dtype,
-                seed=target_model_config.seed,
-                revision=draft_revision,
-                code_revision=draft_code_revision,
-                tokenizer_revision=target_model_config.tokenizer_revision,
-                max_model_len=None,
-                spec_target_max_model_len=target_model_config.max_model_len,
-                quantization=draft_quantization,
-                enforce_eager=target_model_config.enforce_eager,
-                max_seq_len_to_capture=target_model_config.
-                max_seq_len_to_capture,
-                max_logprobs=target_model_config.max_logprobs,
-                hf_overrides=SpeculativeConfig.hf_config_override,
-            )
-
-            draft_hf_config = draft_model_config.hf_config
+            self.prompt_lookup_max = 0
+            self.prompt_lookup_min = 0
+
+            if self.model is not None:
+                self.draft_model_config = ModelConfig(
+                    model=self.model,
+                    task="draft",
+                    tokenizer=self.target_model_config.tokenizer,
+                    tokenizer_mode=self.target_model_config.tokenizer_mode,
+                    trust_remote_code=self.target_model_config.
+                    trust_remote_code,
+                    allowed_local_media_path=self.target_model_config.
+                    allowed_local_media_path,
+                    dtype=self.target_model_config.dtype,
+                    seed=self.target_model_config.seed,
+                    revision=self.revision,
+                    code_revision=self.code_revision,
+                    tokenizer_revision=self.target_model_config.
+                    tokenizer_revision,
+                    max_model_len=None,
+                    spec_target_max_model_len=self.target_model_config.
+                    max_model_len,
+                    quantization=self.quantization,
+                    enforce_eager=self.target_model_config.enforce_eager,
+                    max_seq_len_to_capture=self.target_model_config.
+                    max_seq_len_to_capture,
+                    max_logprobs=self.target_model_config.max_logprobs,
+                    hf_overrides=SpeculativeConfig.hf_config_override,
+                )

-            # Detect EAGLE prefix to replace hf_config for EAGLE draft_model
-            if "eagle-" in draft_model_config.model.lower():
-                from vllm.transformers_utils.configs.eagle import EAGLEConfig
-                if isinstance(draft_model_config.hf_config, EAGLEConfig):
-                    pass
+                # Automatically detect the method
+                if "eagle-" in self.draft_model_config.model.lower():
+                    self.method = "eagle"
+                elif self.draft_model_config.hf_config.model_type == "medusa":
+                    self.method = "medusa"
+                elif (self.draft_model_config.hf_config.model_type ==
+                      "mlp_speculator"):
+                    self.method = "mlp_speculator"
                else:
-                    eagle_config = EAGLEConfig(draft_model_config.hf_config)
-                    draft_model_config.hf_config = eagle_config
-
-            if (num_speculative_tokens is not None
-                    and hasattr(draft_hf_config, "num_lookahead_tokens")):
-                draft_hf_config.num_lookahead_tokens = num_speculative_tokens
-            n_predict = getattr(draft_hf_config, "n_predict", None)
-            if n_predict is not None:
-                if num_speculative_tokens is None:
-                    # Default to max value defined in draft model config.
-                    num_speculative_tokens = n_predict
-                elif num_speculative_tokens > n_predict and \
-                        num_speculative_tokens % n_predict != 0:
-                    # Ensure divisibility for MTP module reuse.
-                    raise ValueError(
-                        f"{num_speculative_tokens=} must be divisible by "
-                        f"{n_predict=}")
-
-            speculative_draft_tensor_parallel_size = \
-                SpeculativeConfig._verify_and_get_draft_model_tensor_parallel_size(
-                    target_parallel_config,
-                    speculative_draft_tensor_parallel_size,
-                    draft_hf_config
-            )
+                    self.method = "draft_model"
+
+                # Replace hf_config for EAGLE draft_model
+                if self.method == "eagle":
+                    if self.enable_chunked_prefill:
+                        raise ValueError(
+                            "Chunked prefill and EAGLE are not compatible.")
+
+                    from vllm.transformers_utils.configs.eagle import (
+                        EAGLEConfig)
+                    if isinstance(self.draft_model_config.hf_config,
+                                  EAGLEConfig):
+                        pass
+                    else:
+                        eagle_config = EAGLEConfig(
+                            self.draft_model_config.hf_config)
+                        self.draft_model_config.hf_config = eagle_config
+
+                if (self.num_speculative_tokens is not None
+                        and hasattr(self.draft_model_config.hf_config,
+                                    "num_lookahead_tokens")):
+                    self.draft_model_config.hf_config.num_lookahead_tokens = \
+                    self.num_speculative_tokens
+
+                n_predict = getattr(self.draft_model_config.hf_config,
+                                    "n_predict", None)
+                if n_predict is not None:
+                    if self.num_speculative_tokens is None:
+                        # Default to max value defined in draft model config.
+                        self.num_speculative_tokens = n_predict
+                    elif self.num_speculative_tokens > n_predict and \
+                            self.num_speculative_tokens % n_predict != 0:
+                        # Ensure divisibility for MTP module reuse.
+                        raise ValueError(
+                            f"num_speculative_tokens:{self.num_speculative_tokens}"
+                            f" must be divisible by {n_predict=}")
+
+                self.draft_tensor_parallel_size = \
+                    SpeculativeConfig._verify_and_get_draft_tp(
+                        self.target_parallel_config,
+                        self.draft_tensor_parallel_size,
+                        self.draft_model_config.hf_config
+                )

-            draft_model_config.max_model_len = (
-                SpeculativeConfig._maybe_override_draft_max_model_len(
-                    speculative_max_model_len,
-                    draft_model_config.max_model_len,
-                    target_model_config.max_model_len,
-                ))
+                self.draft_model_config.max_model_len = (
+                    SpeculativeConfig._maybe_override_draft_max_model_len(
+                        self.max_model_len,
+                        self.draft_model_config.max_model_len,
+                        self.target_model_config.max_model_len,
+                    ))

-            draft_parallel_config = (
-                SpeculativeConfig.create_draft_parallel_config(
-                    target_parallel_config,
-                    speculative_draft_tensor_parallel_size, draft_hf_config))
+                self.draft_parallel_config = (
+                    SpeculativeConfig.create_draft_parallel_config(
+                        self.target_parallel_config,
+                        self.draft_tensor_parallel_size))

-        if num_speculative_tokens is None:
-            raise ValueError(
-                "num_speculative_tokens must be provided with "
-                "speculative_model unless the draft model config contains an "
-                "n_predict parameter.")
+        if self.acceptance_method == "typical_acceptance_sampler":
+            if self.posterior_threshold is None:
+                self.posterior_threshold = 0.09
+            if self.posterior_alpha is None:
+                self.posterior_alpha = 0.3

-        if typical_acceptance_sampler_posterior_threshold is None:
-            typical_acceptance_sampler_posterior_threshold = 0.09
-        if typical_acceptance_sampler_posterior_alpha is None:
-            typical_acceptance_sampler_posterior_alpha = 0.3
-        if disable_logprobs is None:
-            disable_logprobs = True
-
-        return SpeculativeConfig(
-            draft_model_config,
-            draft_parallel_config,
-            num_speculative_tokens,
-            speculative_disable_mqa_scorer,
-            speculative_disable_by_batch_size,
-            ngram_prompt_lookup_max,
-            ngram_prompt_lookup_min,
-            draft_token_acceptance_method=draft_token_acceptance_method,
-            typical_acceptance_sampler_posterior_threshold=\
-                typical_acceptance_sampler_posterior_threshold,
-            typical_acceptance_sampler_posterior_alpha=\
-                typical_acceptance_sampler_posterior_alpha,
-            disable_logprobs=disable_logprobs,
-            disable_log_stats=disable_log_stats,
-        )
+        self._verify_args()

    @staticmethod
    def _maybe_override_draft_max_model_len(
@@ -2108,7 +2189,7 @@ class SpeculativeConfig:
        )

    @staticmethod
-    def _verify_and_get_draft_model_tensor_parallel_size(
+    def _verify_and_get_draft_tp(
            target_parallel_config: ParallelConfig,
            speculative_draft_tensor_parallel_size: Optional[int],
            draft_hf_config: PretrainedConfig) -> int:
@@ -2140,7 +2221,6 @@ class SpeculativeConfig:
    def create_draft_parallel_config(
        target_parallel_config: ParallelConfig,
        speculative_draft_tensor_parallel_size: int,
-        draft_hf_config: PretrainedConfig,
    ) -> ParallelConfig:
        """Create a parallel config for use by the draft worker.

@@ -2164,74 +2244,13 @@ class SpeculativeConfig:

        return draft_parallel_config

-    def __init__(
-        self,
-        draft_model_config: ModelConfig,
-        draft_parallel_config: ParallelConfig,
-        num_speculative_tokens: int,
-        speculative_disable_mqa_scorer: Optional[bool],
-        speculative_disable_by_batch_size: Optional[int],
-        ngram_prompt_lookup_max: Optional[int],
-        ngram_prompt_lookup_min: Optional[int],
-        draft_token_acceptance_method: str,
-        typical_acceptance_sampler_posterior_threshold: float,
-        typical_acceptance_sampler_posterior_alpha: float,
-        disable_logprobs: bool,
-        disable_log_stats: bool,
-    ):
-        """Create a SpeculativeConfig object.
-
-        Args:
-            draft_model_config: ModelConfig for the draft model.
-            draft_parallel_config: ParallelConfig for the draft model.
-            num_speculative_tokens: The number of tokens to sample from the
-                draft model before scoring with the target model.
-            speculative_disable_by_batch_size: Disable speculative
-                decoding for new incoming requests when the number of
-                enqueue requests is larger than this value.
-            ngram_prompt_lookup_max: Max size of ngram token window.
-            ngram_prompt_lookup_min: Min size of ngram token window.
-            draft_token_acceptance_method (str): The method to use for
-                accepting draft tokens. This can take two possible
-                values 'rejection_sampler' and 'typical_acceptance_sampler'
-                for RejectionSampler and TypicalAcceptanceSampler
-                respectively.
-            typical_acceptance_sampler_posterior_threshold (Optional[float]):
-                A threshold value that sets a lower bound on the posterior
-                probability of a token in the target model for it to be
-                accepted. This threshold is used only when we use the
-                TypicalAcceptanceSampler for token acceptance.
-            typical_acceptance_sampler_posterior_alpha (Optional[float]):
-                A scaling factor for the entropy-based threshold in the
-                TypicalAcceptanceSampler.
-            disable_logprobs: If set to True, token log probabilities will not
-                be returned even if requested by sampling parameters. This
-                reduces latency by skipping logprob calculation in proposal
-                sampling, target sampling, and after accepted tokens are
-                determined. If set to False, log probabilities will be
-                returned.
-            disable_log_stats: Whether to disable periodic printing of stage
-                times in speculative decoding.
-        """
-        self.draft_model_config = draft_model_config
-        self.draft_parallel_config = draft_parallel_config
-        self.num_speculative_tokens = num_speculative_tokens
-        self.speculative_disable_mqa_scorer = speculative_disable_mqa_scorer
-        self.speculative_disable_by_batch_size = \
-            speculative_disable_by_batch_size
-        self.ngram_prompt_lookup_max = ngram_prompt_lookup_max or 0
-        self.ngram_prompt_lookup_min = ngram_prompt_lookup_min or 0
-        self.draft_token_acceptance_method = draft_token_acceptance_method
-        self.typical_acceptance_sampler_posterior_threshold = \
-            typical_acceptance_sampler_posterior_threshold
-        self.typical_acceptance_sampler_posterior_alpha = \
-            typical_acceptance_sampler_posterior_alpha
-        self.disable_logprobs = disable_logprobs
-        self.disable_log_stats = disable_log_stats
-
-        self._verify_args()
-
    def _verify_args(self) -> None:
+        if self.num_speculative_tokens is None:
+            raise ValueError(
+                "num_speculative_tokens must be provided with "
+                "speculative model unless the draft model config contains an "
+                "n_predict parameter.")
+
        if self.num_speculative_tokens <= 0:
            raise ValueError("Expected num_speculative_tokens to be greater "
                             f"than zero ({self.num_speculative_tokens}).")
@@ -2241,29 +2260,34 @@ class SpeculativeConfig:
                self.draft_parallel_config)
            # Validate and set draft token acceptance related settings.

-        if (self.draft_token_acceptance_method is None):
-            raise ValueError("draft_token_acceptance_method is not set. "
+        if self.acceptance_method is None:
+            raise ValueError("acceptance_method is not set. "
                             "Expected values are rejection_sampler or "
                             "typical_acceptance_sampler.")

-        if (self.draft_token_acceptance_method != 'rejection_sampler'
-                and self.draft_token_acceptance_method
-                != 'typical_acceptance_sampler'):
+        if (self.acceptance_method != 'rejection_sampler'
+                and self.acceptance_method != 'typical_acceptance_sampler'):
            raise ValueError(
-                "Expected draft_token_acceptance_method to be either "
+                "Expected acceptance_method to be either "
                "rejection_sampler or typical_acceptance_sampler. Instead it "
-                f"is {self.draft_token_acceptance_method}")
+                f"is {self.acceptance_method}")

-        if (self.typical_acceptance_sampler_posterior_threshold < 0
-                or self.typical_acceptance_sampler_posterior_alpha < 0):
+        if self.acceptance_method == "typical_acceptance_sampler" and (
+            (self.posterior_threshold is not None
+             and self.posterior_threshold < 0) or
+            (self.posterior_alpha is not None and self.posterior_alpha < 0)):
            raise ValueError(
-                "Expected typical_acceptance_sampler_posterior_threshold "
-                "and typical_acceptance_sampler_posterior_alpha to be > 0. "
-                "Instead found "
-                f"typical_acceptance_sampler_posterior_threshold = "
-                f"{self.typical_acceptance_sampler_posterior_threshold} and "
-                f"typical_acceptance_sampler_posterior_alpha = "
-                f"{self.typical_acceptance_sampler_posterior_alpha}")
+                "Expected the posterior_threshold and posterior_alpha of "
+                "typical_acceptance_sampler to be > 0. "
+                "Instead found posterior_threshold = "
+                f"{self.posterior_threshold} and posterior_alpha = "
+                f"{self.posterior_alpha}")
+
+        if (self.disable_by_batch_size is not None
+                and self.disable_by_batch_size < 2):
+            raise ValueError("Expect the batch size threshold of disabling "
+                             "speculative decoding is > 1, but got "
+                             f"{self.disable_by_batch_size=}")

    @property
    def num_lookahead_slots(self) -> int:
@@ -2276,8 +2300,8 @@ class SpeculativeConfig:
        return self.num_speculative_tokens

    def __repr__(self) -> str:
-        if self.ngram_prompt_lookup_max > 0:
-            draft_model = "[ngram]"
+        if self.prompt_lookup_max is not None and self.prompt_lookup_max > 0:
+            draft_model = "ngram"
        else:
            draft_model = self.draft_model_config.model
        num_spec_tokens = self.num_speculative_tokens
@@ -2785,12 +2809,14 @@ class DecodingConfig:
        return hash_str

    def __post_init__(self):
-        valid_guided_backends = ['outlines', 'lm-format-enforcer', 'xgrammar']
+        valid_guided_backends = [
+            'outlines', 'lm-format-enforcer', 'xgrammar', 'guidance'
+        ]

        backend = GuidedDecodingParams(
            backend=self.guided_decoding_backend).backend_name
        if backend not in valid_guided_backends:
-            raise ValueError(f"Invalid guided_decoding_backend '{backend},"
+            raise ValueError(f"Invalid guided_decoding_backend '{backend}',"
                             f" must be one of {valid_guided_backends}")


@@ -3283,7 +3309,8 @@ class VllmConfig:
                                        init=True)  # type: ignore
    load_config: LoadConfig = field(default=None, init=True)  # type: ignore
    lora_config: Optional[LoRAConfig] = None
-    speculative_config: Optional[SpeculativeConfig] = None
+    speculative_config: SpeculativeConfig = field(default=None,
+                                                  init=True)  # type: ignore
    decoding_config: Optional[DecodingConfig] = None
    observability_config: Optional[ObservabilityConfig] = None
    prompt_adapter_config: Optional[PromptAdapterConfig] = None

--- a/vllm/core/block/cpu_gpu_block_allocator.py
+++ b/vllm/core/block/cpu_gpu_block_allocator.py
@@ -341,8 +341,10 @@ class CpuGpuBlockAllocator(DeviceAwareBlockAllocator):
        assert device in self._allocators
        return self._allocators[device].get_prefix_cache_hit_rate()

-    def reset_prefix_cache(self) -> bool:
-        """Reset prefix cache for all devices."""
+    def reset_prefix_cache(self, device: Optional[Device] = None) -> bool:
+        """Reset prefix cache for specified or all devices."""
+        if device:
+            return self._allocators[device].reset_prefix_cache()
        success = True
        for allocator in self._allocators.values():
            success = success and allocator.reset_prefix_cache()

--- a/vllm/core/block/interfaces.py
+++ b/vllm/core/block/interfaces.py
@@ -305,7 +305,7 @@ class DeviceAwareBlockAllocator(ABC):
        pass

    @abstractmethod
-    def reset_prefix_cache(self) -> bool:
+    def reset_prefix_cache(self, device: Optional[Device] = None) -> bool:
        """Reset prefix cache."""
        pass


--- a/vllm/core/block_manager.py
+++ b/vllm/core/block_manager.py
@@ -456,8 +456,8 @@ class SelfAttnBlockSpaceManager(BlockSpaceManager):
    def get_prefix_cache_hit_rate(self, device: Device) -> float:
        return self.block_allocator.get_prefix_cache_hit_rate(device)

-    def reset_prefix_cache(self) -> bool:
-        return self.block_allocator.reset_prefix_cache()
+    def reset_prefix_cache(self, device: Optional[Device] = None) -> bool:
+        return self.block_allocator.reset_prefix_cache(device)

    def _can_swap(self,
                  seq_group: SequenceGroup,

--- a/vllm/core/interfaces.py
+++ b/vllm/core/interfaces.py
@@ -2,7 +2,7 @@

 import enum
 from abc import ABC, abstractmethod
-from typing import List
+from typing import List, Optional
 from typing import Sequence as GenericSequence
 from typing import Tuple

@@ -125,8 +125,8 @@ class BlockSpaceManager(ABC):
        pass

    @abstractmethod
-    def reset_prefix_cache(self) -> bool:
-        """Reset prefix cache for all devices."""
+    def reset_prefix_cache(self, device: Optional[Device] = None) -> bool:
+        """Reset prefix cache for specified or all devices."""
        pass

    @abstractmethod

--- a/vllm/core/placeholder_block_space_manager.py
+++ b/vllm/core/placeholder_block_space_manager.py
 # SPDX-License-Identifier: Apache-2.0

-from typing import List, Tuple
+from typing import List, Optional, Tuple

 from vllm.core.interfaces import AllocStatus, BlockSpaceManager
 from vllm.sequence import Sequence, SequenceGroup
@@ -92,7 +92,7 @@ class PlaceholderBlockSpaceManager(BlockSpaceManager):
    def get_prefix_cache_hit_rate(self, device: Device) -> float:
        return -1

-    def reset_prefix_cache(self) -> bool:
+    def reset_prefix_cache(self, device: Optional[Device] = None) -> bool:
        return True

    def get_num_cached_tokens(self, seq: Sequence) -> int:

--- a/vllm/core/scheduler.py
+++ b/vllm/core/scheduler.py
@@ -634,8 +634,8 @@ class Scheduler:
    def get_prefix_cache_hit_rate(self, device: Device) -> float:
        return self.block_manager.get_prefix_cache_hit_rate(device)

-    def reset_prefix_cache(self) -> bool:
-        return self.block_manager.reset_prefix_cache()
+    def reset_prefix_cache(self, device: Optional[Device] = None) -> bool:
+        return self.block_manager.reset_prefix_cache(device)

    def get_num_unfinished_seq_groups(self) -> int:
        return len(self.waiting) + len(self.running) + len(self.swapped)

--- a/vllm/distributed/device_communicators/tpu_communicator.py
+++ b/vllm/distributed/device_communicators/tpu_communicator.py
@@ -6,16 +6,25 @@ from typing import Optional
 import torch
 from torch.distributed import ProcessGroup

+from vllm.config import get_current_vllm_config
+from vllm.logger import init_logger
 from vllm.platforms import current_platform

 from .base_device_communicator import DeviceCommunicatorBase

+USE_RAY = parallel_config = get_current_vllm_config(
+).parallel_config.distributed_executor_backend == "ray"
+
+logger = init_logger(__name__)
+
 if current_platform.is_tpu():
+    import torch_xla
    import torch_xla.core.xla_model as xm
    import torch_xla.runtime as xr
    from torch_xla._internal import pjrt

-    from vllm.executor import ray_utils
+    if USE_RAY:
+        from vllm.executor import ray_utils


 class TpuCommunicator(DeviceCommunicatorBase):
@@ -33,19 +42,32 @@ class TpuCommunicator(DeviceCommunicatorBase):
        global_rank = self.global_rank
        global_world_size = self.global_world_size

-        # Calculate how many TPU nodes are in the current deployment. This
-        # is the Ray placement group if it is deployed with Ray. Default
-        # to the number of TPU nodes in the Ray cluster. The number of TPU
-        # nodes is computed by the total number of TPUs divided by the
-        # number of TPU accelerators per node, to account for clusters
-        # with both CPUs and TPUs.
-        num_nodes = ray_utils.get_num_tpu_nodes()
-        num_nodes_in_pg = ray_utils.get_num_nodes_in_placement_group()
-        if num_nodes_in_pg > 0:
-            num_nodes = num_nodes_in_pg
-
-        local_world_size = global_world_size // num_nodes
-        local_rank = global_rank % local_world_size
+        if USE_RAY:
+            logger.info("TpuCommunicator initialized with RAY")
+            # Calculate how many TPU nodes are in the current deployment. This
+            # is the Ray placement group if it is deployed with Ray. Default
+            # to the number of TPU nodes in the Ray cluster. The number of TPU
+            # nodes is computed by the total number of TPUs divided by the
+            # number of TPU accelerators per node, to account for clusters
+            # with both CPUs and TPUs.
+            num_nodes = ray_utils.get_num_tpu_nodes()
+            num_nodes_in_pg = ray_utils.get_num_nodes_in_placement_group()
+            if num_nodes_in_pg > 0:
+                num_nodes = num_nodes_in_pg
+
+            local_world_size = global_world_size // num_nodes
+            local_rank = global_rank % local_world_size
+        else:
+            logger.info("TpuCommunicator initialized with MP")
+            # Sanity: Verify we run on a single host
+            num_hosts = torch_xla.tpu.num_tpu_workers()
+            assert num_hosts == 1
+
+            # Get the current number of TPUs (we have locally)
+            local_world_size = torch_xla.tpu.num_available_chips()
+
+            # Get current rank
+            local_rank = global_rank % local_world_size

        # Ensure environment variables are set for multihost deployments.
        # On GKE, this is needed for libtpu and TPU driver to know which TPU

--- a/vllm/distributed/kv_transfer/kv_connector/simple_connector.py
+++ b/vllm/distributed/kv_transfer/kv_connector/simple_connector.py
@@ -12,6 +12,7 @@ from typing import TYPE_CHECKING, List, Optional, Tuple, Union

 import torch

+import vllm.envs as envs
 from vllm import _custom_ops as ops
 from vllm.config import VllmConfig
 from vllm.distributed.kv_transfer.kv_connector.base import KVConnectorBase
@@ -37,6 +38,8 @@ class SimpleConnector(KVConnectorBase):

        self.config = config.kv_transfer_config
        self.tp_size = config.parallel_config.tensor_parallel_size
+        self.is_deepseek_mla = config.model_config.is_deepseek_mla
+        self.use_mla_opt = not envs.VLLM_MLA_DISABLE

        if self.config.kv_connector == "PyNcclConnector":
            from vllm.distributed.kv_transfer.kv_pipe.pynccl_pipe import (
@@ -167,8 +170,26 @@ class SimpleConnector(KVConnectorBase):
        num_heads = int(model_config.num_key_value_heads / self.tp_size)
        hidden_size = model_config.hidden_size
        num_attention_heads = model_config.num_attention_heads
-        head_size = getattr(model_config, "head_dim",
-                            int(hidden_size // num_attention_heads))
+
+        # Deepseek's MLA (Multi-head Latent Attention) uses two different
+        # kv_cache shapes based on whether VLLM_MLA_DISABLE is set to 0.
+        # When VLLM_MLA_DISABLE=0 (default), forward absorb is applied,
+        # resulting in a kv_cache shape of [num_blks, blk_size, 1,
+        # kv_lora_rank + qk_rope_head_dim].
+        # When VLLM_MLA_DISABLE=1, standard FA is used instead, leading
+        # to a kv_cache shape of [2, num_blks, blk_size,
+        # num_key_value_heads / tp, qk_nope_head_dim + qk_rope_head_dim].
+        # For more details, see vllm/attention/backends/mla/common.py.
+        if self.is_deepseek_mla and self.use_mla_opt:
+            head_size = model_config.kv_lora_rank + \
+                model_config.qk_rope_head_dim
+            num_heads = 1
+        elif self.is_deepseek_mla and not self.use_mla_opt:
+            head_size = model_config.qk_nope_head_dim + \
+                model_config.qk_rope_head_dim
+        else:
+            head_size = getattr(model_config, "head_dim",
+                                int(hidden_size // num_attention_heads))

        # query_lens contains new KV caches that are added to vLLM.
        # so we will send them to decode instance
@@ -192,8 +213,12 @@ class SimpleConnector(KVConnectorBase):
            for layer_id in range(start_layer, end_layer):
                kv_cache = kv_caches[layer_id - start_layer]

-                key_cache = kv_cache[0].reshape(-1, num_heads, head_size)
-                value_cache = kv_cache[1].reshape(-1, num_heads, head_size)
+                if self.is_deepseek_mla and self.use_mla_opt:
+                    key_cache = kv_cache.reshape(-1, num_heads, head_size)
+                    value_cache = kv_cache.reshape(-1, num_heads, head_size)
+                else:
+                    key_cache = kv_cache[0].reshape(-1, num_heads, head_size)
+                    value_cache = kv_cache[1].reshape(-1, num_heads, head_size)

                current_slot_mapping = slot_mapping_flat[start_pos:end_pos]

@@ -223,6 +248,8 @@ class SimpleConnector(KVConnectorBase):
        # and hidden states.
        bypass_model_exec = True

+        model_config = model_executable.model.config
+
        input_tokens_tensor = model_input.input_tokens
        seq_lens = model_input.attn_metadata.seq_lens
        num_prefill_tokens = model_input.attn_metadata.num_prefill_tokens
@@ -291,19 +318,35 @@ class SimpleConnector(KVConnectorBase):
                kv_cache = kv_caches[i - model_executable.model.start_layer]
                layer = model_executable.model.layers[i]

-                key_cache, value_cache = kv_cache[0], kv_cache[1]
-                ops.reshape_and_cache_flash(
-                    keys[i - model_executable.model.start_layer].to(
-                        key_cache.device),
-                    values[i - model_executable.model.start_layer].to(
-                        value_cache.device),
-                    key_cache,
-                    value_cache,
-                    slot_mapping[start_pos:end_pos],
-                    layer.self_attn.attn.kv_cache_dtype,
-                    layer.self_attn.attn._k_scale,
-                    layer.self_attn.attn._v_scale,
-                )
+                if self.is_deepseek_mla and self.use_mla_opt:
+                    layer.self_attn.attn = layer.self_attn.mla_attn
+                    k_c_normed_k_pe = keys[
+                        i - model_executable.model.start_layer].to(
+                            kv_cache.device).squeeze(1)
+                    k_c_normed = k_c_normed_k_pe[:, :model_config.kv_lora_rank]
+                    k_pe = k_c_normed_k_pe[:, model_config.kv_lora_rank:]
+                    ops.concat_and_cache_mla(
+                        k_c_normed,
+                        k_pe,
+                        kv_cache,
+                        slot_mapping[start_pos:end_pos],
+                        layer.self_attn.attn.kv_cache_dtype,
+                        layer.self_attn.attn._k_scale,
+                    )
+                else:
+                    key_cache, value_cache = kv_cache[0], kv_cache[1]
+                    ops.reshape_and_cache_flash(
+                        keys[i - model_executable.model.start_layer].to(
+                            key_cache.device),
+                        values[i - model_executable.model.start_layer].to(
+                            value_cache.device),
+                        key_cache,
+                        value_cache,
+                        slot_mapping[start_pos:end_pos],
+                        layer.self_attn.attn.kv_cache_dtype,
+                        layer.self_attn.attn._k_scale,
+                        layer.self_attn.attn._v_scale,
+                    )

            hidden_or_intermediate_states_for_one_req.append(hidden)


--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -26,7 +26,7 @@ from vllm.plugins import load_general_plugins
 from vllm.test_utils import MODEL_WEIGHTS_S3_BUCKET, MODELS_ON_S3
 from vllm.transformers_utils.utils import check_gguf_file
 from vllm.usage.usage_lib import UsageContext
-from vllm.utils import FlexibleArgumentParser, StoreBoolean
+from vllm.utils import FlexibleArgumentParser, StoreBoolean, is_in_ray_actor

 if TYPE_CHECKING:
    from vllm.transformers_utils.tokenizer_group import BaseTokenizerGroup
@@ -40,7 +40,6 @@ DEVICE_OPTIONS = [
    "cuda",
    "neuron",
    "cpu",
-    "openvino",
    "tpu",
    "xpu",
    "hpu",
@@ -120,6 +119,7 @@ class EngineArgs:
    block_size: Optional[int] = None
    enable_prefix_caching: Optional[bool] = None
    disable_sliding_window: bool = False
+    disable_cascade_attn: bool = False
    use_v2_block_manager: bool = True
    swap_space: float = 4  # GiB
    cpu_offload_gb: float = 0  # GiB
@@ -177,7 +177,10 @@ class EngineArgs:

    guided_decoding_backend: str = 'xgrammar'
    logits_processor_pattern: Optional[str] = None
-    # Speculative decoding configuration.
+
+    speculative_config: Optional[Union[str, Dict[str, Any]]] = None
+
+    # TODO(Shangming): Deprecate these out-of-date params after next release
    speculative_model: Optional[str] = None
    speculative_model_quantization: Optional[str] = None
    speculative_draft_tensor_parallel_size: Optional[int] = None
@@ -190,9 +193,9 @@ class EngineArgs:
    spec_decoding_acceptance_method: str = 'rejection_sampler'
    typical_acceptance_sampler_posterior_threshold: Optional[float] = None
    typical_acceptance_sampler_posterior_alpha: Optional[float] = None
-    qlora_adapter_name_or_path: Optional[str] = None
    disable_logprobs_during_spec_decoding: Optional[bool] = None

+    qlora_adapter_name_or_path: Optional[str] = None
    show_hidden_metrics_for_version: Optional[str] = None
    otlp_traces_endpoint: Optional[str] = None
    collect_detailed_traces: Optional[str] = None
@@ -338,9 +341,15 @@ class EngineArgs:
            'CoreWeave. See the Tensorize vLLM Model script in the Examples '
            'section for more information.\n'
            '* "runai_streamer" will load the Safetensors weights using Run:ai'
-            'Model Streamer \n'
+            'Model Streamer.\n'
            '* "bitsandbytes" will load the weights using bitsandbytes '
-            'quantization.\n')
+            'quantization.\n'
+            '* "sharded_state" will load weights from pre-sharded checkpoint '
+            'files, supporting efficient loading of tensor-parallel models\n'
+            '* "gguf" will load weights from GGUF format files (details '
+            'specified in https://github.com/ggml-org/ggml/blob/master/docs/gguf.md).\n'
+            '* "mistral" will load weights from consolidated safetensors files '
+            'used by Mistral models.\n')
        parser.add_argument(
            '--config-format',
            default=EngineArgs.config_format,
@@ -774,7 +783,11 @@ class EngineArgs:
            const="True",
            help='If set, the prefill requests can be chunked based on the '
            'max_num_batched_tokens.')
-
+        parser.add_argument('--speculative-config',
+                            type=nullable_str,
+                            default=None,
+                            help='The configurations for speculative decoding.'
+                            ' Should be a JSON string.')
        parser.add_argument(
            '--speculative-model',
            type=nullable_str,
@@ -1096,6 +1109,16 @@ class EngineArgs:
            "using. This is used to parse the reasoning content into OpenAI "
            "API format. Required for ``--enable-reasoning``.")

+        parser.add_argument(
+            "--disable-cascade-attn",
+            action="store_true",
+            default=False,
+            help="Disable cascade attention for V1. While cascade attention "
+            "does not change the mathematical correctness, disabling it "
+            "could be useful for preventing potential numerical issues. "
+            "Note that even if this is set to False, cascade attention will be "
+            "only used when the heuristic tells that it's beneficial.")
+
        return parser

    @classmethod
@@ -1141,6 +1164,7 @@ class EngineArgs:
            max_seq_len_to_capture=self.max_seq_len_to_capture,
            max_logprobs=self.max_logprobs,
            disable_sliding_window=self.disable_sliding_window,
+            disable_cascade_attn=self.disable_cascade_attn,
            skip_tokenizer_init=self.skip_tokenizer_init,
            served_model_name=self.served_model_name,
            limit_mm_per_prompt=self.limit_mm_per_prompt,
@@ -1158,22 +1182,15 @@ class EngineArgs:
        )

    def create_load_config(self) -> LoadConfig:
-        # bitsandbytes quantization needs a specific model loader
-        # so we make sure the quant method and the load format are consistent
-        if (self.quantization == "bitsandbytes" or
-           self.qlora_adapter_name_or_path is not None) and \
-           self.load_format != "bitsandbytes":
-            raise ValueError(
-                "BitsAndBytes quantization and QLoRA adapter only support "
-                f"'bitsandbytes' load format, but got {self.load_format}")

-        if (self.load_format == "bitsandbytes" or
-            self.qlora_adapter_name_or_path is not None) and \
+        if(self.qlora_adapter_name_or_path is not None) and \
            self.quantization != "bitsandbytes":
            raise ValueError(
-                "BitsAndBytes load format and QLoRA adapter only support "
+                "QLoRA adapter only support "
                f"'bitsandbytes' quantization, but got {self.quantization}")

+        if self.quantization == "bitsandbytes":
+            self.load_format = "bitsandbytes"
        return LoadConfig(
            load_format=self.load_format,
            download_dir=self.download_dir,
@@ -1182,6 +1199,82 @@ class EngineArgs:
            use_tqdm_on_load=self.use_tqdm_on_load,
        )

+    def create_speculative_config(
+        self,
+        target_model_config: ModelConfig,
+        target_parallel_config: ParallelConfig,
+        enable_chunked_prefill: bool,
+        disable_log_stats: bool,
+    ) -> Optional["SpeculativeConfig"]:
+        """Initializes and returns a SpeculativeConfig object based on
+        `speculative_config`.
+
+        This function utilizes `speculative_config` to create a
+        SpeculativeConfig object. The `speculative_config` can either be
+        provided as a JSON string input via CLI arguments or directly as a
+        dictionary from the engine. If `speculative_config` is not set, this
+        function will attempt to construct a configuration dictionary using
+        certain parameters, which are scheduled for deprecation in the next
+        release. Note that in next releases, `speculative_config` must be
+        provided, and the deprecated standalone speculative-related parameters
+        will be removed.
+        """
+        if self.speculative_config is None:
+            if (self.speculative_model is None
+                    and self.num_speculative_tokens is None):
+                return None
+
+            # TODO(Shangming): Deprecate this way of setting SpeculativeConfig,
+            # only allow '--speculative-config' after next release
+            logger.warning_once(
+                "Please use '--speculative-config' to set all configurations "
+                "related to speculative decoding. The current method of "
+                "specifying the model through '--speculative-model' and "
+                "adding related parameters (e.g., '--num-speculative-tokens') "
+                "separately will be deprecated in the next release.")
+
+            spec_config_dict = {
+                "model": self.speculative_model,
+                "quantization": self.speculative_model_quantization,
+                "max_model_len": self.speculative_max_model_len,
+                "draft_tensor_parallel_size":
+                self.speculative_draft_tensor_parallel_size,
+                "num_speculative_tokens": self.num_speculative_tokens,
+                "disable_mqa_scorer": self.speculative_disable_mqa_scorer,
+                "disable_by_batch_size":
+                self.speculative_disable_by_batch_size,
+                "prompt_lookup_max": self.ngram_prompt_lookup_max,
+                "prompt_lookup_min": self.ngram_prompt_lookup_min,
+                "acceptance_method": self.spec_decoding_acceptance_method,
+                "posterior_threshold":
+                self.typical_acceptance_sampler_posterior_threshold,
+                "posterior_alpha":
+                self.typical_acceptance_sampler_posterior_alpha,
+                "disable_logprobs": self.disable_logprobs_during_spec_decoding,
+            }
+
+            self.speculative_config = spec_config_dict
+        else:
+            if isinstance(self.speculative_config, str):
+                import ast
+                self.speculative_config = ast.literal_eval(
+                    self.speculative_config)
+        # Note(Shangming): These parameters are not obtained from the cli arg
+        # '--speculative-config' and must be passed in when creating the engine
+        # config.
+
+        assert isinstance(self.speculative_config, dict)
+        self.speculative_config.update({
+            "target_model_config": target_model_config,
+            "target_parallel_config": target_parallel_config,
+            "enable_chunked_prefill": enable_chunked_prefill,
+            "disable_log_stats": disable_log_stats,
+        })
+        speculative_config = SpeculativeConfig.from_dict(
+            self.speculative_config)
+
+        return speculative_config
+
    def create_engine_config(
        self,
        usage_context: Optional[UsageContext] = None,
@@ -1228,6 +1321,8 @@ class EngineArgs:
        else:
            self._set_default_args_v0(model_config)

+        assert self.enable_chunked_prefill is not None
+
        cache_config = CacheConfig(
            block_size=self.block_size,
            gpu_memory_utilization=self.gpu_memory_utilization,
@@ -1240,6 +1335,18 @@ class EngineArgs:
            cpu_offload_gb=self.cpu_offload_gb,
            calculate_kv_scales=self.calculate_kv_scales,
        )
+
+        # Get the current placement group if Ray is initialized and
+        # we are in a Ray actor. If so, then the placement group will be
+        # passed to spawned processes.
+        placement_group = None
+        if is_in_ray_actor():
+            import ray
+
+            # This call initializes Ray automatically if it is not initialized,
+            # but we should not do this here.
+            placement_group = ray.util.get_current_placement_group()
+
        parallel_config = ParallelConfig(
            pipeline_parallel_size=self.pipeline_parallel_size,
            tensor_parallel_size=self.tensor_parallel_size,
@@ -1252,36 +1359,17 @@ class EngineArgs:
                self.tokenizer_pool_extra_config,
            ),
            ray_workers_use_nsight=self.ray_workers_use_nsight,
+            placement_group=placement_group,
            distributed_executor_backend=self.distributed_executor_backend,
            worker_cls=self.worker_cls,
            worker_extension_cls=self.worker_extension_cls,
        )

-        speculative_config = SpeculativeConfig.maybe_create_spec_config(
+        speculative_config = self.create_speculative_config(
            target_model_config=model_config,
            target_parallel_config=parallel_config,
-            target_dtype=self.dtype,
-            speculative_model=self.speculative_model,
-            speculative_model_quantization = \
-                self.speculative_model_quantization,
-            speculative_draft_tensor_parallel_size = \
-                self.speculative_draft_tensor_parallel_size,
-            num_speculative_tokens=self.num_speculative_tokens,
-            speculative_disable_mqa_scorer=self.speculative_disable_mqa_scorer,
-            speculative_disable_by_batch_size=self.
-            speculative_disable_by_batch_size,
-            speculative_max_model_len=self.speculative_max_model_len,
            enable_chunked_prefill=self.enable_chunked_prefill,
            disable_log_stats=self.disable_log_stats,
-            ngram_prompt_lookup_max=self.ngram_prompt_lookup_max,
-            ngram_prompt_lookup_min=self.ngram_prompt_lookup_min,
-            draft_token_acceptance_method=\
-                self.spec_decoding_acceptance_method,
-            typical_acceptance_sampler_posterior_threshold=self.
-            typical_acceptance_sampler_posterior_threshold,
-            typical_acceptance_sampler_posterior_alpha=self.
-            typical_acceptance_sampler_posterior_alpha,
-            disable_logprobs=self.disable_logprobs_during_spec_decoding,
        )

        # Reminder: Please update docs/source/features/compatibility_matrix.md
@@ -1436,16 +1524,6 @@ class EngineArgs:
                               recommend_to_remove=False)
            return False

-        if self.worker_cls != EngineArgs.worker_cls:
-            _raise_or_fallback(feature_name="--worker-cls",
-                               recommend_to_remove=False)
-            return False
-
-        if self.worker_extension_cls != EngineArgs.worker_extension_cls:
-            _raise_or_fallback(feature_name="--worker-extension-cls",
-                               recommend_to_remove=False)
-            return False
-
        if self.num_scheduler_steps != EngineArgs.num_scheduler_steps:
            _raise_or_fallback(feature_name="--num-scheduler-steps",
                               recommend_to_remove=True)
@@ -1462,7 +1540,9 @@ class EngineArgs:
            return False

        # Only support Xgrammar for guided decoding so far.
-        SUPPORTED_GUIDED_DECODING = ["xgrammar", "xgrammar:nofallback"]
+        SUPPORTED_GUIDED_DECODING = [
+            "xgrammar", "xgrammar:disable-any-whitespace"
+        ]
        if self.guided_decoding_backend not in SUPPORTED_GUIDED_DECODING:
            _raise_or_fallback(feature_name="--guided-decoding-backend",
                               recommend_to_remove=False)
@@ -1554,7 +1634,7 @@ class EngineArgs:
        if (self.speculative_model is not None
                or self.num_speculative_tokens is not None):
            # This is supported but experimental (handled below).
-            if self.speculative_model == "[ngram]":
+            if self.speculative_model in ("ngram", "[ngram]"):
                pass
            else:
                _raise_or_fallback(feature_name="Speculative Decoding",
@@ -1570,7 +1650,7 @@ class EngineArgs:
        # No FlashInfer or XFormers so far.
        V1_BACKENDS = [
            "FLASH_ATTN_VLLM_V1", "FLASH_ATTN", "PALLAS", "PALLAS_VLLM_V1",
-            "TRITON_MLA", "FLASHMLA"
+            "TRITON_ATTN_VLLM_V1", "TRITON_MLA", "FLASHMLA"
        ]
        if (envs.is_set("VLLM_ATTENTION_BACKEND")
                and envs.VLLM_ATTENTION_BACKEND not in V1_BACKENDS):
@@ -1602,7 +1682,8 @@ class EngineArgs:
            return False

        # ngram is supported on V1, but off by default for now.
-        if self.speculative_model == "[ngram]" and _warn_or_fallback("ngram"):
+        if self.speculative_model in (
+                "ngram", "[ngram]") and _warn_or_fallback("ngram"):
            return False

        # Non-CUDA is supported on V1, but off by default for now.
@@ -1683,7 +1764,7 @@ class EngineArgs:
        # V1 should use the new scheduler by default.
        # Swap it only if this arg is set to the original V0 default
        if self.scheduler_cls == EngineArgs.scheduler_cls:
-            self.scheduler_cls = "vllm.v1.core.scheduler.Scheduler"
+            self.scheduler_cls = "vllm.v1.core.sched.scheduler.Scheduler"

        # When no user override, set the default values based on the usage
        # context.

--- a/vllm/engine/async_llm_engine.py
+++ b/vllm/engine/async_llm_engine.py
@@ -35,7 +35,7 @@ from vllm.sampling_params import SamplingParams
 from vllm.sequence import ExecuteModelRequest
 from vllm.transformers_utils.tokenizer import AnyTokenizer
 from vllm.usage.usage_lib import UsageContext
-from vllm.utils import deprecate_kwargs, weak_bind
+from vllm.utils import Device, deprecate_kwargs, weak_bind

 logger = init_logger(__name__)
 ENGINE_ITERATION_TIMEOUT_S = envs.VLLM_ENGINE_ITERATION_TIMEOUT_S
@@ -492,7 +492,6 @@ class _AsyncLLMEngine(LLMEngine):

        preprocessed_inputs = await self.input_preprocessor.preprocess_async(
            prompt,
-            request_id=request_id,
            lora_request=lora_request,
            prompt_adapter_request=prompt_adapter_request,
        )
@@ -1216,8 +1215,9 @@ class AsyncLLMEngine(EngineClient):
    async def stop_profile(self) -> None:
        self.engine.stop_profile()

-    async def reset_prefix_cache(self) -> None:
-        self.engine.reset_prefix_cache()
+    async def reset_prefix_cache(self,
+                                 device: Optional[Device] = None) -> None:
+        self.engine.reset_prefix_cache(device)

    async def sleep(self, level: int = 1) -> None:
        self.engine.sleep(level)