Merge tag 'v0.8.3' into v0.8.3-dev

fcfc474d · zhuwenwen · bb94d2e5 · 296c6572 · fcfc474d · fcfc474d
Commit fcfc474d authored Apr 09, 2025 by zhuwenwen
20 changed files
--- a/vllm/benchmarks/serve.py
+++ b/vllm/benchmarks/serve.py
@@ -726,15 +726,13 @@ def add_cli_args(parser: argparse.ArgumentParser):
        default="ttft,tpot,itl",
        help="Comma-seperated list of selected metrics to report percentils. "
        "This argument specifies the metrics to report percentiles. "
-        "Allowed metric names are \"ttft\", \"tpot\", \"itl\", \"e2el\". "
+        "Allowed metric names are \"ttft\", \"tpot\", \"itl\", \"e2el\". ")
-        "Default value is \"ttft,tpot,itl\".")
    parser.add_argument(
        "--metric-percentiles",
        type=str,
        default="99",
        help="Comma-seperated list of percentiles for selected metrics. "
        "To report 25-th, 50-th, and 75-th percentiles, use \"25,50,75\". "
-        "Default value is \"99\". "
        "Use \"--percentile-metrics\" to select metrics.",
    )
    parser.add_argument(

--- a/vllm/compilation/backends.py
+++ b/vllm/compilation/backends.py
@@ -381,8 +381,8 @@ class VllmBackend:
                with open(filepath) as f:
                    hash_content.append(f.read())
            import hashlib
-            code_hash = hashlib.md5(
+            code_hash = hashlib.md5("\n".join(hash_content).encode(),
-                "\n".join(hash_content).encode()).hexdigest()
+                                    usedforsecurity=False).hexdigest()
            factors.append(code_hash)
            # 3. compiler hash
@@ -390,7 +390,8 @@ class VllmBackend:
            factors.append(compiler_hash)
            # combine all factors to generate the cache dir
-            hash_key = hashlib.md5(str(factors).encode()).hexdigest()[:10]
+            hash_key = hashlib.md5(str(factors).encode(),
+                                   usedforsecurity=False).hexdigest()[:10]
            cache_dir = os.path.join(
                envs.VLLM_CACHE_ROOT,

--- a/vllm/compilation/compiler_interface.py
+++ b/vllm/compilation/compiler_interface.py
 # SPDX-License-Identifier: Apache-2.0
+import contextlib
 import copy
 import hashlib
+import importlib.metadata
 import os
 from contextlib import ExitStack
 from typing import Any, Callable, Dict, List, Optional, Tuple
@@ -9,6 +11,7 @@ from unittest.mock import patch
 import torch
 import torch._inductor.compile_fx
 import torch.fx as fx
+from packaging.version import Version
 from vllm.config import VllmConfig
@@ -139,10 +142,12 @@ class InductorAdaptor(CompilerInterface):
        from torch._inductor.codecache import torch_key
        torch_factors = torch_key()
        factors.append(torch_factors)
-        hash_str = hashlib.md5(str(factors).encode()).hexdigest()[:10]
+        hash_str = hashlib.md5(str(factors).encode(),
+                               usedforsecurity=False).hexdigest()[:10]
        return hash_str
    def initialize_cache(self, cache_dir: str, disable_cache: bool = False):
+        self.cache_dir = cache_dir
        if disable_cache:
            return
        # redirect the cache directory to a sub-directory
@@ -155,7 +160,6 @@ class InductorAdaptor(CompilerInterface):
        triton_cache = os.path.join(cache_dir, "triton_cache")
        os.makedirs(triton_cache, exist_ok=True)
        os.environ["TRITON_CACHE_DIR"] = triton_cache
-        self.cache_dir = cache_dir
    def compile(
        self,
@@ -228,7 +232,20 @@ class InductorAdaptor(CompilerInterface):
                inductor_compiled_graph = output
                if inductor_compiled_graph is not None:
                    nonlocal file_path
-                    file_path = inductor_compiled_graph.current_callable.__code__.co_filename  # noqa
+                    compiled_fn = inductor_compiled_graph.current_callable
+                    file_path = compiled_fn.__code__.co_filename  # noqa
+                    if not file_path.startswith(self.cache_dir):
+                        # hooked in the align_inputs_from_check_idxs function
+                        # in torch/_inductor/utils.py
+                        for cell in compiled_fn.__closure__:
+                            if not callable(cell.cell_contents):
+                                continue
+                            code = cell.cell_contents.__code__
+                            if code.co_filename.startswith(self.cache_dir):
+                                # this is the real file path
+                                # compiled from Inductor
+                                file_path = code.co_filename
+                                break
                    hash_str = inductor_compiled_graph._fx_graph_cache_key
                return output
@@ -271,6 +288,9 @@ class InductorAdaptor(CompilerInterface):
                    "torch._inductor.codecache.FxGraphCache._check_can_cache",
                    _check_can_cache))
+            # Dynamo metrics context, see method for more details.
+            stack.enter_context(self.metrics_context())
            compiled_graph = compile_fx(
                graph,
                example_inputs,
@@ -295,8 +315,14 @@ class InductorAdaptor(CompilerInterface):
        hash_str = handle[0]
        from torch._inductor.codecache import FxGraphCache
-        with patch("torch._inductor.codecache.FxGraphCache._get_shape_env",
+        with ExitStack() as exit_stack:
-                   lambda *args, **kwargs: AlwaysHitShapeEnv()):
+            exit_stack.enter_context(
+                patch("torch._inductor.codecache.FxGraphCache._get_shape_env",
+                      lambda *args, **kwargs: AlwaysHitShapeEnv()))
+            # Dynamo metrics context, see method for more details.
+            exit_stack.enter_context(self.metrics_context())
            if torch.__version__.startswith("2.5"):
                inductor_compiled_graph = FxGraphCache._lookup_graph(
                    hash_str, example_inputs, True, False)
@@ -337,6 +363,28 @@ class InductorAdaptor(CompilerInterface):
        return compiled_graph
+    def metrics_context(self) -> contextlib.AbstractContextManager:
+        """
+        This method returns the Dynamo metrics context (if it exists,
+        otherwise a null context). It is used by various compile components.
+        Present in torch>=2.6, it's used inside FxGraphCache in
+        torch==2.6 (but not after). It might also be used in various other
+        torch.compile internal functions.
+        Because it is re-entrant, we always set it (even if entering via Dynamo
+        and the context was already entered). We might want to revisit if it
+        should be set at a different level of compilation.
+        This is likely a bug in PyTorch: public APIs should not rely on
+        manually setting up internal contexts. But we also rely on non-public
+        APIs which might not provide these guarantees.
+        """
+        if Version(importlib.metadata.version('torch')) >= Version("2.6"):
+            import torch._dynamo.utils
+            return torch._dynamo.utils.get_metrics_context()
+        else:
+            return contextlib.nullcontext()
 class EagerAdaptor(CompilerInterface):
    name = "eager"

--- a/vllm/compilation/fusion.py
+++ b/vllm/compilation/fusion.py
@@ -4,8 +4,6 @@ from typing import Callable, Dict, List, NamedTuple, Optional, Tuple
 import torch
 import torch._inductor.pattern_matcher as pm
-# TODO(luka) use vllm.utils once #10836 landed
-from compressed_tensors.quantization import FP8_DTYPE
 from torch import fx
 from torch._higher_order_ops.auto_functionalize import auto_functionalized
 from torch._inductor.pattern_matcher import PatternMatcherPass
@@ -13,12 +11,14 @@ from torch._ops import OpOverload
 from vllm.config import CompilationConfig
 from vllm.logger import init_logger
+from vllm.platforms import current_platform
 from .fx_utils import find_getitem_maybe
 from .multi_output_match import MultiOutputMatch
 from .vllm_inductor_pass import VllmInductorPass
 logger = init_logger(__name__)
+FP8_DTYPE = current_platform.fp8_dtype()
 def empty_bf16(*args, **kwargs):

--- a/vllm/config.py
+++ b/vllm/config.py
@@ -29,7 +29,7 @@ from vllm.logger import init_logger
 from vllm.model_executor.layers.quantization import (QUANTIZATION_METHODS,
                                                     get_quantization_config)
 from vllm.model_executor.models import ModelRegistry
-from vllm.platforms import CpuArchEnum
+from vllm.platforms import CpuArchEnum, current_platform
 from vllm.sampling_params import GuidedDecodingParams
 from vllm.tracing import is_otel_available, otel_import_error_traceback
 from vllm.transformers_utils.config import (
@@ -38,9 +38,10 @@ from vllm.transformers_utils.config import (
    get_sentence_transformer_tokenizer_config, is_encoder_decoder,
    try_get_generation_config, uses_mrope)
 from vllm.transformers_utils.s3_utils import S3Model
-from vllm.transformers_utils.utils import is_s3
+from vllm.transformers_utils.utils import is_s3, maybe_model_redirect
 from vllm.utils import (GiB_bytes, LayerBlockType, cuda_device_count_stateless,
-                        get_cpu_memory, random_uuid, resolve_obj_by_qualname)
+                        get_cpu_memory, get_open_port, random_uuid,
+                        resolve_obj_by_qualname)
 if TYPE_CHECKING:
    from ray.util.placement_group import PlacementGroup
@@ -221,6 +222,9 @@ class ModelConfig:
        factors.append(self.trust_remote_code)
        factors.append(self.rope_scaling)
        factors.append(self.rope_theta)
+        # rope cos/sin cache depends on the max_position_embeddings
+        factors.append(
+            getattr(self.hf_config, "max_position_embeddings", "None"))
        return hashlib.sha256(str(factors).encode()).hexdigest()
    def __init__(
@@ -263,9 +267,13 @@ class ModelConfig:
        override_generation_config: Optional[dict[str, Any]] = None,
        model_impl: Union[str, ModelImpl] = ModelImpl.AUTO,
    ) -> None:
-        self.model = model
+        self.model = maybe_model_redirect(model)
+        self.tokenizer = maybe_model_redirect(tokenizer)
        self.hf_config_path = hf_config_path
-        self.tokenizer = tokenizer
+        if isinstance(hf_config_path, str):
+            self.hf_config_path = maybe_model_redirect(hf_config_path)
        self.tokenizer_mode = tokenizer_mode
        self.trust_remote_code = trust_remote_code
        self.allowed_local_media_path = allowed_local_media_path
@@ -309,8 +317,8 @@ class ModelConfig:
            ) and backend == "FLASHINFER" and find_spec("flashinfer") is None:
            raise ValueError(
                "VLLM_ATTENTION_BACKEND is set to FLASHINFER, but flashinfer "
-                "module was not found."
+                "module was not found. See "
-                "See https://github.com/vllm-project/vllm/blob/main/Dockerfile"
+                "https://github.com/vllm-project/vllm/blob/main/docker/Dockerfile "  # noqa: E501
                "for instructions on how to install it.")
        # The tokenizer version is consistent with the model version by default.
@@ -346,6 +354,8 @@ class ModelConfig:
        self.hf_config = hf_config
        self.hf_text_config = get_hf_text_config(self.hf_config)
+        self.attention_chunk_size = getattr(self.hf_text_config,
+                                            "attention_chunk_size", None)
        self.encoder_config = self._get_encoder_config()
        self.hf_image_processor_config = get_hf_image_processor_config(
            self.model, revision)
@@ -403,6 +413,7 @@ class ModelConfig:
        self.is_attention_free = self._init_attention_free()
        self.is_hybrid = self._init_is_hybrid()
+        self.has_noops = self._init_has_noops()
        self.has_inner_state = self._init_has_inner_state()
        if current_platform.is_neuron():
@@ -502,6 +513,10 @@ class ModelConfig:
    def _init_is_hybrid(self) -> bool:
        return self.registry.is_hybrid_model(self.architectures)
+    def _init_has_noops(self) -> bool:
+        architectures = getattr(self.hf_config, "architectures", [])
+        return self.registry.is_noops_model(architectures)
    def _init_has_inner_state(self) -> bool:
        return self.registry.model_has_inner_state(self.architectures)
@@ -671,11 +686,19 @@ class ModelConfig:
            self.max_seq_len_to_capture = self.max_model_len
        self.max_seq_len_to_capture = min(self.max_seq_len_to_capture,
                                          self.max_model_len)
+        ROCM_UNSUPPORTED_MODELS = ['mllama']
+        if (self.hf_config.model_type in ROCM_UNSUPPORTED_MODELS
+                and not self.enforce_eager and current_platform.is_rocm()):
+            logger.warning(
+                "CUDA graph is not supported for %s on ROCm yet, fallback "
+                "to the eager mode.", self.hf_config.model_type)
+            self.enforce_eager = True
    def _verify_bnb_config(self) -> None:
        """
-        The current version of bitsandbytes (0.44.0) with 8-bit models does not
+        The current version of bitsandbytes (0.45.3) with 8-bit models does not
        yet support CUDA graph.
+        # TODO Remove this when bitsandbytes supports.
        """
        is_bitsandbytes = self.quantization == "bitsandbytes"
        has_quantization_config = (getattr(self.hf_config,
@@ -690,8 +713,9 @@ class ModelConfig:
                not self.enforce_eager,
        ]):
            logger.warning(
-                "CUDA graph is not supported on BitAndBytes 8bit yet, "
+                "CUDA graph is not supported on BitsAndBytes 8bit yet, "
                "fallback to the eager mode.")
            self.enforce_eager = True
    def _verify_with_expert_parallelism(self) -> None:
@@ -746,6 +770,12 @@ class ModelConfig:
        self,
        parallel_config: "ParallelConfig",
    ) -> None:
+        if parallel_config.distributed_executor_backend == "external_launcher":
+            assert self.seed is not None, (
+                "Seed must be set when using external launcher backend to "
+                "make sure sampling results are the same across workers.")
        total_num_attention_heads = getattr(self.hf_text_config,
                                            "num_attention_heads", 0)
        tensor_parallel_size = parallel_config.tensor_parallel_size
@@ -797,10 +827,18 @@ class ModelConfig:
    @property
    def is_deepseek_mla(self) -> bool:
-        return (hasattr(self.hf_text_config, "model_type")) \
+        if not hasattr(self.hf_text_config, "model_type"):
-                and (self.hf_text_config.model_type in \
+            return False
-                    ('deepseek_v2', 'deepseek_v3', 'deepseek_mtp'))\
+        elif self.hf_text_config.model_type in \
-                and (self.hf_text_config.kv_lora_rank is not None)
+            ('deepseek_v2', 'deepseek_v3', 'deepseek_mtp'):
+            return self.hf_text_config.kv_lora_rank is not None
+        elif self.hf_text_config.model_type == 'eagle':
+            # if the model is an EAGLE module, check for the
+            # underlying architecture
+            return self.hf_text_config.model.model_type in \
+                    ('deepseek_v2', 'deepseek_v3') \
+                and self.hf_text_config.kv_lora_rank is not None
+        return False
    def get_head_size(self) -> int:
        # TODO remove hard code
@@ -854,6 +892,14 @@ class ModelConfig:
            return getattr(self.hf_config.attn_config, "kv_n_heads",
                           self.hf_config.num_attention_heads)
+        if self.hf_config.model_type == "nemotron-nas":
+            for block in self.hf_config.block_configs:
+                if not block.attention.no_op:
+                    return self.hf_config.num_attention_heads \
+                        // block.attention.n_heads_in_group
+            raise RuntimeError("Couldn't determine number of kv heads")
        if self.is_attention_free:
            return 0
@@ -922,7 +968,9 @@ class ModelConfig:
        # This function relies on 'layers_block_type' in hf_config,
        # for w/o this attribute, we will need to have workarounds like so
        attn_block_type = block_type == LayerBlockType.attention
-        is_transformer = not self.is_hybrid and not self.is_attention_free
+        is_transformer = not self.is_hybrid and \
+                            not self.has_noops and \
+                            not self.is_attention_free
        start, end = self.get_layers_start_end_indices(parallel_config)
        if is_transformer:
@@ -933,27 +981,39 @@ class ModelConfig:
            # Note that this code assumes there
            # is only one type of attention-free block type.
            return 0 if attn_block_type else end - start
+        elif self.has_noops:
+            block_configs = self.hf_config.block_configs
+            return sum(not bc.attention.no_op
+                       for bc in block_configs[start:end])
        else:
-            # Hybrid model
+            # Hybrid model Jamba
            layers_block_type_value = getattr(self.hf_config,
                                              "layers_block_type", None)
-            if layers_block_type_value is None:
+            if layers_block_type_value is not None:
-                raise ValueError("The model is an hybrid without a "
+                if hasattr(self.hf_text_config,
-                                 "layers_block_type in the hf_config, "
+                           "model_type") and (self.hf_text_config.model_type
-                                 "cannot determine the num of "
+                                              == "zamba2"):
-                                 f"{block_type.value} layers")
+                    if attn_block_type:
+                        return sum(t == "hybrid"
-            if hasattr(self.hf_text_config,
+                                   for t in layers_block_type_value[start:end])
-                       "model_type") and (self.hf_text_config.model_type
+                    else:
-                                          == "zamba2"):
+                        return self.get_num_layers(parallel_config)
-                if attn_block_type:
+                return sum(t == block_type.value
-                    return sum(t == "hybrid"
+                           for t in layers_block_type_value[start:end])
-                               for t in layers_block_type_value[start:end])
-                else:
+            # Hybrid model Minimax
-                    return self.get_num_layers(parallel_config)
+            attn_type_list = getattr(self.hf_config, "attn_type_list", None)
+            if attn_type_list:
+                return sum(t == 1 for t in attn_type_list[start:end])
+            if layers_block_type_value is None and attn_type_list is None:
+                raise ValueError(
+                    "The model is an hybrid without a"
+                    "layers_block_type or an attn_type_list in the hf_config,"
+                    "cannot determine the num of "
+                    f"{block_type.value} layers")
-            return sum(t == block_type.value
+            return sum(t == 1 for t in attn_type_list[start:end])
-                       for t in layers_block_type_value[start:end])
    def get_multimodal_config(self) -> "MultiModalConfig":
        """
@@ -1079,8 +1139,7 @@ class CacheConfig:
        is_attention_free: Whether the model is attention-free.
        num_gpu_blocks_override: Number of GPU blocks to use. This overrides the
            profiled num_gpu_blocks if specified. Does nothing if None.
-        sliding_window: Sliding window size for the KV cache. Can not work with
+        sliding_window: Sliding window size for the KV cache.
-            prefix caching enabled.
        enable_prefix_caching: Whether to enable prefix caching.
        cpu_offload_gb: Size of the CPU offload buffer in GiB.
    """
@@ -1100,7 +1159,8 @@ class CacheConfig:
        factors: list[Any] = []
        factors.append(self.cache_dtype)
        # `cpu_offload_gb` does not use `torch.compile` yet.
-        hash_str = hashlib.md5(str(factors).encode()).hexdigest()
+        hash_str = hashlib.md5(str(factors).encode(),
+                               usedforsecurity=False).hexdigest()
        return hash_str
    def __init__(
@@ -1113,6 +1173,7 @@ class CacheConfig:
        num_gpu_blocks_override: Optional[int] = None,
        sliding_window: Optional[int] = None,
        enable_prefix_caching: bool = False,
+        prefix_caching_hash_algo: str = "builtin",
        cpu_offload_gb: float = 0,
        calculate_kv_scales: Optional[bool] = None,
    ) -> None:
@@ -1124,6 +1185,7 @@ class CacheConfig:
        self.is_attention_free = is_attention_free
        self.sliding_window = sliding_window
        self.enable_prefix_caching = enable_prefix_caching
+        self.prefix_caching_hash_algo = prefix_caching_hash_algo
        self.cpu_offload_gb = cpu_offload_gb
        self.calculate_kv_scales = calculate_kv_scales
        self._verify_args()
@@ -1174,6 +1236,13 @@ class CacheConfig:
                "Prefix caching is not supported with sliding window. "
                "Run with --disable-sliding-window to use prefix caching.")
+        if self.enable_prefix_caching and self.prefix_caching_hash_algo not in (
+                "builtin", "sha256"):
+            raise ValueError(
+                "Unknown prefix caching hash algorithm: "
+                f"{self.prefix_caching_hash_algo}. Must be either "
+                "'builtin' or 'sha256'.")
    def verify_with_parallel_config(
        self,
        parallel_config: "ParallelConfig",
@@ -1223,7 +1292,8 @@ class TokenizerPoolConfig:
        # no factors to consider.
        # this config will not affect the computation graph.
        factors: list[Any] = []
-        hash_str = hashlib.md5(str(factors).encode()).hexdigest()
+        hash_str = hashlib.md5(str(factors).encode(),
+                               usedforsecurity=False).hexdigest()
        return hash_str
    def __post_init__(self):
@@ -1334,7 +1404,8 @@ class LoadConfig:
        # no factors to consider.
        # this config will not affect the computation graph.
        factors: list[Any] = []
-        hash_str = hashlib.md5(str(factors).encode()).hexdigest()
+        hash_str = hashlib.md5(str(factors).encode(),
+                               usedforsecurity=False).hexdigest()
        return hash_str
    def __post_init__(self):
@@ -1362,6 +1433,8 @@ class ParallelConfig:
    tensor_parallel_size: int = 1  # Number of tensor parallel groups.
    data_parallel_size: int = 1  # Number of data parallel groups.
    data_parallel_rank: int = 0  # Rank of the data parallel group.
+    # Local rank of the data parallel group, defaults to global rank.
+    data_parallel_rank_local: Optional[int] = None
    # IP of the data parallel master.
    data_parallel_master_ip: str = "127.0.0.1"
    data_parallel_master_port: int = 29500  # Port of the data parallel master.
@@ -1466,10 +1539,18 @@ class ParallelConfig:
        self.world_size = self.pipeline_parallel_size * \
            self.tensor_parallel_size
-        self.data_parallel_size = envs.VLLM_DP_SIZE
+        if self.data_parallel_size > 1:
-        self.data_parallel_rank = envs.VLLM_DP_RANK
+            # Data parallel was specified in the engine args.
-        self.data_parallel_master_ip = envs.VLLM_DP_MASTER_IP
+            self.data_parallel_master_port = get_open_port()
-        self.data_parallel_master_port = envs.VLLM_DP_MASTER_PORT
+            # TODO multi-node
+        else:
+            # Otherwise fall back to env vars (e.g. for offline SPMD case).
+            self.data_parallel_size = envs.VLLM_DP_SIZE
+            self.data_parallel_rank = envs.VLLM_DP_RANK
+            self.data_parallel_rank_local = envs.VLLM_DP_RANK_LOCAL
+            self.data_parallel_master_ip = envs.VLLM_DP_MASTER_IP
+            self.data_parallel_master_port = envs.VLLM_DP_MASTER_PORT
        self.world_size_across_dp = self.world_size * self.data_parallel_size
        if self.distributed_executor_backend == "external_launcher":
@@ -1547,11 +1628,11 @@ class ParallelConfig:
        if self.use_ray:
            from vllm.executor import ray_utils
            ray_utils.assert_ray_available()
-        # if current_platform.is_rocm():
+        # if not current_platform.use_custom_allreduce():
        #     self.disable_custom_all_reduce = True
        #     logger.info(
        #         "Disabled the custom all-reduce kernel because it is not "
-        #         "supported on hcus.")
+        #         "supported on current platform.")
        if self.ray_workers_use_nsight and not self.use_ray:
            raise ValueError("Unable to use nsight profiling unless workers "
                             "run with Ray.")
@@ -1654,7 +1735,8 @@ class SchedulerConfig:
        # no factors to consider.
        # this config will not affect the computation graph.
        factors: list[Any] = []
-        hash_str = hashlib.md5(str(factors).encode()).hexdigest()
+        hash_str = hashlib.md5(str(factors).encode(),
+                               usedforsecurity=False).hexdigest()
        return hash_str
    def __post_init__(self) -> None:
@@ -1790,7 +1872,8 @@ class DeviceConfig:
        # the device/platform information will be summarized
        # by torch/vllm automatically.
        factors: list[Any] = []
-        hash_str = hashlib.md5(str(factors).encode()).hexdigest()
+        hash_str = hashlib.md5(str(factors).encode(),
+                               usedforsecurity=False).hexdigest()
        return hash_str
    def __init__(self, device: str = "auto") -> None:
@@ -1799,7 +1882,10 @@ class DeviceConfig:
            from vllm.platforms import current_platform
            self.device_type = current_platform.device_type
            if not self.device_type:
-                raise RuntimeError("Failed to infer device type")
+                raise RuntimeError(
+                    "Failed to infer device type, please set "
+                    "the environment variable `VLLM_LOGGING_LEVEL=DEBUG` "
+                    "to turn on verbose logging to help debug the issue.")
        else:
            # Device type is assigned explicitly
            self.device_type = device
@@ -1963,7 +2049,8 @@ class SpeculativeConfig:
        # no factors to consider.
        # spec decode does not use `torch.compile` yet.
        factors: list[Any] = []
-        hash_str = hashlib.md5(str(factors).encode()).hexdigest()
+        hash_str = hashlib.md5(str(factors).encode(),
+                               usedforsecurity=False).hexdigest()
        return hash_str
    @classmethod
@@ -1985,14 +2072,13 @@ class SpeculativeConfig:
    def __post_init__(self):
-        # Note: After next release, the method parameter will be used to
+        # Note: "method" is a new parameter that helps to extend the
-        # specify the speculative method, which helps to extend the
+        # configuration of non-model-based proposers, and the "model" parameter
-        # configuration of non-model-based proposers, and the model parameter
+        # will be used to set the draft model, eagle head, or additional weight
-        # will be used when the draft model or head is needed.
+        # when needed. If users do not specify "method", the speculative method
-        # If users do not specify the method, the speculative method will
+        # will be detected automatically if possible. If the speculative method
-        # be detected automatically if possible. If the speculative method can
+        # can not be detected, it will be considered as the "draft_model" by
-        # not be detected, it will be considered as the draft-model-based
+        # default.
-        # method by default.
        if self.model is None and self.num_speculative_tokens is not None:
            # TODO(Shangming): Refactor mtp configuration logic when supporting
@@ -2007,8 +2093,8 @@ class SpeculativeConfig:
                raise ValueError("num_speculative_tokens was provided without "
                                 "speculative model.")
-        # Automatically configure the ngram method during configuration
+        # Automatically configure the method for ngram when "model" is used
-        # refactoring to ensure a smooth transition.
+        # instead of "method"
        if self.method is None and (self.model is not None
                                    and self.model in ("ngram", "[ngram]")):
            self.method = "ngram"
@@ -2090,9 +2176,10 @@ class SpeculativeConfig:
                # Replace hf_config for EAGLE draft_model
                if self.method == "eagle":
-                    if self.enable_chunked_prefill:
+                    if self.enable_chunked_prefill and not envs.VLLM_USE_V1:
                        raise ValueError(
-                            "Chunked prefill and EAGLE are not compatible.")
+                            "Chunked prefill and EAGLE are not compatible "
+                            "when using V0.")
                    from vllm.transformers_utils.configs.eagle import (
                        EAGLEConfig)
@@ -2302,12 +2389,10 @@ class SpeculativeConfig:
        return self.num_speculative_tokens
    def __repr__(self) -> str:
-        if self.prompt_lookup_max is not None and self.prompt_lookup_max > 0:
+        method = self.method
-            draft_model = "ngram"
+        model = None if method == "ngram" else self.draft_model_config.model
-        else:
-            draft_model = self.draft_model_config.model
        num_spec_tokens = self.num_speculative_tokens
-        return f"SpeculativeConfig({draft_model=}, {num_spec_tokens=})"
+        return f"SpeculativeConfig({method=}, {model=}, {num_spec_tokens=})"
 @dataclass
@@ -2343,7 +2428,8 @@ class LoRAConfig:
        factors.append(self.lora_extra_vocab_size)
        factors.append(self.long_lora_scaling_factors)
        factors.append(self.bias_enabled)
-        hash_str = hashlib.md5(str(factors).encode()).hexdigest()
+        hash_str = hashlib.md5(str(factors).encode(),
+                               usedforsecurity=False).hexdigest()
        return hash_str
    def __post_init__(self):
@@ -2373,9 +2459,9 @@ class LoRAConfig:
                f"max_loras ({self.max_loras}) is 1")
    def verify_with_cache_config(self, cache_config: CacheConfig):
-        # TODO LoRA supports CPU offload.
+        if cache_config.cpu_offload_gb > 0 and not envs.VLLM_USE_V1:
-        if cache_config.cpu_offload_gb > 0:
+            raise ValueError(
-            raise ValueError("CPU offload is not supported with LoRA yet.")
+                "V0 LoRA does not support CPU offload, please use V1.")
    def verify_with_model_config(self, model_config: ModelConfig):
        if self.lora_dtype in (None, "auto"):
@@ -2413,7 +2499,8 @@ class PromptAdapterConfig:
        # no factors to consider.
        # this config will not affect the computation graph.
        factors: list[Any] = []
-        hash_str = hashlib.md5(str(factors).encode()).hexdigest()
+        hash_str = hashlib.md5(str(factors).encode(),
+                               usedforsecurity=False).hexdigest()
        return hash_str
    def __post_init__(self):
@@ -2458,7 +2545,8 @@ class MultiModalConfig:
        # no factors to consider.
        # this config will not affect the computation graph.
        factors: list[Any] = []
-        hash_str = hashlib.md5(str(factors).encode()).hexdigest()
+        hash_str = hashlib.md5(str(factors).encode(),
+                               usedforsecurity=False).hexdigest()
        return hash_str
    def get_limit_per_prompt(self, modality: str) -> int:
@@ -2524,7 +2612,8 @@ class PoolerConfig:
        # no factors to consider.
        # this config will not affect the computation graph.
        factors: list[Any] = []
-        hash_str = hashlib.md5(str(factors).encode()).hexdigest()
+        hash_str = hashlib.md5(str(factors).encode(),
+                               usedforsecurity=False).hexdigest()
        return hash_str
    @staticmethod
@@ -2661,6 +2750,10 @@ def _get_and_verify_max_len(
            max_len_key = key if max_len < derived_max_model_len \
                else max_len_key
            derived_max_model_len = min(derived_max_model_len, max_len)
+    # For Command-R / Cohere, Cohere2 / Aya Vision models
+    if tmp_max_len := getattr(hf_config, "model_max_length", None):
+        max_len_key = "model_max_length"
+        derived_max_model_len = tmp_max_len
    # If sliding window is manually disabled, max_length should be less
    # than the sliding window length in the model config.
@@ -2805,7 +2898,8 @@ class DecodingConfig:
        # no factors to consider.
        # this config will not affect the computation graph.
        factors: list[Any] = []
-        hash_str = hashlib.md5(str(factors).encode()).hexdigest()
+        hash_str = hashlib.md5(str(factors).encode(),
+                               usedforsecurity=False).hexdigest()
        return hash_str
    def __post_init__(self):
@@ -2855,7 +2949,8 @@ class ObservabilityConfig:
        # no factors to consider.
        # this config will not affect the computation graph.
        factors: list[Any] = []
-        hash_str = hashlib.md5(str(factors).encode()).hexdigest()
+        hash_str = hashlib.md5(str(factors).encode(),
+                               usedforsecurity=False).hexdigest()
        return hash_str
    def __post_init__(self):
@@ -2917,7 +3012,8 @@ class KVTransferConfig(BaseModel):
        # no factors to consider.
        # this config will not affect the computation graph.
        factors: list[Any] = []
-        hash_str = hashlib.md5(str(factors).encode()).hexdigest()
+        hash_str = hashlib.md5(str(factors).encode(),
+                               usedforsecurity=False).hexdigest()
        return hash_str
    @classmethod
@@ -2945,12 +3041,6 @@ class KVTransferConfig(BaseModel):
        return self.kv_connector is not None and \
            self.kv_role in ["kv_producer", "kv_consumer", "kv_both"]
-    @property
-    def need_kv_parallel_group(self) -> bool:
-        # for those database-based connector, vLLM does not need to create
-        # parallel group, and in that case the kv parallel size will be 1.
-        return self.kv_connector is not None and self.kv_parallel_size > 1
    @property
    def is_kv_producer(self) -> bool:
        return self.kv_connector is not None and \
@@ -3414,7 +3504,8 @@ class VllmConfig:
            vllm_factors.append("None")
        factors.append(vllm_factors)
-        hash_str = hashlib.md5(str(factors).encode()).hexdigest()[:10]
+        hash_str = hashlib.md5(str(factors).encode(),
+                               usedforsecurity=False).hexdigest()[:10]
        return hash_str
    def pad_for_cudagraph(self, batch_size: int) -> int:
@@ -3526,9 +3617,10 @@ class VllmConfig:
        if self.cache_config is not None and \
            self.cache_config.cpu_offload_gb > 0 and \
-            self.compilation_config.level != CompilationLevel.NO_COMPILATION:
+            self.compilation_config.level != CompilationLevel.NO_COMPILATION \
+                and not envs.VLLM_USE_V1:
            logger.warning(
-                "CPU offload is not supported with `torch.compile` yet."
+                "CPU offload is not supported with `torch.compile` in v0 yet."
                " Disabling `torch.compile`.")
            self.compilation_config.level = CompilationLevel.NO_COMPILATION

--- a/vllm/device_allocator/cumem.py
+++ b/vllm/device_allocator/cumem.py
@@ -8,6 +8,7 @@
 # not sure why, they are created from a different context.
 # the only successful approach is to call cuda driver API in C.
 import dataclasses
+import gc
 import os
 from contextlib import contextmanager
 from typing import Any, Callable, Dict, Optional, Tuple, Union
@@ -175,7 +176,7 @@ class CuMemAllocator:
                                         str]] = None) -> None:
        """
        Put the allocator in sleep mode.
-        All data in the memory allocation with the specified tag will be 
+        All data in the memory allocation with the specified tag will be
        offloaded to CPU memory, and others will be discarded.
        :param offload_tags: The tags of the memory allocation that will be
@@ -204,28 +205,37 @@ class CuMemAllocator:
                data.cpu_backup_tensor = cpu_backup_tensor
            unmap_and_release(handle)
-    def wake_up(self):
+        gc.collect()
+        torch.cuda.empty_cache()
+    def wake_up(self, tags: Optional[list[str]] = None) -> None:
        """
        Wake up the allocator from sleep mode.
        All data that is previously offloaded will be loaded back to GPU 
-        memory, and the rest of the data will have empty memory."""
+        memory, and the rest of the data will have empty memory.
+        :param tags: The tags of the memory allocation that will be loaded
+            back to GPU memory. If None, all memory allocation will be loaded
+            back to GPU memory.
+        """
        for ptr, data in self.pointer_to_data.items():
-            handle = data.handle
+            if tags is None or data.tag in tags:
-            create_and_map(handle)
+                handle = data.handle
-            if data.cpu_backup_tensor is not None:
+                create_and_map(handle)
-                cpu_backup_tensor = data.cpu_backup_tensor
+                if data.cpu_backup_tensor is not None:
-                if cpu_backup_tensor is not None:
+                    cpu_backup_tensor = data.cpu_backup_tensor
-                    size_in_bytes = cpu_backup_tensor.numel(
+                    if cpu_backup_tensor is not None:
-                    ) * cpu_backup_tensor.element_size()
+                        size_in_bytes = cpu_backup_tensor.numel(
-                    cpu_ptr = cpu_backup_tensor.data_ptr()
+                        ) * cpu_backup_tensor.element_size()
-                    libcudart.cudaMemcpy(ptr, cpu_ptr, size_in_bytes)
+                        cpu_ptr = cpu_backup_tensor.data_ptr()
-                    data.cpu_backup_tensor = None
+                        libcudart.cudaMemcpy(ptr, cpu_ptr, size_in_bytes)
+                        data.cpu_backup_tensor = None
    @contextmanager
    def use_memory_pool(self, tag: Optional[str] = None):
        """
        A context manager to use the memory pool.
-        All memory allocation created inside the context will be allocated 
+        All memory allocation created inside the context will be allocated
        in the memory pool, and has the specified tag.
        :param tag: The tag of the memory allocation. If None, the default tag

--- a/vllm/distributed/device_communicators/cpu_communicator.py
+++ b/vllm/distributed/device_communicators/cpu_communicator.py
 # SPDX-License-Identifier: Apache-2.0
-from typing import Optional
+import os
+from typing import List, Optional
 import torch
 from torch.distributed import ProcessGroup
+from vllm.platforms import current_platform
+from vllm.platforms.interface import CpuArchEnum
 from .base_device_communicator import DeviceCommunicatorBase
@@ -16,19 +20,120 @@ class CpuCommunicator(DeviceCommunicatorBase):
                 device_group: Optional[ProcessGroup] = None,
                 unique_name: str = ""):
        super().__init__(cpu_group, device, device_group, unique_name)
-        self.ipex_available = False
        self.dist_module = torch.distributed
-        try:
-            import intel_extension_for_pytorch as ipex
+        if current_platform.get_cpu_architecture() == CpuArchEnum.X86:
-            self.ipex_available = True
+            self.dist_module = _CPUSHMDistributed(self)
-            self.dist_module = ipex.distributed
-        except ImportError:
-            """
-            Intel IPEX not found. Falling back to PyTorch native 
-            all_reduce for CPU (e.g. MacOS)
-            """
-            pass
    def all_reduce(self, input_):
        self.dist_module.all_reduce(input_, group=self.device_group)
        return input_
+    def gather(self,
+               input_: torch.Tensor,
+               dst: int = 0,
+               dim: int = -1) -> Optional[torch.Tensor]:
+        """
+        NOTE: We assume that the input tensor is on the same device across
+        all the ranks.
+        NOTE: `dst` is the local rank of the destination rank.
+        """
+        world_size = self.world_size
+        assert -input_.dim() <= dim < input_.dim(), (
+            f"Invalid dim ({dim}) for input tensor with shape {input_.size()}")
+        if dim < 0:
+            # Convert negative dim to positive.
+            dim += input_.dim()
+        # Allocate output tensor.
+        if self.rank_in_group == dst:
+            gather_list = [torch.empty_like(input_) for _ in range(world_size)]
+        else:
+            gather_list = None
+        # Gather.
+        self.dist_module.gather(input_,
+                                gather_list,
+                                dst=self.ranks[dst],
+                                group=self.device_group)
+        if self.rank_in_group == dst:
+            output_tensor = torch.cat(gather_list, dim=dim)
+        else:
+            output_tensor = None
+        return output_tensor
+    def all_gather(self, input_: torch.Tensor, dim: int = -1) -> torch.Tensor:
+        if dim < 0:
+            # Convert negative dim to positive.
+            dim += input_.dim()
+        input_size = input_.size()
+        # NOTE: we have to use concat-style all-gather here,
+        # stack-style all-gather has compatibility issues with
+        # torch.compile . see https://github.com/pytorch/pytorch/issues/138795
+        output_size = (input_size[0] * self.world_size, ) + input_size[1:]
+        # Allocate output tensor.
+        output_tensor = torch.empty(output_size,
+                                    dtype=input_.dtype,
+                                    device=input_.device)
+        # All-gather.
+        self.dist_module.all_gather_into_tensor(output_tensor,
+                                                input_,
+                                                group=self.device_group)
+        # Reshape
+        output_tensor = output_tensor.reshape((self.world_size, ) + input_size)
+        output_tensor = output_tensor.movedim(0, dim)
+        output_tensor = output_tensor.reshape(input_size[:dim] +
+                                              (self.world_size *
+                                               input_size[dim], ) +
+                                              input_size[dim + 1:])
+        return output_tensor
+class _CPUSHMDistributed:
+    def __init__(self, communicator: CpuCommunicator):
+        instance_identifier = os.environ["VLLM_DIST_IDENT"]
+        self.communicator = communicator
+        group_ranks = [str(rank) for rank in self.communicator.ranks]
+        shm_group_identifier = f"[{'-'.join(group_ranks)}]"
+        self.group_name = f"{instance_identifier}-{shm_group_identifier}-cpushm"
+        self.handle = self._init_cpu_shm()
+    def _init_cpu_shm(self) -> int:
+        handle = torch.ops._C.init_shm_manager(
+            self.group_name,
+            self.communicator.world_size,
+            self.communicator.rank,
+        )
+        torch.distributed.barrier(self.communicator.device_group)
+        torch.ops._C.join_shm_manager(
+            handle,
+            self.group_name,
+        )
+        torch.distributed.barrier(self.communicator.device_group)
+        return handle
+    def all_reduce(self,
+                   input: torch.Tensor,
+                   group: Optional[ProcessGroup] = None) -> None:
+        torch.ops._C.shm_allreduce(self.handle, input)
+    def gather(self,
+               input: torch.Tensor,
+               gather_list: Optional[List[torch.Tensor]],
+               dst: int = -1,
+               group: Optional[ProcessGroup] = None) -> None:
+        # Note: different from the torch gather, here we use local dst rank.
+        torch.ops._C.shm_gather(self.handle, input, gather_list,
+                                torch.distributed.get_group_rank(group, dst))
+    def all_gather_into_tensor(self,
+                               output: torch.Tensor,
+                               input: torch.Tensor,
+                               group: Optional[ProcessGroup] = None) -> None:
+        torch.ops._C.shm_all_gather(self.handle, input, output)
--- a/vllm/distributed/device_communicators/custom_all_reduce.py
+++ b/vllm/distributed/device_communicators/custom_all_reduce.py
 # SPDX-License-Identifier: Apache-2.0
-import ctypes
 from contextlib import contextmanager
 from typing import List, Optional, Union
@@ -10,7 +9,6 @@ from torch.distributed import ProcessGroup
 import vllm.envs as envs
 from vllm import _custom_ops as ops
-# from vllm.distributed.device_communicators.cuda_wrapper import CudaRTLibrary
 from vllm.distributed.device_communicators.custom_all_reduce_utils import (
    gpu_p2p_access_check)
 from vllm.distributed.parallel_state import in_the_same_node_as
@@ -23,7 +21,7 @@ try:
    custom_ar = True
 except Exception:
-    # For AMD GPUs and CPUs
+    # For CPUs
    custom_ar = False
 logger = init_logger(__name__)
@@ -56,7 +54,7 @@ class CustomAllreduce:
    def __init__(self,
                 group: ProcessGroup,
                 device: Union[int, str, torch.device],
-                 max_size=8192 * 1024 * 2) -> None:
+                 max_size=8192 * 1024) -> None:
        """
        Args:
            group: the process group to work on. If None, it will use the
@@ -72,9 +70,9 @@ class CustomAllreduce:
        if not custom_ar:
            # disable because of missing custom allreduce library
-            # e.g. in a non-cuda environment
+            # e.g. in a non-GPU environment
-            logger.warning("Custom allreduce is disabled because "
+            logger.info("Custom allreduce is disabled because "
-                           "of missing custom allreduce library")
+                        "of missing custom allreduce library")
            return
        self.group = group
@@ -91,7 +89,6 @@ class CustomAllreduce:
        rank = dist.get_rank(group=self.group)
        self.rank = rank
-        self.rank = rank
        world_size = dist.get_world_size(group=self.group)
        if world_size == 1:
            # No need to initialize custom allreduce for single GPU case.
@@ -135,10 +132,11 @@ class CustomAllreduce:
        # this checks hardware and driver support for NVLink
        assert current_platform.is_cuda_alike()
-        full_nvlink = current_platform.is_fully_connected_nvlink_or_xgmi(
+        fully_connected = current_platform.is_fully_connected(
            physical_device_ids)
-        if not full_nvlink:
+        # if world_size > 2 and not fully_connected:
+        if not fully_connected:
            logger.warning(
                "Custom allreduce is disabled because it's not supported on"
                " more than two PCIe-only GPUs. To silence this warning, "
@@ -147,12 +145,13 @@ class CustomAllreduce:
        # test P2P capability, this checks software/cudaruntime support
        # this is expensive to compute at the first time
        # then we cache the result
-        # if not _can_p2p(rank, world_size):
+        # On AMD GPU, p2p is always enabled between XGMI connected GPUs
-        #     logger.warning(
+        if not current_platform.is_rocm() and not _can_p2p(rank, world_size):
-        #         "Custom allreduce is disabled because your platform lacks "
+            logger.warning(
-        #         "GPU P2P capability or P2P test failed. To silence this "
+                "Custom allreduce is disabled because your platform lacks "
-        #         "warning, specify disable_custom_all_reduce=True explicitly.")
+                "GPU P2P capability or P2P test failed. To silence this "
-        #     return
+                "warning, specify disable_custom_all_reduce=True explicitly.")
+            return
        self.disabled = False
        # Buffers memory are owned by this Python class and passed to C++.
@@ -175,46 +174,11 @@ class CustomAllreduce:
        self.max_size = max_size
        self.rank = rank
        self.world_size = world_size
-        self.full_nvlink = full_nvlink
+        self.fully_connected = fully_connected
        self._ptr = ops.init_custom_ar(self.meta_ptrs, self.rank_data, rank,
-                                       self.full_nvlink)
+                                       self.fully_connected)
        ops.register_buffer(self._ptr, self.buffer_ptrs)
-    # @staticmethod
-    # def create_shared_buffer(
-    #         size_in_bytes: int,
-    #         group: Optional[ProcessGroup] = None) -> List[int]:
-    #     """
-    #     Creates a shared buffer and returns a list of pointers
-    #     representing the buffer on all processes in the group.
-    #     """
-    #     lib = CudaRTLibrary()
-    #     pointer = lib.cudaMalloc(size_in_bytes)
-    #     handle = lib.cudaIpcGetMemHandle(pointer)
-    #     world_size = dist.get_world_size(group=group)
-    #     rank = dist.get_rank(group=group)
-    #     handles = [None] * world_size
-    #     dist.all_gather_object(handles, handle, group=group)
-    #     pointers: List[int] = []
-    #     for i, h in enumerate(handles):
-    #         if i == rank:
-    #             pointers.append(pointer.value)  # type: ignore
-    #         else:
-    #             pointers.append(
-    #                 lib.cudaIpcOpenMemHandle(h).value)  # type: ignore
-    #     return pointers
-    # @staticmethod
-    # def free_shared_buffer(pointers: List[int],
-    #                        group: Optional[ProcessGroup] = None,
-    #                        rank: Optional[int] = None) -> None:
-    #     if rank is None:
-    #         rank = dist.get_rank(group=group)
-    #     lib = CudaRTLibrary()
-    #     lib.cudaFree(ctypes.c_void_p(pointers[rank]))
    @contextmanager
    def capture(self):
        """
@@ -261,7 +225,7 @@ class CustomAllreduce:
            return False
        # for 4 or more non NVLink-capable GPUs, custom allreduce provides
        # little performance improvement over NCCL.
-        if self.world_size == 2 or self.full_nvlink:
+        if self.world_size == 2 or self.fully_connected:
            return inp_size < self.max_size
        return False
@@ -312,8 +276,8 @@ class CustomAllreduce:
    def __del__(self):
        self.close()
    @staticmethod
    def create_shared_buffer(size_in_bytes: int,
                             group: Optional[ProcessGroup] = None,
@@ -340,4 +304,3 @@ class CustomAllreduce:
        if rank is None:
            rank = dist.get_rank(group=group)
        ops.free_shared_buffer(pointers[rank])
--- a/vllm/distributed/device_communicators/shm_broadcast.py
+++ b/vllm/distributed/device_communicators/shm_broadcast.py
@@ -125,8 +125,13 @@ class ShmRingBuffer:
                       lambda *args, **kwargs: None):
                try:
                    self.shared_memory = shared_memory.SharedMemory(name=name)
-                    assert (
+                    # See https://docs.python.org/3/library/multiprocessing.shared_memory.html # noqa
-                        self.shared_memory.size == self.total_bytes_of_buffer)
+                    # Some platforms allocate memory based on page size,
+                    # so the shared memory block size may be larger or equal
+                    # to the requested size. The size parameter is ignored
+                    # when attaching to an existing block.
+                    assert (self.shared_memory.size
+                            >= self.total_bytes_of_buffer)
                except FileNotFoundError:
                    # we might deserialize the object in a different node
                    # in this case, this object is not used,

--- a/vllm/distributed/device_communicators/tpu_communicator.py
+++ b/vllm/distributed/device_communicators/tpu_communicator.py
@@ -22,6 +22,8 @@ if current_platform.is_tpu():
    import torch_xla.core.xla_model as xm
    import torch_xla.runtime as xr
    from torch_xla._internal import pjrt
+    from torch_xla.distributed.xla_multiprocessing import (
+        create_optimized_replica_groups)
    if USE_RAY:
        from vllm.executor import ray_utils
@@ -79,9 +81,12 @@ class TpuCommunicator(DeviceCommunicatorBase):
        pjrt.initialize_multiprocess(local_rank, local_world_size)
        xr._init_world_size_ordinal()
+        self.groups = create_optimized_replica_groups()
    def all_reduce(self, input_: torch.Tensor) -> torch.Tensor:
-        return xm.all_reduce(xm.REDUCE_SUM, input_)
+        # TODO: Remove the groups specification after XLA compiler can support
+        # auto-reordering the ring order for all-reduce.
+        return xm.all_reduce(xm.REDUCE_SUM, input_, groups=self.groups)
    def all_gather(self, input_: torch.Tensor, dim: int = -1) -> torch.Tensor:
        assert dim == -1, "TPUs only support dim=-1 for all-gather."

--- a/vllm/distributed/kv_transfer/kv_connector/factory.py
+++ b/vllm/distributed/kv_transfer/kv_connector/factory.py
@@ -53,3 +53,8 @@ KVConnectorFactory.register_connector(
    "LMCacheConnector",
    "vllm.distributed.kv_transfer.kv_connector.lmcache_connector",
    "LMCacheConnector")
+KVConnectorFactory.register_connector(
+    "MooncakeStoreConnector",
+    "vllm.distributed.kv_transfer.kv_connector.mooncake_store_connector",
+    "MooncakeStoreConnector")
\ No newline at end of file
--- a/vllm/distributed/kv_transfer/kv_connector/mooncake_store_connector.py
+++ b/vllm/distributed/kv_transfer/kv_connector/mooncake_store_connector.py
+# SPDX-License-Identifier: Apache-2.0
+"""
+MooncakeStore Connector for Distributed Machine Learning Inference
+The MooncakeStoreConnector transfers KV caches between prefill vLLM workers
+(KV cache producer) and decode vLLM workers (KV cache consumer) using a
+database-style KVStore.
+"""
+import hashlib
+from typing import TYPE_CHECKING, List, Tuple, Union
+import torch
+from vllm import _custom_ops as ops
+from vllm.config import VllmConfig
+from vllm.distributed.kv_transfer.kv_connector.base import KVConnectorBase
+from vllm.logger import init_logger
+from vllm.sequence import IntermediateTensors
+if TYPE_CHECKING:
+    from vllm.worker.model_runner import ModelInputForGPUWithSamplingMetadata
+logger = init_logger(__name__)
+class MooncakeStoreConnector(KVConnectorBase):
+    def __init__(
+        self,
+        rank: int,
+        local_rank: int,
+        config: VllmConfig,
+    ):
+        self.config = config.kv_transfer_config
+        self.tp_size = config.parallel_config.tensor_parallel_size
+        self.local_tp_rank = local_rank
+        # Init kv_store
+        if self.config.kv_connector == "MooncakeStoreConnector":
+            # Check if MOONCAKE_CONFIG_PATH is set
+            import os
+            use_mooncake_store = os.getenv('MOONCAKE_CONFIG_PATH') is not None
+            if not use_mooncake_store:
+                raise ValueError(
+                    "To use MooncakeStoreConnector, you need to pass the ENV: "
+                    "'MOONCAKE_CONFIG_PATH=/path/to/mooncake_config.json'.")
+            else:
+                from vllm.distributed.kv_transfer.kv_lookup_buffer.mooncake_store import (  # noqa: E501
+                    MooncakeStore)
+                logger.info(
+                    "Initializing KVStoreConnector under kv_transfer_config %s",
+                    self.config)
+                self.kv_store = MooncakeStore(config)
+        else:
+            logger.error("Can not find %s", self.config.kv_connector)
+        assert self.kv_store is not None
+    def close(self) -> None:
+        """Close the buffer and release resources.
+        This method is responsible for cleaning up resources related to the 
+        connector when it is no longer needed.
+        Raises:
+            NotImplementedError: This method must be implemented in subclasses.
+        """
+        self.kv_store.close()
+    def send_kv_caches_and_hidden_states(
+        self,
+        model_executable: torch.nn.Module,
+        model_input: "ModelInputForGPUWithSamplingMetadata",
+        kv_caches: List[torch.Tensor],
+        hidden_or_intermediate_states: Union[torch.Tensor,
+                                             IntermediateTensors],
+    ) -> None:
+        input_tokens_tensor = model_input.input_tokens
+        seq_lens = model_input.attn_metadata.seq_lens
+        slot_mapping_flat = model_input.attn_metadata.slot_mapping.flatten()
+        start_layer = model_executable.model.start_layer
+        end_layer = model_executable.model.end_layer
+        model_config = model_executable.model.config
+        num_heads = int(model_config.num_key_value_heads / self.tp_size)
+        hidden_size = model_config.hidden_size
+        num_attention_heads = model_config.num_attention_heads
+        head_size = int(hidden_size / num_attention_heads)
+        for idx, slen in enumerate(seq_lens):
+            start_pos = sum(seq_lens[:idx])
+            end_pos = start_pos + slen
+            current_tokens = input_tokens_tensor[start_pos:end_pos]
+            store_key_prefix = self.tensor_hash(current_tokens)
+            keys, values = [], []
+            for layer_id in range(start_layer, end_layer):
+                kv_cache = kv_caches[layer_id - start_layer]
+                key_cache = kv_cache[0].reshape(-1, num_heads, head_size)
+                value_cache = kv_cache[1].reshape(-1, num_heads, head_size)
+                current_slot_mapping = slot_mapping_flat[start_pos:end_pos]
+                keys.append(key_cache[current_slot_mapping].unsqueeze(0))
+                values.append(value_cache[current_slot_mapping].unsqueeze(0))
+            keys = torch.cat(keys, dim=0)
+            values = torch.cat(values, dim=0)
+            kvcache_to_sent = torch.stack((keys, values), dim=0)
+            store_kvcache_key = f"{store_key_prefix}_{self.local_tp_rank}"
+            self.kv_store.put(store_kvcache_key, kvcache_to_sent)
+            hidden_key = f"{store_key_prefix}_hidden_{self.local_tp_rank}"
+            self.kv_store.put(hidden_key,
+                              hidden_or_intermediate_states[start_pos:end_pos])
+        logger.debug("[rank%d]: KV send DONE.", torch.distributed.get_rank())
+    def recv_kv_caches_and_hidden_states(
+        self, model_executable: torch.nn.Module,
+        model_input: "ModelInputForGPUWithSamplingMetadata",
+        kv_caches: List[torch.Tensor]
+    ) -> Tuple[Union[torch.Tensor, IntermediateTensors], bool,
+               "ModelInputForGPUWithSamplingMetadata"]:
+        bypass_model_exec = True
+        input_tokens_tensor = model_input.input_tokens
+        seq_lens = model_input.attn_metadata.seq_lens
+        num_prefill_tokens = model_input.attn_metadata.num_prefill_tokens
+        slot_mapping = model_input.attn_metadata.slot_mapping.flatten()
+        start_layer = model_executable.model.start_layer
+        end_layer = model_executable.model.end_layer
+        hidden_or_intermediate_states_for_one_req = []
+        for idx, slen in enumerate(seq_lens):
+            start_pos = sum(seq_lens[:idx])
+            end_pos = start_pos + slen
+            if start_pos >= num_prefill_tokens:
+                # This can happen during inflight batching. See:
+                # vllm/worker/model_runner.py::_prepare_model_input_tensors:
+                # - input_tokens[:num_prefill_tokens] contains prefill tokens.
+                # - input_tokens[num_prefill_tokens:] contains decode tokens.
+                logger.warning("You should set --enable_chunked_prefill=False "
+                               "and --max_num_batched_tokens "
+                               "should be equal to max_seq_len_to_capture")
+                bypass_model_exec = False
+                assert start_pos == num_prefill_tokens
+                break
+            current_tokens = input_tokens_tensor[start_pos:end_pos]
+            # get roi for current seq
+            load_key_prefix = self.tensor_hash(current_tokens)
+            load_kvcache_key = f"{load_key_prefix}_{self.local_tp_rank}"
+            remote_kv = self.kv_store.get(load_kvcache_key)
+            hidden_key = f"{load_key_prefix}_hidden_{self.local_tp_rank}"
+            hidden = self.kv_store.get(hidden_key)
+            if remote_kv is None or hidden is None:
+                # didn't find any match.
+                bypass_model_exec = False
+                continue
+            num_computed_tokens = current_tokens.shape[0]
+            # update the end position based on how many tokens are cached.
+            end_pos = start_pos + num_computed_tokens
+            # call self.kv_store to get kv layer by layer
+            for layer_id in range(start_layer, end_layer):
+                layer = model_executable.model.layers[layer_id]
+                # get kvcache object
+                kv_cache = kv_caches[layer_id - start_layer]
+                key_cache, value_cache = kv_cache[0], kv_cache[1]
+                # get remote kvcache
+                remote_k, remote_v = remote_kv[0][layer_id], remote_kv[1][
+                    layer_id]
+                # use ops.reshape_and_cache_flash to put kv into kvcache
+                ops.reshape_and_cache_flash(
+                    remote_k.to(key_cache.device),
+                    remote_v.to(value_cache.device),
+                    key_cache,
+                    value_cache,
+                    slot_mapping[start_pos:end_pos],
+                    layer.self_attn.attn.kv_cache_dtype,
+                    layer.self_attn.attn._k_scale,
+                    layer.self_attn.attn._v_scale,
+                )
+            hidden_or_intermediate_states_for_one_req.append(hidden)
+        if not bypass_model_exec:
+            logger.warning(
+                "[rank%d]: Failed to receive all KVs and hidden "
+                "states, redo model forwarding.", torch.distributed.get_rank())
+            hidden_or_intermediate_states = None
+        else:
+            logger.debug(
+                "[rank%d]: Successfully received all KVs and hidden "
+                "states, skip model forwarding.", torch.distributed.get_rank())
+            hidden_or_intermediate_states = torch.cat(
+                hidden_or_intermediate_states_for_one_req, dim=0)
+        return hidden_or_intermediate_states, bypass_model_exec, model_input
+    @staticmethod
+    def tensor_hash(tensor: torch.Tensor) -> int:
+        """Calculate the hash value of the tensor."""
+        tensor_bytes = tensor.clone().detach().cpu().numpy().tobytes()
+        hash_object = hashlib.blake2b(tensor_bytes)
+        hash_hex = hash_object.hexdigest()
+        return int(hash_hex[:16], 16)
--- a/vllm/distributed/kv_transfer/kv_lookup_buffer/base.py
+++ b/vllm/distributed/kv_transfer/kv_lookup_buffer/base.py
 # SPDX-License-Identifier: Apache-2.0
 """
-This file contains a new class `KVLookupBufferBase` that allows developers to 
+This file contains a new class `KVLookupBufferBase` that allows developers to
-think of KV cache operations as inserting new KV cache entries (`insert`) 
+think of KV cache operations as inserting new KV cache entries (`insert`)
-into the lookup buffer and querying existing KV caches (`drop_select`) 
+into the lookup buffer and querying existing KV caches (`drop_select`)
 from the lookup buffer.
-All distributed communications are abstracted behind this class.
+This file also contains a new class `KVStoreBufferBase` that allows developers
+to manage the KVCache buffer as a simple key-value storage buffer with basic
+put/get operations.
+These classes above are abstracted behind class `KVCacheBufferBase`.
 """
 from abc import ABC, abstractmethod
@@ -14,9 +18,27 @@ from typing import List, Optional
 import torch
-class KVLookupBufferBase(ABC):
+class KVCacheBufferBase(ABC):
+    """
+    Abstract base class for a KVCache buffer.
    """
-    Abstract base class for a lookup buffer.
+    @abstractmethod
+    def close(self) -> None:
+        """Close the buffer and release resources.
+        This method is responsible for cleaning up resources related to the
+        KVCache buffer when it is no longer needed.
+        Raises:
+            NotImplementedError: This method must be implemented in subclasses.
+        """
+        raise NotImplementedError
+class KVLookupBufferBase(KVCacheBufferBase):
+    """
+    Abstract base class for a KVCache lookup buffer.
    This class provides an abstraction for a key-value (KV) cache lookup buffer.
@@ -96,12 +118,55 @@ class KVLookupBufferBase(ABC):
        """
        raise NotImplementedError
+class KVStoreBufferBase(KVCacheBufferBase):
+    """
+    Abstract base class for a KVCache storage buffer with key-value semantics.
+    This class provides a simple key-value storage buffer abstract with basic
+    put/get operations, which enables flexible KVCache transfer granular
+    control.
+    The functionality is similar to a distributed key-value store, where:
+    - Key: A unique string identifier for the cached entry
+    - Value:
+        - Tensor to be stored and retrieved
+        - None (indicating deletion or empty value)
+    """
+    @abstractmethod
+    def put(
+        self,
+        key: str,
+        value: Optional[torch.Tensor],
+    ) -> None:
+        """Store a key-value pair in the buffer.
+        Args:
+            key (str): Unique identifier for a tensor, this tensor could be the
+                key cache tensor, value cache tensor, or hidden state tensor
+                generated during model forwarding.
+            value (Optional[torch.Tensor]): Tensor to be stored.
+        Raises:
+            NotImplementedError: This method must be implemented in subclasses.
+        """
+        raise NotImplementedError
    @abstractmethod
-    def close(self) -> None:
+    def get(
-        """Close the buffer and release resources.
+        self,
+        key: str,
+    ) -> Optional[torch.Tensor]:
+        """Retrieve a value from the buffer by key.
+        Args:
+            key (str): Unique identifier for a tensor, this tensor could be the
+                key cache tensor, value cache tensor, or hidden state tensor
+                generated during model forwarding.
-        This method is responsible for cleaning up resources related to the 
+        Returns:
-        lookup buffer when it is no longer needed.
+            Optional[torch.Tensor]: Stored tensor if exists, None otherwise.
        Raises:
            NotImplementedError: This method must be implemented in subclasses.

--- a/vllm/distributed/kv_transfer/kv_lookup_buffer/mooncake_store.py
+++ b/vllm/distributed/kv_transfer/kv_lookup_buffer/mooncake_store.py
+# SPDX-License-Identifier: Apache-2.0
+"""
+This file contains a new class `MooncakeStore` that allows developers to
+think of KV cache transfer operations as putting new KV cache entries
+into a remote KVStore-based lookup buffer and getting existing KV caches
+from this remote lookup buffer.
+"""
+import json
+import os
+from dataclasses import dataclass
+from typing import Optional
+import torch
+from safetensors.torch import load as safetensors_load
+from safetensors.torch import save as safetensors_save
+from vllm.config import VllmConfig
+from vllm.distributed.kv_transfer.kv_lookup_buffer.base import (
+    KVStoreBufferBase)
+from vllm.logger import init_logger
+DEFAULT_GLOBAL_SEGMENT_SIZE = 3355443200  # 3.125 GiB
+DEFAULT_LOCAL_BUFFER_SIZE = 1073741824  # 1.0 GiB
+logger = init_logger(__name__)
+@dataclass
+class MooncakeStoreConfig:
+    local_hostname: str
+    metadata_server: str
+    global_segment_size: int
+    local_buffer_size: int
+    protocol: str
+    device_name: str
+    master_server_address: str
+    @staticmethod
+    def from_file(file_path: str) -> 'MooncakeStoreConfig':
+        """Load the config from a JSON file."""
+        with open(file_path) as fin:
+            config = json.load(fin)
+        return MooncakeStoreConfig(
+            local_hostname=config.get("local_hostname"),
+            metadata_server=config.get("metadata_server"),
+            global_segment_size=config.get("global_segment_size",
+                                           DEFAULT_GLOBAL_SEGMENT_SIZE),
+            local_buffer_size=config.get("local_buffer_size",
+                                         DEFAULT_LOCAL_BUFFER_SIZE),
+            protocol=config.get("protocol", "tcp"),
+            device_name=config.get("device_name", ""),
+            master_server_address=config.get("master_server_address"),
+        )
+    @staticmethod
+    def load_from_env() -> 'MooncakeStoreConfig':
+        """Load config from a file specified in the environment variable."""
+        config_file_path = os.getenv('MOONCAKE_CONFIG_PATH')
+        if config_file_path is None:
+            raise ValueError(
+                "The environment variable 'MOONCAKE_CONFIG_PATH' is not set.")
+        return MooncakeStoreConfig.from_file(config_file_path)
+class MooncakeStore(KVStoreBufferBase):
+    def __init__(
+        self,
+        config: VllmConfig,
+    ):
+        try:
+            from mooncake_vllm_adaptor import MooncakeDistributedStore
+        except ImportError as e:
+            raise ImportError(
+                "Please install mooncake by following the instructions at "
+                "https://github.com/kvcache-ai/Mooncake/blob/main/doc/en/build.md "  # noqa: E501
+                "to run vLLM with MooncakeConnector.") from e
+        try:
+            self.store = MooncakeDistributedStore()
+            self.config = MooncakeStoreConfig.load_from_env()
+            logger.info("Mooncake Configuration loaded successfully.")
+            self.store.setup(self.config.local_hostname,
+                             self.config.metadata_server,
+                             self.config.global_segment_size,
+                             self.config.local_buffer_size,
+                             self.config.protocol, self.config.device_name,
+                             self.config.master_server_address)
+        except ValueError as e:
+            logger.error("Configuration loading failed: %s", e)
+            raise
+        except Exception as exc:
+            logger.error(
+                "An error occurred while loading the configuration: %s", exc)
+            raise
+    def close(self):
+        # MooncakeDistributedStore will automatically call the destructor, so
+        # it is unnecessary to close it manually.
+        pass
+    def put(
+        self,
+        key: str,
+        value: Optional[torch.Tensor],
+    ) -> None:
+        # A message queue needs to be introduced before making it asynchronous.
+        if value is not None:
+            self._put_impl(key, value)
+    def get(
+        self,
+        key: str,
+    ) -> Optional[torch.Tensor]:
+        # A message queue needs to be introduced before making it asynchronous.
+        value = self._get_impl(key)
+        return value
+    def _put_impl(
+        self,
+        key: str,
+        value: torch.Tensor,
+    ) -> None:
+        """Put KVCache to Mooncake Store"""
+        device_id = value.device.index if value.device.type == 'cuda' else -1
+        device_tensor = torch.tensor(device_id, dtype=torch.int32)
+        value_bytes = safetensors_save({
+            "tensor": value,
+            "device_id": device_tensor
+        })
+        try:
+            self.store.put(key, value_bytes)
+        except TypeError as err:
+            logger.error("Failed to put value into Mooncake Store: %s", err)
+            raise TypeError("Mooncake Store Put Type Error.") from err
+    def _get_impl(
+        self,
+        key: str,
+    ) -> Optional[torch.Tensor]:
+        """Get KVCache from Mooncake Store"""
+        try:
+            data = self.store.get(key)
+        except TypeError as err:
+            logger.error("Failed to get value from Mooncake Store: %s", err)
+            raise TypeError("Mooncake Store Get Type Error.") from err
+        if data:
+            loaded_tensors = safetensors_load(data)
+            tensor = loaded_tensors["tensor"]
+            device_id_tensor = loaded_tensors["device_id"]
+            device_id = int(device_id_tensor.item())
+            device = torch.device(
+                'cuda', device_id) if device_id >= 0 else torch.device('cpu')
+            return tensor.to(device)
+        return None
--- a/vllm/distributed/parallel_state.py
+++ b/vllm/distributed/parallel_state.py
@@ -119,11 +119,13 @@ def all_reduce_fake(tensor: torch.Tensor, group_name: str) -> torch.Tensor:
 if supports_custom_op():
+    from vllm.platforms import current_platform
    direct_register_custom_op(
        op_name="all_reduce",
        op_func=all_reduce,
        mutates_args=[],
        fake_impl=all_reduce_fake,
+        dispatch_key=current_platform.dispatch_key,
    )
@@ -219,7 +221,8 @@ class GroupCoordinator:
                self.cpu_group, 1 << 22, 6)
        from vllm.platforms import current_platform
-        self.use_custom_op_call = current_platform.is_cuda_alike()
+        self.use_custom_op_call = (current_platform.is_cuda_alike()
+                                   or current_platform.is_tpu())
    @property
    def first_rank(self):

--- a/vllm/distributed/utils.py
+++ b/vllm/distributed/utils.py
@@ -15,6 +15,7 @@ import torch
 from torch.distributed import ProcessGroup, TCPStore
 from torch.distributed.distributed_c10d import (Backend, PrefixStore,
                                                _get_default_timeout,
+                                                _unregister_process_group,
                                                is_nccl_available)
 from torch.distributed.rendezvous import rendezvous
@@ -206,10 +207,7 @@ class StatelessProcessGroup:
    def barrier(self):
        """A barrier to synchronize all ranks."""
        for i in range(self.world_size):
-            if i == self.rank:
+            self.broadcast_obj(None, src=i)
-                self.broadcast_obj(None, src=self.rank)
-            else:
-                self.broadcast_obj(None, src=i)
    @staticmethod
    def create(
@@ -333,3 +331,15 @@ def stateless_init_torch_distributed_process_group(
    pg._register_backend(device, backend_type, backend_class)
    return pg
+def stateless_destroy_torch_distributed_process_group(
+        pg: ProcessGroup) -> None:
+    """
+    Destroy ProcessGroup returned by
+        stateless_init_torch_distributed_process_group().
+    """
+    # Lazy import for non-CUDA backends.
+    from torch.distributed.distributed_c10d import _shutdown_backend
+    _shutdown_backend(pg)
+    _unregister_process_group(pg.group_name)
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -23,6 +23,7 @@ from vllm.executor.executor_base import ExecutorBase
 from vllm.logger import init_logger
 from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS
 from vllm.plugins import load_general_plugins
+from vllm.reasoning import ReasoningParserManager
 from vllm.test_utils import MODEL_WEIGHTS_S3_BUCKET, MODELS_ON_S3
 from vllm.transformers_utils.utils import check_gguf_file
 from vllm.usage.usage_lib import UsageContext
@@ -114,10 +115,12 @@ class EngineArgs:
    # number of P/D disaggregation (or other disaggregation) workers
    pipeline_parallel_size: int = 1
    tensor_parallel_size: int = 1
+    data_parallel_size: int = 1
    enable_expert_parallel: bool = False
    max_parallel_loading_workers: Optional[int] = None
    block_size: Optional[int] = None
    enable_prefix_caching: Optional[bool] = None
+    prefix_caching_hash_algo: str = "builtin"
    disable_sliding_window: bool = False
    disable_cascade_attn: bool = False
    use_v2_block_manager: bool = True
@@ -180,23 +183,7 @@ class EngineArgs:
    guided_decoding_backend: str = 'xgrammar'
    logits_processor_pattern: Optional[str] = None
-    speculative_config: Optional[Union[str, Dict[str, Any]]] = None
+    speculative_config: Optional[Dict[str, Any]] = None
-    # TODO(Shangming): Deprecate these out-of-date params after next release
-    speculative_model: Optional[str] = None
-    speculative_model_quantization: Optional[str] = None
-    speculative_draft_tensor_parallel_size: Optional[int] = None
-    num_speculative_tokens: Optional[int] = None
-    num_speculative_heads: Optional[int] = None
-    speculative_disable_mqa_scorer: Optional[bool] = False
-    speculative_max_model_len: Optional[int] = None
-    speculative_disable_by_batch_size: Optional[int] = None
-    ngram_prompt_lookup_max: Optional[int] = None
-    ngram_prompt_lookup_min: Optional[int] = None
-    spec_decoding_acceptance_method: str = 'rejection_sampler'
-    typical_acceptance_sampler_posterior_threshold: Optional[float] = None
-    typical_acceptance_sampler_posterior_alpha: Optional[float] = None
-    disable_logprobs_during_spec_decoding: Optional[bool] = None
    qlora_adapter_name_or_path: Optional[str] = None
    show_hidden_metrics_for_version: Optional[str] = None
@@ -323,9 +310,7 @@ class EngineArgs:
        parser.add_argument('--download-dir',
                            type=nullable_str,
                            default=EngineArgs.download_dir,
-                            help='Directory to download and load the weights, '
+                            help='Directory to download and load the weights.')
-                            'default to the default cache dir of '
-                            'huggingface.')
        parser.add_argument(
            '--load-format',
            type=str,
@@ -400,8 +385,7 @@ class EngineArgs:
            'Valid backend values are "xgrammar", "guidance", and "auto". '
            'With "auto", we will make opinionated choices based on request'
            'contents and what the backend libraries currently support, so '
-            'the behavior is subject to change in each release. '
+            'the behavior is subject to change in each release.')
-            'The default is xgrammar.')
        parser.add_argument(
            '--logits-processor-pattern',
            type=nullable_str,
@@ -445,6 +429,14 @@ class EngineArgs:
                            type=int,
                            default=EngineArgs.tensor_parallel_size,
                            help='Number of tensor parallel replicas.')
+        parser.add_argument('--data-parallel-size',
+                            '-dp',
+                            type=int,
+                            default=EngineArgs.data_parallel_size,
+                            help='Number of data parallel replicas. '
+                            'MoE layers will be sharded according to the '
+                            'product of the tensor-parallel-size and '
+                            'data-parallel-size.')
        parser.add_argument(
            '--enable-expert-parallel',
            action='store_true',
@@ -479,6 +471,15 @@ class EngineArgs:
            help="Enables automatic prefix caching. "
            "Use ``--no-enable-prefix-caching`` to disable explicitly.",
        )
+        parser.add_argument(
+            "--prefix-caching-hash-algo",
+            type=str,
+            choices=["builtin", "sha256"],
+            default=EngineArgs.prefix_caching_hash_algo,
+            help="Set the hash algorithm for prefix caching. "
+            "Options are 'builtin' (Python's built-in hash) or 'sha256' "
+            "(collision resistant but with certain overheads).",
+        )
        parser.add_argument('--disable-sliding-window',
                            action='store_true',
                            help='Disables sliding window, '
@@ -551,9 +552,7 @@ class EngineArgs:
            type=int,
            default=EngineArgs.max_num_partial_prefills,
            help="For chunked prefill, the max number of concurrent \
-            partial prefills."
+            partial prefills.")
-            "Defaults to 1",
-        )
        parser.add_argument(
            "--max-long-partial-prefills",
            type=int,
@@ -562,15 +561,13 @@ class EngineArgs:
            "than --long-prefill-token-threshold that will be prefilled "
            "concurrently. Setting this less than --max-num-partial-prefills "
            "will allow shorter prompts to jump the queue in front of longer "
-            "prompts in some cases, improving latency. Defaults to 1.")
+            "prompts in some cases, improving latency.")
        parser.add_argument(
            "--long-prefill-token-threshold",
            type=float,
            default=EngineArgs.long_prefill_token_threshold,
            help="For chunked prefill, a request is considered long if the "
-            "prompt is longer than this number of tokens. Defaults to 4%% of "
+            "prompt is longer than this number of tokens.")
-            "the model's context length.",
-        )
        parser.add_argument('--max-num-seqs',
                            type=int,
                            default=EngineArgs.max_num_seqs,
@@ -658,7 +655,7 @@ class EngineArgs:
            type=nullable_kvs,
            default=EngineArgs.limit_mm_per_prompt,
            # The default value is given in
-            # MultiModalRegistry.init_mm_limits_per_prompt
+            # MultiModalConfig.get_limit_per_prompt
            help=('For each multimodal plugin, limit how many '
                  'input instances to allow for each prompt. '
                  'Expects a comma-separated list of items, '
@@ -730,8 +727,7 @@ class EngineArgs:
            type=int,
            default=EngineArgs.max_cpu_loras,
            help=('Maximum number of LoRAs to store in CPU memory. '
-                  'Must be >= than max_loras. '
+                  'Must be >= than max_loras.'))
-                  'Defaults to max_loras.'))
        parser.add_argument(
            '--fully-sharded-loras',
            action='store_true',
@@ -793,129 +789,10 @@ class EngineArgs:
            help='If set, the prefill requests can be chunked based on the '
            'max_num_batched_tokens.')
        parser.add_argument('--speculative-config',
-                            type=nullable_str,
+                            type=json.loads,
                            default=None,
                            help='The configurations for speculative decoding.'
                            ' Should be a JSON string.')
-        parser.add_argument(
-            '--speculative-model',
-            type=nullable_str,
-            default=EngineArgs.speculative_model,
-            help=
-            'The name of the draft model to be used in speculative decoding.')
-        # Quantization settings for speculative model.
-        parser.add_argument(
-            '--speculative-model-quantization',
-            type=nullable_str,
-            choices=[*QUANTIZATION_METHODS, None],
-            default=EngineArgs.speculative_model_quantization,
-            help='Method used to quantize the weights of speculative model. '
-            'If None, we first check the `quantization_config` '
-            'attribute in the model config file. If that is '
-            'None, we assume the model weights are not '
-            'quantized and use `dtype` to determine the data '
-            'type of the weights.')
-        parser.add_argument(
-            '--num-speculative-tokens',
-            type=int,
-            default=EngineArgs.num_speculative_tokens,
-            help='The number of speculative tokens to sample from '
-            'the draft model in speculative decoding.')
-        parser.add_argument(
-            '--num-speculative-heads',
-            type=int,
-            default=EngineArgs.num_speculative_heads,
-            help='The number of speculative heads to sample from '
-                 'the draft model in speculative decoding.')
-        parser.add_argument(
-            '--speculative-disable-mqa-scorer',
-            action='store_true',
-            help=
-            'If set to True, the MQA scorer will be disabled in speculative '
-            ' and fall back to batch expansion')
-        parser.add_argument(
-            '--speculative-draft-tensor-parallel-size',
-            '-spec-draft-tp',
-            type=int,
-            default=EngineArgs.speculative_draft_tensor_parallel_size,
-            help='Number of tensor parallel replicas for '
-            'the draft model in speculative decoding.')
-        parser.add_argument(
-            '--speculative-max-model-len',
-            type=int,
-            default=EngineArgs.speculative_max_model_len,
-            help='The maximum sequence length supported by the '
-            'draft model. Sequences over this length will skip '
-            'speculation.')
-        parser.add_argument(
-            '--speculative-disable-by-batch-size',
-            type=int,
-            default=EngineArgs.speculative_disable_by_batch_size,
-            help='Disable speculative decoding for new incoming requests '
-            'if the number of enqueue requests is larger than this value.')
-        parser.add_argument(
-            '--ngram-prompt-lookup-max',
-            type=int,
-            default=EngineArgs.ngram_prompt_lookup_max,
-            help='Max size of window for ngram prompt lookup in speculative '
-            'decoding.')
-        parser.add_argument(
-            '--ngram-prompt-lookup-min',
-            type=int,
-            default=EngineArgs.ngram_prompt_lookup_min,
-            help='Min size of window for ngram prompt lookup in speculative '
-            'decoding.')
-        parser.add_argument(
-            '--spec-decoding-acceptance-method',
-            type=str,
-            default=EngineArgs.spec_decoding_acceptance_method,
-            choices=['rejection_sampler', 'typical_acceptance_sampler'],
-            help='Specify the acceptance method to use during draft token '
-            'verification in speculative decoding. Two types of acceptance '
-            'routines are supported: '
-            '1) RejectionSampler which does not allow changing the '
-            'acceptance rate of draft tokens, '
-            '2) TypicalAcceptanceSampler which is configurable, allowing for '
-            'a higher acceptance rate at the cost of lower quality, '
-            'and vice versa.')
-        parser.add_argument(
-            '--typical-acceptance-sampler-posterior-threshold',
-            type=float,
-            default=EngineArgs.typical_acceptance_sampler_posterior_threshold,
-            help='Set the lower bound threshold for the posterior '
-            'probability of a token to be accepted. This threshold is '
-            'used by the TypicalAcceptanceSampler to make sampling decisions '
-            'during speculative decoding. Defaults to 0.09')
-        parser.add_argument(
-            '--typical-acceptance-sampler-posterior-alpha',
-            type=float,
-            default=EngineArgs.typical_acceptance_sampler_posterior_alpha,
-            help='A scaling factor for the entropy-based threshold for token '
-            'acceptance in the TypicalAcceptanceSampler. Typically defaults '
-            'to sqrt of --typical-acceptance-sampler-posterior-threshold '
-            'i.e. 0.3')
-        parser.add_argument(
-            '--disable-logprobs-during-spec-decoding',
-            action=StoreBoolean,
-            default=EngineArgs.disable_logprobs_during_spec_decoding,
-            nargs="?",
-            const="True",
-            help='If set to True, token log probabilities are not returned '
-            'during speculative decoding. If set to False, log probabilities '
-            'are returned according to the settings in SamplingParams. If '
-            'not specified, it defaults to True. Disabling log probabilities '
-            'during speculative decoding reduces latency by skipping logprob '
-            'calculation in proposal sampling, target sampling, and after '
-            'accepted tokens are determined.')
        parser.add_argument('--model-loader-extra-config',
                            type=nullable_str,
                            default=EngineArgs.model_loader_extra_config,
@@ -1117,7 +994,7 @@ class EngineArgs:
        parser.add_argument(
            "--reasoning-parser",
            type=str,
-            choices=["deepseek_r1"],
+            choices=list(ReasoningParserManager.reasoning_parsers),
            default=None,
            help=
            "Select the reasoning parser depending on the model that you're "
@@ -1228,58 +1105,14 @@ class EngineArgs:
        This function utilizes `speculative_config` to create a
        SpeculativeConfig object. The `speculative_config` can either be
        provided as a JSON string input via CLI arguments or directly as a
-        dictionary from the engine. If `speculative_config` is not set, this
+        dictionary from the engine.
-        function will attempt to construct a configuration dictionary using
-        certain parameters, which are scheduled for deprecation in the next
-        release. Note that in next releases, `speculative_config` must be
-        provided, and the deprecated standalone speculative-related parameters
-        will be removed.
        """
        if self.speculative_config is None:
-            if (self.speculative_model is None
+            return None
-                    and self.num_speculative_tokens is None):
-                return None
-            # TODO(Shangming): Deprecate this way of setting SpeculativeConfig,
-            # only allow '--speculative-config' after next release
-            logger.warning_once(
-                "Please use '--speculative-config' to set all configurations "
-                "related to speculative decoding. The current method of "
-                "specifying the model through '--speculative-model' and "
-                "adding related parameters (e.g., '--num-speculative-tokens') "
-                "separately will be deprecated in the next release.")
-            spec_config_dict = {
-                "model": self.speculative_model,
-                "quantization": self.speculative_model_quantization,
-                "max_model_len": self.speculative_max_model_len,
-                "draft_tensor_parallel_size":
-                self.speculative_draft_tensor_parallel_size,
-                "num_speculative_tokens": self.num_speculative_tokens,
-                "disable_mqa_scorer": self.speculative_disable_mqa_scorer,
-                "disable_by_batch_size":
-                self.speculative_disable_by_batch_size,
-                "prompt_lookup_max": self.ngram_prompt_lookup_max,
-                "prompt_lookup_min": self.ngram_prompt_lookup_min,
-                "acceptance_method": self.spec_decoding_acceptance_method,
-                "posterior_threshold":
-                self.typical_acceptance_sampler_posterior_threshold,
-                "posterior_alpha":
-                self.typical_acceptance_sampler_posterior_alpha,
-                "disable_logprobs": self.disable_logprobs_during_spec_decoding,
-            }
-            self.speculative_config = spec_config_dict
-        else:
-            if isinstance(self.speculative_config, str):
-                import ast
-                self.speculative_config = ast.literal_eval(
-                    self.speculative_config)
        # Note(Shangming): These parameters are not obtained from the cli arg
        # '--speculative-config' and must be passed in when creating the engine
        # config.
-        assert isinstance(self.speculative_config, dict)
        self.speculative_config.update({
            "target_model_config": target_model_config,
            "target_parallel_config": target_parallel_config,
@@ -1349,6 +1182,7 @@ class EngineArgs:
            num_gpu_blocks_override=self.num_gpu_blocks_override,
            sliding_window=model_config.get_sliding_window(),
            enable_prefix_caching=self.enable_prefix_caching,
+            prefix_caching_hash_algo=self.prefix_caching_hash_algo,
            cpu_offload_gb=self.cpu_offload_gb,
            calculate_kv_scales=self.calculate_kv_scales,
        )
@@ -1367,6 +1201,7 @@ class EngineArgs:
        parallel_config = ParallelConfig(
            pipeline_parallel_size=self.pipeline_parallel_size,
            tensor_parallel_size=self.tensor_parallel_size,
+            data_parallel_size=self.data_parallel_size,
            enable_expert_parallel=self.enable_expert_parallel,
            max_parallel_loading_workers=self.max_parallel_loading_workers,
            disable_custom_all_reduce=self.disable_custom_all_reduce,
@@ -1561,7 +1396,8 @@ class EngineArgs:
        # Xgrammar and Guidance are supported.
        SUPPORTED_GUIDED_DECODING = [
-            "xgrammar", "xgrammar:disable-any-whitespace", "guidance", "auto"
+            "xgrammar", "xgrammar:disable-any-whitespace", "guidance",
+            "guidance:disable-any-whitespace", "auto"
        ]
        if self.guided_decoding_backend not in SUPPORTED_GUIDED_DECODING:
            _raise_or_fallback(feature_name="--guided-decoding-backend",
@@ -1603,12 +1439,6 @@ class EngineArgs:
                               recommend_to_remove=False)
            return False
-        # No CPU offloading yet.
-        if self.cpu_offload_gb != EngineArgs.cpu_offload_gb:
-            _raise_or_fallback(feature_name="--cpu-offload-gb",
-                               recommend_to_remove=False)
-            return False
        # Only Fp16 and Bf16 dtypes since we only support FA.
        V1_SUPPORTED_DTYPES = [torch.bfloat16, torch.float16]
        if model_config.dtype not in V1_SUPPORTED_DTYPES:
@@ -1617,7 +1447,7 @@ class EngineArgs:
            return False
        # Some quantization is not compatible with torch.compile.
-        V1_UNSUPPORTED_QUANT = ["bitsandbytes", "gguf"]
+        V1_UNSUPPORTED_QUANT = ["gguf"]
        if model_config.quantization in V1_UNSUPPORTED_QUANT:
            _raise_or_fallback(
                feature_name=f"--quantization {model_config.quantization}",
@@ -1636,21 +1466,11 @@ class EngineArgs:
                               recommend_to_remove=False)
            return False
-        # No TransformersModel support so far.
-        if (model_config.model_impl == ModelImpl.TRANSFORMERS
-                or model_config.model_impl == "transformers"):
-            _raise_or_fallback(
-                feature_name=f"model_impl={model_config.model_impl}",
-                recommend_to_remove=False)
-            return False
        # No Concurrent Partial Prefills so far.
        if (self.max_num_partial_prefills
                != EngineArgs.max_num_partial_prefills
                or self.max_long_partial_prefills
-                != EngineArgs.max_long_partial_prefills
+                != EngineArgs.max_long_partial_prefills):
-                or self.long_prefill_token_threshold
-                != EngineArgs.long_prefill_token_threshold):
            _raise_or_fallback(feature_name="Concurrent Partial Prefill",
                               recommend_to_remove=False)
            return False
@@ -1662,12 +1482,22 @@ class EngineArgs:
            return False
        # Only Ngram speculative decoding so far.
-        if (self.speculative_model is not None
+        is_ngram_enabled = False
-                or self.num_speculative_tokens is not None):
+        is_eagle_enabled = False
+        if self.speculative_config is not None:
            # This is supported but experimental (handled below).
-            if self.speculative_model in ("ngram", "[ngram]"):
+            speculative_method = self.speculative_config.get("method")
-                pass
+            if speculative_method:
+                if speculative_method in ("ngram", "[ngram]"):
+                    is_ngram_enabled = True
+                elif speculative_method == "eagle":
+                    is_eagle_enabled = True
            else:
+                speculative_model = self.speculative_config.get("model")
+                if speculative_model in ("ngram", "[ngram]"):
+                    is_ngram_enabled = True
+            if not (is_ngram_enabled or is_eagle_enabled):
+                # Other speculative decoding methods are not supported yet.
                _raise_or_fallback(feature_name="Speculative Decoding",
                                   recommend_to_remove=False)
                return False
@@ -1689,9 +1519,8 @@ class EngineArgs:
            _raise_or_fallback(feature_name=name, recommend_to_remove=True)
            return False
-        # No support for device type other than CUDA, AMD (experiemntal) or
+        # Platforms must decide if they can support v1 for this model
-        # TPU (experimental) so far.
+        if not current_platform.supports_v1(model_config=model_config):
-        if not (current_platform.is_cuda_alike() or current_platform.is_tpu()):
            _raise_or_fallback(
                feature_name=f"device type={current_platform.device_type}",
                recommend_to_remove=False)
@@ -1704,23 +1533,26 @@ class EngineArgs:
                and _warn_or_fallback("Engine in background thread")):
            return False
-        # LoRA is supported on V1, but off by default for now.
+        # PP is supported on V1 with Ray distributed executor,
-        if self.enable_lora and _warn_or_fallback("LORA"):
+        # but off for MP distributed executor for now.
+        if (self.pipeline_parallel_size > 1
+                and self.distributed_executor_backend != "ray"):
+            name = "Pipeline Parallelism without Ray distributed executor"
+            _raise_or_fallback(feature_name=name, recommend_to_remove=False)
            return False
-        # PP is supported on V1, but off by default for now.
+        # ngram is supported on V1, but off by default for now.
-        if self.pipeline_parallel_size > 1 and _warn_or_fallback("PP"):
+        if is_ngram_enabled and _warn_or_fallback("ngram"):
            return False
-        # ngram is supported on V1, but off by default for now.
+        # Eagle is under development, so we don't support it yet.
-        if self.speculative_model in (
+        if is_eagle_enabled and _warn_or_fallback("Eagle"):
-                "ngram", "[ngram]") and _warn_or_fallback("ngram"):
            return False
        # Non-CUDA is supported on V1, but off by default for now.
        not_cuda = not current_platform.is_cuda()
        if not_cuda and _warn_or_fallback(  # noqa: SIM103
-                current_platform.device_type):
+                current_platform.device_name):
            return False
        #############################################################
@@ -1743,7 +1575,7 @@ class EngineArgs:
                is_gpu = current_platform.is_cuda()
                use_sliding_window = (model_config.get_sliding_window()
                                      is not None)
-                use_spec_decode = self.speculative_model is not None
+                use_spec_decode = self.speculative_config is not None
                if (is_gpu and not use_sliding_window and not use_spec_decode
                        and not self.enable_lora
@@ -1771,12 +1603,22 @@ class EngineArgs:
            msg = "Chunked prefill is not supported for pooling models"
            raise ValueError(msg)
-        # Disable prefix caching for multimodal models for VLLM_V0.
+        # if using prefix caching, we must set a hash algo
-        if (model_config.is_multimodal_model and self.enable_prefix_caching):
+        if self.enable_prefix_caching:
-            logger.warning(
+            # Disable prefix caching for multimodal models for VLLM_V0.
-                "--enable-prefix-caching is not supported for multimodal "
+            if model_config.is_multimodal_model:
-                "models in V0 and has been disabled.")
+                logger.warning(
-            self.enable_prefix_caching = False
+                    "--enable-prefix-caching is not supported for multimodal "
+                    "models in V0 and has been disabled.")
+                self.enable_prefix_caching = False
+            # VLLM_V0 only supports builtin hash algo for prefix caching.
+            if self.prefix_caching_hash_algo is None:
+                self.prefix_caching_hash_algo = "builtin"
+            elif self.prefix_caching_hash_algo == "sha256":
+                raise ValueError(
+                    "sha256 is not supported for prefix caching in V0 engine. "
+                    "Please use 'builtin'.")
        # Set max_num_seqs to 256 for VLLM_V0.
        if self.max_num_seqs is None:
@@ -1792,6 +1634,10 @@ class EngineArgs:
        if self.enable_prefix_caching is None:
            self.enable_prefix_caching = True
+        # if using prefix caching, we must set a hash algo
+        if self.enable_prefix_caching and self.prefix_caching_hash_algo is None:
+            self.prefix_caching_hash_algo = "builtin"
        # V1 should use the new scheduler by default.
        # Swap it only if this arg is set to the original V0 default
        if self.scheduler_cls == EngineArgs.scheduler_cls:

--- a/vllm/engine/async_llm_engine.py
+++ b/vllm/engine/async_llm_engine.py
@@ -303,8 +303,11 @@ class _AsyncLLMEngine(LLMEngine):
            ctx.seq_group_metadata_list = seq_group_metadata_list
            ctx.scheduler_outputs = scheduler_outputs
-            finished_requests_ids = self.scheduler[
+            if not scheduler_outputs.is_empty():
-                virtual_engine].get_and_reset_finished_requests_ids()
+                # this will cause mamba_cache/minimax_cache failed
+                # to release finished_requests_ids of the last steps
+                finished_requests_ids = self.scheduler[
+                    virtual_engine].get_and_reset_finished_requests_ids()
            # Maybe switch from async mode to sync mode
            if not allow_async_output_proc and len(ctx.output_queue) > 0:
@@ -1222,8 +1225,8 @@ class AsyncLLMEngine(EngineClient):
    async def sleep(self, level: int = 1) -> None:
        self.engine.sleep(level)
-    async def wake_up(self) -> None:
+    async def wake_up(self, tags: Optional[list[str]] = None) -> None:
-        self.engine.wake_up()
+        self.engine.wake_up(tags)
    async def is_sleeping(self) -> bool:
        return self.engine.is_sleeping()

--- a/vllm/engine/llm_engine.py
+++ b/vllm/engine/llm_engine.py
@@ -8,8 +8,8 @@ from collections import deque
 from contextlib import contextmanager
 from dataclasses import dataclass
 from functools import partial
-from typing import (TYPE_CHECKING, Callable, ClassVar, Deque, Dict, Iterable,
+from typing import (TYPE_CHECKING, Any, Callable, ClassVar, Deque, Dict,
-                    List, Mapping, NamedTuple, Optional)
+                    Iterable, List, Mapping, NamedTuple, Optional)
 from typing import Sequence as GenericSequence
 from typing import Set, Type, Union, cast, overload
@@ -31,8 +31,8 @@ from vllm.entrypoints.openai.logits_processors import (
    get_logits_processors as get_openai_logits_processors)
 from vllm.executor.executor_base import ExecutorBase
 from vllm.inputs import (INPUT_REGISTRY, InputRegistry, ProcessorInputs,
-                         PromptType, SingletonInputsAdapter)
+                         PromptType)
-from vllm.inputs.parse import is_encoder_decoder_inputs, is_token_prompt
+from vllm.inputs.parse import is_token_prompt, split_enc_dec_inputs
 from vllm.inputs.preprocess import InputPreprocessor
 from vllm.logger import init_logger
 from vllm.logits_process import get_bad_words_logits_processors
@@ -68,6 +68,7 @@ _LOCAL_LOGGING_INTERVAL_SEC = 5
 _G = TypeVar("_G", bound=BaseTokenizerGroup, default=BaseTokenizerGroup)
 _O = TypeVar("_O", RequestOutput, PoolingRequestOutput)
+_R = TypeVar("_R", default=Any)
 @dataclass
@@ -612,12 +613,7 @@ class LLMEngine:
        seq_id = next(self.seq_counter)
        eos_token_id = self.input_preprocessor.get_eos_token_id(lora_request)
-        if is_encoder_decoder_inputs(processed_inputs):
+        encoder_inputs, decoder_inputs = split_enc_dec_inputs(processed_inputs)
-            decoder_inputs = processed_inputs["decoder"]
-            encoder_inputs = processed_inputs["encoder"]
-        else:
-            decoder_inputs = processed_inputs
-            encoder_inputs = None
        seq = Sequence(seq_id, decoder_inputs, block_size, eos_token_id,
                       lora_request, prompt_adapter_request)
@@ -1959,10 +1955,10 @@ class LLMEngine:
            "Sleep mode is not enabled in the model config")
        self.model_executor.sleep(level=level)
-    def wake_up(self) -> None:
+    def wake_up(self, tags: Optional[list[str]] = None) -> None:
        assert self.vllm_config.model_config.enable_sleep_mode, (
            "Sleep mode is not enabled in the model config")
-        self.model_executor.wake_up()
+        self.model_executor.wake_up(tags)
    def is_sleeping(self) -> bool:
        return self.model_executor.is_sleeping
@@ -2048,15 +2044,16 @@ class LLMEngine:
    def _validate_model_inputs(self, inputs: ProcessorInputs,
                               lora_request: Optional[LoRARequest]):
-        if is_encoder_decoder_inputs(inputs):
+        encoder_inputs, decoder_inputs = split_enc_dec_inputs(inputs)
-            # For encoder-decoder multimodal models, the max_prompt_len
-            # restricts the decoder prompt length
+        # For encoder-decoder multimodal models, the max_prompt_len
-            prompt_inputs = inputs["decoder" if self.model_config.
+        # restricts the decoder prompt length
-                                   is_multimodal_model else "encoder"]
+        if self.model_config.is_multimodal_model:
+            prompt_inputs = decoder_inputs
        else:
-            prompt_inputs = inputs
+            prompt_inputs = encoder_inputs or decoder_inputs
-        prompt_ids = SingletonInputsAdapter(prompt_inputs).prompt_token_ids
+        prompt_ids = prompt_inputs["prompt_token_ids"]
        if prompt_ids is None or len(prompt_ids) == 0:
            raise ValueError("Prompt cannot be empty")
@@ -2101,8 +2098,9 @@ class LLMEngine:
            guided_decoding.backend = guided_decoding.backend or \
                self.decoding_config.guided_decoding_backend
-            logger.debug("Reasoning backend: %s",
+            if self.decoding_config.reasoning_backend is not None:
-                         self.decoding_config.reasoning_backend)
+                logger.debug("Building with reasoning backend %s",
+                             self.decoding_config.reasoning_backend)
            processor = get_local_guided_decoding_logits_processor(
                guided_params=guided_decoding,
@@ -2143,6 +2141,14 @@ class LLMEngine:
        return sampling_params
+    def collective_rpc(self,
+                       method: Union[str, Callable[..., _R]],
+                       timeout: Optional[float] = None,
+                       args: tuple = (),
+                       kwargs: Optional[dict[str, Any]] = None) -> list[_R]:
+        return self.model_executor.collective_rpc(method, timeout, args,
+                                                  kwargs)
 if envs.is_set("VLLM_USE_V1") and envs.VLLM_USE_V1:
    from vllm.v1.engine.llm_engine import LLMEngine as V1LLMEngine

--- a/vllm/engine/metrics.py
+++ b/vllm/engine/metrics.py
@@ -52,6 +52,11 @@ class Metrics:
        max_model_len = vllm_config.model_config.max_model_len
+        # Use this flag to hide metrics that were deprecated in
+        # a previous release and which will be removed future
+        self.show_hidden_metrics = \
+            vllm_config.observability_config.show_hidden_metrics
        # System stats
        #   Scheduler State
        self.gauge_scheduler_running = self._gauge_cls(
@@ -76,14 +81,15 @@ class Metrics:
        )
        # Deprecated in 0.8 - KV cache offloading is not used in V1
-        # TODO: in 0.9, only enable if show_hidden_metrics=True
+        # Hidden in 0.9, due to be removed in 0.10
-        self.gauge_scheduler_swapped = self._gauge_cls(
+        if self.show_hidden_metrics:
-            name="vllm:num_requests_swapped",
+            self.gauge_scheduler_swapped = self._gauge_cls(
-            documentation=(
+                name="vllm:num_requests_swapped",
-                "Number of requests swapped to CPU. "
+                documentation=(
-                "DEPRECATED: KV cache offloading is not used in V1"),
+                    "Number of requests swapped to CPU. "
-            labelnames=labelnames,
+                    "DEPRECATED: KV cache offloading is not used in V1"),
-            multiprocess_mode="sum")
+                labelnames=labelnames,
+                multiprocess_mode="sum")
        #   KV Cache Usage in %
        self.gauge_gpu_cache_usage = self._gauge_cls(
@@ -93,34 +99,33 @@ class Metrics:
            multiprocess_mode="sum")
        # Deprecated in 0.8 - KV cache offloading is not used in V1
-        # TODO: in 0.9, only enable if show_hidden_metrics=True
+        # Hidden in 0.9, due to be removed in 0.10
-        self.gauge_cpu_cache_usage = self._gauge_cls(
+        if self.show_hidden_metrics:
-            name="vllm:cpu_cache_usage_perc",
+            self.gauge_cpu_cache_usage = self._gauge_cls(
-            documentation=(
+                name="vllm:cpu_cache_usage_perc",
-                "CPU KV-cache usage. 1 means 100 percent usage. "
+                documentation=(
-                "DEPRECATED: KV cache offloading is not used in V1"),
+                    "CPU KV-cache usage. 1 means 100 percent usage. "
-            labelnames=labelnames,
+                    "DEPRECATED: KV cache offloading is not used in V1"),
-            multiprocess_mode="sum")
+                labelnames=labelnames,
+                multiprocess_mode="sum")
-        # Deprecated in 0.8 - KV cache offloading is not used in V1
+            self.gauge_cpu_prefix_cache_hit_rate = self._gauge_cls(
-        # TODO: in 0.9, only enable if show_hidden_metrics=True
+                name="vllm:cpu_prefix_cache_hit_rate",
-        self.gauge_cpu_prefix_cache_hit_rate = self._gauge_cls(
+                documentation=(
-            name="vllm:cpu_prefix_cache_hit_rate",
+                    "CPU prefix cache block hit rate. "
-            documentation=(
+                    "DEPRECATED: KV cache offloading is not used in V1"),
-                "CPU prefix cache block hit rate. "
+                labelnames=labelnames,
-                "DEPRECATED: KV cache offloading is not used in V1"),
+                multiprocess_mode="sum")
-            labelnames=labelnames,
-            multiprocess_mode="sum")
        # Deprecated in 0.8 - replaced by queries+hits counters in V1
-        # TODO: in 0.9, only enable if show_hidden_metrics=True
+        # Hidden in 0.9, due to be removed in 0.10
-        self.gauge_gpu_prefix_cache_hit_rate = self._gauge_cls(
+        if self.show_hidden_metrics:
-            name="vllm:gpu_prefix_cache_hit_rate",
+            self.gauge_gpu_prefix_cache_hit_rate = self._gauge_cls(
-            documentation=("GPU prefix cache block hit rate. "
+                name="vllm:gpu_prefix_cache_hit_rate",
-                           "DEPRECATED: use vllm:gpu_prefix_cache_queries and "
+                documentation=("GPU prefix cache block hit rate. "
-                           "vllm:gpu_prefix_cache_queries in V1"),
+                               "DEPRECATED: use vllm:gpu_prefix_cache_queries "
-            labelnames=labelnames,
+                               "and vllm:gpu_prefix_cache_queries in V1"),
-            multiprocess_mode="sum")
+                labelnames=labelnames,
+                multiprocess_mode="sum")
        # Iteration stats
        self.counter_num_preemption = self._counter_cls(
@@ -198,33 +203,35 @@ class Metrics:
            labelnames=labelnames,
            buckets=request_latency_buckets)
        # Deprecated in 0.8 - duplicates vllm:request_queue_time_seconds:
-        # TODO: in 0.9, only enable if show_hidden_metrics=True
+        # Hidden in 0.9, due to be removed in 0.10
-        self.histogram_time_in_queue_request = self._histogram_cls(
+        if self.show_hidden_metrics:
-            name="vllm:time_in_queue_requests",
+            self.histogram_time_in_queue_request = self._histogram_cls(
-            documentation=(
+                name="vllm:time_in_queue_requests",
-                "Histogram of time the request spent in the queue in seconds. "
+                documentation=
-                "DEPRECATED: use vllm:request_queue_time_seconds instead."),
+                ("Histogram of time the request spent in the queue in seconds. "
-            labelnames=labelnames,
+                 "DEPRECATED: use vllm:request_queue_time_seconds instead."),
-            buckets=request_latency_buckets)
+                labelnames=labelnames,
+                buckets=request_latency_buckets)
        # Deprecated in 0.8 - use prefill/decode/inference time metrics
-        # TODO: in 0.9, only enable if show_hidden_metrics=True
+        # Hidden in 0.9, due to be removed in 0.10
-        self.histogram_model_forward_time_request = self._histogram_cls(
+        if self.show_hidden_metrics:
-            name="vllm:model_forward_time_milliseconds",
+            self.histogram_model_forward_time_request = self._histogram_cls(
-            documentation=(
+                name="vllm:model_forward_time_milliseconds",
-                "Histogram of time spent in the model forward pass in ms. "
+                documentation=
-                "DEPRECATED: use prefill/decode/inference time metrics instead."
+                ("Histogram of time spent in the model forward pass in ms. "
-            ),
+                 "DEPRECATED: use prefill/decode/inference time metrics instead"
-            labelnames=labelnames,
+                 ),
-            buckets=build_1_2_3_5_8_buckets(3000))
+                labelnames=labelnames,
-        self.histogram_model_execute_time_request = self._histogram_cls(
+                buckets=build_1_2_3_5_8_buckets(3000))
-            name="vllm:model_execute_time_milliseconds",
+            self.histogram_model_execute_time_request = self._histogram_cls(
-            documentation=(
+                name="vllm:model_execute_time_milliseconds",
-                "Histogram of time spent in the model execute function in ms."
+                documentation=
-                "DEPRECATED: use prefill/decode/inference time metrics instead."
+                ("Histogram of time spent in the model execute function in ms."
-            ),
+                 "DEPRECATED: use prefill/decode/inference time metrics instead"
-            labelnames=labelnames,
+                 ),
-            buckets=build_1_2_3_5_8_buckets(3000))
+                labelnames=labelnames,
+                buckets=build_1_2_3_5_8_buckets(3000))
        #   Metadata
        self.histogram_num_prompt_tokens_request = self._histogram_cls(
@@ -543,11 +550,6 @@ class PrometheusStatLogger(StatLoggerBase):
        self.metrics = self._metrics_cls(labelnames=list(labels.keys()),
                                         vllm_config=vllm_config)
-        # Use this flag to hide metrics that were deprecated in
-        # a previous release and which will be removed future
-        self.show_hidden_metrics = \
-            vllm_config.observability_config.show_hidden_metrics
    def _log_gauge(self, gauge, data: Union[int, float]) -> None:
        # Convenience function for logging to gauge.
        gauge.labels(**self.labels).set(data)
@@ -580,18 +582,20 @@ class PrometheusStatLogger(StatLoggerBase):
        # System state data
        self._log_gauge(self.metrics.gauge_scheduler_running,
                        stats.num_running_sys)
-        self._log_gauge(self.metrics.gauge_scheduler_swapped,
+        if self.metrics.show_hidden_metrics:
-                        stats.num_swapped_sys)
+            self._log_gauge(self.metrics.gauge_scheduler_swapped,
+                            stats.num_swapped_sys)
        self._log_gauge(self.metrics.gauge_scheduler_waiting,
                        stats.num_waiting_sys)
        self._log_gauge(self.metrics.gauge_gpu_cache_usage,
                        stats.gpu_cache_usage_sys)
-        self._log_gauge(self.metrics.gauge_cpu_cache_usage,
+        if self.metrics.show_hidden_metrics:
-                        stats.cpu_cache_usage_sys)
+            self._log_gauge(self.metrics.gauge_cpu_cache_usage,
-        self._log_gauge(self.metrics.gauge_cpu_prefix_cache_hit_rate,
+                            stats.cpu_cache_usage_sys)
-                        stats.cpu_prefix_cache_hit_rate)
+            self._log_gauge(self.metrics.gauge_cpu_prefix_cache_hit_rate,
-        self._log_gauge(self.metrics.gauge_gpu_prefix_cache_hit_rate,
+                            stats.cpu_prefix_cache_hit_rate)
-                        stats.gpu_prefix_cache_hit_rate)
+            self._log_gauge(self.metrics.gauge_gpu_prefix_cache_hit_rate,
+                            stats.gpu_prefix_cache_hit_rate)
        # Including max-lora in metric, in future this property of lora
        # config maybe extended to be dynamic.
        lora_info = {
@@ -629,12 +633,15 @@ class PrometheusStatLogger(StatLoggerBase):
                            stats.time_prefill_requests)
        self._log_histogram(self.metrics.histogram_decode_time_request,
                            stats.time_decode_requests)
-        self._log_histogram(self.metrics.histogram_time_in_queue_request,
+        if self.metrics.show_hidden_metrics:
-                            stats.time_in_queue_requests)
+            self._log_histogram(self.metrics.histogram_time_in_queue_request,
-        self._log_histogram(self.metrics.histogram_model_forward_time_request,
+                                stats.time_in_queue_requests)
-                            stats.model_forward_time_requests)
+            self._log_histogram(
-        self._log_histogram(self.metrics.histogram_model_execute_time_request,
+                self.metrics.histogram_model_forward_time_request,
-                            stats.model_execute_time_requests)
+                stats.model_forward_time_requests)
+            self._log_histogram(
+                self.metrics.histogram_model_execute_time_request,
+                stats.model_execute_time_requests)
        # Metadata
        finished_reason_counter = CollectionsCounter(
            stats.finished_reason_requests)