Merge tag 'v0.5.5' into v0.5.5-dtk24.04.1

af7f4372 · zhuwenwen · 5e19cdef · 09c77926 · af7f4372 · af7f4372
Commit af7f4372 authored Sep 03, 2024 by zhuwenwen
20 changed files
--- a/vllm/config.py
+++ b/vllm/config.py
 import enum
 import json
 from dataclasses import dataclass, field, fields
-from typing import TYPE_CHECKING, ClassVar, List, Optional, Tuple, Type, Union
+from typing import (TYPE_CHECKING, ClassVar, List, Mapping, Optional, Tuple,
+                    Type, Union)

 import torch
 from transformers import PretrainedConfig
@@ -10,10 +11,14 @@ import vllm.envs as envs
 from vllm.logger import init_logger
 from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS
 from vllm.model_executor.models import ModelRegistry
-from vllm.tracing import is_otel_installed
-from vllm.transformers_utils.config import get_config, get_hf_text_config
-from vllm.utils import (cuda_device_count_stateless, get_cpu_memory, is_cpu,
-                        is_hip, is_neuron, is_openvino, is_tpu, is_xpu,
+from vllm.platforms import current_platform
+from vllm.tracing import is_otel_available, otel_import_error_traceback
+from vllm.transformers_utils.config import (get_config,
+                                            get_hf_image_processor_config,
+                                            get_hf_text_config)
+from vllm.utils import (STR_NOT_IMPL_ENC_DEC_CUDAGRAPH, GiB_bytes,
+                        cuda_device_count_stateless, get_cpu_memory, is_cpu,
+                        is_hip, is_neuron, is_openvino, is_xpu,
                        print_warning_once)

 if TYPE_CHECKING:
@@ -26,7 +31,6 @@ if TYPE_CHECKING:

 logger = init_logger(__name__)

-_GB = 1 << 30
 _EMBEDDING_MODEL_MAX_NUM_BATCHED_TOKENS = 32768

 _PP_SUPPORTED_MODELS = [
@@ -34,6 +38,7 @@ _PP_SUPPORTED_MODELS = [
    "AquilaForCausalLM",
    "DeepseekV2ForCausalLM",
    "InternLMForCausalLM",
+    "JAISLMHeadModel",
    "LlamaForCausalLM",
    "LLaMAForCausalLM",
    "MistralForCausalLM",
@@ -87,6 +92,9 @@ class ModelConfig:
        enforce_eager: Whether to enforce eager execution. If True, we will
            disable CUDA graph and always execute the model in eager mode.
            If False, we will use CUDA graph and eager execution in hybrid.
+            If None, the user did not specify, so default to False -
+            except for encoder/decoder models, which currently require
+            eager mode.
        max_context_len_to_capture: Maximum context len covered by CUDA graphs.
            When a sequence has context length larger than this, we fall back
            to eager mode (DEPRECATED. Use max_seq_len_to_capture instead).
@@ -103,6 +111,8 @@ class ModelConfig:
            matches the model name exposed via the APIs. If multiple model 
            names provided, the first name will be used. If not specified, 
            the model name will be the same as `model`.
+        limit_mm_per_prompt: Maximum number of data instances per modality 
+            per prompt. Only applicable for multimodal models.
    """

    def __init__(
@@ -119,16 +129,17 @@ class ModelConfig:
        rope_theta: Optional[float] = None,
        tokenizer_revision: Optional[str] = None,
        max_model_len: Optional[int] = None,
+        spec_target_max_model_len: Optional[int] = None,
        quantization: Optional[str] = None,
        quantization_param_path: Optional[str] = None,
-        enforce_eager: bool = False,
+        enforce_eager: Optional[bool] = None,
        max_context_len_to_capture: Optional[int] = None,
        max_seq_len_to_capture: Optional[int] = None,
        max_logprobs: int = 20,
        disable_sliding_window: bool = False,
        skip_tokenizer_init: bool = False,
        served_model_name: Optional[Union[str, List[str]]] = None,
-        multimodal_config: Optional["MultiModalConfig"] = None,
+        limit_mm_per_prompt: Optional[Mapping[str, int]] = None,
    ) -> None:
        self.model = model
        self.tokenizer = tokenizer
@@ -158,8 +169,38 @@ class ModelConfig:
        self.hf_config = get_config(self.model, trust_remote_code, revision,
                                    code_revision, rope_scaling, rope_theta)
        self.hf_text_config = get_hf_text_config(self.hf_config)
+        self.hf_image_processor_config = get_hf_image_processor_config(
+            self.model, revision)
        self.dtype = _get_and_verify_dtype(self.hf_text_config, dtype)

+        # Choose a default enforce_eager value if the user did not specify
+        # a value (enforce_eager is None)
+        if getattr(self.hf_config, 'is_encoder_decoder', False):
+            if self.enforce_eager is None:
+                # *Only for encoder/decoder models* and
+                # *only if enforce_eager is unset*, override
+                # to enforce_eager=True
+                #
+                # Add a logger message since it is *somewhat* non-intuitive that
+                # enforce_eager is True when the user has not specified its
+                # value.
+                logger.info("Forcing enforce_eager == True because "
+                            "enforce_eager setting was unspecified and "
+                            "CUDAGraph is not supported with encoder/ "
+                            "decoder models.")
+                self.enforce_eager = True
+
+            if not self.enforce_eager:
+                # Eager mode explicitly disabled by user for an encoder/
+                # decoder model; however CUDAGRAPH + encoder/decoder is
+                # not currently supported
+                raise ValueError(STR_NOT_IMPL_ENC_DEC_CUDAGRAPH)
+        elif self.enforce_eager is None:
+            # *Only for decoder-only models*, enforce_eager
+            # defaults to False if unset. This is intuitive
+            # so no logging message needed.
+            self.enforce_eager = False
+
        if (not self.disable_sliding_window
                and self.hf_text_config.model_type == "gemma2"
                and self.hf_text_config.sliding_window is not None):
@@ -174,17 +215,33 @@ class ModelConfig:
            hf_config=self.hf_text_config,
            max_model_len=max_model_len,
            disable_sliding_window=self.disable_sliding_window,
-            sliding_window_len=self.get_hf_config_sliding_window())
+            sliding_window_len=self.get_hf_config_sliding_window(),
+            spec_target_max_model_len=spec_target_max_model_len)
        self.served_model_name = get_served_model_name(model,
                                                       served_model_name)
-        self.multimodal_config = multimodal_config
-
+        self.multimodal_config = self._init_multimodal_config(
+            limit_mm_per_prompt)
        if not self.skip_tokenizer_init:
            self._verify_tokenizer_mode()
        self._verify_embedding_mode()
        self._verify_quantization()
        self._verify_cuda_graph()

+    def _init_multimodal_config(
+        self, limit_mm_per_prompt: Optional[Mapping[str, int]]
+    ) -> Optional["MultiModalConfig"]:
+        architectures = getattr(self.hf_config, "architectures", [])
+        if any(
+                ModelRegistry.is_multimodal_model(arch)
+                for arch in architectures):
+            return MultiModalConfig(limit_per_prompt=limit_mm_per_prompt or {})
+        else:
+            if limit_mm_per_prompt:
+                raise ValueError(
+                    "limit_mm_per_prompt is only supported for multimodal "
+                    "models.")
+            return None
+
    def _verify_tokenizer_mode(self) -> None:
        tokenizer_mode = self.tokenizer_mode.lower()
        if tokenizer_mode not in ["auto", "slow"]:
@@ -207,11 +264,13 @@ class ModelConfig:

    def _verify_quantization(self) -> None:
        supported_quantization = [*QUANTIZATION_METHODS]
-        rocm_supported_quantization = ["gptq", "squeezellm","awq"]
+        rocm_supported_quantization = ["gptq", "squeezellm","awq"]  # "fp8"
        optimized_quantization_methods = [
            "fp8", "marlin", "gptq_marlin_24", "gptq_marlin", "awq_marlin",
-            "fbgemm_fp8", "compressed_tensors", "compressed-tensors"
+            "fbgemm_fp8", "compressed_tensors", "compressed-tensors",
+            "experts_int8"
        ]
+        tpu_supported_quantization = ["tpu_int8"]
        if self.quantization is not None:
            self.quantization = self.quantization.lower()

@@ -250,6 +309,11 @@ class ModelConfig:
                raise ValueError(
                    f"{self.quantization} quantization is currently not "
                    f"supported in ROCm.")
+            if current_platform.is_tpu(
+            ) and self.quantization not in tpu_supported_quantization:
+                raise ValueError(
+                    f"{self.quantization} quantization is currently not "
+                    f"supported in TPU Backend.")
            if self.quantization not in optimized_quantization_methods:
                logger.warning(
                    "%s quantization is not fully "
@@ -290,8 +354,9 @@ class ModelConfig:
                "BitAndBytes quantization with TP or PP is not supported yet.")

        if self.quantization == "bitsandbytes" and self.enforce_eager is False:
-            raise ValueError(
-                "BitAndBytes with enforce_eager = False is not supported yet.")
+            logger.warning("CUDA graph is not supported on BitAndBytes yet, "
+                           "fallback to the eager mode.")
+            self.enforce_eager = True

    def get_hf_config_sliding_window(self) -> Optional[int]:
        """Get the sliding window size, or None if disabled."""
@@ -425,6 +490,28 @@ class ModelConfig:
            if t != "attention"
        ])

+    def get_multimodal_config(self) -> "MultiModalConfig":
+        """
+        Get the multimodal configuration of the model.
+
+        Raises:
+            ValueError: If the model is not multimodal.
+        """
+        if self.multimodal_config is None:
+            raise ValueError("The model is not multimodal.")
+
+        return self.multimodal_config
+
+    @property
+    def is_encoder_decoder_model(self) -> bool:
+        """Extract the HF encoder/decoder model flag."""
+        return getattr(self.hf_config, "is_encoder_decoder", False)
+
+    @property
+    def is_embedding_model(self) -> bool:
+        """Extract the embedding model flag."""
+        return self.embedding_mode
+

 class CacheConfig:
    """Configuration for the KV cache.
@@ -443,7 +530,7 @@ class CacheConfig:
        self,
        block_size: int,
        gpu_memory_utilization: float,
-        swap_space: int,
+        swap_space: float,
        cache_dtype: str,
        num_gpu_blocks_override: Optional[int] = None,
        sliding_window: Optional[int] = None,
@@ -452,7 +539,7 @@ class CacheConfig:
    ) -> None:
        self.block_size = block_size
        self.gpu_memory_utilization = gpu_memory_utilization
-        self.swap_space_bytes = swap_space * _GB
+        self.swap_space_bytes = swap_space * GiB_bytes
        self.num_gpu_blocks_override = num_gpu_blocks_override
        self.cache_dtype = cache_dtype
        self.sliding_window = sliding_window
@@ -497,10 +584,6 @@ class CacheConfig:
            raise NotImplementedError(
                "Prefix caching is not supported with sliding window. "
                "Run with --disable-sliding-window to use prefix caching.")
-        if self.cache_dtype == "fp8":
-            raise NotImplementedError(
-                "Prefix caching is not supported for fp8 cache_dtype. "
-                "Run with --kv-cache-dtype auto to use prefix caching.")

    def verify_with_parallel_config(
        self,
@@ -512,9 +595,9 @@ class CacheConfig:
        num_gpus_per_node = parallel_config.tensor_parallel_size
        cpu_memory_usage = self.swap_space_bytes * num_gpus_per_node

-        msg = (f"{cpu_memory_usage / _GB:.2f} GiB out of "
-               f"the {total_cpu_memory / _GB:.2f} GiB total CPU memory is "
-               "allocated for the swap space.")
+        msg = (f"{cpu_memory_usage / GiB_bytes:.2f} GiB out of the "
+               f"{total_cpu_memory / GiB_bytes:.2f} GiB total CPU memory "
+               "is allocated for the swap space.")
        if cpu_memory_usage > 0.7 * total_cpu_memory:
            raise ValueError("Too large swap space. " + msg)
        elif cpu_memory_usage > 0.4 * total_cpu_memory:
@@ -582,6 +665,7 @@ class LoadFormat(str, enum.Enum):
    DUMMY = "dummy"
    TENSORIZER = "tensorizer"
    SHARDED_STATE = "sharded_state"
+    GGUF = "gguf"
    BITSANDBYTES = "bitsandbytes"


@@ -692,8 +776,8 @@ class ParallelConfig:
        self.tokenizer_pool_config = tokenizer_pool_config
        self.ray_workers_use_nsight = ray_workers_use_nsight
        self.placement_group = placement_group
-
        self.world_size = pipeline_parallel_size * self.tensor_parallel_size
+
        if worker_use_ray:
            if self.distributed_executor_backend is None:
                self.distributed_executor_backend = "ray"
@@ -789,6 +873,11 @@ class SchedulerConfig:
            swapping. However, when the sequence group has multiple sequences
            (e.g., beam search), recomputation is not currently supported. In
            such a case, we use swapping instead.
+        send_delta_data: Private API. If used, scheduler sends delta data to
+            workers instead of an entire data. It should be enabled only
+            when SPMD worker architecture is enabled. I.e.,
+            VLLM_USE_RAY_SPMD_WORKER=1
+
    """

    def __init__(self,
@@ -800,7 +889,9 @@ class SchedulerConfig:
                 delay_factor: float = 0.0,
                 enable_chunked_prefill: bool = False,
                 embedding_mode: Optional[bool] = False,
-                 preemption_mode: Optional[str] = None) -> None:
+                 preemption_mode: Optional[str] = None,
+                 num_scheduler_steps: int = 1,
+                 send_delta_data: bool = False) -> None:
        if max_num_batched_tokens is not None:
            self.max_num_batched_tokens = max_num_batched_tokens
        else:
@@ -829,6 +920,8 @@ class SchedulerConfig:
        self.chunked_prefill_enabled = enable_chunked_prefill
        self.embedding_mode = embedding_mode
        self.preemption_mode = preemption_mode
+        self.num_scheduler_steps = num_scheduler_steps
+        self.send_delta_data = send_delta_data
        self._verify_args()

    def _verify_args(self) -> None:
@@ -854,6 +947,16 @@ class SchedulerConfig:
                f"({self.num_lookahead_slots}) must be greater than or "
                "equal to 0.")

+        if self.num_scheduler_steps < 1:
+            raise ValueError(
+                "num_scheduler_steps "
+                f"({self.num_scheduler_steps}) must be greater than or "
+                "equal to 1.")
+
+    @property
+    def is_multi_step(self) -> bool:
+        return self.num_scheduler_steps > 1
+

 class DeviceConfig:
    device: Optional[torch.device]
@@ -865,7 +968,7 @@ class DeviceConfig:
                self.device_type = "neuron"
            elif is_openvino():
                self.device_type = "openvino"
-            elif is_tpu():
+            elif current_platform.is_tpu():
                self.device_type = "tpu"
            elif is_cpu():
                self.device_type = "cpu"
@@ -902,6 +1005,7 @@ class SpeculativeConfig:
        target_parallel_config: ParallelConfig,
        target_dtype: str,
        speculative_model: Optional[str],
+        speculative_model_quantization: Optional[str],
        speculative_draft_tensor_parallel_size: Optional[int],
        num_speculative_tokens: Optional[int],
        speculative_max_model_len: Optional[int],
@@ -930,6 +1034,9 @@ class SpeculativeConfig:
            target_dtype (str): The data type used for the target model.
            speculative_model (Optional[str]): The name of the speculative
                model, if provided.
+            speculative_model_quantization (Optional[str]): Quantization method
+                that was used to quantize the speculative model weights. If
+                None, we assume the model weights are not quantized.
            speculative_draft_tensor_parallel_size (Optional[int]): The degree
                of the tensor parallelism for the draft model.
            num_speculative_tokens (Optional[int]): The number of speculative
@@ -997,11 +1104,11 @@ class SpeculativeConfig:
                "Speculative decoding requires usage of the V2 "
                "block manager. Enable it with --use-v2-block-manager.")

-        # TODO: The user should be able to specify revision/quantization/max
-        # model len for the draft model. It is not currently supported.
+        # TODO: The user should be able to specify revision/max model len
+        # for the draft model. It is not currently supported.
        draft_revision = None
        draft_code_revision = None
-        draft_quantization = None
+        draft_quantization = speculative_model_quantization

        if speculative_model == "[ngram]":
            if ngram_prompt_lookup_min is None:
@@ -1033,6 +1140,7 @@ class SpeculativeConfig:
                code_revision=draft_code_revision,
                tokenizer_revision=target_model_config.tokenizer_revision,
                max_model_len=None,
+                spec_target_max_model_len=target_model_config.max_model_len,
                quantization=draft_quantization,
                enforce_eager=target_model_config.enforce_eager,
                max_seq_len_to_capture=target_model_config.
@@ -1158,7 +1266,7 @@ class SpeculativeConfig:
        elif speculative_draft_tensor_parallel_size != 1:
            # TODO(wooyeon): allow tp values larger than 1
            raise ValueError(
-                f"{speculative_draft_tensor_parallel_size=} cannot be"
+                f"{speculative_draft_tensor_parallel_size=} cannot be "
                f"other value than 1")

        draft_parallel_config = ParallelConfig(
@@ -1310,8 +1418,9 @@ class LoRAConfig:
    long_lora_scaling_factors: Optional[Tuple[float]] = None

    def __post_init__(self):
-        # TODO: Increase the range of rank
-        possible_max_ranks = (8, 16, 32, 64)
+        # Setting the maximum rank to 256 should be able to satisfy the vast
+        # majority of applications.
+        possible_max_ranks = (8, 16, 32, 64, 128, 256)
        possible_lora_extra_vocab_size = (0, 256, 512)
        if self.max_lora_rank not in possible_max_ranks:
            raise ValueError(
@@ -1343,11 +1452,6 @@ class LoRAConfig:
                           model_config.quantization)

    def verify_with_scheduler_config(self, scheduler_config: SchedulerConfig):
-        if scheduler_config.max_num_batched_tokens > 65528:
-            raise ValueError(
-                "Due to limitations of the custom LoRA CUDA kernel, "
-                "max_num_batched_tokens must be <= 65528 when "
-                "LoRA is enabled.")
        if scheduler_config.chunked_prefill_enabled:
            raise ValueError("LoRA is not supported with chunked prefill yet.")

@@ -1387,10 +1491,15 @@ class PromptAdapterConfig:

 @dataclass
 class MultiModalConfig:
-    """Configs the input data format and how models should run for
-    multimodal models."""
+    """Controls the behavior of multimodal models."""
+
+    limit_per_prompt: Mapping[str, int] = field(default_factory=dict)
+    """
+    The maximum number of multi-modal input instances allowed per prompt
+    for each :class:`~vllm.multimodal.MultiModalPlugin`.
+    """
+
    # TODO: Add configs to init vision tower or not.
-    pass


 _STR_DTYPE_TO_TORCH_DTYPE = {
@@ -1461,6 +1570,7 @@ def _get_and_verify_max_len(
    max_model_len: Optional[int],
    disable_sliding_window: bool,
    sliding_window_len: Optional[int],
+    spec_target_max_model_len: Optional[int] = None,
 ) -> int:
    """Get and verify the model's maximum length."""
    derived_max_model_len = float("inf")
@@ -1503,6 +1613,11 @@ def _get_and_verify_max_len(
            # If max_model_len is specified, we use it.
            return max_model_len

+        if spec_target_max_model_len is not None:
+            # If this is a speculative draft model, we use the max model len
+            # from the target model.
+            return spec_target_max_model_len
+
        default_max_len = 2048
        logger.warning(
            "The model's config.json does not contain any of the following "
@@ -1610,10 +1725,27 @@ class ObservabilityConfig:
    """Configuration for observability."""
    otlp_traces_endpoint: Optional[str] = None

+    # Collecting detailed timing information for each request can be expensive.
+
+    # If set, collects the model forward time for the request.
+    collect_model_forward_time: bool = False
+
+    # If set, collects the model execute time for the request.
+    collect_model_execute_time: bool = False
+
    def __post_init__(self):
-        if not is_otel_installed() and self.otlp_traces_endpoint is not None:
-            raise ValueError("OpenTelemetry packages must be installed before "
-                             "configuring 'otlp_traces_endpoint'")
+        if not is_otel_available() and self.otlp_traces_endpoint is not None:
+            raise ValueError(
+                "OpenTelemetry is not available. Unable to configure "
+                "'otlp_traces_endpoint'. Ensure OpenTelemetry packages are "
+                f"installed. Original error:\n{otel_import_error_traceback}")
+
+        if ((self.collect_model_forward_time
+             or self.collect_model_execute_time)
+                and self.otlp_traces_endpoint is None):
+            raise ValueError(
+                "collect_model_forward_time or collect_model_execute_time "
+                "requires --otlp-traces-endpoint to be set.")


 @dataclass(frozen=True)
@@ -1629,7 +1761,6 @@ class EngineConfig:
    device_config: DeviceConfig
    load_config: LoadConfig
    lora_config: Optional[LoRAConfig]
-    multimodal_config: Optional[MultiModalConfig]
    speculative_config: Optional[SpeculativeConfig]
    decoding_config: Optional[DecodingConfig]
    observability_config: Optional[ObservabilityConfig]

--- a/vllm/core/block/block_table.py
+++ b/vllm/core/block/block_table.py
@@ -356,7 +356,13 @@ class BlockTable:
        appended to blocks. The first such "token block" may have less token ids
        than the block size, since the last allocated block may be partially
        full.
+
+        If no token ids are provided, then no chunks are returned.
        """
+
+        if not token_ids:
+            return []
+
        first_chunk_size = self._block_size - (self._num_full_slots %
                                               self._block_size)
        token_blocks = [token_ids[:first_chunk_size]]

--- a/vllm/core/block/common.py
+++ b/vllm/core/block/common.py
 from collections import deque
+from dataclasses import dataclass
 from typing import Deque, Dict, Iterable, List, Optional, Protocol, Tuple

 from vllm.core.block.interfaces import Block, BlockAllocator
@@ -282,6 +283,58 @@ class BlockList:
        return self._block_ids


+@dataclass
+class CacheMetricData:
+    """A utility dataclass to maintain cache metric.
+    To avoid overflow, we maintain the hit rate in block granularity, so that
+    we can maintain a single hit rate for n_completed_block x block_size,
+    and calculate the real time hit rate by the following:
+    BS = The number of queries per block.
+    nB = The number of completed blocks.
+    HR = hit rate of (nB x BS) queries.
+    Q = current number of queries (< BS).
+    H = current number of hits (< BS).
+    hit rate = ((HR x nB) + (H / Q) x (Q / BS)) / (nB + Q / BS)
+    """
+    num_completed_blocks: int = 0
+    completed_block_cache_hit_rate: float = 0.0
+    num_incompleted_block_queries: int = 0
+    num_incompleted_block_hit: int = 0
+    block_size: int = 1000
+
+    def query(self, hit: bool):
+        self.num_incompleted_block_queries += 1
+        self.num_incompleted_block_hit += 1 if hit else 0
+
+        # When a block is completed, update the cache hit rate
+        # and reset the incomplete numbers.
+        if self.num_incompleted_block_queries == self.block_size:
+            hit_rate = (self.num_incompleted_block_hit /
+                        self.num_incompleted_block_queries)
+            self.completed_block_cache_hit_rate = (
+                self.completed_block_cache_hit_rate * self.num_completed_blocks
+                + hit_rate) / (self.num_completed_blocks + 1)
+            self.num_incompleted_block_queries = 0
+            self.num_incompleted_block_hit = 0
+            self.num_completed_blocks += 1
+
+    def get_hit_rate(self):
+        incomplete_ratio = self.num_incompleted_block_queries / self.block_size
+        total_blocks = self.num_completed_blocks + incomplete_ratio
+        if total_blocks == 0:
+            return 0.0
+
+        completed_block_hit, incompleted_block_hit = 0.0, 0.0
+        if self.num_completed_blocks > 0:
+            completed_block_hit = (self.completed_block_cache_hit_rate *
+                                   self.num_completed_blocks)
+        if self.num_incompleted_block_queries > 0:
+            incompleted_hit_rate = (self.num_incompleted_block_hit /
+                                    self.num_incompleted_block_queries)
+            incompleted_block_hit = (incompleted_hit_rate * incomplete_ratio)
+        return (completed_block_hit + incompleted_block_hit) / total_blocks
+
+
 def get_all_blocks_recursively(last_block: Block) -> List[Block]:
    """Retrieves all the blocks in a sequence starting from the last block.


--- a/vllm/core/block/cpu_gpu_block_allocator.py
+++ b/vllm/core/block/cpu_gpu_block_allocator.py
@@ -323,6 +323,11 @@ class CpuGpuBlockAllocator(DeviceAwareBlockAllocator):
    def all_block_ids(self) -> FrozenSet[int]:
        return frozenset(self._block_ids_to_allocator.keys())

+    def get_prefix_cache_hit_rate(self, device: Device) -> float:
+        """Prefix cache hit rate. -1 means not supported or disabled."""
+        assert device in self._allocators
+        return self._allocators[device].get_prefix_cache_hit_rate()
+
    def get_and_reset_swaps(self) -> List[Tuple[int, int]]:
        """Returns and clears the mapping of source to destination block IDs.
        Will be called after every swapping operations for now, and after every

--- a/vllm/core/block/interfaces.py
+++ b/vllm/core/block/interfaces.py
@@ -186,6 +186,11 @@ class BlockAllocator(ABC):
                               num_lookahead_slots: int = 0) -> int:
        pass

+    @abstractmethod
+    def get_prefix_cache_hit_rate(self) -> float:
+        """Prefix cache hit rate. -1 means not supported or disabled."""
+        pass
+
    class NoFreeBlocksError(ValueError):
        pass

@@ -278,3 +283,8 @@ class DeviceAwareBlockAllocator(ABC):
        There is at most one null block per allocator.
        """
        pass
+
+    @abstractmethod
+    def get_prefix_cache_hit_rate(self, device: Device) -> float:
+        """Prefix cache hit rate. -1 means not supported or disabled."""
+        pass
--- a/vllm/core/block/naive_block.py
+++ b/vllm/core/block/naive_block.py
@@ -307,9 +307,8 @@ class NaiveBlockAllocator(BlockAllocator):
        # TODO(cade): make sure the logic is correct and clean it up.
        for block in blocks:
            if not block.is_full and num_lookahead_slots != 0:
-                if block.num_empty_slots >= num_lookahead_slots:
-                    new_block_count += 1
-                else:
+                new_block_count += 1
+                if num_lookahead_slots > block.num_empty_slots:
                    new_block_count += cdiv(
                        num_lookahead_slots - block.num_empty_slots,
                        self._block_size)
@@ -342,6 +341,9 @@ class NaiveBlockAllocator(BlockAllocator):

            block.block_id = block_id  # Assign block_id

+    def get_prefix_cache_hit_rate(self) -> float:
+        return -1
+

 class NaiveBlock(Block):
    """An implementation of the Block class that does not support prefix

--- a/vllm/core/block/prefix_caching_block.py
+++ b/vllm/core/block/prefix_caching_block.py
 """Token blocks."""
-
 from os.path import commonprefix
 from typing import Dict, FrozenSet, Iterable, List, Optional, Tuple

-from vllm.core.block.common import (CopyOnWriteTracker,
+from vllm.core.block.common import (CacheMetricData, CopyOnWriteTracker,
                                    get_all_blocks_recursively)
 from vllm.core.block.interfaces import Block, BlockAllocator, BlockId, Device
 from vllm.core.block.naive_block import (BlockPool, NaiveBlock,
@@ -107,6 +106,8 @@ class PrefixCachingBlockAllocator(BlockAllocator):
        self._cow_tracker = CopyOnWriteTracker(
            refcounter=self._refcounter.as_readonly())

+        self.metric_data = CacheMetricData()
+
    # Implements Block.Factory.
    def _create_block(
        self,
@@ -155,9 +156,11 @@ class PrefixCachingBlockAllocator(BlockAllocator):

        cached_block_id = self._cached_blocks.get(block.content_hash, None)
        if cached_block_id is not None:
+            self.metric_data.query(hit=True)
            block.block_id = cached_block_id
            self._incr_refcount_cached_block(block)
            return block
+        self.metric_data.query(hit=False)
        self._block_pool.free_block(block)

        # No cached block => Allocate a new block
@@ -404,6 +407,9 @@ class PrefixCachingBlockAllocator(BlockAllocator):
    def all_block_ids(self) -> FrozenSet[int]:
        return self._hashless_allocator.all_block_ids

+    def get_prefix_cache_hit_rate(self) -> float:
+        return self.metric_data.get_hit_rate()
+
    def is_block_cached(self, block: Block) -> bool:
        assert block.content_hash is not None
        if block.content_hash in self._cached_blocks:
@@ -579,14 +585,17 @@ class PrefixCachingBlockAllocator(BlockAllocator):
        num_touched_blocks = 0
        for block in blocks:
            if not block.is_full:
-                if block.num_empty_slots >= num_lookahead_slots:
-                    num_touched_blocks += 1
-                else:
+                num_touched_blocks += 1
+                if num_lookahead_slots > block.num_empty_slots:
                    num_touched_blocks += cdiv(
                        num_lookahead_slots - block.num_empty_slots,
                        self._block_size)
            else:
-                if not self.is_block_cached(block):
+                # If the block has a match in the cache and the cached block
+                # is not referenced, then we still count it as a touched block
+                if not self.is_block_cached(block) or \
+                    (block.content_hash is not None and \
+                     self._cached_blocks[block.content_hash] in self.evictor):
                    num_touched_blocks += 1
        return num_touched_blocks


--- a/vllm/core/block/utils.py
+++ b/vllm/core/block/utils.py
 """Block manager utils."""
 from vllm.sequence import SequenceGroup
-
-# Exception strings for non-implemented block manager enc/dec scenarios
-
-STR_NOT_IMPL_ENC_DEC_SWA = \
-    "Sliding window attention for encoder/decoder models " + \
-                    "is not currently supported."
-
-STR_NOT_IMPL_ENC_DEC_PREFIX_CACHE = \
-    "Prefix caching for encoder/decoder models " + \
-                    "is not currently supported."
+from vllm.utils import (STR_NOT_IMPL_ENC_DEC_PREFIX_CACHE,
+                        STR_NOT_IMPL_ENC_DEC_SWA)


 def _get_block_mgr_sliding_window_attr(block_mgr):

--- a/vllm/core/block_manager_v1.py
+++ b/vllm/core/block_manager_v1.py
@@ -8,6 +8,7 @@ from typing import Sequence as GenericSequence
 from typing import Set, Tuple

 from vllm.block import BlockTable, PhysicalTokenBlock
+from vllm.core.block.common import CacheMetricData
 from vllm.core.block.utils import check_no_caching_or_swa_for_blockmgr_encdec
 from vllm.core.evictor_v1 import EvictionPolicy, Evictor, make_evictor
 from vllm.core.interfaces import AllocStatus, BlockSpaceManager
@@ -60,6 +61,11 @@ class BlockAllocatorBase(ABC):
    def update_hash(self, block_hash: int, block: PhysicalTokenBlock):
        pass

+    @abstractmethod
+    def get_prefix_cache_hit_rate(self) -> float:
+        """Prefix cache hit rate. -1 means not supported or disabled."""
+        pass
+

 class CachedBlockAllocator(BlockAllocatorBase):
    """Manages free physical token blocks for a device.
@@ -85,6 +91,8 @@ class CachedBlockAllocator(BlockAllocatorBase):

        self.default_hash_ctr = count()

+        self.cache_metric_data = CacheMetricData()
+
    def allocate_block(self, block_hash: int,
                       num_hashed_tokens: int) -> PhysicalTokenBlock:
        if self.current_num_blocks == self.num_blocks:
@@ -105,15 +113,17 @@ class CachedBlockAllocator(BlockAllocatorBase):
                 num_hashed_tokens: int = 0) -> PhysicalTokenBlock:
        if block_hash is None:
            block_hash = next(self.default_hash_ctr)
+
        if block_hash in self.evictor:
            assert block_hash not in self.cached_blocks
            block = self.evictor.remove(block_hash)
            assert block.ref_count == 0
            self.cached_blocks[block_hash] = block
-            block.ref_count += 1
-            assert block.block_hash == block_hash
-            return block
-        if block_hash not in self.cached_blocks:
+
+        if block_hash in self.cached_blocks:
+            self.cache_metric_data.query(hit=True)
+        else:
+            self.cache_metric_data.query(hit=False)
            self.cached_blocks[block_hash] = self.allocate_block(
                block_hash, num_hashed_tokens)
        block = self.cached_blocks[block_hash]
@@ -150,6 +160,9 @@ class CachedBlockAllocator(BlockAllocatorBase):
        del self.cached_blocks[old_hash]
        self.cached_blocks[block_hash] = block

+    def get_prefix_cache_hit_rate(self) -> float:
+        return self.cache_metric_data.get_hit_rate()
+

 class UncachedBlockAllocator(BlockAllocatorBase):
    """Manages free physical token blocks for a device.
@@ -170,7 +183,7 @@ class UncachedBlockAllocator(BlockAllocatorBase):
        self.num_blocks = num_blocks

        # Initialize the free blocks.
-        self.free_blocks: BlockTable = []
+        self.free_blocks: List[PhysicalTokenBlock] = []
        for i in range(num_blocks):
            block = PhysicalTokenBlock(device=device,
                                       block_number=i,
@@ -209,6 +222,9 @@ class UncachedBlockAllocator(BlockAllocatorBase):
        raise NotImplementedError(
            "Invalid codepath for uncached block allocator.")

+    def get_prefix_cache_hit_rate(self) -> float:
+        return -1
+

 class BlockSpaceManagerV1(BlockSpaceManager):
    """Manages the mapping between logical and physical token blocks."""
@@ -256,6 +272,7 @@ class BlockSpaceManagerV1(BlockSpaceManager):
                Device.CPU, block_size, num_cpu_blocks)
        # Mapping: seq_id -> BlockTable.
        self.block_tables: Dict[int, BlockTable] = {}
+
        # Mapping: req_id -> BlockTable
        # Note that each SequenceGroup has a unique
        # request ID
@@ -299,7 +316,7 @@ class BlockSpaceManagerV1(BlockSpaceManager):
        # Allocate new physical token blocks that will store the prompt tokens.
        num_prompt_blocks = seq.n_blocks

-        block_table: BlockTable = []
+        block_table: BlockTable = BlockTable()
        for logical_idx in range(num_prompt_blocks):
            if (self.block_sliding_window is not None
                    and logical_idx >= self.block_sliding_window):
@@ -326,15 +343,19 @@ class BlockSpaceManagerV1(BlockSpaceManager):
        #
        # NOTE: Here we assume that all sequences in the group have the same
        # decoder prompt.
-        seq = seq_group.get_seqs(status=SequenceStatus.WAITING)[0]
+        wait_seqs = seq_group.get_seqs(status=SequenceStatus.WAITING)
+        seq = wait_seqs[0]
        block_table: BlockTable = \
            self._allocate_sequence(seq,
                                    seq_group.num_seqs(),
                                    is_encoder_decoder)

        # Assign the self-attention block tables for each sequence.
-        for seq in seq_group.get_seqs(status=SequenceStatus.WAITING):
-            self.block_tables[seq.seq_id] = block_table.copy()
+        if len(wait_seqs) == 1:
+            self.block_tables[seq.seq_id] = block_table
+        else:
+            for seq in wait_seqs:
+                self.block_tables[seq.seq_id] = block_table.copy()

        # Allocate encoder sequence
        if is_encoder_decoder:
@@ -476,6 +497,7 @@ class BlockSpaceManagerV1(BlockSpaceManager):
            return
        src_block_table = self.block_tables[parent_seq.seq_id]
        self.block_tables[child_seq.seq_id] = src_block_table.copy()
+
        # When using a sliding window, blocks will be eventually reused.
        # In this case the block tables will contain repeated blocks.
        # When forking, we must make sure that each block's `ref_count`
@@ -527,7 +549,7 @@ class BlockSpaceManagerV1(BlockSpaceManager):
            dest_allocator: BlockAllocatorBase,
            mapping: Dict[PhysicalTokenBlock,
                          PhysicalTokenBlock]) -> BlockTable:
-        new_block_table = []
+        new_block_table: BlockTable = BlockTable()

        for from_block in block_table:
            if from_block in mapping:
@@ -553,8 +575,7 @@ class BlockSpaceManagerV1(BlockSpaceManager):
        for seq in seq_group.get_seqs(status=SequenceStatus.SWAPPED):
            self.block_tables[seq.seq_id] = \
                self._swap_block_table(self.block_tables[seq.seq_id],
-                                       self.cpu_allocator,
-                                       self.gpu_allocator,
+                                       self.cpu_allocator, self.gpu_allocator,
                                       mapping)

        if seq_group.is_encoder_decoder():
@@ -580,8 +601,7 @@ class BlockSpaceManagerV1(BlockSpaceManager):
        for seq in seq_group.get_seqs(status=SequenceStatus.RUNNING):
            self.block_tables[seq.seq_id] = \
                self._swap_block_table(self.block_tables[seq.seq_id],
-                                       self.gpu_allocator,
-                                       self.cpu_allocator,
+                                       self.gpu_allocator, self.cpu_allocator,
                                       mapping)

        if seq_group.is_encoder_decoder():
@@ -636,8 +656,7 @@ class BlockSpaceManagerV1(BlockSpaceManager):
        self.cross_block_tables.clear()

    def get_block_table(self, seq: Sequence) -> List[int]:
-        block_table = self.block_tables[seq.seq_id]
-        return [block.block_number for block in block_table]
+        return self.block_tables[seq.seq_id].ids()

    def get_cross_block_table(self, seq_group: SequenceGroup) -> List[int]:
        block_table = self.cross_block_tables[seq_group.request_id]
@@ -702,3 +721,10 @@ class BlockSpaceManagerV1(BlockSpaceManager):
        if self.enable_caching:
            for seq in seq_group.get_seqs():
                self.compute_full_blocks_in_seq(seq)
+
+    def get_prefix_cache_hit_rate(self, device: Device) -> float:
+        if device == Device.GPU:
+            return self.gpu_allocator.get_prefix_cache_hit_rate()
+        if device == Device.CPU:
+            return self.cpu_allocator.get_prefix_cache_hit_rate()
+        raise ValueError(f"Invalid device: {device}")
--- a/vllm/core/block_manager_v2.py
+++ b/vllm/core/block_manager_v2.py
@@ -441,6 +441,9 @@ class BlockSpaceManagerV2(BlockSpaceManager):
    def get_num_free_cpu_blocks(self) -> int:
        return self.block_allocator.get_num_free_blocks(Device.CPU)

+    def get_prefix_cache_hit_rate(self, device: Device) -> float:
+        return self.block_allocator.get_prefix_cache_hit_rate(device)
+
    def _can_swap(self,
                  seq_group: SequenceGroup,
                  device: Device,

--- a/vllm/core/embedding_model_block_manager.py
+++ b/vllm/core/embedding_model_block_manager.py
@@ -2,6 +2,7 @@ from typing import List, Tuple

 from vllm.core.interfaces import AllocStatus, BlockSpaceManager
 from vllm.sequence import Sequence, SequenceGroup
+from vllm.utils import Device


 class EmbeddingModelBlockSpaceManager(BlockSpaceManager):
@@ -81,3 +82,6 @@ class EmbeddingModelBlockSpaceManager(BlockSpaceManager):

    def mark_blocks_as_computed(self, seq_group: SequenceGroup):
        pass
+
+    def get_prefix_cache_hit_rate(self, device: Device) -> float:
+        return -1
--- a/vllm/core/evictor_v2.py
+++ b/vllm/core/evictor_v2.py
@@ -85,18 +85,21 @@ class LRUEvictor(Evictor):
        if len(self.free_table) == 0:
            raise ValueError("No usable cache memory left")

-        evicted_block = next(iter(self.free_table.values()))
-        evicted_block_id = next(iter(self.free_table.keys()))
+        evicted_block, evicted_block_id = None, None
        # The blocks with the lowest timestamps should be placed consecutively
        # at the start of OrderedDict. Loop through all these blocks to
        # find the one with maximum number of hashed tokens.
        for _id, block in self.free_table.items():
-            if evicted_block.last_accessed > block.last_accessed or (
-                    evicted_block.last_accessed == block.last_accessed and
-                    evicted_block.num_hashed_tokens < block.num_hashed_tokens):
-                evicted_block = block
-                evicted_block_id = _id
-
+            if evicted_block is None:
+                evicted_block, evicted_block_id = block, _id
+                continue
+            if evicted_block.last_accessed < block.last_accessed:
+                break
+            if evicted_block.num_hashed_tokens < block.num_hashed_tokens:
+                evicted_block, evicted_block_id = block, _id
+
+        assert evicted_block is not None
+        assert evicted_block_id is not None
        self.free_table.pop(evicted_block_id)

        return evicted_block_id, evicted_block.content_hash

--- a/vllm/core/interfaces.py
+++ b/vllm/core/interfaces.py
@@ -5,6 +5,7 @@ from typing import Sequence as GenericSequence
 from typing import Tuple

 from vllm.sequence import Sequence, SequenceGroup
+from vllm.utils import Device


 class AllocStatus(enum.Enum):
@@ -116,3 +117,8 @@ class BlockSpaceManager(ABC):
    @abstractmethod
    def mark_blocks_as_computed(self, seq_group: SequenceGroup):
        pass
+
+    @abstractmethod
+    def get_prefix_cache_hit_rate(self, device: Device) -> float:
+        """Prefix cache hit rate. -1 means not supported or disabled."""
+        pass
--- a/vllm/core/scheduler.py
+++ b/vllm/core/scheduler.py
@@ -12,7 +12,9 @@ from vllm.logger import init_logger
 from vllm.lora.request import LoRARequest
 from vllm.prompt_adapter.request import PromptAdapterRequest
 from vllm.sequence import (Sequence, SequenceData, SequenceGroup,
-                           SequenceGroupMetadata, SequenceStatus)
+                           SequenceGroupMetadata, SequenceGroupMetadataDelta,
+                           SequenceStatus)
+from vllm.utils import Device, PyObjectCache

 logger = init_logger(__name__)

@@ -176,10 +178,10 @@ class SchedulerRunningOutputs:
    enough memory, it can be preempted (for recompute) or swapped out.
    """
    # Selected sequences that are running and in a decoding phase.
-    decode_seq_groups: List[SequenceGroup]
+    decode_seq_groups: List[ScheduledSequenceGroup]
    # Selected sequences that are running and in a prefill phase.
    # I.e., it means the prefill has been chunked.
-    prefill_seq_groups: List[SequenceGroup]
+    prefill_seq_groups: List[ScheduledSequenceGroup]
    # The preempted sequences.
    preempted: List[SequenceGroup]
    # Sequences that are swapped out.
@@ -191,6 +193,10 @@ class SchedulerRunningOutputs:
    # The number of slots for lookahead decoding.
    num_lookahead_slots: int

+    # Optimization for fast-access to seq_group lists
+    decode_seq_groups_list: List[SequenceGroup]
+    prefill_seq_groups_list: List[SequenceGroup]
+
    @classmethod
    def create_empty(cls) -> "SchedulerRunningOutputs":
        return SchedulerRunningOutputs(
@@ -201,6 +207,8 @@ class SchedulerRunningOutputs:
            blocks_to_swap_out=[],
            blocks_to_copy=[],
            num_lookahead_slots=0,
+            decode_seq_groups_list=[],
+            prefill_seq_groups_list=[],
        )


@@ -259,6 +267,30 @@ class SchedulerPrefillOutputs:
        )


+def seq_group_metadata_builder():
+    return SequenceGroupMetadata(request_id="",
+                                 is_prompt=False,
+                                 seq_data={},
+                                 sampling_params=None,
+                                 block_tables={})
+
+
+def scheduler_running_outputs_builder():
+    return SchedulerRunningOutputs(decode_seq_groups=[],
+                                   prefill_seq_groups=[],
+                                   preempted=[],
+                                   swapped_out=[],
+                                   blocks_to_swap_out=[],
+                                   blocks_to_copy=[],
+                                   num_lookahead_slots=0,
+                                   prefill_seq_groups_list=[],
+                                   decode_seq_groups_list=[])
+
+
+def scheduled_seq_group_builder():
+    return ScheduledSequenceGroup(seq_group=None, token_chunk_size=0)
+
+
 class Scheduler:

    def __init__(
@@ -331,6 +363,12 @@ class Scheduler:
                                       else 0)
        self.num_cumulative_preemption: int = 0

+        # Used to cache python objects
+        self._scheduler_running_outputs_cache: PyObjectCache = PyObjectCache(
+            scheduler_running_outputs_builder)
+        self._scheduled_seq_group_cache: PyObjectCache = PyObjectCache(
+            scheduled_seq_group_builder)
+
    @property
    def lora_enabled(self) -> bool:
        return bool(self.lora_config)
@@ -392,10 +430,26 @@ class Scheduler:
                    seq.status = SequenceStatus.FINISHED_ABORTED
                    self.free_seq(seq)

+                self._free_seq_group_cross_attn_blocks(aborted_group)
+
+    def _free_seq_group_cross_attn_blocks(
+        self,
+        seq_group: SequenceGroup,
+    ) -> None:
+        """
+        Free a sequence group from a cross-attention block table.
+        Has no effect on decoder-only models.
+        """
+        if seq_group.is_encoder_decoder():
+            self.block_manager.free_cross(seq_group)
+
    def has_unfinished_seqs(self) -> bool:
        return len(self.waiting) != 0 or len(self.running) != 0 or len(
            self.swapped) != 0

+    def get_prefix_cache_hit_rate(self, device: Device) -> float:
+        return self.block_manager.get_prefix_cache_hit_rate(device)
+
    def get_num_unfinished_seq_groups(self) -> int:
        return len(self.waiting) + len(self.running) + len(self.swapped)

@@ -428,14 +482,30 @@ class Scheduler:
        Returns:
            SchedulerRunningOutputs.
        """
+        ret: SchedulerRunningOutputs = \
+            self._scheduler_running_outputs_cache.get_object()
+        ret.blocks_to_swap_out.clear()
+        ret.blocks_to_copy.clear()
+        ret.decode_seq_groups.clear()
+        ret.prefill_seq_groups.clear()
+        ret.preempted.clear()
+        ret.swapped_out.clear()
+
+        ret.num_lookahead_slots = self._get_num_lookahead_slots(
+            is_prefill=False)
+
+        ret.decode_seq_groups_list.clear()
+        ret.prefill_seq_groups_list.clear()
+
        # Blocks that need to be swapped or copied before model execution.
-        blocks_to_swap_out: List[Tuple[int, int]] = []
-        blocks_to_copy: List[Tuple[int, int]] = []
+        blocks_to_swap_out: List[Tuple[int, int]] = ret.blocks_to_swap_out
+        blocks_to_copy: List[Tuple[int, int]] = ret.blocks_to_copy

-        decode_seq_groups: List[ScheduledSequenceGroup] = []
-        prefill_seq_groups: List[ScheduledSequenceGroup] = []
-        preempted: List[SequenceGroup] = []
-        swapped_out: List[SequenceGroup] = []
+        decode_seq_groups: List[ScheduledSequenceGroup] = ret.decode_seq_groups
+        prefill_seq_groups: List[
+            ScheduledSequenceGroup] = ret.prefill_seq_groups
+        preempted: List[SequenceGroup] = ret.preempted
+        swapped_out: List[SequenceGroup] = ret.swapped_out

        # NOTE(woosuk): Preemption happens only when there is no available slot
        # to keep all the sequence groups in the RUNNING state.
@@ -484,15 +554,19 @@ class Scheduler:
            else:
                self._append_slots(seq_group, blocks_to_copy)
                is_prefill = seq_group.is_prefill()
+
+                scheduled_seq_group: ScheduledSequenceGroup = \
+                    self._scheduled_seq_group_cache.get_object()
+                scheduled_seq_group.seq_group = seq_group
                if is_prefill:
-                    prefill_seq_groups.append(
-                        ScheduledSequenceGroup(
-                            seq_group=seq_group,
-                            token_chunk_size=num_running_tokens))
+                    scheduled_seq_group.token_chunk_size = num_running_tokens
+                    prefill_seq_groups.append(scheduled_seq_group)
+                    ret.prefill_seq_groups_list.append(seq_group)
                else:
-                    decode_seq_groups.append(
-                        ScheduledSequenceGroup(seq_group=seq_group,
-                                               token_chunk_size=1))
+                    scheduled_seq_group.token_chunk_size = 1
+                    decode_seq_groups.append(scheduled_seq_group)
+                    ret.decode_seq_groups_list.append(seq_group)
+
                budget.add_num_batched_tokens(seq_group.request_id,
                                              num_running_tokens)
                # OPTIMIZATION:  Note that get_max_num_running_seqs is
@@ -505,15 +579,10 @@ class Scheduler:
                if curr_loras is not None and seq_group.lora_int_id > 0:
                    curr_loras.add(seq_group.lora_int_id)

-        return SchedulerRunningOutputs(
-            decode_seq_groups=decode_seq_groups,
-            prefill_seq_groups=prefill_seq_groups,
-            preempted=preempted,
-            swapped_out=swapped_out,
-            blocks_to_swap_out=blocks_to_swap_out,
-            blocks_to_copy=blocks_to_copy,
-            num_lookahead_slots=self._get_num_lookahead_slots(
-                is_prefill=False))
+        self._scheduler_running_outputs_cache.reset()
+        self._scheduled_seq_group_cache.reset()
+
+        return ret

    def _schedule_swapped(
        self,
@@ -665,7 +734,7 @@ class Scheduler:
                all tokens.

        Returns:
-            SchedulerSwappedInOutputs.
+            SchedulerPrefillOutputs.
        """
        ignored_seq_groups: List[SequenceGroup] = []
        seq_groups: List[SequenceGroup] = []
@@ -738,6 +807,9 @@ class Scheduler:
                curr_loras.add(lora_int_id)
            waiting_queue.popleft()
            self._allocate_and_set_running(seq_group)
+            seq_group.init_multi_step(
+                num_scheduler_steps=self._get_num_lookahead_slots(
+                    is_prefill=True) + 1)
            seq_groups.append(
                ScheduledSequenceGroup(seq_group=seq_group,
                                       token_chunk_size=num_new_tokens))
@@ -807,11 +879,15 @@ class Scheduler:
        # Update waiting requests.
        self.waiting.extendleft(running_scheduled.preempted)
        # Update new running requests.
-        self.running.extend([s.seq_group for s in prefills.seq_groups])
-        self.running.extend(
-            [s.seq_group for s in running_scheduled.decode_seq_groups])
-        self.running.extend(
-            [s.seq_group for s in swapped_in.decode_seq_groups])
+        if len(prefills.seq_groups) > 0:
+            self.running.extend([s.seq_group for s in prefills.seq_groups])
+
+        self.running.extend(running_scheduled.decode_seq_groups_list)
+
+        if len(swapped_in.decode_seq_groups) > 0:
+            self.running.extend(
+                [s.seq_group for s in swapped_in.decode_seq_groups])
+
        # Update swapped requests.
        self.swapped.extend(running_scheduled.swapped_out)
        preempted = (len(running_scheduled.preempted) +
@@ -821,24 +897,36 @@ class Scheduler:
        # doesn't allow chunked prefills.
        assert len(running_scheduled.prefill_seq_groups) == 0
        assert len(swapped_in.prefill_seq_groups) == 0
+
+        # Merge lists
+        num_prefill_groups = len(prefills.seq_groups)
+        if num_prefill_groups > 0:
+            scheduled_seq_groups = prefills.seq_groups
+            scheduled_seq_groups.extend(running_scheduled.decode_seq_groups)
+        else:
+            scheduled_seq_groups = running_scheduled.decode_seq_groups
+        scheduled_seq_groups.extend(swapped_in.decode_seq_groups)
+
+        blocks_to_copy = running_scheduled.blocks_to_copy
+        blocks_to_copy.extend(swapped_in.blocks_to_copy)
+
+        ignored_seq_groups = prefills.ignored_seq_groups
+        ignored_seq_groups.extend(swapped_in.infeasible_seq_groups)
+
        return SchedulerOutputs(
-            scheduled_seq_groups=(prefills.seq_groups +
-                                  running_scheduled.decode_seq_groups +
-                                  swapped_in.decode_seq_groups),
-            num_prefill_groups=len(prefills.seq_groups),
+            scheduled_seq_groups=scheduled_seq_groups,
+            num_prefill_groups=num_prefill_groups,
            num_batched_tokens=budget.num_batched_tokens,
            blocks_to_swap_in=swapped_in.blocks_to_swap_in,
            blocks_to_swap_out=running_scheduled.blocks_to_swap_out,
-            blocks_to_copy=running_scheduled.blocks_to_copy +
-            swapped_in.blocks_to_copy,
-            ignored_seq_groups=prefills.ignored_seq_groups +
-            swapped_in.infeasible_seq_groups,
+            blocks_to_copy=blocks_to_copy,
+            ignored_seq_groups=ignored_seq_groups,
            num_lookahead_slots=running_scheduled.num_lookahead_slots,
            running_queue_size=len(self.running),
            preempted=preempted,
        )

-    def _schedule_chunked_prefill(self):
+    def _schedule_chunked_prefill(self) -> SchedulerOutputs:
        """Schedule queued requests.
        
        Chunked prefill allows to chunk prefill requests, batch them together
@@ -849,7 +937,7 @@ class Scheduler:

        The policy can sustain the high GPU utilization because it can put
        prefill and decodes requests to the same batch, while it improves
-        inter token latency because decodes requests don't need to blocked
+        inter token latency because decodes requests don't need to be blocked
        by prefill requests.
        """
        budget = SchedulingBudget(
@@ -947,9 +1035,13 @@ class Scheduler:
        # Schedule sequence groups.
        # This function call changes the internal states of the scheduler
        # such as self.running, self.swapped, and self.waiting.
+        scheduler_start_time = time.perf_counter()
        scheduler_outputs = self._schedule()
        now = time.time()

+        if not self.cache_config.enable_prefix_caching:
+            common_computed_block_nums = []
+
        # Create input data structures.
        seq_group_metadata_list: List[SequenceGroupMetadata] = []
        for i, scheduled_seq_group in enumerate(
@@ -963,52 +1055,88 @@ class Scheduler:
            # seq_id -> physical block numbers
            block_tables: Dict[int, List[int]] = {}

+            if seq_group.is_encoder_decoder():
+                # Encoder associated with SequenceGroup
+                encoder_seq_data = seq_group.get_encoder_seq().data
+                # Block table for cross-attention
+                # Also managed at SequenceGroup level
+                cross_block_table = self.block_manager.get_cross_block_table(
+                    seq_group)
+            else:
+                encoder_seq_data = None
+                cross_block_table = None
+
            for seq in seq_group.get_seqs(status=SequenceStatus.RUNNING):
                seq_id = seq.seq_id
                seq_data[seq_id] = seq.data
                block_tables[seq_id] = self.block_manager.get_block_table(seq)
                self.block_manager.access_all_blocks_in_seq(seq, now)

-            common_computed_block_nums = (
-                self.block_manager.get_common_computed_block_ids(
-                    seq_group.get_seqs(status=SequenceStatus.RUNNING)))
+            if self.cache_config.enable_prefix_caching:
+                common_computed_block_nums = (
+                    self.block_manager.get_common_computed_block_ids(
+                        seq_group.get_seqs(status=SequenceStatus.RUNNING)))

            do_sample = True
-            if seq_group.is_prefill():
+            is_prompt = seq_group.is_prefill()
+            # We should send the metadata to workers when the first prefill
+            # is sent. Subsequent requests could be chunked prefill or decode.
+            is_first_prefill = False
+            if is_prompt:
                seqs = seq_group.get_seqs()
                # Prefill has only 1 sequence.
                assert len(seqs) == 1
+                num_computed_tokens = seqs[0].data.get_num_computed_tokens()
+                is_first_prefill = num_computed_tokens == 0
                # In the next iteration, all prompt tokens are not computed.
                # It means the prefill is chunked, and we don't need sampling.
                # NOTE: We use get_len instead of get_prompt_len because when
                # a sequence is preempted, prefill includes previous generated
                # output tokens.
-                if (token_chunk_size + seqs[0].data.get_num_computed_tokens() <
+                if (token_chunk_size + num_computed_tokens <
                        seqs[0].data.get_len()):
                    do_sample = False

            # It assumes the scheduled_seq_groups is ordered by
            # prefill < decoding.
-            is_prompt = seq_group.is_prefill()
-            seq_group_metadata = SequenceGroupMetadata(
-                request_id=seq_group.request_id,
-                is_prompt=is_prompt,
-                seq_data=seq_data,
-                sampling_params=seq_group.sampling_params,
-                block_tables=block_tables,
-                do_sample=do_sample,
-                pooling_params=seq_group.pooling_params,
-                token_chunk_size=token_chunk_size,
-                lora_request=seq_group.lora_request,
-                computed_block_nums=common_computed_block_nums,
-                # `multi_modal_data` will only be present for the 1st comm
-                # between engine and worker.
-                # the subsequent comms can still use delta, but
-                # `multi_modal_data` will be None.
-                multi_modal_data=seq_group.multi_modal_data
-                if scheduler_outputs.num_prefill_groups > 0 else None,
-                prompt_adapter_request=seq_group.prompt_adapter_request,
-            )
+            if is_first_prefill or not self.scheduler_config.send_delta_data:
+                seq_group_metadata = SequenceGroupMetadata(
+                    request_id=seq_group.request_id,
+                    is_prompt=is_prompt,
+                    seq_data=seq_data,
+                    sampling_params=seq_group.sampling_params,
+                    block_tables=block_tables,
+                    do_sample=do_sample,
+                    pooling_params=seq_group.pooling_params,
+                    token_chunk_size=token_chunk_size,
+                    lora_request=seq_group.lora_request,
+                    computed_block_nums=common_computed_block_nums,
+                    encoder_seq_data=encoder_seq_data,
+                    cross_block_table=cross_block_table,
+                    state=seq_group.state,
+                    # `multi_modal_data` will only be present for the 1st comm
+                    # between engine and worker.
+                    # the subsequent comms can still use delta, but
+                    # `multi_modal_data` will be None.
+                    multi_modal_data=seq_group.multi_modal_data
+                    if scheduler_outputs.num_prefill_groups > 0 else None,
+                    prompt_adapter_request=seq_group.prompt_adapter_request,
+                )
+            else:
+                # When SPMD mode is enabled, we only send delta data except for
+                # the first request to reduce serialization cost.
+                seq_data_delta = {}
+                for id, data in seq_data.items():
+                    seq_data_delta[id] = data.get_delta_and_reset()
+                seq_group_metadata = SequenceGroupMetadataDelta(
+                    seq_data_delta,
+                    seq_group.request_id,
+                    block_tables,
+                    is_prompt,
+                    do_sample=do_sample,
+                    token_chunk_size=token_chunk_size,
+                    computed_block_nums=common_computed_block_nums,
+                )
            seq_group_metadata_list.append(seq_group_metadata)

        # Now that the batch has been created, we can assume all blocks in the
@@ -1019,6 +1147,17 @@ class Scheduler:
            self.block_manager.mark_blocks_as_computed(
                scheduled_seq_group.seq_group)

+        scheduler_time = time.perf_counter() - scheduler_start_time
+        # Add this to scheduler time to all the sequences that are currently
+        # running. This will help estimate if the scheduler is a significant
+        # component in the e2e latency.
+        for seq_group in self.running:
+            if seq_group is not None and seq_group.metrics is not None:
+                if seq_group.metrics.scheduler_time is not None:
+                    seq_group.metrics.scheduler_time += scheduler_time
+                else:
+                    seq_group.metrics.scheduler_time = scheduler_time
+
        return seq_group_metadata_list, scheduler_outputs

    def fork_seq(self, parent_seq: Sequence, child_seq: Sequence) -> None:
@@ -1032,6 +1171,8 @@ class Scheduler:
        remaining: Deque[SequenceGroup] = deque()
        for seq_group in self.running:
            if seq_group.is_finished():
+                # Free cross-attention block table, if it exists
+                self._free_seq_group_cross_attn_blocks(seq_group)
                # Add the finished requests to the finished requests list.
                # This list will be used to update the Mamba cache in the
                # next step.
@@ -1062,10 +1203,12 @@ class Scheduler:
                slots.
        """
        num_lookahead_slots = self._get_num_lookahead_slots(is_prefill=False)
+        seq_group.init_multi_step(num_scheduler_steps=num_lookahead_slots + 1)

        for seq in seq_group.get_seqs(status=SequenceStatus.RUNNING):
            cows = self.block_manager.append_slots(seq, num_lookahead_slots)
-            blocks_to_copy.extend(cows)
+            if len(cows) > 0:
+                blocks_to_copy.extend(cows)

    def _preempt(
        self,

--- a/vllm/distributed/communication_op.py
+++ b/vllm/distributed/communication_op.py
@@ -19,7 +19,7 @@ def tensor_model_parallel_all_gather(input_: torch.Tensor,

 def tensor_model_parallel_gather(input_: torch.Tensor,
                                 dst: int = 0,
-                                 dim: int = -1) -> torch.Tensor:
+                                 dim: int = -1) -> Optional[torch.Tensor]:
    """Gather the input tensor across model parallel group."""
    return get_tp_group().gather(input_, dst, dim)


--- a/vllm/distributed/device_communicators/cuda_wrapper.py
+++ b/vllm/distributed/device_communicators/cuda_wrapper.py
@@ -49,8 +49,13 @@ def find_loaded_library(lib_name) -> Optional[str]:
    if not found:
        # the library is not loaded in the current process
        return None
+    # if lib_name is libcudart, we need to match a line with:
+    # address /path/to/libcudart-hash.so.11.0
    start = line.index("/")
    path = line[start:].strip()
+    filename = path.split("/")[-1]
+    assert filename.rpartition(".so")[0].startswith(lib_name), \
+        f"Unexpected filename: {filename} for library {lib_name}"
    return path


@@ -98,9 +103,9 @@ class CudaRTLibrary:

    def __init__(self, so_file: Optional[str] = None):
        if so_file is None:
-            so_file = find_loaded_library("libcudart.so")
+            so_file = find_loaded_library("libcudart")
            assert so_file is not None, \
-                "libcudart.so is not loaded in the current process"
+                "libcudart is not loaded in the current process"
        if so_file not in CudaRTLibrary.path_to_library_cache:
            lib = ctypes.CDLL(so_file)
            CudaRTLibrary.path_to_library_cache[so_file] = lib

--- a/vllm/distributed/device_communicators/custom_all_reduce.py
+++ b/vllm/distributed/device_communicators/custom_all_reduce.py
@@ -11,15 +11,17 @@ from vllm.distributed.device_communicators.custom_all_reduce_utils import (
    gpu_p2p_access_check)
 from vllm.distributed.parallel_state import in_the_same_node_as
 from vllm.logger import init_logger
-from vllm.utils import cuda_device_count_stateless, is_full_nvlink
+from vllm.platforms import current_platform
+from vllm.utils import cuda_device_count_stateless
 from vllm.utils import is_hip

 try:
    if (not is_hip()):
-        assert ops.is_custom_op_supported("_C_custom_ar::meta_size")
+        ops.meta_size()
        custom_ar = True
    else:
        custom_ar = False
+
 except Exception:
    # For AMD GPUs and CPUs
    custom_ar = False
@@ -117,7 +119,10 @@ class CustomAllreduce:
        # test nvlink first, this will filter out most of the cases
        # where custom allreduce is not supported
        # this checks hardware and driver support for NVLink
-        full_nvlink = is_full_nvlink(physical_device_ids)
+        assert current_platform.is_cuda()
+        from vllm.platforms.cuda import CudaPlatform
+        cuda_platform: CudaPlatform = current_platform
+        full_nvlink = cuda_platform.is_full_nvlink(physical_device_ids)
        if world_size > 2 and not full_nvlink:
            logger.warning(
                "Custom allreduce is disabled because it's not supported on"

--- a/vllm/distributed/device_communicators/tpu_communicator.py
+++ b/vllm/distributed/device_communicators/tpu_communicator.py
@@ -5,6 +5,7 @@ from torch.distributed import ProcessGroup
 from vllm.platforms import current_platform

 if current_platform.is_tpu():
+    import ray
    import torch_xla.core.xla_model as xm
    import torch_xla.runtime as xr
    from torch_xla._internal import pjrt
@@ -18,9 +19,15 @@ class TpuCommunicator:
            return
        self.disabled = False

-        local_rank = dist.get_rank(group)
-        world_size = dist.get_world_size(group)
-        pjrt.initialize_multiprocess(local_rank, world_size)
+        # NOTE(woosuk): When using TP > 1 on TPUs, every TPU on the same node
+        # must be used together. Therefore, the local rank and world size can
+        # be simply calculated as follows.
+        global_rank = dist.get_rank(group)
+        global_world_size = dist.get_world_size(group)
+        num_nodes = len(ray.nodes())
+        local_world_size = global_world_size // num_nodes
+        local_rank = global_rank % local_world_size
+        pjrt.initialize_multiprocess(local_rank, local_world_size)
        xr._init_world_size_ordinal()

    def all_reduce(self, x: torch.Tensor) -> torch.Tensor:

--- a/vllm/distributed/parallel_state.py
+++ b/vllm/distributed/parallel_state.py
@@ -329,7 +329,7 @@ class GroupCoordinator:
    def gather(self,
               input_: torch.Tensor,
               dst: int = 0,
-               dim: int = -1) -> torch.Tensor:
+               dim: int = -1) -> Optional[torch.Tensor]:
        """
        NOTE: We assume that the input tensor is on the same device across
        all the ranks.

--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -2,11 +2,15 @@ import argparse
 import dataclasses
 import json
 from dataclasses import dataclass
-from typing import TYPE_CHECKING, List, Optional, Tuple, Type, Union
+from typing import (TYPE_CHECKING, Dict, List, Mapping, Optional, Tuple, Type,
+                    Union)

+import torch
+
+import vllm.envs as envs
 from vllm.config import (CacheConfig, DecodingConfig, DeviceConfig,
-                         EngineConfig, LoadConfig, LoRAConfig, ModelConfig,
-                         MultiModalConfig, ObservabilityConfig, ParallelConfig,
+                         EngineConfig, LoadConfig, LoadFormat, LoRAConfig,
+                         ModelConfig, ObservabilityConfig, ParallelConfig,
                         PromptAdapterConfig, SchedulerConfig,
                         SpeculativeConfig, TokenizerPoolConfig)
 from vllm.executor.executor_base import ExecutorBase
@@ -15,11 +19,12 @@ from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS
 from vllm.utils import FlexibleArgumentParser

 if TYPE_CHECKING:
-    from vllm.transformers_utils.tokenizer_group.base_tokenizer_group import (
-        BaseTokenizerGroup)
+    from vllm.transformers_utils.tokenizer_group import BaseTokenizerGroup

 logger = init_logger(__name__)

+ALLOWED_DETAILED_TRACE_MODULES = ["model", "worker", "all"]
+

 def nullable_str(val: str):
    if not val or val == "None":
@@ -27,11 +32,32 @@ def nullable_str(val: str):
    return val


+def nullable_kvs(val: str) -> Optional[Mapping[str, int]]:
+    if len(val) == 0:
+        return None
+
+    out_dict: Dict[str, int] = {}
+    for item in val.split(","):
+        try:
+            key, value = item.split("=")
+        except TypeError as exc:
+            msg = "Each item should be in the form KEY=VALUE"
+            raise ValueError(msg) from exc
+
+        try:
+            out_dict[key] = int(value)
+        except ValueError as exc:
+            msg = f"Failed to parse value of item {key}={value}"
+            raise ValueError(msg) from exc
+
+    return out_dict
+
+
 @dataclass
 class EngineArgs:
    """Arguments for vLLM engine."""
-    model: str
-    served_model_name: Optional[Union[List[str]]] = None
+    model: str = 'facebook/opt-125m'
+    served_model_name: Optional[Union[str, List[str]]] = None
    tokenizer: Optional[str] = None
    skip_tokenizer_init: bool = False
    tokenizer_mode: str = 'auto'
@@ -56,8 +82,8 @@ class EngineArgs:
    enable_prefix_caching: bool = False
    disable_sliding_window: bool = False
    use_v2_block_manager: bool = False
-    swap_space: int = 4  # GiB
-    cpu_offload_gb: int = 0  # GiB
+    swap_space: float = 4  # GiB
+    cpu_offload_gb: float = 0  # GiB
    gpu_memory_utilization: float = 0.90
    max_num_batched_tokens: Optional[int] = None
    max_num_seqs: int = 256
@@ -69,7 +95,7 @@ class EngineArgs:
    rope_theta: Optional[float] = None
    tokenizer_revision: Optional[str] = None
    quantization: Optional[str] = None
-    enforce_eager: bool = False
+    enforce_eager: Optional[bool] = None
    max_context_len_to_capture: Optional[int] = None
    max_seq_len_to_capture: int = 8192
    disable_custom_all_reduce: bool = False
@@ -79,6 +105,7 @@ class EngineArgs:
    # notice.
    tokenizer_pool_type: Union[str, Type["BaseTokenizerGroup"]] = "ray"
    tokenizer_pool_extra_config: Optional[dict] = None
+    limit_mm_per_prompt: Optional[Mapping[str, int]] = None
    enable_lora: bool = False
    max_loras: int = 1
    max_lora_rank: int = 16
@@ -88,9 +115,10 @@ class EngineArgs:
    fully_sharded_loras: bool = False
    lora_extra_vocab_size: int = 256
    long_lora_scaling_factors: Optional[Tuple[float]] = None
-    lora_dtype: str = 'auto'
+    lora_dtype: Optional[Union[str, torch.dtype]] = 'auto'
    max_cpu_loras: Optional[int] = None
    device: str = 'auto'
+    num_scheduler_steps: int = 1
    ray_workers_use_nsight: bool = False
    num_gpu_blocks_override: Optional[int] = None
    num_lookahead_slots: int = 0
@@ -104,6 +132,7 @@ class EngineArgs:
    guided_decoding_backend: str = 'outlines'
    # Speculative decoding configuration.
    speculative_model: Optional[str] = None
+    speculative_model_quantization: Optional[str] = None
    speculative_draft_tensor_parallel_size: Optional[int] = None
    num_speculative_tokens: Optional[int] = None
    speculative_max_model_len: Optional[int] = None
@@ -117,6 +146,7 @@ class EngineArgs:
    disable_logprobs_during_spec_decoding: Optional[bool] = None

    otlp_traces_endpoint: Optional[str] = None
+    collect_detailed_traces: Optional[str] = None

    def __post_init__(self):
        if self.tokenizer is None:
@@ -130,7 +160,7 @@ class EngineArgs:
        parser.add_argument(
            '--model',
            type=str,
-            default='facebook/opt-125m',
+            default=EngineArgs.model,
            help='Name or path of the huggingface model to use.')
        parser.add_argument(
            '--tokenizer',
@@ -184,10 +214,7 @@ class EngineArgs:
            '--load-format',
            type=str,
            default=EngineArgs.load_format,
-            choices=[
-                'auto', 'pt', 'safetensors', 'npcache', 'dummy', 'tensorizer',
-                'bitsandbytes'
-            ],
+            choices=[f.value for f in LoadFormat],
            help='The format of the model weights to load.\n\n'
            '* "auto" will try to load the weights in the safetensors format '
            'and fall back to the pytorch bin format if safetensors format '
@@ -290,7 +317,7 @@ class EngineArgs:
        parser.add_argument('--block-size',
                            type=int,
                            default=EngineArgs.block_size,
-                            choices=[8, 16, 32],
+                            choices=[8, 16, 32, 128, 256, 512, 1024, 2048],
                            help='Token block size for contiguous chunks of '
                            'tokens.')

@@ -318,7 +345,7 @@ class EngineArgs:
                            default=EngineArgs.seed,
                            help='Random seed for operations.')
        parser.add_argument('--swap-space',
-                            type=int,
+                            type=float,
                            default=EngineArgs.swap_space,
                            help='CPU swap space size (GiB) per GPU.')
        parser.add_argument(
@@ -432,6 +459,21 @@ class EngineArgs:
                            'This should be a JSON string that will be '
                            'parsed into a dictionary. Ignored if '
                            'tokenizer_pool_size is 0.')
+
+        # Multimodal related configs
+        parser.add_argument(
+            '--limit-mm-per-prompt',
+            type=nullable_kvs,
+            default=EngineArgs.limit_mm_per_prompt,
+            # The default value is given in
+            # MultiModalRegistry.init_mm_limits_per_prompt
+            help=('For each multimodal plugin, limit how many '
+                  'input instances to allow for each prompt. '
+                  'Expects a comma-separated list of items, '
+                  'e.g.: `image=16,video=2` allows a maximum of 16 '
+                  'images and 2 videos per prompt. Defaults to 1 for '
+                  'each modality.'))
+
        # LoRA related configs
        parser.add_argument('--enable-lora',
                            action='store_true',
@@ -503,6 +545,11 @@ class EngineArgs:
                                "tpu", "xpu"
                            ],
                            help='Device type for vLLM execution.')
+        parser.add_argument('--num-scheduler-steps',
+                            type=int,
+                            default=1,
+                            help=('Maximum number of forward steps per '
+                                  'scheduler call.'))

        parser.add_argument(
            '--scheduler-delay-factor',
@@ -525,6 +572,18 @@ class EngineArgs:
            default=EngineArgs.speculative_model,
            help=
            'The name of the draft model to be used in speculative decoding.')
+        # Quantization settings for speculative model.
+        parser.add_argument(
+            '--speculative-model-quantization',
+            type=nullable_str,
+            choices=[*QUANTIZATION_METHODS, None],
+            default=EngineArgs.speculative_model_quantization,
+            help='Method used to quantize the weights of speculative model.'
+            'If None, we first check the `quantization_config` '
+            'attribute in the model config file. If that is '
+            'None, we assume the model weights are not '
+            'quantized and use `dtype` to determine the data '
+            'type of the weights.')
        parser.add_argument(
            '--num-speculative-tokens',
            type=int,
@@ -602,8 +661,10 @@ class EngineArgs:

        parser.add_argument(
            '--disable-logprobs-during-spec-decoding',
-            type=bool,
+            action=StoreBoolean,
            default=EngineArgs.disable_logprobs_during_spec_decoding,
+            nargs="?",
+            const="True",
            help='If set to True, token log probabilities are not returned '
            'during speculative decoding. If set to False, log probabilities '
            'are returned according to the settings in SamplingParams. If '
@@ -660,6 +721,16 @@ class EngineArgs:
            type=str,
            default=None,
            help='Target URL to which OpenTelemetry traces will be sent.')
+        parser.add_argument(
+            '--collect-detailed-traces',
+            type=str,
+            default=None,
+            help="Valid choices are " +
+            ",".join(ALLOWED_DETAILED_TRACE_MODULES) +
+            ". It makes sense to set this only if --otlp-traces-endpoint is"
+            " set. If set, it will collect detailed traces for the specified "
+            "modules. This involves use of possibly costly and or blocking "
+            "operations and hence might have a performance impact.")

        return parser

@@ -672,6 +743,9 @@ class EngineArgs:
        return engine_args

    def create_engine_config(self, ) -> EngineConfig:
+        # gguf file needs a specific model loader and doesn't use hf_repo
+        if self.model.endswith(".gguf"):
+            self.quantization = self.load_format = "gguf"

        # bitsandbytes quantization needs a specific model loader
        # so we make sure the quant method and the load format are consistent
@@ -693,8 +767,6 @@ class EngineArgs:
            "CPU offload space must be non-negative"
            f", but got {self.cpu_offload_gb}")

-        multimodal_config = MultiModalConfig()
-
        device_config = DeviceConfig(device=self.device)
        model_config = ModelConfig(
            model=self.model,
@@ -718,7 +790,8 @@ class EngineArgs:
            disable_sliding_window=self.disable_sliding_window,
            skip_tokenizer_init=self.skip_tokenizer_init,
            served_model_name=self.served_model_name,
-            multimodal_config=multimodal_config)
+            limit_mm_per_prompt=self.limit_mm_per_prompt,
+        )
        cache_config = CacheConfig(
            block_size=self.block_size,
            gpu_memory_utilization=self.gpu_memory_utilization,
@@ -779,11 +852,19 @@ class EngineArgs:
                "in low performance due to small KV cache space. Consider "
                "setting --max-model-len to a smaller value.", max_model_len)

+        if self.num_scheduler_steps > 1 and not self.use_v2_block_manager:
+            self.use_v2_block_manager = True
+            logger.warning(
+                "Enabled BlockSpaceManagerV2 because it is "
+                "required for multi-step (--num-scheduler-steps > 1)")
+
        speculative_config = SpeculativeConfig.maybe_create_spec_config(
            target_model_config=model_config,
            target_parallel_config=parallel_config,
            target_dtype=self.dtype,
            speculative_model=self.speculative_model,
+            speculative_model_quantization = \
+                self.speculative_model_quantization,
            speculative_draft_tensor_parallel_size = \
                self.speculative_draft_tensor_parallel_size,
            num_speculative_tokens=self.num_speculative_tokens,
@@ -804,18 +885,35 @@ class EngineArgs:
            disable_logprobs=self.disable_logprobs_during_spec_decoding,
        )

+        if self.num_scheduler_steps > 1:
+            if speculative_config is not None:
+                raise ValueError("Speculative decoding is not supported with "
+                                 "multi-step (--num-scheduler-steps > 1)")
+            if self.enable_chunked_prefill:
+                raise ValueError("Chunked prefill is not supported with "
+                                 "multi-step (--num-scheduler-steps > 1)")
+
+        # make sure num_lookahead_slots is set the higher value depending on
+        # if we are using speculative decoding or multi-step
+        num_lookahead_slots = max(self.num_lookahead_slots,
+                                  self.num_scheduler_steps - 1)
+        num_lookahead_slots = num_lookahead_slots \
+            if speculative_config is None \
+            else speculative_config.num_lookahead_slots
+
        scheduler_config = SchedulerConfig(
            max_num_batched_tokens=self.max_num_batched_tokens,
            max_num_seqs=self.max_num_seqs,
            max_model_len=model_config.max_model_len,
            use_v2_block_manager=self.use_v2_block_manager,
-            num_lookahead_slots=(self.num_lookahead_slots
-                                 if speculative_config is None else
-                                 speculative_config.num_lookahead_slots),
+            num_lookahead_slots=num_lookahead_slots,
            delay_factor=self.scheduler_delay_factor,
            enable_chunked_prefill=self.enable_chunked_prefill,
            embedding_mode=model_config.embedding_mode,
            preemption_mode=self.preemption_mode,
+            num_scheduler_steps=self.num_scheduler_steps,
+            send_delta_data=(envs.VLLM_USE_RAY_SPMD_WORKER
+                             and parallel_config.use_ray),
        )
        lora_config = LoRAConfig(
            max_lora_rank=self.max_lora_rank,
@@ -849,8 +947,21 @@ class EngineArgs:
        decoding_config = DecodingConfig(
            guided_decoding_backend=self.guided_decoding_backend)

+        detailed_trace_modules = []
+        if self.collect_detailed_traces is not None:
+            detailed_trace_modules = self.collect_detailed_traces.split(",")
+        for m in detailed_trace_modules:
+            if m not in ALLOWED_DETAILED_TRACE_MODULES:
+                raise ValueError(
+                    f"Invalid module {m} in collect_detailed_traces. "
+                    f"Valid modules are {ALLOWED_DETAILED_TRACE_MODULES}")
        observability_config = ObservabilityConfig(
-            otlp_traces_endpoint=self.otlp_traces_endpoint)
+            otlp_traces_endpoint=self.otlp_traces_endpoint,
+            collect_model_forward_time="model" in detailed_trace_modules
+            or "all" in detailed_trace_modules,
+            collect_model_execute_time="worker" in detailed_trace_modules
+            or "all" in detailed_trace_modules,
+        )

        if (model_config.get_sliding_window() is not None
                and scheduler_config.chunked_prefill_enabled
@@ -866,7 +977,6 @@ class EngineArgs:
            scheduler_config=scheduler_config,
            device_config=device_config,
            lora_config=lora_config,
-            multimodal_config=multimodal_config,
            speculative_config=speculative_config,
            load_config=load_config,
            decoding_config=decoding_config,
@@ -889,7 +999,13 @@ class AsyncEngineArgs(EngineArgs):
        parser.add_argument('--engine-use-ray',
                            action='store_true',
                            help='Use Ray to start the LLM engine in a '
-                            'separate process as the server process.')
+                            'separate process as the server process.'
+                            '(DEPRECATED. This argument is deprecated '
+                            'and will be removed in a future update. '
+                            'Set `VLLM_ALLOW_ENGINE_USE_RAY=1` to force '
+                            'use it. See '
+                            'https://github.com/vllm-project/vllm/issues/7045.'
+                            ')')
        parser.add_argument('--disable-log-requests',
                            action='store_true',
                            help='Disable logging requests.')