Commit af7f4372 authored by zhuwenwen's avatar zhuwenwen
Browse files

Merge tag 'v0.5.5' into v0.5.5-dtk24.04.1

parents 5e19cdef 09c77926
import enum
import json
from dataclasses import dataclass, field, fields
from typing import TYPE_CHECKING, ClassVar, List, Optional, Tuple, Type, Union
from typing import (TYPE_CHECKING, ClassVar, List, Mapping, Optional, Tuple,
Type, Union)
import torch
from transformers import PretrainedConfig
......@@ -10,10 +11,14 @@ import vllm.envs as envs
from vllm.logger import init_logger
from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS
from vllm.model_executor.models import ModelRegistry
from vllm.tracing import is_otel_installed
from vllm.transformers_utils.config import get_config, get_hf_text_config
from vllm.utils import (cuda_device_count_stateless, get_cpu_memory, is_cpu,
is_hip, is_neuron, is_openvino, is_tpu, is_xpu,
from vllm.platforms import current_platform
from vllm.tracing import is_otel_available, otel_import_error_traceback
from vllm.transformers_utils.config import (get_config,
get_hf_image_processor_config,
get_hf_text_config)
from vllm.utils import (STR_NOT_IMPL_ENC_DEC_CUDAGRAPH, GiB_bytes,
cuda_device_count_stateless, get_cpu_memory, is_cpu,
is_hip, is_neuron, is_openvino, is_xpu,
print_warning_once)
if TYPE_CHECKING:
......@@ -26,7 +31,6 @@ if TYPE_CHECKING:
logger = init_logger(__name__)
_GB = 1 << 30
_EMBEDDING_MODEL_MAX_NUM_BATCHED_TOKENS = 32768
_PP_SUPPORTED_MODELS = [
......@@ -34,6 +38,7 @@ _PP_SUPPORTED_MODELS = [
"AquilaForCausalLM",
"DeepseekV2ForCausalLM",
"InternLMForCausalLM",
"JAISLMHeadModel",
"LlamaForCausalLM",
"LLaMAForCausalLM",
"MistralForCausalLM",
......@@ -87,6 +92,9 @@ class ModelConfig:
enforce_eager: Whether to enforce eager execution. If True, we will
disable CUDA graph and always execute the model in eager mode.
If False, we will use CUDA graph and eager execution in hybrid.
If None, the user did not specify, so default to False -
except for encoder/decoder models, which currently require
eager mode.
max_context_len_to_capture: Maximum context len covered by CUDA graphs.
When a sequence has context length larger than this, we fall back
to eager mode (DEPRECATED. Use max_seq_len_to_capture instead).
......@@ -103,6 +111,8 @@ class ModelConfig:
matches the model name exposed via the APIs. If multiple model
names provided, the first name will be used. If not specified,
the model name will be the same as `model`.
limit_mm_per_prompt: Maximum number of data instances per modality
per prompt. Only applicable for multimodal models.
"""
def __init__(
......@@ -119,16 +129,17 @@ class ModelConfig:
rope_theta: Optional[float] = None,
tokenizer_revision: Optional[str] = None,
max_model_len: Optional[int] = None,
spec_target_max_model_len: Optional[int] = None,
quantization: Optional[str] = None,
quantization_param_path: Optional[str] = None,
enforce_eager: bool = False,
enforce_eager: Optional[bool] = None,
max_context_len_to_capture: Optional[int] = None,
max_seq_len_to_capture: Optional[int] = None,
max_logprobs: int = 20,
disable_sliding_window: bool = False,
skip_tokenizer_init: bool = False,
served_model_name: Optional[Union[str, List[str]]] = None,
multimodal_config: Optional["MultiModalConfig"] = None,
limit_mm_per_prompt: Optional[Mapping[str, int]] = None,
) -> None:
self.model = model
self.tokenizer = tokenizer
......@@ -158,8 +169,38 @@ class ModelConfig:
self.hf_config = get_config(self.model, trust_remote_code, revision,
code_revision, rope_scaling, rope_theta)
self.hf_text_config = get_hf_text_config(self.hf_config)
self.hf_image_processor_config = get_hf_image_processor_config(
self.model, revision)
self.dtype = _get_and_verify_dtype(self.hf_text_config, dtype)
# Choose a default enforce_eager value if the user did not specify
# a value (enforce_eager is None)
if getattr(self.hf_config, 'is_encoder_decoder', False):
if self.enforce_eager is None:
# *Only for encoder/decoder models* and
# *only if enforce_eager is unset*, override
# to enforce_eager=True
#
# Add a logger message since it is *somewhat* non-intuitive that
# enforce_eager is True when the user has not specified its
# value.
logger.info("Forcing enforce_eager == True because "
"enforce_eager setting was unspecified and "
"CUDAGraph is not supported with encoder/ "
"decoder models.")
self.enforce_eager = True
if not self.enforce_eager:
# Eager mode explicitly disabled by user for an encoder/
# decoder model; however CUDAGRAPH + encoder/decoder is
# not currently supported
raise ValueError(STR_NOT_IMPL_ENC_DEC_CUDAGRAPH)
elif self.enforce_eager is None:
# *Only for decoder-only models*, enforce_eager
# defaults to False if unset. This is intuitive
# so no logging message needed.
self.enforce_eager = False
if (not self.disable_sliding_window
and self.hf_text_config.model_type == "gemma2"
and self.hf_text_config.sliding_window is not None):
......@@ -174,17 +215,33 @@ class ModelConfig:
hf_config=self.hf_text_config,
max_model_len=max_model_len,
disable_sliding_window=self.disable_sliding_window,
sliding_window_len=self.get_hf_config_sliding_window())
sliding_window_len=self.get_hf_config_sliding_window(),
spec_target_max_model_len=spec_target_max_model_len)
self.served_model_name = get_served_model_name(model,
served_model_name)
self.multimodal_config = multimodal_config
self.multimodal_config = self._init_multimodal_config(
limit_mm_per_prompt)
if not self.skip_tokenizer_init:
self._verify_tokenizer_mode()
self._verify_embedding_mode()
self._verify_quantization()
self._verify_cuda_graph()
def _init_multimodal_config(
self, limit_mm_per_prompt: Optional[Mapping[str, int]]
) -> Optional["MultiModalConfig"]:
architectures = getattr(self.hf_config, "architectures", [])
if any(
ModelRegistry.is_multimodal_model(arch)
for arch in architectures):
return MultiModalConfig(limit_per_prompt=limit_mm_per_prompt or {})
else:
if limit_mm_per_prompt:
raise ValueError(
"limit_mm_per_prompt is only supported for multimodal "
"models.")
return None
def _verify_tokenizer_mode(self) -> None:
tokenizer_mode = self.tokenizer_mode.lower()
if tokenizer_mode not in ["auto", "slow"]:
......@@ -207,11 +264,13 @@ class ModelConfig:
def _verify_quantization(self) -> None:
supported_quantization = [*QUANTIZATION_METHODS]
rocm_supported_quantization = ["gptq", "squeezellm","awq"]
rocm_supported_quantization = ["gptq", "squeezellm","awq"] # "fp8"
optimized_quantization_methods = [
"fp8", "marlin", "gptq_marlin_24", "gptq_marlin", "awq_marlin",
"fbgemm_fp8", "compressed_tensors", "compressed-tensors"
"fbgemm_fp8", "compressed_tensors", "compressed-tensors",
"experts_int8"
]
tpu_supported_quantization = ["tpu_int8"]
if self.quantization is not None:
self.quantization = self.quantization.lower()
......@@ -250,6 +309,11 @@ class ModelConfig:
raise ValueError(
f"{self.quantization} quantization is currently not "
f"supported in ROCm.")
if current_platform.is_tpu(
) and self.quantization not in tpu_supported_quantization:
raise ValueError(
f"{self.quantization} quantization is currently not "
f"supported in TPU Backend.")
if self.quantization not in optimized_quantization_methods:
logger.warning(
"%s quantization is not fully "
......@@ -290,8 +354,9 @@ class ModelConfig:
"BitAndBytes quantization with TP or PP is not supported yet.")
if self.quantization == "bitsandbytes" and self.enforce_eager is False:
raise ValueError(
"BitAndBytes with enforce_eager = False is not supported yet.")
logger.warning("CUDA graph is not supported on BitAndBytes yet, "
"fallback to the eager mode.")
self.enforce_eager = True
def get_hf_config_sliding_window(self) -> Optional[int]:
"""Get the sliding window size, or None if disabled."""
......@@ -425,6 +490,28 @@ class ModelConfig:
if t != "attention"
])
def get_multimodal_config(self) -> "MultiModalConfig":
"""
Get the multimodal configuration of the model.
Raises:
ValueError: If the model is not multimodal.
"""
if self.multimodal_config is None:
raise ValueError("The model is not multimodal.")
return self.multimodal_config
@property
def is_encoder_decoder_model(self) -> bool:
"""Extract the HF encoder/decoder model flag."""
return getattr(self.hf_config, "is_encoder_decoder", False)
@property
def is_embedding_model(self) -> bool:
"""Extract the embedding model flag."""
return self.embedding_mode
class CacheConfig:
"""Configuration for the KV cache.
......@@ -443,7 +530,7 @@ class CacheConfig:
self,
block_size: int,
gpu_memory_utilization: float,
swap_space: int,
swap_space: float,
cache_dtype: str,
num_gpu_blocks_override: Optional[int] = None,
sliding_window: Optional[int] = None,
......@@ -452,7 +539,7 @@ class CacheConfig:
) -> None:
self.block_size = block_size
self.gpu_memory_utilization = gpu_memory_utilization
self.swap_space_bytes = swap_space * _GB
self.swap_space_bytes = swap_space * GiB_bytes
self.num_gpu_blocks_override = num_gpu_blocks_override
self.cache_dtype = cache_dtype
self.sliding_window = sliding_window
......@@ -497,10 +584,6 @@ class CacheConfig:
raise NotImplementedError(
"Prefix caching is not supported with sliding window. "
"Run with --disable-sliding-window to use prefix caching.")
if self.cache_dtype == "fp8":
raise NotImplementedError(
"Prefix caching is not supported for fp8 cache_dtype. "
"Run with --kv-cache-dtype auto to use prefix caching.")
def verify_with_parallel_config(
self,
......@@ -512,9 +595,9 @@ class CacheConfig:
num_gpus_per_node = parallel_config.tensor_parallel_size
cpu_memory_usage = self.swap_space_bytes * num_gpus_per_node
msg = (f"{cpu_memory_usage / _GB:.2f} GiB out of "
f"the {total_cpu_memory / _GB:.2f} GiB total CPU memory is "
"allocated for the swap space.")
msg = (f"{cpu_memory_usage / GiB_bytes:.2f} GiB out of the "
f"{total_cpu_memory / GiB_bytes:.2f} GiB total CPU memory "
"is allocated for the swap space.")
if cpu_memory_usage > 0.7 * total_cpu_memory:
raise ValueError("Too large swap space. " + msg)
elif cpu_memory_usage > 0.4 * total_cpu_memory:
......@@ -582,6 +665,7 @@ class LoadFormat(str, enum.Enum):
DUMMY = "dummy"
TENSORIZER = "tensorizer"
SHARDED_STATE = "sharded_state"
GGUF = "gguf"
BITSANDBYTES = "bitsandbytes"
......@@ -692,8 +776,8 @@ class ParallelConfig:
self.tokenizer_pool_config = tokenizer_pool_config
self.ray_workers_use_nsight = ray_workers_use_nsight
self.placement_group = placement_group
self.world_size = pipeline_parallel_size * self.tensor_parallel_size
if worker_use_ray:
if self.distributed_executor_backend is None:
self.distributed_executor_backend = "ray"
......@@ -789,6 +873,11 @@ class SchedulerConfig:
swapping. However, when the sequence group has multiple sequences
(e.g., beam search), recomputation is not currently supported. In
such a case, we use swapping instead.
send_delta_data: Private API. If used, scheduler sends delta data to
workers instead of an entire data. It should be enabled only
when SPMD worker architecture is enabled. I.e.,
VLLM_USE_RAY_SPMD_WORKER=1
"""
def __init__(self,
......@@ -800,7 +889,9 @@ class SchedulerConfig:
delay_factor: float = 0.0,
enable_chunked_prefill: bool = False,
embedding_mode: Optional[bool] = False,
preemption_mode: Optional[str] = None) -> None:
preemption_mode: Optional[str] = None,
num_scheduler_steps: int = 1,
send_delta_data: bool = False) -> None:
if max_num_batched_tokens is not None:
self.max_num_batched_tokens = max_num_batched_tokens
else:
......@@ -829,6 +920,8 @@ class SchedulerConfig:
self.chunked_prefill_enabled = enable_chunked_prefill
self.embedding_mode = embedding_mode
self.preemption_mode = preemption_mode
self.num_scheduler_steps = num_scheduler_steps
self.send_delta_data = send_delta_data
self._verify_args()
def _verify_args(self) -> None:
......@@ -854,6 +947,16 @@ class SchedulerConfig:
f"({self.num_lookahead_slots}) must be greater than or "
"equal to 0.")
if self.num_scheduler_steps < 1:
raise ValueError(
"num_scheduler_steps "
f"({self.num_scheduler_steps}) must be greater than or "
"equal to 1.")
@property
def is_multi_step(self) -> bool:
return self.num_scheduler_steps > 1
class DeviceConfig:
device: Optional[torch.device]
......@@ -865,7 +968,7 @@ class DeviceConfig:
self.device_type = "neuron"
elif is_openvino():
self.device_type = "openvino"
elif is_tpu():
elif current_platform.is_tpu():
self.device_type = "tpu"
elif is_cpu():
self.device_type = "cpu"
......@@ -902,6 +1005,7 @@ class SpeculativeConfig:
target_parallel_config: ParallelConfig,
target_dtype: str,
speculative_model: Optional[str],
speculative_model_quantization: Optional[str],
speculative_draft_tensor_parallel_size: Optional[int],
num_speculative_tokens: Optional[int],
speculative_max_model_len: Optional[int],
......@@ -930,6 +1034,9 @@ class SpeculativeConfig:
target_dtype (str): The data type used for the target model.
speculative_model (Optional[str]): The name of the speculative
model, if provided.
speculative_model_quantization (Optional[str]): Quantization method
that was used to quantize the speculative model weights. If
None, we assume the model weights are not quantized.
speculative_draft_tensor_parallel_size (Optional[int]): The degree
of the tensor parallelism for the draft model.
num_speculative_tokens (Optional[int]): The number of speculative
......@@ -997,11 +1104,11 @@ class SpeculativeConfig:
"Speculative decoding requires usage of the V2 "
"block manager. Enable it with --use-v2-block-manager.")
# TODO: The user should be able to specify revision/quantization/max
# model len for the draft model. It is not currently supported.
# TODO: The user should be able to specify revision/max model len
# for the draft model. It is not currently supported.
draft_revision = None
draft_code_revision = None
draft_quantization = None
draft_quantization = speculative_model_quantization
if speculative_model == "[ngram]":
if ngram_prompt_lookup_min is None:
......@@ -1033,6 +1140,7 @@ class SpeculativeConfig:
code_revision=draft_code_revision,
tokenizer_revision=target_model_config.tokenizer_revision,
max_model_len=None,
spec_target_max_model_len=target_model_config.max_model_len,
quantization=draft_quantization,
enforce_eager=target_model_config.enforce_eager,
max_seq_len_to_capture=target_model_config.
......@@ -1158,7 +1266,7 @@ class SpeculativeConfig:
elif speculative_draft_tensor_parallel_size != 1:
# TODO(wooyeon): allow tp values larger than 1
raise ValueError(
f"{speculative_draft_tensor_parallel_size=} cannot be"
f"{speculative_draft_tensor_parallel_size=} cannot be "
f"other value than 1")
draft_parallel_config = ParallelConfig(
......@@ -1310,8 +1418,9 @@ class LoRAConfig:
long_lora_scaling_factors: Optional[Tuple[float]] = None
def __post_init__(self):
# TODO: Increase the range of rank
possible_max_ranks = (8, 16, 32, 64)
# Setting the maximum rank to 256 should be able to satisfy the vast
# majority of applications.
possible_max_ranks = (8, 16, 32, 64, 128, 256)
possible_lora_extra_vocab_size = (0, 256, 512)
if self.max_lora_rank not in possible_max_ranks:
raise ValueError(
......@@ -1343,11 +1452,6 @@ class LoRAConfig:
model_config.quantization)
def verify_with_scheduler_config(self, scheduler_config: SchedulerConfig):
if scheduler_config.max_num_batched_tokens > 65528:
raise ValueError(
"Due to limitations of the custom LoRA CUDA kernel, "
"max_num_batched_tokens must be <= 65528 when "
"LoRA is enabled.")
if scheduler_config.chunked_prefill_enabled:
raise ValueError("LoRA is not supported with chunked prefill yet.")
......@@ -1387,10 +1491,15 @@ class PromptAdapterConfig:
@dataclass
class MultiModalConfig:
"""Configs the input data format and how models should run for
multimodal models."""
"""Controls the behavior of multimodal models."""
limit_per_prompt: Mapping[str, int] = field(default_factory=dict)
"""
The maximum number of multi-modal input instances allowed per prompt
for each :class:`~vllm.multimodal.MultiModalPlugin`.
"""
# TODO: Add configs to init vision tower or not.
pass
_STR_DTYPE_TO_TORCH_DTYPE = {
......@@ -1461,6 +1570,7 @@ def _get_and_verify_max_len(
max_model_len: Optional[int],
disable_sliding_window: bool,
sliding_window_len: Optional[int],
spec_target_max_model_len: Optional[int] = None,
) -> int:
"""Get and verify the model's maximum length."""
derived_max_model_len = float("inf")
......@@ -1503,6 +1613,11 @@ def _get_and_verify_max_len(
# If max_model_len is specified, we use it.
return max_model_len
if spec_target_max_model_len is not None:
# If this is a speculative draft model, we use the max model len
# from the target model.
return spec_target_max_model_len
default_max_len = 2048
logger.warning(
"The model's config.json does not contain any of the following "
......@@ -1610,10 +1725,27 @@ class ObservabilityConfig:
"""Configuration for observability."""
otlp_traces_endpoint: Optional[str] = None
# Collecting detailed timing information for each request can be expensive.
# If set, collects the model forward time for the request.
collect_model_forward_time: bool = False
# If set, collects the model execute time for the request.
collect_model_execute_time: bool = False
def __post_init__(self):
if not is_otel_installed() and self.otlp_traces_endpoint is not None:
raise ValueError("OpenTelemetry packages must be installed before "
"configuring 'otlp_traces_endpoint'")
if not is_otel_available() and self.otlp_traces_endpoint is not None:
raise ValueError(
"OpenTelemetry is not available. Unable to configure "
"'otlp_traces_endpoint'. Ensure OpenTelemetry packages are "
f"installed. Original error:\n{otel_import_error_traceback}")
if ((self.collect_model_forward_time
or self.collect_model_execute_time)
and self.otlp_traces_endpoint is None):
raise ValueError(
"collect_model_forward_time or collect_model_execute_time "
"requires --otlp-traces-endpoint to be set.")
@dataclass(frozen=True)
......@@ -1629,7 +1761,6 @@ class EngineConfig:
device_config: DeviceConfig
load_config: LoadConfig
lora_config: Optional[LoRAConfig]
multimodal_config: Optional[MultiModalConfig]
speculative_config: Optional[SpeculativeConfig]
decoding_config: Optional[DecodingConfig]
observability_config: Optional[ObservabilityConfig]
......
......@@ -356,7 +356,13 @@ class BlockTable:
appended to blocks. The first such "token block" may have less token ids
than the block size, since the last allocated block may be partially
full.
If no token ids are provided, then no chunks are returned.
"""
if not token_ids:
return []
first_chunk_size = self._block_size - (self._num_full_slots %
self._block_size)
token_blocks = [token_ids[:first_chunk_size]]
......
from collections import deque
from dataclasses import dataclass
from typing import Deque, Dict, Iterable, List, Optional, Protocol, Tuple
from vllm.core.block.interfaces import Block, BlockAllocator
......@@ -282,6 +283,58 @@ class BlockList:
return self._block_ids
@dataclass
class CacheMetricData:
"""A utility dataclass to maintain cache metric.
To avoid overflow, we maintain the hit rate in block granularity, so that
we can maintain a single hit rate for n_completed_block x block_size,
and calculate the real time hit rate by the following:
BS = The number of queries per block.
nB = The number of completed blocks.
HR = hit rate of (nB x BS) queries.
Q = current number of queries (< BS).
H = current number of hits (< BS).
hit rate = ((HR x nB) + (H / Q) x (Q / BS)) / (nB + Q / BS)
"""
num_completed_blocks: int = 0
completed_block_cache_hit_rate: float = 0.0
num_incompleted_block_queries: int = 0
num_incompleted_block_hit: int = 0
block_size: int = 1000
def query(self, hit: bool):
self.num_incompleted_block_queries += 1
self.num_incompleted_block_hit += 1 if hit else 0
# When a block is completed, update the cache hit rate
# and reset the incomplete numbers.
if self.num_incompleted_block_queries == self.block_size:
hit_rate = (self.num_incompleted_block_hit /
self.num_incompleted_block_queries)
self.completed_block_cache_hit_rate = (
self.completed_block_cache_hit_rate * self.num_completed_blocks
+ hit_rate) / (self.num_completed_blocks + 1)
self.num_incompleted_block_queries = 0
self.num_incompleted_block_hit = 0
self.num_completed_blocks += 1
def get_hit_rate(self):
incomplete_ratio = self.num_incompleted_block_queries / self.block_size
total_blocks = self.num_completed_blocks + incomplete_ratio
if total_blocks == 0:
return 0.0
completed_block_hit, incompleted_block_hit = 0.0, 0.0
if self.num_completed_blocks > 0:
completed_block_hit = (self.completed_block_cache_hit_rate *
self.num_completed_blocks)
if self.num_incompleted_block_queries > 0:
incompleted_hit_rate = (self.num_incompleted_block_hit /
self.num_incompleted_block_queries)
incompleted_block_hit = (incompleted_hit_rate * incomplete_ratio)
return (completed_block_hit + incompleted_block_hit) / total_blocks
def get_all_blocks_recursively(last_block: Block) -> List[Block]:
"""Retrieves all the blocks in a sequence starting from the last block.
......
......@@ -323,6 +323,11 @@ class CpuGpuBlockAllocator(DeviceAwareBlockAllocator):
def all_block_ids(self) -> FrozenSet[int]:
return frozenset(self._block_ids_to_allocator.keys())
def get_prefix_cache_hit_rate(self, device: Device) -> float:
"""Prefix cache hit rate. -1 means not supported or disabled."""
assert device in self._allocators
return self._allocators[device].get_prefix_cache_hit_rate()
def get_and_reset_swaps(self) -> List[Tuple[int, int]]:
"""Returns and clears the mapping of source to destination block IDs.
Will be called after every swapping operations for now, and after every
......
......@@ -186,6 +186,11 @@ class BlockAllocator(ABC):
num_lookahead_slots: int = 0) -> int:
pass
@abstractmethod
def get_prefix_cache_hit_rate(self) -> float:
"""Prefix cache hit rate. -1 means not supported or disabled."""
pass
class NoFreeBlocksError(ValueError):
pass
......@@ -278,3 +283,8 @@ class DeviceAwareBlockAllocator(ABC):
There is at most one null block per allocator.
"""
pass
@abstractmethod
def get_prefix_cache_hit_rate(self, device: Device) -> float:
"""Prefix cache hit rate. -1 means not supported or disabled."""
pass
......@@ -307,9 +307,8 @@ class NaiveBlockAllocator(BlockAllocator):
# TODO(cade): make sure the logic is correct and clean it up.
for block in blocks:
if not block.is_full and num_lookahead_slots != 0:
if block.num_empty_slots >= num_lookahead_slots:
new_block_count += 1
else:
new_block_count += 1
if num_lookahead_slots > block.num_empty_slots:
new_block_count += cdiv(
num_lookahead_slots - block.num_empty_slots,
self._block_size)
......@@ -342,6 +341,9 @@ class NaiveBlockAllocator(BlockAllocator):
block.block_id = block_id # Assign block_id
def get_prefix_cache_hit_rate(self) -> float:
return -1
class NaiveBlock(Block):
"""An implementation of the Block class that does not support prefix
......
"""Token blocks."""
from os.path import commonprefix
from typing import Dict, FrozenSet, Iterable, List, Optional, Tuple
from vllm.core.block.common import (CopyOnWriteTracker,
from vllm.core.block.common import (CacheMetricData, CopyOnWriteTracker,
get_all_blocks_recursively)
from vllm.core.block.interfaces import Block, BlockAllocator, BlockId, Device
from vllm.core.block.naive_block import (BlockPool, NaiveBlock,
......@@ -107,6 +106,8 @@ class PrefixCachingBlockAllocator(BlockAllocator):
self._cow_tracker = CopyOnWriteTracker(
refcounter=self._refcounter.as_readonly())
self.metric_data = CacheMetricData()
# Implements Block.Factory.
def _create_block(
self,
......@@ -155,9 +156,11 @@ class PrefixCachingBlockAllocator(BlockAllocator):
cached_block_id = self._cached_blocks.get(block.content_hash, None)
if cached_block_id is not None:
self.metric_data.query(hit=True)
block.block_id = cached_block_id
self._incr_refcount_cached_block(block)
return block
self.metric_data.query(hit=False)
self._block_pool.free_block(block)
# No cached block => Allocate a new block
......@@ -404,6 +407,9 @@ class PrefixCachingBlockAllocator(BlockAllocator):
def all_block_ids(self) -> FrozenSet[int]:
return self._hashless_allocator.all_block_ids
def get_prefix_cache_hit_rate(self) -> float:
return self.metric_data.get_hit_rate()
def is_block_cached(self, block: Block) -> bool:
assert block.content_hash is not None
if block.content_hash in self._cached_blocks:
......@@ -579,14 +585,17 @@ class PrefixCachingBlockAllocator(BlockAllocator):
num_touched_blocks = 0
for block in blocks:
if not block.is_full:
if block.num_empty_slots >= num_lookahead_slots:
num_touched_blocks += 1
else:
num_touched_blocks += 1
if num_lookahead_slots > block.num_empty_slots:
num_touched_blocks += cdiv(
num_lookahead_slots - block.num_empty_slots,
self._block_size)
else:
if not self.is_block_cached(block):
# If the block has a match in the cache and the cached block
# is not referenced, then we still count it as a touched block
if not self.is_block_cached(block) or \
(block.content_hash is not None and \
self._cached_blocks[block.content_hash] in self.evictor):
num_touched_blocks += 1
return num_touched_blocks
......
"""Block manager utils."""
from vllm.sequence import SequenceGroup
# Exception strings for non-implemented block manager enc/dec scenarios
STR_NOT_IMPL_ENC_DEC_SWA = \
"Sliding window attention for encoder/decoder models " + \
"is not currently supported."
STR_NOT_IMPL_ENC_DEC_PREFIX_CACHE = \
"Prefix caching for encoder/decoder models " + \
"is not currently supported."
from vllm.utils import (STR_NOT_IMPL_ENC_DEC_PREFIX_CACHE,
STR_NOT_IMPL_ENC_DEC_SWA)
def _get_block_mgr_sliding_window_attr(block_mgr):
......
......@@ -8,6 +8,7 @@ from typing import Sequence as GenericSequence
from typing import Set, Tuple
from vllm.block import BlockTable, PhysicalTokenBlock
from vllm.core.block.common import CacheMetricData
from vllm.core.block.utils import check_no_caching_or_swa_for_blockmgr_encdec
from vllm.core.evictor_v1 import EvictionPolicy, Evictor, make_evictor
from vllm.core.interfaces import AllocStatus, BlockSpaceManager
......@@ -60,6 +61,11 @@ class BlockAllocatorBase(ABC):
def update_hash(self, block_hash: int, block: PhysicalTokenBlock):
pass
@abstractmethod
def get_prefix_cache_hit_rate(self) -> float:
"""Prefix cache hit rate. -1 means not supported or disabled."""
pass
class CachedBlockAllocator(BlockAllocatorBase):
"""Manages free physical token blocks for a device.
......@@ -85,6 +91,8 @@ class CachedBlockAllocator(BlockAllocatorBase):
self.default_hash_ctr = count()
self.cache_metric_data = CacheMetricData()
def allocate_block(self, block_hash: int,
num_hashed_tokens: int) -> PhysicalTokenBlock:
if self.current_num_blocks == self.num_blocks:
......@@ -105,15 +113,17 @@ class CachedBlockAllocator(BlockAllocatorBase):
num_hashed_tokens: int = 0) -> PhysicalTokenBlock:
if block_hash is None:
block_hash = next(self.default_hash_ctr)
if block_hash in self.evictor:
assert block_hash not in self.cached_blocks
block = self.evictor.remove(block_hash)
assert block.ref_count == 0
self.cached_blocks[block_hash] = block
block.ref_count += 1
assert block.block_hash == block_hash
return block
if block_hash not in self.cached_blocks:
if block_hash in self.cached_blocks:
self.cache_metric_data.query(hit=True)
else:
self.cache_metric_data.query(hit=False)
self.cached_blocks[block_hash] = self.allocate_block(
block_hash, num_hashed_tokens)
block = self.cached_blocks[block_hash]
......@@ -150,6 +160,9 @@ class CachedBlockAllocator(BlockAllocatorBase):
del self.cached_blocks[old_hash]
self.cached_blocks[block_hash] = block
def get_prefix_cache_hit_rate(self) -> float:
return self.cache_metric_data.get_hit_rate()
class UncachedBlockAllocator(BlockAllocatorBase):
"""Manages free physical token blocks for a device.
......@@ -170,7 +183,7 @@ class UncachedBlockAllocator(BlockAllocatorBase):
self.num_blocks = num_blocks
# Initialize the free blocks.
self.free_blocks: BlockTable = []
self.free_blocks: List[PhysicalTokenBlock] = []
for i in range(num_blocks):
block = PhysicalTokenBlock(device=device,
block_number=i,
......@@ -209,6 +222,9 @@ class UncachedBlockAllocator(BlockAllocatorBase):
raise NotImplementedError(
"Invalid codepath for uncached block allocator.")
def get_prefix_cache_hit_rate(self) -> float:
return -1
class BlockSpaceManagerV1(BlockSpaceManager):
"""Manages the mapping between logical and physical token blocks."""
......@@ -256,6 +272,7 @@ class BlockSpaceManagerV1(BlockSpaceManager):
Device.CPU, block_size, num_cpu_blocks)
# Mapping: seq_id -> BlockTable.
self.block_tables: Dict[int, BlockTable] = {}
# Mapping: req_id -> BlockTable
# Note that each SequenceGroup has a unique
# request ID
......@@ -299,7 +316,7 @@ class BlockSpaceManagerV1(BlockSpaceManager):
# Allocate new physical token blocks that will store the prompt tokens.
num_prompt_blocks = seq.n_blocks
block_table: BlockTable = []
block_table: BlockTable = BlockTable()
for logical_idx in range(num_prompt_blocks):
if (self.block_sliding_window is not None
and logical_idx >= self.block_sliding_window):
......@@ -326,15 +343,19 @@ class BlockSpaceManagerV1(BlockSpaceManager):
#
# NOTE: Here we assume that all sequences in the group have the same
# decoder prompt.
seq = seq_group.get_seqs(status=SequenceStatus.WAITING)[0]
wait_seqs = seq_group.get_seqs(status=SequenceStatus.WAITING)
seq = wait_seqs[0]
block_table: BlockTable = \
self._allocate_sequence(seq,
seq_group.num_seqs(),
is_encoder_decoder)
# Assign the self-attention block tables for each sequence.
for seq in seq_group.get_seqs(status=SequenceStatus.WAITING):
self.block_tables[seq.seq_id] = block_table.copy()
if len(wait_seqs) == 1:
self.block_tables[seq.seq_id] = block_table
else:
for seq in wait_seqs:
self.block_tables[seq.seq_id] = block_table.copy()
# Allocate encoder sequence
if is_encoder_decoder:
......@@ -476,6 +497,7 @@ class BlockSpaceManagerV1(BlockSpaceManager):
return
src_block_table = self.block_tables[parent_seq.seq_id]
self.block_tables[child_seq.seq_id] = src_block_table.copy()
# When using a sliding window, blocks will be eventually reused.
# In this case the block tables will contain repeated blocks.
# When forking, we must make sure that each block's `ref_count`
......@@ -527,7 +549,7 @@ class BlockSpaceManagerV1(BlockSpaceManager):
dest_allocator: BlockAllocatorBase,
mapping: Dict[PhysicalTokenBlock,
PhysicalTokenBlock]) -> BlockTable:
new_block_table = []
new_block_table: BlockTable = BlockTable()
for from_block in block_table:
if from_block in mapping:
......@@ -553,8 +575,7 @@ class BlockSpaceManagerV1(BlockSpaceManager):
for seq in seq_group.get_seqs(status=SequenceStatus.SWAPPED):
self.block_tables[seq.seq_id] = \
self._swap_block_table(self.block_tables[seq.seq_id],
self.cpu_allocator,
self.gpu_allocator,
self.cpu_allocator, self.gpu_allocator,
mapping)
if seq_group.is_encoder_decoder():
......@@ -580,8 +601,7 @@ class BlockSpaceManagerV1(BlockSpaceManager):
for seq in seq_group.get_seqs(status=SequenceStatus.RUNNING):
self.block_tables[seq.seq_id] = \
self._swap_block_table(self.block_tables[seq.seq_id],
self.gpu_allocator,
self.cpu_allocator,
self.gpu_allocator, self.cpu_allocator,
mapping)
if seq_group.is_encoder_decoder():
......@@ -636,8 +656,7 @@ class BlockSpaceManagerV1(BlockSpaceManager):
self.cross_block_tables.clear()
def get_block_table(self, seq: Sequence) -> List[int]:
block_table = self.block_tables[seq.seq_id]
return [block.block_number for block in block_table]
return self.block_tables[seq.seq_id].ids()
def get_cross_block_table(self, seq_group: SequenceGroup) -> List[int]:
block_table = self.cross_block_tables[seq_group.request_id]
......@@ -702,3 +721,10 @@ class BlockSpaceManagerV1(BlockSpaceManager):
if self.enable_caching:
for seq in seq_group.get_seqs():
self.compute_full_blocks_in_seq(seq)
def get_prefix_cache_hit_rate(self, device: Device) -> float:
if device == Device.GPU:
return self.gpu_allocator.get_prefix_cache_hit_rate()
if device == Device.CPU:
return self.cpu_allocator.get_prefix_cache_hit_rate()
raise ValueError(f"Invalid device: {device}")
......@@ -441,6 +441,9 @@ class BlockSpaceManagerV2(BlockSpaceManager):
def get_num_free_cpu_blocks(self) -> int:
return self.block_allocator.get_num_free_blocks(Device.CPU)
def get_prefix_cache_hit_rate(self, device: Device) -> float:
return self.block_allocator.get_prefix_cache_hit_rate(device)
def _can_swap(self,
seq_group: SequenceGroup,
device: Device,
......
......@@ -2,6 +2,7 @@ from typing import List, Tuple
from vllm.core.interfaces import AllocStatus, BlockSpaceManager
from vllm.sequence import Sequence, SequenceGroup
from vllm.utils import Device
class EmbeddingModelBlockSpaceManager(BlockSpaceManager):
......@@ -81,3 +82,6 @@ class EmbeddingModelBlockSpaceManager(BlockSpaceManager):
def mark_blocks_as_computed(self, seq_group: SequenceGroup):
pass
def get_prefix_cache_hit_rate(self, device: Device) -> float:
return -1
......@@ -85,18 +85,21 @@ class LRUEvictor(Evictor):
if len(self.free_table) == 0:
raise ValueError("No usable cache memory left")
evicted_block = next(iter(self.free_table.values()))
evicted_block_id = next(iter(self.free_table.keys()))
evicted_block, evicted_block_id = None, None
# The blocks with the lowest timestamps should be placed consecutively
# at the start of OrderedDict. Loop through all these blocks to
# find the one with maximum number of hashed tokens.
for _id, block in self.free_table.items():
if evicted_block.last_accessed > block.last_accessed or (
evicted_block.last_accessed == block.last_accessed and
evicted_block.num_hashed_tokens < block.num_hashed_tokens):
evicted_block = block
evicted_block_id = _id
if evicted_block is None:
evicted_block, evicted_block_id = block, _id
continue
if evicted_block.last_accessed < block.last_accessed:
break
if evicted_block.num_hashed_tokens < block.num_hashed_tokens:
evicted_block, evicted_block_id = block, _id
assert evicted_block is not None
assert evicted_block_id is not None
self.free_table.pop(evicted_block_id)
return evicted_block_id, evicted_block.content_hash
......
......@@ -5,6 +5,7 @@ from typing import Sequence as GenericSequence
from typing import Tuple
from vllm.sequence import Sequence, SequenceGroup
from vllm.utils import Device
class AllocStatus(enum.Enum):
......@@ -116,3 +117,8 @@ class BlockSpaceManager(ABC):
@abstractmethod
def mark_blocks_as_computed(self, seq_group: SequenceGroup):
pass
@abstractmethod
def get_prefix_cache_hit_rate(self, device: Device) -> float:
"""Prefix cache hit rate. -1 means not supported or disabled."""
pass
......@@ -12,7 +12,9 @@ from vllm.logger import init_logger
from vllm.lora.request import LoRARequest
from vllm.prompt_adapter.request import PromptAdapterRequest
from vllm.sequence import (Sequence, SequenceData, SequenceGroup,
SequenceGroupMetadata, SequenceStatus)
SequenceGroupMetadata, SequenceGroupMetadataDelta,
SequenceStatus)
from vllm.utils import Device, PyObjectCache
logger = init_logger(__name__)
......@@ -176,10 +178,10 @@ class SchedulerRunningOutputs:
enough memory, it can be preempted (for recompute) or swapped out.
"""
# Selected sequences that are running and in a decoding phase.
decode_seq_groups: List[SequenceGroup]
decode_seq_groups: List[ScheduledSequenceGroup]
# Selected sequences that are running and in a prefill phase.
# I.e., it means the prefill has been chunked.
prefill_seq_groups: List[SequenceGroup]
prefill_seq_groups: List[ScheduledSequenceGroup]
# The preempted sequences.
preempted: List[SequenceGroup]
# Sequences that are swapped out.
......@@ -191,6 +193,10 @@ class SchedulerRunningOutputs:
# The number of slots for lookahead decoding.
num_lookahead_slots: int
# Optimization for fast-access to seq_group lists
decode_seq_groups_list: List[SequenceGroup]
prefill_seq_groups_list: List[SequenceGroup]
@classmethod
def create_empty(cls) -> "SchedulerRunningOutputs":
return SchedulerRunningOutputs(
......@@ -201,6 +207,8 @@ class SchedulerRunningOutputs:
blocks_to_swap_out=[],
blocks_to_copy=[],
num_lookahead_slots=0,
decode_seq_groups_list=[],
prefill_seq_groups_list=[],
)
......@@ -259,6 +267,30 @@ class SchedulerPrefillOutputs:
)
def seq_group_metadata_builder():
return SequenceGroupMetadata(request_id="",
is_prompt=False,
seq_data={},
sampling_params=None,
block_tables={})
def scheduler_running_outputs_builder():
return SchedulerRunningOutputs(decode_seq_groups=[],
prefill_seq_groups=[],
preempted=[],
swapped_out=[],
blocks_to_swap_out=[],
blocks_to_copy=[],
num_lookahead_slots=0,
prefill_seq_groups_list=[],
decode_seq_groups_list=[])
def scheduled_seq_group_builder():
return ScheduledSequenceGroup(seq_group=None, token_chunk_size=0)
class Scheduler:
def __init__(
......@@ -331,6 +363,12 @@ class Scheduler:
else 0)
self.num_cumulative_preemption: int = 0
# Used to cache python objects
self._scheduler_running_outputs_cache: PyObjectCache = PyObjectCache(
scheduler_running_outputs_builder)
self._scheduled_seq_group_cache: PyObjectCache = PyObjectCache(
scheduled_seq_group_builder)
@property
def lora_enabled(self) -> bool:
return bool(self.lora_config)
......@@ -392,10 +430,26 @@ class Scheduler:
seq.status = SequenceStatus.FINISHED_ABORTED
self.free_seq(seq)
self._free_seq_group_cross_attn_blocks(aborted_group)
def _free_seq_group_cross_attn_blocks(
self,
seq_group: SequenceGroup,
) -> None:
"""
Free a sequence group from a cross-attention block table.
Has no effect on decoder-only models.
"""
if seq_group.is_encoder_decoder():
self.block_manager.free_cross(seq_group)
def has_unfinished_seqs(self) -> bool:
return len(self.waiting) != 0 or len(self.running) != 0 or len(
self.swapped) != 0
def get_prefix_cache_hit_rate(self, device: Device) -> float:
return self.block_manager.get_prefix_cache_hit_rate(device)
def get_num_unfinished_seq_groups(self) -> int:
return len(self.waiting) + len(self.running) + len(self.swapped)
......@@ -428,14 +482,30 @@ class Scheduler:
Returns:
SchedulerRunningOutputs.
"""
ret: SchedulerRunningOutputs = \
self._scheduler_running_outputs_cache.get_object()
ret.blocks_to_swap_out.clear()
ret.blocks_to_copy.clear()
ret.decode_seq_groups.clear()
ret.prefill_seq_groups.clear()
ret.preempted.clear()
ret.swapped_out.clear()
ret.num_lookahead_slots = self._get_num_lookahead_slots(
is_prefill=False)
ret.decode_seq_groups_list.clear()
ret.prefill_seq_groups_list.clear()
# Blocks that need to be swapped or copied before model execution.
blocks_to_swap_out: List[Tuple[int, int]] = []
blocks_to_copy: List[Tuple[int, int]] = []
blocks_to_swap_out: List[Tuple[int, int]] = ret.blocks_to_swap_out
blocks_to_copy: List[Tuple[int, int]] = ret.blocks_to_copy
decode_seq_groups: List[ScheduledSequenceGroup] = []
prefill_seq_groups: List[ScheduledSequenceGroup] = []
preempted: List[SequenceGroup] = []
swapped_out: List[SequenceGroup] = []
decode_seq_groups: List[ScheduledSequenceGroup] = ret.decode_seq_groups
prefill_seq_groups: List[
ScheduledSequenceGroup] = ret.prefill_seq_groups
preempted: List[SequenceGroup] = ret.preempted
swapped_out: List[SequenceGroup] = ret.swapped_out
# NOTE(woosuk): Preemption happens only when there is no available slot
# to keep all the sequence groups in the RUNNING state.
......@@ -484,15 +554,19 @@ class Scheduler:
else:
self._append_slots(seq_group, blocks_to_copy)
is_prefill = seq_group.is_prefill()
scheduled_seq_group: ScheduledSequenceGroup = \
self._scheduled_seq_group_cache.get_object()
scheduled_seq_group.seq_group = seq_group
if is_prefill:
prefill_seq_groups.append(
ScheduledSequenceGroup(
seq_group=seq_group,
token_chunk_size=num_running_tokens))
scheduled_seq_group.token_chunk_size = num_running_tokens
prefill_seq_groups.append(scheduled_seq_group)
ret.prefill_seq_groups_list.append(seq_group)
else:
decode_seq_groups.append(
ScheduledSequenceGroup(seq_group=seq_group,
token_chunk_size=1))
scheduled_seq_group.token_chunk_size = 1
decode_seq_groups.append(scheduled_seq_group)
ret.decode_seq_groups_list.append(seq_group)
budget.add_num_batched_tokens(seq_group.request_id,
num_running_tokens)
# OPTIMIZATION: Note that get_max_num_running_seqs is
......@@ -505,15 +579,10 @@ class Scheduler:
if curr_loras is not None and seq_group.lora_int_id > 0:
curr_loras.add(seq_group.lora_int_id)
return SchedulerRunningOutputs(
decode_seq_groups=decode_seq_groups,
prefill_seq_groups=prefill_seq_groups,
preempted=preempted,
swapped_out=swapped_out,
blocks_to_swap_out=blocks_to_swap_out,
blocks_to_copy=blocks_to_copy,
num_lookahead_slots=self._get_num_lookahead_slots(
is_prefill=False))
self._scheduler_running_outputs_cache.reset()
self._scheduled_seq_group_cache.reset()
return ret
def _schedule_swapped(
self,
......@@ -665,7 +734,7 @@ class Scheduler:
all tokens.
Returns:
SchedulerSwappedInOutputs.
SchedulerPrefillOutputs.
"""
ignored_seq_groups: List[SequenceGroup] = []
seq_groups: List[SequenceGroup] = []
......@@ -738,6 +807,9 @@ class Scheduler:
curr_loras.add(lora_int_id)
waiting_queue.popleft()
self._allocate_and_set_running(seq_group)
seq_group.init_multi_step(
num_scheduler_steps=self._get_num_lookahead_slots(
is_prefill=True) + 1)
seq_groups.append(
ScheduledSequenceGroup(seq_group=seq_group,
token_chunk_size=num_new_tokens))
......@@ -807,11 +879,15 @@ class Scheduler:
# Update waiting requests.
self.waiting.extendleft(running_scheduled.preempted)
# Update new running requests.
self.running.extend([s.seq_group for s in prefills.seq_groups])
self.running.extend(
[s.seq_group for s in running_scheduled.decode_seq_groups])
self.running.extend(
[s.seq_group for s in swapped_in.decode_seq_groups])
if len(prefills.seq_groups) > 0:
self.running.extend([s.seq_group for s in prefills.seq_groups])
self.running.extend(running_scheduled.decode_seq_groups_list)
if len(swapped_in.decode_seq_groups) > 0:
self.running.extend(
[s.seq_group for s in swapped_in.decode_seq_groups])
# Update swapped requests.
self.swapped.extend(running_scheduled.swapped_out)
preempted = (len(running_scheduled.preempted) +
......@@ -821,24 +897,36 @@ class Scheduler:
# doesn't allow chunked prefills.
assert len(running_scheduled.prefill_seq_groups) == 0
assert len(swapped_in.prefill_seq_groups) == 0
# Merge lists
num_prefill_groups = len(prefills.seq_groups)
if num_prefill_groups > 0:
scheduled_seq_groups = prefills.seq_groups
scheduled_seq_groups.extend(running_scheduled.decode_seq_groups)
else:
scheduled_seq_groups = running_scheduled.decode_seq_groups
scheduled_seq_groups.extend(swapped_in.decode_seq_groups)
blocks_to_copy = running_scheduled.blocks_to_copy
blocks_to_copy.extend(swapped_in.blocks_to_copy)
ignored_seq_groups = prefills.ignored_seq_groups
ignored_seq_groups.extend(swapped_in.infeasible_seq_groups)
return SchedulerOutputs(
scheduled_seq_groups=(prefills.seq_groups +
running_scheduled.decode_seq_groups +
swapped_in.decode_seq_groups),
num_prefill_groups=len(prefills.seq_groups),
scheduled_seq_groups=scheduled_seq_groups,
num_prefill_groups=num_prefill_groups,
num_batched_tokens=budget.num_batched_tokens,
blocks_to_swap_in=swapped_in.blocks_to_swap_in,
blocks_to_swap_out=running_scheduled.blocks_to_swap_out,
blocks_to_copy=running_scheduled.blocks_to_copy +
swapped_in.blocks_to_copy,
ignored_seq_groups=prefills.ignored_seq_groups +
swapped_in.infeasible_seq_groups,
blocks_to_copy=blocks_to_copy,
ignored_seq_groups=ignored_seq_groups,
num_lookahead_slots=running_scheduled.num_lookahead_slots,
running_queue_size=len(self.running),
preempted=preempted,
)
def _schedule_chunked_prefill(self):
def _schedule_chunked_prefill(self) -> SchedulerOutputs:
"""Schedule queued requests.
Chunked prefill allows to chunk prefill requests, batch them together
......@@ -849,7 +937,7 @@ class Scheduler:
The policy can sustain the high GPU utilization because it can put
prefill and decodes requests to the same batch, while it improves
inter token latency because decodes requests don't need to blocked
inter token latency because decodes requests don't need to be blocked
by prefill requests.
"""
budget = SchedulingBudget(
......@@ -947,9 +1035,13 @@ class Scheduler:
# Schedule sequence groups.
# This function call changes the internal states of the scheduler
# such as self.running, self.swapped, and self.waiting.
scheduler_start_time = time.perf_counter()
scheduler_outputs = self._schedule()
now = time.time()
if not self.cache_config.enable_prefix_caching:
common_computed_block_nums = []
# Create input data structures.
seq_group_metadata_list: List[SequenceGroupMetadata] = []
for i, scheduled_seq_group in enumerate(
......@@ -963,52 +1055,88 @@ class Scheduler:
# seq_id -> physical block numbers
block_tables: Dict[int, List[int]] = {}
if seq_group.is_encoder_decoder():
# Encoder associated with SequenceGroup
encoder_seq_data = seq_group.get_encoder_seq().data
# Block table for cross-attention
# Also managed at SequenceGroup level
cross_block_table = self.block_manager.get_cross_block_table(
seq_group)
else:
encoder_seq_data = None
cross_block_table = None
for seq in seq_group.get_seqs(status=SequenceStatus.RUNNING):
seq_id = seq.seq_id
seq_data[seq_id] = seq.data
block_tables[seq_id] = self.block_manager.get_block_table(seq)
self.block_manager.access_all_blocks_in_seq(seq, now)
common_computed_block_nums = (
self.block_manager.get_common_computed_block_ids(
seq_group.get_seqs(status=SequenceStatus.RUNNING)))
if self.cache_config.enable_prefix_caching:
common_computed_block_nums = (
self.block_manager.get_common_computed_block_ids(
seq_group.get_seqs(status=SequenceStatus.RUNNING)))
do_sample = True
if seq_group.is_prefill():
is_prompt = seq_group.is_prefill()
# We should send the metadata to workers when the first prefill
# is sent. Subsequent requests could be chunked prefill or decode.
is_first_prefill = False
if is_prompt:
seqs = seq_group.get_seqs()
# Prefill has only 1 sequence.
assert len(seqs) == 1
num_computed_tokens = seqs[0].data.get_num_computed_tokens()
is_first_prefill = num_computed_tokens == 0
# In the next iteration, all prompt tokens are not computed.
# It means the prefill is chunked, and we don't need sampling.
# NOTE: We use get_len instead of get_prompt_len because when
# a sequence is preempted, prefill includes previous generated
# output tokens.
if (token_chunk_size + seqs[0].data.get_num_computed_tokens() <
if (token_chunk_size + num_computed_tokens <
seqs[0].data.get_len()):
do_sample = False
# It assumes the scheduled_seq_groups is ordered by
# prefill < decoding.
is_prompt = seq_group.is_prefill()
seq_group_metadata = SequenceGroupMetadata(
request_id=seq_group.request_id,
is_prompt=is_prompt,
seq_data=seq_data,
sampling_params=seq_group.sampling_params,
block_tables=block_tables,
do_sample=do_sample,
pooling_params=seq_group.pooling_params,
token_chunk_size=token_chunk_size,
lora_request=seq_group.lora_request,
computed_block_nums=common_computed_block_nums,
# `multi_modal_data` will only be present for the 1st comm
# between engine and worker.
# the subsequent comms can still use delta, but
# `multi_modal_data` will be None.
multi_modal_data=seq_group.multi_modal_data
if scheduler_outputs.num_prefill_groups > 0 else None,
prompt_adapter_request=seq_group.prompt_adapter_request,
)
if is_first_prefill or not self.scheduler_config.send_delta_data:
seq_group_metadata = SequenceGroupMetadata(
request_id=seq_group.request_id,
is_prompt=is_prompt,
seq_data=seq_data,
sampling_params=seq_group.sampling_params,
block_tables=block_tables,
do_sample=do_sample,
pooling_params=seq_group.pooling_params,
token_chunk_size=token_chunk_size,
lora_request=seq_group.lora_request,
computed_block_nums=common_computed_block_nums,
encoder_seq_data=encoder_seq_data,
cross_block_table=cross_block_table,
state=seq_group.state,
# `multi_modal_data` will only be present for the 1st comm
# between engine and worker.
# the subsequent comms can still use delta, but
# `multi_modal_data` will be None.
multi_modal_data=seq_group.multi_modal_data
if scheduler_outputs.num_prefill_groups > 0 else None,
prompt_adapter_request=seq_group.prompt_adapter_request,
)
else:
# When SPMD mode is enabled, we only send delta data except for
# the first request to reduce serialization cost.
seq_data_delta = {}
for id, data in seq_data.items():
seq_data_delta[id] = data.get_delta_and_reset()
seq_group_metadata = SequenceGroupMetadataDelta(
seq_data_delta,
seq_group.request_id,
block_tables,
is_prompt,
do_sample=do_sample,
token_chunk_size=token_chunk_size,
computed_block_nums=common_computed_block_nums,
)
seq_group_metadata_list.append(seq_group_metadata)
# Now that the batch has been created, we can assume all blocks in the
......@@ -1019,6 +1147,17 @@ class Scheduler:
self.block_manager.mark_blocks_as_computed(
scheduled_seq_group.seq_group)
scheduler_time = time.perf_counter() - scheduler_start_time
# Add this to scheduler time to all the sequences that are currently
# running. This will help estimate if the scheduler is a significant
# component in the e2e latency.
for seq_group in self.running:
if seq_group is not None and seq_group.metrics is not None:
if seq_group.metrics.scheduler_time is not None:
seq_group.metrics.scheduler_time += scheduler_time
else:
seq_group.metrics.scheduler_time = scheduler_time
return seq_group_metadata_list, scheduler_outputs
def fork_seq(self, parent_seq: Sequence, child_seq: Sequence) -> None:
......@@ -1032,6 +1171,8 @@ class Scheduler:
remaining: Deque[SequenceGroup] = deque()
for seq_group in self.running:
if seq_group.is_finished():
# Free cross-attention block table, if it exists
self._free_seq_group_cross_attn_blocks(seq_group)
# Add the finished requests to the finished requests list.
# This list will be used to update the Mamba cache in the
# next step.
......@@ -1062,10 +1203,12 @@ class Scheduler:
slots.
"""
num_lookahead_slots = self._get_num_lookahead_slots(is_prefill=False)
seq_group.init_multi_step(num_scheduler_steps=num_lookahead_slots + 1)
for seq in seq_group.get_seqs(status=SequenceStatus.RUNNING):
cows = self.block_manager.append_slots(seq, num_lookahead_slots)
blocks_to_copy.extend(cows)
if len(cows) > 0:
blocks_to_copy.extend(cows)
def _preempt(
self,
......
......@@ -19,7 +19,7 @@ def tensor_model_parallel_all_gather(input_: torch.Tensor,
def tensor_model_parallel_gather(input_: torch.Tensor,
dst: int = 0,
dim: int = -1) -> torch.Tensor:
dim: int = -1) -> Optional[torch.Tensor]:
"""Gather the input tensor across model parallel group."""
return get_tp_group().gather(input_, dst, dim)
......
......@@ -49,8 +49,13 @@ def find_loaded_library(lib_name) -> Optional[str]:
if not found:
# the library is not loaded in the current process
return None
# if lib_name is libcudart, we need to match a line with:
# address /path/to/libcudart-hash.so.11.0
start = line.index("/")
path = line[start:].strip()
filename = path.split("/")[-1]
assert filename.rpartition(".so")[0].startswith(lib_name), \
f"Unexpected filename: {filename} for library {lib_name}"
return path
......@@ -98,9 +103,9 @@ class CudaRTLibrary:
def __init__(self, so_file: Optional[str] = None):
if so_file is None:
so_file = find_loaded_library("libcudart.so")
so_file = find_loaded_library("libcudart")
assert so_file is not None, \
"libcudart.so is not loaded in the current process"
"libcudart is not loaded in the current process"
if so_file not in CudaRTLibrary.path_to_library_cache:
lib = ctypes.CDLL(so_file)
CudaRTLibrary.path_to_library_cache[so_file] = lib
......
......@@ -11,15 +11,17 @@ from vllm.distributed.device_communicators.custom_all_reduce_utils import (
gpu_p2p_access_check)
from vllm.distributed.parallel_state import in_the_same_node_as
from vllm.logger import init_logger
from vllm.utils import cuda_device_count_stateless, is_full_nvlink
from vllm.platforms import current_platform
from vllm.utils import cuda_device_count_stateless
from vllm.utils import is_hip
try:
if (not is_hip()):
assert ops.is_custom_op_supported("_C_custom_ar::meta_size")
ops.meta_size()
custom_ar = True
else:
custom_ar = False
except Exception:
# For AMD GPUs and CPUs
custom_ar = False
......@@ -117,7 +119,10 @@ class CustomAllreduce:
# test nvlink first, this will filter out most of the cases
# where custom allreduce is not supported
# this checks hardware and driver support for NVLink
full_nvlink = is_full_nvlink(physical_device_ids)
assert current_platform.is_cuda()
from vllm.platforms.cuda import CudaPlatform
cuda_platform: CudaPlatform = current_platform
full_nvlink = cuda_platform.is_full_nvlink(physical_device_ids)
if world_size > 2 and not full_nvlink:
logger.warning(
"Custom allreduce is disabled because it's not supported on"
......
......@@ -5,6 +5,7 @@ from torch.distributed import ProcessGroup
from vllm.platforms import current_platform
if current_platform.is_tpu():
import ray
import torch_xla.core.xla_model as xm
import torch_xla.runtime as xr
from torch_xla._internal import pjrt
......@@ -18,9 +19,15 @@ class TpuCommunicator:
return
self.disabled = False
local_rank = dist.get_rank(group)
world_size = dist.get_world_size(group)
pjrt.initialize_multiprocess(local_rank, world_size)
# NOTE(woosuk): When using TP > 1 on TPUs, every TPU on the same node
# must be used together. Therefore, the local rank and world size can
# be simply calculated as follows.
global_rank = dist.get_rank(group)
global_world_size = dist.get_world_size(group)
num_nodes = len(ray.nodes())
local_world_size = global_world_size // num_nodes
local_rank = global_rank % local_world_size
pjrt.initialize_multiprocess(local_rank, local_world_size)
xr._init_world_size_ordinal()
def all_reduce(self, x: torch.Tensor) -> torch.Tensor:
......
......@@ -329,7 +329,7 @@ class GroupCoordinator:
def gather(self,
input_: torch.Tensor,
dst: int = 0,
dim: int = -1) -> torch.Tensor:
dim: int = -1) -> Optional[torch.Tensor]:
"""
NOTE: We assume that the input tensor is on the same device across
all the ranks.
......
......@@ -2,11 +2,15 @@ import argparse
import dataclasses
import json
from dataclasses import dataclass
from typing import TYPE_CHECKING, List, Optional, Tuple, Type, Union
from typing import (TYPE_CHECKING, Dict, List, Mapping, Optional, Tuple, Type,
Union)
import torch
import vllm.envs as envs
from vllm.config import (CacheConfig, DecodingConfig, DeviceConfig,
EngineConfig, LoadConfig, LoRAConfig, ModelConfig,
MultiModalConfig, ObservabilityConfig, ParallelConfig,
EngineConfig, LoadConfig, LoadFormat, LoRAConfig,
ModelConfig, ObservabilityConfig, ParallelConfig,
PromptAdapterConfig, SchedulerConfig,
SpeculativeConfig, TokenizerPoolConfig)
from vllm.executor.executor_base import ExecutorBase
......@@ -15,11 +19,12 @@ from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS
from vllm.utils import FlexibleArgumentParser
if TYPE_CHECKING:
from vllm.transformers_utils.tokenizer_group.base_tokenizer_group import (
BaseTokenizerGroup)
from vllm.transformers_utils.tokenizer_group import BaseTokenizerGroup
logger = init_logger(__name__)
ALLOWED_DETAILED_TRACE_MODULES = ["model", "worker", "all"]
def nullable_str(val: str):
if not val or val == "None":
......@@ -27,11 +32,32 @@ def nullable_str(val: str):
return val
def nullable_kvs(val: str) -> Optional[Mapping[str, int]]:
if len(val) == 0:
return None
out_dict: Dict[str, int] = {}
for item in val.split(","):
try:
key, value = item.split("=")
except TypeError as exc:
msg = "Each item should be in the form KEY=VALUE"
raise ValueError(msg) from exc
try:
out_dict[key] = int(value)
except ValueError as exc:
msg = f"Failed to parse value of item {key}={value}"
raise ValueError(msg) from exc
return out_dict
@dataclass
class EngineArgs:
"""Arguments for vLLM engine."""
model: str
served_model_name: Optional[Union[List[str]]] = None
model: str = 'facebook/opt-125m'
served_model_name: Optional[Union[str, List[str]]] = None
tokenizer: Optional[str] = None
skip_tokenizer_init: bool = False
tokenizer_mode: str = 'auto'
......@@ -56,8 +82,8 @@ class EngineArgs:
enable_prefix_caching: bool = False
disable_sliding_window: bool = False
use_v2_block_manager: bool = False
swap_space: int = 4 # GiB
cpu_offload_gb: int = 0 # GiB
swap_space: float = 4 # GiB
cpu_offload_gb: float = 0 # GiB
gpu_memory_utilization: float = 0.90
max_num_batched_tokens: Optional[int] = None
max_num_seqs: int = 256
......@@ -69,7 +95,7 @@ class EngineArgs:
rope_theta: Optional[float] = None
tokenizer_revision: Optional[str] = None
quantization: Optional[str] = None
enforce_eager: bool = False
enforce_eager: Optional[bool] = None
max_context_len_to_capture: Optional[int] = None
max_seq_len_to_capture: int = 8192
disable_custom_all_reduce: bool = False
......@@ -79,6 +105,7 @@ class EngineArgs:
# notice.
tokenizer_pool_type: Union[str, Type["BaseTokenizerGroup"]] = "ray"
tokenizer_pool_extra_config: Optional[dict] = None
limit_mm_per_prompt: Optional[Mapping[str, int]] = None
enable_lora: bool = False
max_loras: int = 1
max_lora_rank: int = 16
......@@ -88,9 +115,10 @@ class EngineArgs:
fully_sharded_loras: bool = False
lora_extra_vocab_size: int = 256
long_lora_scaling_factors: Optional[Tuple[float]] = None
lora_dtype: str = 'auto'
lora_dtype: Optional[Union[str, torch.dtype]] = 'auto'
max_cpu_loras: Optional[int] = None
device: str = 'auto'
num_scheduler_steps: int = 1
ray_workers_use_nsight: bool = False
num_gpu_blocks_override: Optional[int] = None
num_lookahead_slots: int = 0
......@@ -104,6 +132,7 @@ class EngineArgs:
guided_decoding_backend: str = 'outlines'
# Speculative decoding configuration.
speculative_model: Optional[str] = None
speculative_model_quantization: Optional[str] = None
speculative_draft_tensor_parallel_size: Optional[int] = None
num_speculative_tokens: Optional[int] = None
speculative_max_model_len: Optional[int] = None
......@@ -117,6 +146,7 @@ class EngineArgs:
disable_logprobs_during_spec_decoding: Optional[bool] = None
otlp_traces_endpoint: Optional[str] = None
collect_detailed_traces: Optional[str] = None
def __post_init__(self):
if self.tokenizer is None:
......@@ -130,7 +160,7 @@ class EngineArgs:
parser.add_argument(
'--model',
type=str,
default='facebook/opt-125m',
default=EngineArgs.model,
help='Name or path of the huggingface model to use.')
parser.add_argument(
'--tokenizer',
......@@ -184,10 +214,7 @@ class EngineArgs:
'--load-format',
type=str,
default=EngineArgs.load_format,
choices=[
'auto', 'pt', 'safetensors', 'npcache', 'dummy', 'tensorizer',
'bitsandbytes'
],
choices=[f.value for f in LoadFormat],
help='The format of the model weights to load.\n\n'
'* "auto" will try to load the weights in the safetensors format '
'and fall back to the pytorch bin format if safetensors format '
......@@ -290,7 +317,7 @@ class EngineArgs:
parser.add_argument('--block-size',
type=int,
default=EngineArgs.block_size,
choices=[8, 16, 32],
choices=[8, 16, 32, 128, 256, 512, 1024, 2048],
help='Token block size for contiguous chunks of '
'tokens.')
......@@ -318,7 +345,7 @@ class EngineArgs:
default=EngineArgs.seed,
help='Random seed for operations.')
parser.add_argument('--swap-space',
type=int,
type=float,
default=EngineArgs.swap_space,
help='CPU swap space size (GiB) per GPU.')
parser.add_argument(
......@@ -432,6 +459,21 @@ class EngineArgs:
'This should be a JSON string that will be '
'parsed into a dictionary. Ignored if '
'tokenizer_pool_size is 0.')
# Multimodal related configs
parser.add_argument(
'--limit-mm-per-prompt',
type=nullable_kvs,
default=EngineArgs.limit_mm_per_prompt,
# The default value is given in
# MultiModalRegistry.init_mm_limits_per_prompt
help=('For each multimodal plugin, limit how many '
'input instances to allow for each prompt. '
'Expects a comma-separated list of items, '
'e.g.: `image=16,video=2` allows a maximum of 16 '
'images and 2 videos per prompt. Defaults to 1 for '
'each modality.'))
# LoRA related configs
parser.add_argument('--enable-lora',
action='store_true',
......@@ -503,6 +545,11 @@ class EngineArgs:
"tpu", "xpu"
],
help='Device type for vLLM execution.')
parser.add_argument('--num-scheduler-steps',
type=int,
default=1,
help=('Maximum number of forward steps per '
'scheduler call.'))
parser.add_argument(
'--scheduler-delay-factor',
......@@ -525,6 +572,18 @@ class EngineArgs:
default=EngineArgs.speculative_model,
help=
'The name of the draft model to be used in speculative decoding.')
# Quantization settings for speculative model.
parser.add_argument(
'--speculative-model-quantization',
type=nullable_str,
choices=[*QUANTIZATION_METHODS, None],
default=EngineArgs.speculative_model_quantization,
help='Method used to quantize the weights of speculative model.'
'If None, we first check the `quantization_config` '
'attribute in the model config file. If that is '
'None, we assume the model weights are not '
'quantized and use `dtype` to determine the data '
'type of the weights.')
parser.add_argument(
'--num-speculative-tokens',
type=int,
......@@ -602,8 +661,10 @@ class EngineArgs:
parser.add_argument(
'--disable-logprobs-during-spec-decoding',
type=bool,
action=StoreBoolean,
default=EngineArgs.disable_logprobs_during_spec_decoding,
nargs="?",
const="True",
help='If set to True, token log probabilities are not returned '
'during speculative decoding. If set to False, log probabilities '
'are returned according to the settings in SamplingParams. If '
......@@ -660,6 +721,16 @@ class EngineArgs:
type=str,
default=None,
help='Target URL to which OpenTelemetry traces will be sent.')
parser.add_argument(
'--collect-detailed-traces',
type=str,
default=None,
help="Valid choices are " +
",".join(ALLOWED_DETAILED_TRACE_MODULES) +
". It makes sense to set this only if --otlp-traces-endpoint is"
" set. If set, it will collect detailed traces for the specified "
"modules. This involves use of possibly costly and or blocking "
"operations and hence might have a performance impact.")
return parser
......@@ -672,6 +743,9 @@ class EngineArgs:
return engine_args
def create_engine_config(self, ) -> EngineConfig:
# gguf file needs a specific model loader and doesn't use hf_repo
if self.model.endswith(".gguf"):
self.quantization = self.load_format = "gguf"
# bitsandbytes quantization needs a specific model loader
# so we make sure the quant method and the load format are consistent
......@@ -693,8 +767,6 @@ class EngineArgs:
"CPU offload space must be non-negative"
f", but got {self.cpu_offload_gb}")
multimodal_config = MultiModalConfig()
device_config = DeviceConfig(device=self.device)
model_config = ModelConfig(
model=self.model,
......@@ -718,7 +790,8 @@ class EngineArgs:
disable_sliding_window=self.disable_sliding_window,
skip_tokenizer_init=self.skip_tokenizer_init,
served_model_name=self.served_model_name,
multimodal_config=multimodal_config)
limit_mm_per_prompt=self.limit_mm_per_prompt,
)
cache_config = CacheConfig(
block_size=self.block_size,
gpu_memory_utilization=self.gpu_memory_utilization,
......@@ -779,11 +852,19 @@ class EngineArgs:
"in low performance due to small KV cache space. Consider "
"setting --max-model-len to a smaller value.", max_model_len)
if self.num_scheduler_steps > 1 and not self.use_v2_block_manager:
self.use_v2_block_manager = True
logger.warning(
"Enabled BlockSpaceManagerV2 because it is "
"required for multi-step (--num-scheduler-steps > 1)")
speculative_config = SpeculativeConfig.maybe_create_spec_config(
target_model_config=model_config,
target_parallel_config=parallel_config,
target_dtype=self.dtype,
speculative_model=self.speculative_model,
speculative_model_quantization = \
self.speculative_model_quantization,
speculative_draft_tensor_parallel_size = \
self.speculative_draft_tensor_parallel_size,
num_speculative_tokens=self.num_speculative_tokens,
......@@ -804,18 +885,35 @@ class EngineArgs:
disable_logprobs=self.disable_logprobs_during_spec_decoding,
)
if self.num_scheduler_steps > 1:
if speculative_config is not None:
raise ValueError("Speculative decoding is not supported with "
"multi-step (--num-scheduler-steps > 1)")
if self.enable_chunked_prefill:
raise ValueError("Chunked prefill is not supported with "
"multi-step (--num-scheduler-steps > 1)")
# make sure num_lookahead_slots is set the higher value depending on
# if we are using speculative decoding or multi-step
num_lookahead_slots = max(self.num_lookahead_slots,
self.num_scheduler_steps - 1)
num_lookahead_slots = num_lookahead_slots \
if speculative_config is None \
else speculative_config.num_lookahead_slots
scheduler_config = SchedulerConfig(
max_num_batched_tokens=self.max_num_batched_tokens,
max_num_seqs=self.max_num_seqs,
max_model_len=model_config.max_model_len,
use_v2_block_manager=self.use_v2_block_manager,
num_lookahead_slots=(self.num_lookahead_slots
if speculative_config is None else
speculative_config.num_lookahead_slots),
num_lookahead_slots=num_lookahead_slots,
delay_factor=self.scheduler_delay_factor,
enable_chunked_prefill=self.enable_chunked_prefill,
embedding_mode=model_config.embedding_mode,
preemption_mode=self.preemption_mode,
num_scheduler_steps=self.num_scheduler_steps,
send_delta_data=(envs.VLLM_USE_RAY_SPMD_WORKER
and parallel_config.use_ray),
)
lora_config = LoRAConfig(
max_lora_rank=self.max_lora_rank,
......@@ -849,8 +947,21 @@ class EngineArgs:
decoding_config = DecodingConfig(
guided_decoding_backend=self.guided_decoding_backend)
detailed_trace_modules = []
if self.collect_detailed_traces is not None:
detailed_trace_modules = self.collect_detailed_traces.split(",")
for m in detailed_trace_modules:
if m not in ALLOWED_DETAILED_TRACE_MODULES:
raise ValueError(
f"Invalid module {m} in collect_detailed_traces. "
f"Valid modules are {ALLOWED_DETAILED_TRACE_MODULES}")
observability_config = ObservabilityConfig(
otlp_traces_endpoint=self.otlp_traces_endpoint)
otlp_traces_endpoint=self.otlp_traces_endpoint,
collect_model_forward_time="model" in detailed_trace_modules
or "all" in detailed_trace_modules,
collect_model_execute_time="worker" in detailed_trace_modules
or "all" in detailed_trace_modules,
)
if (model_config.get_sliding_window() is not None
and scheduler_config.chunked_prefill_enabled
......@@ -866,7 +977,6 @@ class EngineArgs:
scheduler_config=scheduler_config,
device_config=device_config,
lora_config=lora_config,
multimodal_config=multimodal_config,
speculative_config=speculative_config,
load_config=load_config,
decoding_config=decoding_config,
......@@ -889,7 +999,13 @@ class AsyncEngineArgs(EngineArgs):
parser.add_argument('--engine-use-ray',
action='store_true',
help='Use Ray to start the LLM engine in a '
'separate process as the server process.')
'separate process as the server process.'
'(DEPRECATED. This argument is deprecated '
'and will be removed in a future update. '
'Set `VLLM_ALLOW_ENGINE_USE_RAY=1` to force '
'use it. See '
'https://github.com/vllm-project/vllm/issues/7045.'
')')
parser.add_argument('--disable-log-requests',
action='store_true',
help='Disable logging requests.')
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment