Commit a3f8d5dd authored by zhuwenwen's avatar zhuwenwen
Browse files

Merge tag 'v0.13.0rc2' into v0.13.0rc2-ori

parents 8d75f22e f34eca5f
......@@ -219,7 +219,7 @@ class Executor(ABC):
def sample_tokens(
self, grammar_output: GrammarOutput | None, non_block: bool = False
) -> ModelRunnerOutput | None | Future[ModelRunnerOutput | None]:
) -> ModelRunnerOutput | Future[ModelRunnerOutput]:
output = self.collective_rpc( # type: ignore[call-overload]
"sample_tokens", args=(grammar_output,), non_block=non_block
)
......
......@@ -124,9 +124,7 @@ class MultiprocExecutor(Executor):
# Set multiprocessing envs
set_multiprocessing_worker_envs()
# Multiprocessing-based executor does not support multi-node setting.
# Since it only works for single node, we can use the loopback address
# get_loopback_ip() for communication.
# use the loopback address get_loopback_ip() for communication.
distributed_init_method = get_distributed_init_method(
get_loopback_ip(), get_open_port()
)
......@@ -294,8 +292,8 @@ class MultiprocExecutor(Executor):
kwargs: dict | None = None,
non_block: bool = False,
unique_reply_rank: int | None = None,
kv_output_aggregator: KVOutputAggregator = None,
) -> Any | list[Any] | Future[Any | list[Any]]:
kv_output_aggregator: KVOutputAggregator | None = None,
) -> Any:
"""Returns single result if unique_reply_rank and/or kv_output_aggregator
is provided, otherwise list."""
assert self.rpc_broadcast_mq is not None, (
......@@ -476,6 +474,8 @@ class WorkerProc:
"""Wrapper that runs one Worker in a separate process."""
READY_STR = "READY"
rpc_broadcast_mq: MessageQueue | None
worker_response_mq: MessageQueue | None
def _init_message_queues(
self, input_shm_handle: Handle, vllm_config: VllmConfig
......@@ -487,7 +487,7 @@ class WorkerProc:
)
# Initializes a message queue for sending the model output
self.worker_response_mq: MessageQueue = MessageQueue(1, 1)
self.worker_response_mq = MessageQueue(1, 1)
self.peer_response_handles = []
else:
# Initialize remote MessageQueue for receiving SchedulerOutput across nodes
......@@ -706,7 +706,7 @@ class WorkerProc:
death_pipe.recv()
except EOFError:
# Parent process has exited, terminate this worker
logger.info("Parent process exited, terminating worker")
logger.info_once("Parent process exited, terminating worker")
# Send signal to self to trigger clean shutdown
shutdown_event.set()
except Exception as e:
......@@ -720,6 +720,7 @@ class WorkerProc:
try:
reader.close()
worker = WorkerProc(*args, **kwargs)
assert worker.worker_response_mq is not None
# Send READY once we know everything is loaded
ready_writer.send(
......@@ -804,6 +805,7 @@ class WorkerProc:
def worker_busy_loop(self, cancel: threading.Event | None = None):
"""Main busy loop for Multiprocessing Workers"""
assert self.rpc_broadcast_mq is not None
while True:
method, args, kwargs, output_rank = self.rpc_broadcast_mq.dequeue(
cancel=cancel, indefinite=True
......
......@@ -413,7 +413,7 @@ class RayDistributedExecutor(Executor):
self,
grammar_output: "GrammarOutput | None",
non_block: bool = False,
) -> ModelRunnerOutput | Future[ModelRunnerOutput]:
) -> ModelRunnerOutput | None | Future[ModelRunnerOutput | None]:
"""Execute the model on the Ray workers.
The scheduler output to use should have been provided in
......@@ -428,7 +428,7 @@ class RayDistributedExecutor(Executor):
"""
scheduler_output = self.scheduler_output
if scheduler_output is None:
return COMPLETED_NONE_FUTURE if non_block else None # noqa
return COMPLETED_NONE_FUTURE if non_block else None
self.scheduler_output = None
......@@ -439,7 +439,7 @@ class RayDistributedExecutor(Executor):
scheduler_output: SchedulerOutput,
grammar_output: "GrammarOutput | None",
non_block: bool = False,
) -> ModelRunnerOutput | Future[ModelRunnerOutput]:
) -> ModelRunnerOutput | None | Future[ModelRunnerOutput | None]:
# Build the compiled DAG for the first time.
if self.forward_dag is None: # type: ignore
self.forward_dag = self._compiled_ray_dag(enable_asyncio=False)
......
......@@ -67,7 +67,7 @@ class UniProcExecutor(Executor):
kwargs: dict | None = None,
non_block: bool = False,
single_value: bool = False,
) -> Any | list[Any] | Future[Any | list[Any]]:
) -> Any:
if kwargs is None:
kwargs = {}
......@@ -79,10 +79,13 @@ class UniProcExecutor(Executor):
result = run_method(self.driver_worker, method, args, kwargs)
if isinstance(result, AsyncModelRunnerOutput):
if (async_thread := self.async_output_thread) is not None:
get_output = result.get_output
if not single_value:
get_output = lambda go=result.get_output: [go()]
return async_thread.submit(get_output)
if single_value:
return async_thread.submit(result.get_output)
def get_output_list() -> list[Any]:
return [result.get_output()]
return async_thread.submit(get_output_list)
result = result.get_output()
future = Future[Any]()
future.set_result(result if single_value else [result])
......
......@@ -13,7 +13,7 @@ from vllm.v1.kv_offload.backends.cpu import CPUBackend
from vllm.v1.kv_offload.lru_manager import LRUOffloadingManager
from vllm.v1.kv_offload.mediums import CPULoadStoreSpec, GPULoadStoreSpec
from vllm.v1.kv_offload.spec import OffloadingSpec
from vllm.v1.kv_offload.worker.cpu_gpu import CpuGpuOffloadingHandler
from vllm.v1.kv_offload.worker.cpu_gpu import CpuGpuOffloadingHandlers
from vllm.v1.kv_offload.worker.worker import OffloadingHandler
......@@ -32,7 +32,7 @@ class CPUOffloadingSpec(OffloadingSpec):
self._manager: OffloadingManager | None = None
# worker-side
self._handler: OffloadingHandler | None = None
self._handlers: CpuGpuOffloadingHandlers | None = None
self.eviction_policy: str = self.extra_config.get("eviction_policy", "lru")
......@@ -67,13 +67,13 @@ class CPUOffloadingSpec(OffloadingSpec):
kv_caches: dict[str, torch.Tensor],
attn_backends: dict[str, type[AttentionBackend]],
) -> Iterator[tuple[type[LoadStoreSpec], type[LoadStoreSpec], OffloadingHandler]]:
if not self._handler:
if not self._handlers:
if not current_platform.is_cuda_alike():
raise Exception(
"CPU Offloading is currently only supported on CUDA-alike GPUs"
)
self._handler = CpuGpuOffloadingHandler(
self._handlers = CpuGpuOffloadingHandlers(
attn_backends=attn_backends,
gpu_block_size=self.gpu_block_size,
cpu_block_size=self.offloaded_block_size,
......@@ -81,6 +81,6 @@ class CPUOffloadingSpec(OffloadingSpec):
gpu_caches=kv_caches,
)
assert self._handler is not None
yield GPULoadStoreSpec, CPULoadStoreSpec, self._handler
yield CPULoadStoreSpec, GPULoadStoreSpec, self._handler
assert self._handlers is not None
yield GPULoadStoreSpec, CPULoadStoreSpec, self._handlers.gpu_to_cpu_handler
yield CPULoadStoreSpec, GPULoadStoreSpec, self._handlers.cpu_to_gpu_handler
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from collections import deque
import numpy as np
import torch
......@@ -8,7 +9,7 @@ from vllm import _custom_ops as ops
from vllm.attention.backends.abstract import AttentionBackend
from vllm.logger import init_logger
from vllm.utils.platform_utils import is_pin_memory_available
from vllm.v1.kv_offload.mediums import CPULoadStoreSpec, GPULoadStoreSpec
from vllm.v1.kv_offload.mediums import BlockIDsLoadStoreSpec
from vllm.v1.kv_offload.worker.worker import (
OffloadingHandler,
TransferResult,
......@@ -51,7 +52,123 @@ def expand_block_ids(
output_idx = output_end_idx
class CpuGpuOffloadingHandler(OffloadingHandler):
class SingleDirectionOffloadingHandler(OffloadingHandler):
"""
SingleDirectionOffloadingHandler handles transfers for a single direction,
either CPU->GPU or GPU->CPU.
Transfers are guaranteed to be executed in order of their submission.
Each transfer uses a unique CUDA stream, and its stream will start
executing only after the streams of previous transfers have finished.
"""
def __init__(
self,
src_tensors: list[torch.Tensor],
dst_tensors: list[torch.Tensor],
kv_dim_before_num_blocks: list[bool],
src_block_size_factor: int,
dst_block_size_factor: int,
priority: int,
):
"""
Initialize a SingleDirectionOffloadingHandler.
Args:
src_tensors: list of KV cache tensors to copy from.
dst_tensors: list of KV cache tensors to copy to.
Order should match src_tensors.
kv_dim_before_num_blocks: list of bools, indicating
whether the respective KV cache tensor has a KV
dimension before its num_blocks dimension.
e.g. (2, num_blocks, ...)
src_block_size_factor: The number of kernel blocks
per KV block in a source tensor.
dst_block_size_factor: The number of kernel blocks
per KV block in a destination tensor.
priority: The priority of the backing CUDA streams.
Lower numbers indicate higher priority.
"""
assert len(src_tensors) == len(dst_tensors) == len(kv_dim_before_num_blocks)
self.src_tensors: list[torch.Tensor] = src_tensors
self.dst_tensors: list[torch.Tensor] = dst_tensors
self.kv_dim_before_num_blocks: list[bool] = kv_dim_before_num_blocks
self.src_block_size_factor: int = src_block_size_factor
self.dst_block_size_factor: int = dst_block_size_factor
self.priority = priority
# queue of transfers (job_id, stream, event)
self._transfers: deque[tuple[int, torch.cuda.Stream, torch.Event]] = deque()
# list of CUDA streams available for re-use
self._stream_pool: list[torch.cuda.Stream] = []
# list of CUDA events available for re-use
self._event_pool: list[torch.Event] = []
def transfer_async(self, job_id: int, transfer_spec: TransferSpec) -> bool:
src_spec, dst_spec = transfer_spec
assert isinstance(src_spec, BlockIDsLoadStoreSpec)
assert isinstance(dst_spec, BlockIDsLoadStoreSpec)
src_blocks = src_spec.block_ids
dst_blocks = dst_spec.block_ids
assert src_blocks.ndim == 1
assert dst_blocks.ndim == 1
src_sub_block_count = src_blocks.size * self.src_block_size_factor
dst_sub_block_count = dst_blocks.size * self.dst_block_size_factor
src_sub_blocks_to_skip = -dst_blocks.size % self.src_block_size_factor
assert dst_sub_block_count == src_sub_block_count - src_sub_blocks_to_skip
src_to_dst = np.empty((dst_sub_block_count, 2), dtype=np.int64)
expand_block_ids(
src_blocks,
self.src_block_size_factor,
src_to_dst[:, 0],
skip_count=src_sub_blocks_to_skip,
)
expand_block_ids(dst_blocks, self.dst_block_size_factor, src_to_dst[:, 1])
src_to_dst_tensor = torch.from_numpy(src_to_dst)
stream = (
self._stream_pool.pop()
if self._stream_pool
else torch.cuda.Stream(priority=self.priority)
)
event = self._event_pool.pop() if self._event_pool else torch.Event()
if self._transfers:
_, _, last_event = self._transfers[-1]
# assure job will start only after the previous one completes
stream.wait_event(last_event)
with torch.cuda.stream(stream):
for src_tensor, dst_tensor, kv_dim in zip(
self.src_tensors, self.dst_tensors, self.kv_dim_before_num_blocks
):
if kv_dim:
src_key_cache, src_value_cache = src_tensor
dst_key_cache, dst_value_cache = dst_tensor
ops.swap_blocks(src_key_cache, dst_key_cache, src_to_dst_tensor)
ops.swap_blocks(src_value_cache, dst_value_cache, src_to_dst_tensor)
else:
ops.swap_blocks(src_tensor, dst_tensor, src_to_dst_tensor)
event.record(stream)
self._transfers.append((job_id, stream, event))
# success
return True
def get_finished(self) -> list[TransferResult]:
results: list[TransferResult] = []
while self._transfers and self._transfers[0][2].query():
job_id, stream, event = self._transfers.popleft()
results.append((job_id, True))
self._stream_pool.append(stream)
self._event_pool.append(event)
return results
class CpuGpuOffloadingHandlers:
def __init__(
self,
gpu_block_size: int,
......@@ -60,27 +177,20 @@ class CpuGpuOffloadingHandler(OffloadingHandler):
gpu_caches: dict[str, torch.Tensor],
attn_backends: dict[str, type[AttentionBackend]],
):
assert gpu_caches
assert cpu_block_size % gpu_block_size == 0
self.block_size_factor = cpu_block_size // gpu_block_size
# cuda streams for gpu->cpu and cpu->gpu
self.d2h_stream = torch.cuda.Stream()
self.h2d_stream = torch.cuda.Stream()
# job_id -> transfer cuda event
self.transfer_events: dict[int, torch.Event] = {}
# list of cuda events available for re-use
self.events_pool: list[torch.Event] = []
block_size_factor = cpu_block_size // gpu_block_size
pin_memory = is_pin_memory_available()
# allocate cpu tensors
logger.info("Allocating %d CPU tensors...", len(gpu_caches))
self.gpu_tensors: list[torch.Tensor] = []
self.cpu_tensors: list[torch.Tensor] = []
self.kv_dim_before_num_blocks: list[bool] = []
gpu_tensors: list[torch.Tensor] = []
cpu_tensors: list[torch.Tensor] = []
kv_dim_before_num_blocks: list[bool] = []
kernel_block_size: int | None = None
for layer_name, gpu_tensor in gpu_caches.items():
self.gpu_tensors.append(gpu_tensor)
gpu_tensors.append(gpu_tensor)
gpu_shape = gpu_tensor.shape
attn_backend = attn_backends[layer_name]
......@@ -88,16 +198,21 @@ class CpuGpuOffloadingHandler(OffloadingHandler):
num_blocks=1234, block_size=16, num_kv_heads=8, head_size=256
)
has_layers_dim = False
if len(gpu_shape) != len(test_shape):
# cross-layers tensor
# shape is (num_blocks, ...)
assert len(gpu_shape) == len(test_shape) + 1
num_blocks_idx = 0
self.kv_dim_before_num_blocks.append(False)
has_layers_dim = True
kv_dim_before_num_blocks.append(False)
# prepend a dummy num_layers=80 to test_shape
test_shape = (80,) + test_shape
elif test_shape[0] == 1234:
# shape is (num_blocks, ...)
num_blocks_idx = 0
self.kv_dim_before_num_blocks.append(False)
kv_dim_before_num_blocks.append(False)
else:
# shape should be (2, num_blocks, ...)
assert test_shape[0] == 2
......@@ -105,13 +220,32 @@ class CpuGpuOffloadingHandler(OffloadingHandler):
assert gpu_shape[0] == 2
num_blocks_idx = 1
self.kv_dim_before_num_blocks.append(True)
kv_dim_before_num_blocks.append(True)
try:
kv_cache_stride_order = attn_backend.get_kv_cache_stride_order(
include_num_layers_dimension=has_layers_dim
)
assert len(kv_cache_stride_order) == len(gpu_shape)
except (AttributeError, NotImplementedError):
kv_cache_stride_order = tuple(range(len(gpu_shape)))
# permute test_shape according to stride_order
test_shape = tuple(test_shape[i] for i in kv_cache_stride_order)
# find block_size (16) dimension index
block_size_idx = test_shape.index(16)
if kernel_block_size is not None:
assert kernel_block_size == gpu_shape[block_size_idx]
else:
kernel_block_size = gpu_shape[block_size_idx]
assert gpu_block_size % kernel_block_size == 0
cpu_shape = list(gpu_shape)
cpu_shape[num_blocks_idx] = num_cpu_blocks * self.block_size_factor
cpu_shape[num_blocks_idx] = num_cpu_blocks * block_size_factor
logger.debug("Allocating CPU tensor of shape %r", cpu_shape)
self.cpu_tensors.append(
cpu_tensors.append(
torch.zeros(
cpu_shape,
dtype=gpu_tensor.dtype,
......@@ -120,72 +254,27 @@ class CpuGpuOffloadingHandler(OffloadingHandler):
)
)
def transfer_async(self, job_id: int, spec: TransferSpec) -> bool:
src_spec, dst_spec = spec
if isinstance(src_spec, CPULoadStoreSpec):
assert isinstance(dst_spec, GPULoadStoreSpec)
stream = self.h2d_stream
src_tensors = self.cpu_tensors
dst_tensors = self.gpu_tensors
src_block_size_factor = self.block_size_factor
dst_block_size_factor = 1
else:
assert isinstance(src_spec, GPULoadStoreSpec)
assert isinstance(dst_spec, CPULoadStoreSpec)
stream = self.d2h_stream
src_tensors = self.gpu_tensors
dst_tensors = self.cpu_tensors
src_block_size_factor = 1
dst_block_size_factor = self.block_size_factor
src_blocks = src_spec.block_ids
dst_blocks = dst_spec.block_ids
assert src_blocks.ndim == 1
assert dst_blocks.ndim == 1
assert kernel_block_size is not None
gpu_block_size_factor = gpu_block_size // kernel_block_size
cpu_block_size_factor = cpu_block_size // kernel_block_size
src_sub_block_count = src_blocks.size * src_block_size_factor
dst_sub_block_count = dst_blocks.size * dst_block_size_factor
src_sub_blocks_to_skip = -dst_blocks.size % src_block_size_factor
# TODO (orozery): adapt swap_blocks to support gpu_block_size_factor
assert gpu_block_size_factor == 1
assert dst_sub_block_count == src_sub_block_count - src_sub_blocks_to_skip
src_to_dst = np.empty((dst_sub_block_count, 2), dtype=np.int64)
expand_block_ids(
src_blocks,
src_block_size_factor,
src_to_dst[:, 0],
skip_count=src_sub_blocks_to_skip,
self.gpu_to_cpu_handler = SingleDirectionOffloadingHandler(
src_tensors=gpu_tensors,
dst_tensors=cpu_tensors,
kv_dim_before_num_blocks=kv_dim_before_num_blocks,
src_block_size_factor=gpu_block_size_factor,
dst_block_size_factor=cpu_block_size_factor,
priority=1,
)
expand_block_ids(dst_blocks, dst_block_size_factor, src_to_dst[:, 1])
src_to_dst_tensor = torch.from_numpy(src_to_dst)
event = self.events_pool.pop() if self.events_pool else torch.Event()
with torch.cuda.stream(stream):
for src_tensor, dst_tensor, kv_dim in zip(
src_tensors, dst_tensors, self.kv_dim_before_num_blocks
):
if kv_dim:
src_key_cache = src_tensor[0]
dst_key_cache = dst_tensor[0]
ops.swap_blocks(src_key_cache, dst_key_cache, src_to_dst_tensor)
src_value_cache = src_tensor[1]
dst_value_cache = dst_tensor[1]
ops.swap_blocks(src_value_cache, dst_value_cache, src_to_dst_tensor)
else:
ops.swap_blocks(src_tensor, dst_tensor, src_to_dst_tensor)
event.record(stream)
self.transfer_events[job_id] = event
# success
return True
def get_finished(self) -> list[TransferResult]:
results: list[TransferResult] = []
for job_id, event in self.transfer_events.items():
if event.query():
results.append((job_id, True))
self.events_pool.append(event)
for job_id, _ in results:
del self.transfer_events[job_id]
return results
self.cpu_to_gpu_handler = SingleDirectionOffloadingHandler(
src_tensors=cpu_tensors,
dst_tensors=gpu_tensors,
kv_dim_before_num_blocks=kv_dim_before_num_blocks,
src_block_size_factor=cpu_block_size_factor,
dst_block_size_factor=gpu_block_size_factor,
priority=-1,
)
......@@ -12,9 +12,11 @@ from vllm.compilation.cuda_graph import CUDAGraphStat
from vllm.v1.core.sched.output import SchedulerOutput
if TYPE_CHECKING:
from vllm.distributed.kv_events import KVConnectorKVEvents
from vllm.distributed.kv_transfer.kv_connector.v1.metrics import KVConnectorStats
else:
KVConnectorStats = object
KVConnectorKVEvents = object
class LogprobsLists(NamedTuple):
......@@ -108,6 +110,7 @@ class KVConnectorOutput:
finished_sending: set[str] | None = None
finished_recving: set[str] | None = None
kv_connector_stats: KVConnectorStats | None = None
kv_cache_events: KVConnectorKVEvents | None = None
# IDs of externally computed KV blocks that failed to load.
# Requests referencing these blocks should be rescheduled to recompute them
invalid_block_ids: set[int] = field(default_factory=set)
......@@ -123,6 +126,7 @@ class KVConnectorOutput:
not self.finished_sending
and not self.finished_recving
and not self.kv_connector_stats
and not self.kv_cache_events
and not self.invalid_block_ids
)
......
......@@ -209,10 +209,10 @@ class Request:
def get_finished_reason(self) -> FinishReason | None:
return RequestStatus.get_finished_reason(self.status)
def get_num_encoder_tokens(self, input_id: int) -> int:
def get_num_encoder_embeds(self, input_id: int) -> int:
assert input_id < len(self.mm_features)
num_tokens = self.mm_features[input_id].mm_position.length
return num_tokens
num_embeds = self.mm_features[input_id].mm_position.get_num_embeds
return num_embeds
def record_event(
self,
......@@ -255,6 +255,7 @@ class RequestStatus(enum.IntEnum):
FINISHED_LENGTH_CAPPED = enum.auto()
FINISHED_ABORTED = enum.auto()
FINISHED_IGNORED = enum.auto()
FINISHED_ERROR = enum.auto()
def __str__(self):
return self.name
......@@ -277,4 +278,5 @@ _FINISHED_REASON_MAP = {
RequestStatus.FINISHED_LENGTH_CAPPED: FinishReason.LENGTH,
RequestStatus.FINISHED_ABORTED: FinishReason.ABORT,
RequestStatus.FINISHED_IGNORED: FinishReason.LENGTH,
RequestStatus.FINISHED_ERROR: FinishReason.ERROR,
}
......@@ -145,7 +145,7 @@ class RejectionSampler(nn.Module):
)
logprobs_tensors = None
if sampling_metadata.max_num_logprobs:
if sampling_metadata.max_num_logprobs is not None:
logprobs_tensors = self._get_logprobs_tensors(
sampling_metadata.max_num_logprobs,
metadata,
......
......@@ -170,7 +170,6 @@ class EagleProposer:
self.allowed_attn_types: tuple | None = None
if current_platform.is_rocm():
rocm_types = [TritonAttentionMetadata, FlashAttentionMetadata]
# ROCM_AITER_FA is an optional backend
# if find_spec(
# AttentionBackendEnum.ROCM_AITER_FA.get_path(include_classname=False)
......@@ -180,6 +179,12 @@ class EagleProposer:
# )
# rocm_types.append(AiterFlashAttentionMetadata)
# TRITON_MLA backend support for MLA models (e.g., DeepSeek)
from vllm.v1.attention.backends.mla.common import MLACommonMetadata
rocm_types.append(MLACommonMetadata)
self.allowed_attn_types = tuple(rocm_types)
# Parse the speculative token tree.
......
......@@ -7,7 +7,7 @@ from typing import TYPE_CHECKING
from vllm.config import VllmConfig
from vllm.logger import init_logger
from vllm.reasoning import ReasoningParserManager
from vllm.tokenizers import init_tokenizer_from_config
from vllm.tokenizers import cached_tokenizer_from_config
from vllm.utils.import_utils import LazyLoader
from vllm.v1.structured_output.backend_guidance import GuidanceBackend
from vllm.v1.structured_output.backend_types import (
......@@ -71,7 +71,7 @@ class StructuredOutputManager:
# of CPUs.
max_workers = max(1, (multiprocessing.cpu_count() + 1) // 2)
self.executor = ThreadPoolExecutor(max_workers=max_workers)
self.tokenizer = init_tokenizer_from_config(
self.tokenizer = cached_tokenizer_from_config(
model_config=self.vllm_config.model_config
)
reasoning_parser = (
......
......@@ -10,7 +10,8 @@ import torch
import vllm.envs
from vllm.logger import init_logger
from vllm.sampling_params import SamplingParams
from vllm.tokenizers import MistralTokenizer
from vllm.tokenizers.deepseek_v32 import DeepseekV32Tokenizer
from vllm.tokenizers.mistral import MistralTokenizer
from vllm.utils.import_utils import LazyLoader
from vllm.v1.structured_output.backend_types import (
StructuredOutputBackend,
......@@ -56,6 +57,27 @@ class XgrammarBackend(StructuredOutputBackend):
stop_token_ids=stop_token_ids,
add_prefix_space=True,
)
elif isinstance(self.tokenizer, DeepseekV32Tokenizer):
# copy from xgr.TokenizerInfo.from_huggingface()
# because we are using a custom tokenizer wrapper here.
vocab_dict = self.tokenizer.get_vocab()
tokenizer_vocab_size = max(len(vocab_dict), self.tokenizer.max_token_id + 1)
vocab_size = self.vocab_size or tokenizer_vocab_size
# maintain tokenizer's indexing
encoded_vocab = [""] * vocab_size
for token, idx in vocab_dict.items():
if idx < vocab_size:
encoded_vocab[idx] = token
stop_token_ids = [self.tokenizer.eos_token_id]
backend_str = self.tokenizer.tokenizer.backend_tokenizer.to_str()
metadata = xgr.TokenizerInfo._detect_metadata_from_hf(backend_str)
tokenizer_info = xgr.TokenizerInfo(
encoded_vocab=encoded_vocab,
vocab_type=metadata["vocab_type"],
vocab_size=vocab_size,
stop_token_ids=stop_token_ids,
add_prefix_space=metadata["add_prefix_space"],
)
else:
tokenizer_info = xgr.TokenizerInfo.from_huggingface(
self.tokenizer,
......@@ -246,13 +268,7 @@ def has_xgrammar_unsupported_json_features(schema: dict[str, Any]) -> bool:
# Unsupported keywords for objects
if obj.get("type") == "object" and any(
key in obj
for key in (
"minProperties",
"maxProperties",
"propertyNames",
"patternProperties",
)
key in obj for key in ("patternProperties", "propertyNames")
):
return True
......
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from typing import TYPE_CHECKING, Any, cast
from vllm.config import VllmConfig, get_layers_from_vllm_config
if TYPE_CHECKING:
from vllm.model_executor.layers.attention_layer_base import AttentionLayerBase
else:
AttentionLayerBase = object
def check_attention_cp_compatibility(vllm_config: VllmConfig) -> None:
pcp_size = vllm_config.parallel_config.prefill_context_parallel_size
dcp_size = vllm_config.parallel_config.decode_context_parallel_size
interleave_size = vllm_config.parallel_config.cp_kv_cache_interleave_size
if pcp_size * dcp_size > 1:
layer_type = cast(type[Any], AttentionLayerBase)
layers = get_layers_from_vllm_config(vllm_config, layer_type)
for layer in layers.values():
layer_impl = getattr(layer, "impl", None)
if layer_impl is None:
continue
if vllm_config.speculative_config is not None and interleave_size > 1:
assert layer_impl.supports_mtp_with_cp_non_trivial_interleave_size, (
"MTP with cp_kv_cache_interleave_size > 1 is not "
f"supported in {layer_impl.__class__.__name__}."
)
if dcp_size > 1:
assert layer_impl.need_to_return_lse_for_decode, (
"DCP requires attention impls to return"
" the softmax lse for decode, but the impl "
f"{layer_impl.__class__.__name__} "
"does not return the softmax lse for decode."
)
if pcp_size > 1:
assert layer_impl.supports_pcp, (
"PCP requires attention impls' support, "
f"but the impl {layer_impl.__class__.__name__} "
"does not support PCP."
)
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import functools
import gc
import itertools
import time
......@@ -148,6 +149,7 @@ from vllm.v1.spec_decode.ngram_proposer import NgramProposer
from vllm.v1.spec_decode.suffix_decoding import SuffixDecodingProposer
from vllm.v1.structured_output.utils import apply_grammar_bitmask
from vllm.v1.utils import CpuGpuBuffer, record_function_or_nullcontext
from vllm.v1.worker.cp_utils import check_attention_cp_compatibility
from vllm.v1.worker.dp_utils import coordinate_batch_across_dp
from vllm.v1.worker.ec_connector_model_runner_mixin import ECConnectorModelRunnerMixin
from vllm.v1.worker.gpu_input_batch import CachedRequestState, InputBatch
......@@ -160,15 +162,14 @@ from vllm.v1.worker.ubatch_utils import (
maybe_create_ubatch_slices,
)
from vllm.v1.worker.utils import is_residual_scattered_for_sp
from vllm.v1.worker.workspace import lock_workspace
from .utils import (
AttentionGroup,
MultiModalBudget,
add_kv_sharing_layers_to_kv_cache_groups,
bind_kv_cache,
gather_mm_placeholders,
sanity_check_mm_encoder_outputs,
scatter_mm_placeholders,
)
if TYPE_CHECKING:
......@@ -295,6 +296,7 @@ class GPUModelRunner(
self.device = device
self.pin_memory = is_pin_memory_available()
self.dtype = self.model_config.dtype
self.kv_cache_dtype = kv_cache_dtype_str_to_dtype(
cache_config.cache_dtype, self.model_config
)
......@@ -1267,6 +1269,8 @@ class GPUModelRunner(
if not isinstance(kv_cache_spec, CrossAttentionSpec):
return None, None
# Zero out buffer for padding requests that are not actually scheduled (CGs)
self.encoder_seq_lens.np[:num_reqs] = 0
# Build encoder_seq_lens array mapping request indices to
# encoder lengths for inputs scheduled in this batch
for req_id in num_scheduled_tokens:
......@@ -1530,28 +1534,13 @@ class GPUModelRunner(
"""
:return: tuple[attn_metadata, spec_decode_common_attn_metadata]
"""
# Attention metadata is not needed for attention free models
if len(self.kv_cache_config.kv_cache_groups) == 0:
return {}, None
num_tokens_padded = num_tokens_padded or num_tokens
num_reqs_padded = num_reqs_padded or num_reqs
logits_indices_padded = None
num_logits_indices = None
if logits_indices is not None:
num_logits_indices = logits_indices.size(0)
if self.cache_config.kv_sharing_fast_prefill:
logits_indices_padded = self._prepare_kv_sharing_fast_prefill(
logits_indices
)
# update seq_lens of decode reqs under DCP.
if self.dcp_world_size > 1:
self.dcp_local_seq_lens.cpu[:num_reqs] = get_dcp_local_seq_lens(
self.seq_lens.cpu[:num_reqs],
self.dcp_world_size,
self.dcp_rank,
self.parallel_config.cp_kv_cache_interleave_size,
)
self.dcp_local_seq_lens.cpu[num_reqs:].fill_(0)
self.dcp_local_seq_lens.copy_to_gpu(num_reqs_padded)
assert num_reqs_padded is not None and num_tokens_padded is not None
attn_metadata: PerLayerAttnMetadata = {}
if ubatch_slices is not None:
......@@ -1572,36 +1561,12 @@ class GPUModelRunner(
self.num_accepted_tokens.np[num_reqs:].fill(1)
self.num_accepted_tokens.copy_to_gpu()
# Used in the below loop, uses padded shapes
query_start_loc = self.query_start_loc.gpu[: num_reqs_padded + 1]
query_start_loc_cpu = self.query_start_loc.cpu[: num_reqs_padded + 1]
seq_lens = self.seq_lens.gpu[:num_reqs_padded]
seq_lens_cpu = self.seq_lens.cpu[:num_reqs_padded]
num_computed_tokens_cpu = self.input_batch.num_computed_tokens_cpu_tensor[
:num_reqs_padded
]
dcp_local_seq_lens, dcp_local_seq_lens_cpu = None, None
if self.dcp_world_size > 1:
dcp_local_seq_lens = self.dcp_local_seq_lens.gpu[:num_reqs_padded]
dcp_local_seq_lens_cpu = self.dcp_local_seq_lens.cpu[:num_reqs_padded]
spec_decode_common_attn_metadata = None
kv_cache_groups = self.kv_cache_config.kv_cache_groups
# Prepare the attention metadata for each KV cache group and make layers
# in the same group share the same metadata.
for kv_cache_gid, kv_cache_group in enumerate(
self.kv_cache_config.kv_cache_groups
):
encoder_seq_lens, encoder_seq_lens_cpu = self._get_encoder_seq_lens(
num_scheduled_tokens or {},
kv_cache_group.kv_cache_spec,
num_reqs_padded,
)
if isinstance(kv_cache_group.kv_cache_spec, EncoderOnlyAttentionSpec):
# Encoder-only layers do not have KV cache, so we need to
# create a dummy block table and slot mapping for them.
def _get_block_table_and_slot_mapping(kv_cache_gid: int):
assert num_reqs_padded is not None and num_tokens_padded is not None
kv_cache_spec = kv_cache_groups[kv_cache_gid].kv_cache_spec
if isinstance(kv_cache_spec, EncoderOnlyAttentionSpec):
blk_table_tensor = torch.zeros(
(num_reqs_padded, 1),
dtype=torch.int32,
......@@ -1617,92 +1582,129 @@ class GPUModelRunner(
blk_table_tensor = blk_table.get_device_tensor(num_reqs_padded)
slot_mapping = blk_table.slot_mapping.gpu[:num_tokens_padded]
# Fill unused with -1. Needed for reshape_and_cache in full cuda
# graph mode. `blk_table_tensor` -1 to match mamba PAD_SLOT_ID
slot_mapping[num_tokens:num_tokens_padded].fill_(-1)
blk_table_tensor[num_reqs:num_reqs_padded].fill_(-1)
common_attn_metadata = CommonAttentionMetadata(
query_start_loc=query_start_loc,
query_start_loc_cpu=query_start_loc_cpu,
seq_lens=seq_lens,
_seq_lens_cpu=seq_lens_cpu,
_num_computed_tokens_cpu=num_computed_tokens_cpu,
num_actual_tokens=num_tokens_padded,
num_reqs=num_reqs_padded,
max_query_len=max_query_len,
max_seq_len=max_seq_len,
block_table_tensor=blk_table_tensor,
slot_mapping=slot_mapping,
logits_indices_padded=logits_indices_padded,
num_logits_indices=num_logits_indices,
causal=True,
encoder_seq_lens=encoder_seq_lens,
encoder_seq_lens_cpu=encoder_seq_lens_cpu,
dcp_local_seq_lens=dcp_local_seq_lens,
dcp_local_seq_lens_cpu=dcp_local_seq_lens_cpu,
# Fill unused with -1. Needed for reshape_and_cache in full cuda
# graph mode. `blk_table_tensor` -1 to match mamba PAD_SLOT_ID
slot_mapping[num_tokens:num_tokens_padded].fill_(-1)
blk_table_tensor[num_reqs:num_reqs_padded].fill_(-1)
return blk_table_tensor, slot_mapping
block_table_gid_0, slot_mapping_gid_0 = _get_block_table_and_slot_mapping(0)
cm_base = CommonAttentionMetadata(
query_start_loc=self.query_start_loc.gpu[: num_reqs_padded + 1],
query_start_loc_cpu=self.query_start_loc.cpu[: num_reqs_padded + 1],
seq_lens=self.seq_lens.gpu[:num_reqs_padded],
_seq_lens_cpu=self.seq_lens.cpu[:num_reqs_padded],
_num_computed_tokens_cpu=self.input_batch.num_computed_tokens_cpu_tensor[
:num_reqs_padded
],
num_reqs=num_reqs_padded,
num_actual_tokens=num_tokens_padded,
max_query_len=max_query_len,
max_seq_len=max_seq_len,
block_table_tensor=block_table_gid_0,
slot_mapping=slot_mapping_gid_0,
causal=True,
)
if self.dcp_world_size > 1:
self.dcp_local_seq_lens.cpu[:num_reqs] = get_dcp_local_seq_lens(
self.seq_lens.cpu[:num_reqs],
self.dcp_world_size,
self.dcp_rank,
self.parallel_config.cp_kv_cache_interleave_size,
)
self.dcp_local_seq_lens.cpu[num_reqs:].fill_(0)
self.dcp_local_seq_lens.copy_to_gpu(num_reqs_padded)
cm_base.dcp_local_seq_lens = self.dcp_local_seq_lens.gpu[:num_reqs_padded]
cm_base.dcp_local_seq_lens_cpu = self.dcp_local_seq_lens.cpu[
:num_reqs_padded
]
if logits_indices is not None and self.cache_config.kv_sharing_fast_prefill:
cm_base.num_logits_indices = logits_indices.size(0)
cm_base.logits_indices_padded = self._prepare_kv_sharing_fast_prefill(
logits_indices
)
def _build_attn_group_metadata(
kv_cache_gid: int,
attn_gid: int,
common_attn_metadata: CommonAttentionMetadata,
ubid: int | None = None,
) -> None:
attn_group = self.attn_groups[kv_cache_gid][attn_gid]
cascade_attn_prefix_len = (
cascade_attn_prefix_lens[kv_cache_gid][attn_gid]
if cascade_attn_prefix_lens
else 0
)
builder = attn_group.get_metadata_builder(ubid or 0)
extra_attn_metadata_args = {}
if use_spec_decode and isinstance(builder, GDNAttentionMetadataBuilder):
assert ubid is None, "UBatching not supported with GDN yet"
extra_attn_metadata_args = dict(
num_accepted_tokens=self.num_accepted_tokens.gpu[:num_reqs_padded],
num_decode_draft_tokens_cpu=self.num_decode_draft_tokens.cpu[
:num_reqs_padded
],
)
if for_cudagraph_capture:
attn_metadata_i = builder.build_for_cudagraph_capture(
common_attn_metadata
)
else:
attn_metadata_i = builder.build(
common_prefix_len=cascade_attn_prefix_len,
common_attn_metadata=common_attn_metadata,
**extra_attn_metadata_args,
)
if ubid is None:
assert isinstance(attn_metadata, dict)
attn_metadata_dict = attn_metadata
else:
assert isinstance(attn_metadata, list)
attn_metadata_dict = attn_metadata[ubid]
for layer_name in attn_group.layer_names:
attn_metadata_dict[layer_name] = attn_metadata_i
# Prepare the attention metadata for each KV cache group and make layers
# in the same group share the same metadata.
spec_decode_common_attn_metadata = None
for kv_cache_gid, kv_cache_group in enumerate(kv_cache_groups):
cm = copy(cm_base) # shallow copy
# Basically only the encoder seq_lens, block_table and slot_mapping change
# for each kv_cache_group.
cm.encoder_seq_lens, cm.encoder_seq_lens_cpu = self._get_encoder_seq_lens(
num_scheduled_tokens or {},
kv_cache_group.kv_cache_spec,
num_reqs_padded,
)
if kv_cache_gid > 0:
cm.block_table_tensor, cm.slot_mapping = (
_get_block_table_and_slot_mapping(kv_cache_gid)
)
if self.speculative_config and spec_decode_common_attn_metadata is None:
if isinstance(self.drafter, EagleProposer):
if self.drafter.attn_layer_names[0] in kv_cache_group.layer_names:
spec_decode_common_attn_metadata = common_attn_metadata
spec_decode_common_attn_metadata = cm
else:
spec_decode_common_attn_metadata = common_attn_metadata
for attn_gid, attn_group in enumerate(self.attn_groups[kv_cache_gid]):
cascade_attn_prefix_len = (
cascade_attn_prefix_lens[kv_cache_gid][attn_gid]
if cascade_attn_prefix_lens
else 0
)
builder = attn_group.get_metadata_builder()
extra_attn_metadata_args = {}
if use_spec_decode and isinstance(builder, GDNAttentionMetadataBuilder):
extra_attn_metadata_args = dict(
num_accepted_tokens=self.num_accepted_tokens.gpu[
:num_reqs_padded
],
num_decode_draft_tokens_cpu=self.num_decode_draft_tokens.cpu[
:num_reqs_padded
],
)
spec_decode_common_attn_metadata = cm
for attn_gid in range(len(self.attn_groups[kv_cache_gid])):
if ubatch_slices is not None:
common_attn_metadata_list = split_attn_metadata(
ubatch_slices, common_attn_metadata
)
for ubid, common_attn_metadata in enumerate(
common_attn_metadata_list
):
builder = attn_group.get_metadata_builder(ubatch_id=ubid)
if for_cudagraph_capture:
attn_metadata_i = builder.build_for_cudagraph_capture(
common_attn_metadata
)
else:
attn_metadata_i = builder.build(
common_prefix_len=cascade_attn_prefix_len,
common_attn_metadata=common_attn_metadata,
)
for layer_name in kv_cache_group.layer_names:
assert type(attn_metadata) is list
attn_metadata[ubid][layer_name] = attn_metadata_i
for ubid, _cm in enumerate(split_attn_metadata(ubatch_slices, cm)):
_build_attn_group_metadata(kv_cache_gid, attn_gid, _cm, ubid)
else:
assert isinstance(attn_metadata, dict)
if for_cudagraph_capture:
attn_metadata_i = builder.build_for_cudagraph_capture(
common_attn_metadata
)
else:
attn_metadata_i = builder.build(
common_prefix_len=cascade_attn_prefix_len,
common_attn_metadata=common_attn_metadata,
**extra_attn_metadata_args,
)
for layer_name in attn_group.layer_names:
attn_metadata[layer_name] = attn_metadata_i
_build_attn_group_metadata(kv_cache_gid, attn_gid, cm)
if self.is_mm_prefix_lm:
req_doc_ranges = {}
......@@ -2183,10 +2185,7 @@ class GPUModelRunner(
# Cache the encoder outputs by mm_hash
for (mm_hash, pos_info), output in zip(mm_hashes_pos, encoder_outputs):
self.encoder_cache[mm_hash] = scatter_mm_placeholders(
output,
is_embed=pos_info.is_embed,
)
self.encoder_cache[mm_hash] = output
logger.debug("Finish execute for mm hash %s", mm_hash)
self.maybe_save_ec_to_connector(self.encoder_cache, mm_hash)
......@@ -2237,6 +2236,13 @@ class GPUModelRunner(
num_encoder_tokens,
)
assert start_idx < end_idx
curr_embeds_start, curr_embeds_end = (
pos_info.get_embeds_indices_in_range(start_idx, end_idx)
)
# If there are no embeddings in the current range, we skip
# gathering the embeddings.
if curr_embeds_start == curr_embeds_end:
continue
mm_hash = mm_feature.identifier
encoder_output = self.encoder_cache.get(mm_hash, None)
......@@ -2244,16 +2250,14 @@ class GPUModelRunner(
if (is_embed := pos_info.is_embed) is not None:
is_embed = is_embed[start_idx:end_idx]
mm_embeds_item = encoder_output[curr_embeds_start:curr_embeds_end]
else:
mm_embeds_item = encoder_output[start_idx:end_idx]
req_start_pos = req_start_idx + start_pos - num_computed_tokens
is_mm_embed[req_start_pos + start_idx : req_start_pos + end_idx] = (
True if is_embed is None else is_embed
)
mm_embeds_item = gather_mm_placeholders(
encoder_output[start_idx:end_idx],
is_embed=is_embed,
)
mm_embeds_req.append(mm_embeds_item)
if self.is_multimodal_pruning_enabled and self.uses_mrope:
......@@ -2764,6 +2768,7 @@ class GPUModelRunner(
# be improved in model runner v2)
force_uniform_decode: bool | None = None,
force_has_lora: bool | None = None,
num_encoder_reqs: int = 0,
) -> tuple[
CUDAGraphMode,
BatchDescriptor,
......@@ -2780,6 +2785,11 @@ class GPUModelRunner(
if force_uniform_decode is None
else force_uniform_decode
)
# Encoder-decoder models only support CG for decoder_step > 0 (no enc_output
# is present). Also, chunked-prefill is disabled, so batch are uniform.
has_encoder_output = (
self.model_config.is_encoder_decoder and num_encoder_reqs > 0
)
has_lora = (
len(self.input_batch.lora_id_to_lora_request) > 0
......@@ -2799,7 +2809,7 @@ class GPUModelRunner(
)
cudagraph_mode, batch_descriptor = dispatch_cudagraph(
num_tokens_padded, use_cascade_attn
num_tokens_padded, use_cascade_attn or has_encoder_output
)
num_tokens_padded = batch_descriptor.num_tokens
......@@ -2997,6 +3007,7 @@ class GPUModelRunner(
num_scheduled_tokens_np=num_scheduled_tokens_np,
max_num_scheduled_tokens=max_num_scheduled_tokens,
use_cascade_attn=cascade_attn_prefix_lens is not None,
num_encoder_reqs=len(scheduler_output.scheduled_encoder_inputs),
)
logger.debug(
......@@ -3562,74 +3573,89 @@ class GPUModelRunner(
if self.parallel_config.enable_eplb:
self.eplb_state = EplbState(self.parallel_config, self.device)
eplb_models = 0
with DeviceMemoryProfiler() as m:
time_before_load = time.perf_counter()
model_loader = get_model_loader(self.load_config)
self.model = model_loader.load_model(
vllm_config=self.vllm_config, model_config=self.model_config
)
if self.lora_config:
self.model = self.load_lora_model(
self.model, self.vllm_config, self.device
try:
with DeviceMemoryProfiler() as m:
time_before_load = time.perf_counter()
model_loader = get_model_loader(self.load_config)
self.model = model_loader.load_model(
vllm_config=self.vllm_config, model_config=self.model_config
)
if hasattr(self, "drafter"):
logger.info_once("Loading drafter model...")
self.drafter.load_model(self.model)
if (
hasattr(self.drafter, "model")
and is_mixture_of_experts(self.drafter.model)
and self.parallel_config.enable_eplb
):
spec_config = self.vllm_config.speculative_config
assert spec_config is not None
assert spec_config.draft_model_config is not None
logger.info_once(
"EPLB is enabled for drafter model %s.",
spec_config.draft_model_config.model,
if self.lora_config:
self.model = self.load_lora_model(
self.model, self.vllm_config, self.device
)
if hasattr(self, "drafter"):
logger.info_once("Loading drafter model...")
self.drafter.load_model(self.model)
if (
hasattr(self.drafter, "model")
and is_mixture_of_experts(self.drafter.model)
and self.parallel_config.enable_eplb
):
spec_config = self.vllm_config.speculative_config
assert spec_config is not None
assert spec_config.draft_model_config is not None
logger.info_once(
"EPLB is enabled for drafter model %s.",
spec_config.draft_model_config.model,
)
global_expert_load = (
global_expert_loads[eplb_models]
if global_expert_loads
else None
)
old_global_expert_indices = (
old_global_expert_indices_per_model[eplb_models]
if old_global_expert_indices_per_model
else None
)
if self.eplb_state is None:
self.eplb_state = EplbState(self.parallel_config, self.device)
self.eplb_state.add_model(
self.drafter.model,
spec_config.draft_model_config,
global_expert_load,
old_global_expert_indices,
rank_mapping,
)
eplb_models += 1
global_expert_load = (
global_expert_loads[eplb_models]
if global_expert_loads
else None
)
old_global_expert_indices = (
old_global_expert_indices_per_model[eplb_models]
if old_global_expert_indices_per_model
else None
)
if self.eplb_state is None:
self.eplb_state = EplbState(
self.parallel_config, self.device
)
self.eplb_state.add_model(
self.drafter.model,
spec_config.draft_model_config,
global_expert_load,
old_global_expert_indices,
rank_mapping,
)
eplb_models += 1
if self.use_aux_hidden_state_outputs:
if not supports_eagle3(self.get_model()):
raise RuntimeError(
"Model does not support EAGLE3 interface but "
"aux_hidden_state_outputs was requested"
)
if self.use_aux_hidden_state_outputs:
if not supports_eagle3(self.get_model()):
raise RuntimeError(
"Model does not support EAGLE3 interface but "
"aux_hidden_state_outputs was requested"
)
# Try to get auxiliary layers from speculative config,
# otherwise use model's default layers
aux_layers = self._get_eagle3_aux_layers_from_config()
if aux_layers:
logger.info(
"Using auxiliary layers from speculative config: %s",
aux_layers,
)
else:
aux_layers = self.model.get_eagle3_aux_hidden_state_layers()
# Try to get auxiliary layers from speculative config,
# otherwise use model's default layers
aux_layers = self._get_eagle3_aux_layers_from_config()
if aux_layers:
logger.info(
"Using auxiliary layers from speculative config: %s",
aux_layers,
)
else:
aux_layers = self.model.get_eagle3_aux_hidden_state_layers()
self.model.set_aux_hidden_state_layers(aux_layers)
time_after_load = time.perf_counter()
self.model_memory_usage = m.consumed_memory
self.model.set_aux_hidden_state_layers(aux_layers)
time_after_load = time.perf_counter()
self.model_memory_usage = m.consumed_memory
except torch.cuda.OutOfMemoryError as e:
msg = (
"Failed to load model - not enough GPU memory. "
"Try lowering --gpu-memory-utilization to free memory for weights, "
"increasing --tensor-parallel-size, or using --quantization. "
"See https://docs.vllm.ai/en/latest/configuration/conserving_memory/ "
"for more tips."
)
combined_msg = f"{msg} (original error: {e})"
logger.error(combined_msg)
raise e
logger.info_once(
"Model loading took %.4f GiB memory and %.6f seconds",
self.model_memory_usage / GiB_bytes,
......@@ -3867,19 +3893,21 @@ class GPUModelRunner(
return {}
@contextmanager
def maybe_randomize_inputs(self, input_ids: torch.Tensor):
def maybe_randomize_inputs(
self, input_ids: torch.Tensor | None, inputs_embeds: torch.Tensor | None
):
"""
Randomize input_ids if VLLM_RANDOMIZE_DP_DUMMY_INPUTS is set.
This is to help balance expert-selection
- during profile_run
- during DP rank dummy run
"""
dp_size = self.vllm_config.parallel_config.data_parallel_size
randomize_inputs = envs.VLLM_RANDOMIZE_DP_DUMMY_INPUTS and dp_size > 1
if not randomize_inputs:
yield
else:
import functools
elif input_ids is not None:
@functools.cache
def rand_input_ids() -> torch.Tensor:
......@@ -3887,13 +3915,27 @@ class GPUModelRunner(
self.input_ids.gpu,
low=0,
high=self.model_config.get_vocab_size(),
dtype=input_ids.dtype,
)
logger.debug_once("Randomizing dummy data for DP Rank")
logger.debug_once("Randomizing dummy input_ids for DP Rank")
input_ids.copy_(rand_input_ids()[: input_ids.size(0)], non_blocking=True)
yield
input_ids.fill_(0)
else:
@functools.cache
def rand_inputs_embeds() -> torch.Tensor:
return torch.randn_like(
self.inputs_embeds.gpu,
)
assert inputs_embeds is not None
logger.debug_once("Randomizing dummy inputs_embeds for DP Rank")
inputs_embeds.copy_(
rand_inputs_embeds()[: inputs_embeds.size(0)], non_blocking=True
)
yield
inputs_embeds.fill_(0)
def _get_mm_dummy_batch(
self,
......@@ -4142,7 +4184,7 @@ class GPUModelRunner(
num_tokens_across_dp[:] = num_tokens_padded
with (
self.maybe_randomize_inputs(input_ids),
self.maybe_randomize_inputs(input_ids, inputs_embeds),
set_forward_context(
attn_metadata,
self.vllm_config,
......@@ -4425,31 +4467,8 @@ class GPUModelRunner(
dummy_encoder_outputs,
expected_num_items=max_mm_items_per_batch,
)
# NOTE: This happens when encoder cache needs to store
# the embeddings that encoder outputs are scattered onto.
# In this case we create dummy embeddings of size
# (max_tokens_for_modality, hidden_size) and scatter
# encoder output into it.
encoder_output_shape = dummy_encoder_outputs[0].shape
max_mm_tokens_per_item = mm_budget.max_tokens_by_modality[
dummy_modality
]
if encoder_output_shape[0] < max_mm_tokens_per_item:
encoder_hidden_size = encoder_output_shape[-1]
expanded_outputs = []
for output in dummy_encoder_outputs:
expanded = output.new_zeros(
(max_mm_tokens_per_item, encoder_hidden_size)
)
num_tokens = output.shape[0]
expanded[:num_tokens].copy_(output)
expanded_outputs.append(expanded)
dummy_encoder_outputs = expanded_outputs
# Cache the dummy encoder outputs.
self.encoder_cache["tmp"] = dict(enumerate(dummy_encoder_outputs))
for i, output in enumerate(dummy_encoder_outputs):
self.encoder_cache[f"tmp_{i}"] = output
# Add `is_profile` here to pre-allocate communication buffers
hidden_states, last_hidden_states = self._dummy_run(
......@@ -4557,6 +4576,10 @@ class GPUModelRunner(
# after here.
set_cudagraph_capturing_enabled(False)
# Lock workspace to prevent resizing during execution.
# Max workspace sizes should have been captured during warmup/profiling.
lock_workspace()
end_time = time.perf_counter()
elapsed_time = end_time - start_time
cuda_graph_size = start_free_gpu_memory - end_free_gpu_memory
......@@ -4712,6 +4735,9 @@ class GPUModelRunner(
attention_backend_list, kv_cache_config.kv_cache_groups
)
# Check if attention backend supports PCP&DCP and related features.
check_attention_cp_compatibility(self.vllm_config)
for i, attn_backend_map in enumerate(attention_backend_maps):
self.attn_groups.append(create_attn_groups(attn_backend_map, i))
......@@ -4871,7 +4897,7 @@ class GPUModelRunner(
# we need to adjust the cudagraph sizes to be a multiple of the uniform
# decode query length to avoid: https://github.com/vllm-project/vllm/issues/28207
# temp-fix: https://github.com/vllm-project/vllm/issues/28207#issuecomment-3504004536
# Will be removed in the near future when we have seperate cudagraph capture
# Will be removed in the near future when we have separate cudagraph capture
# sizes for decode and mixed prefill-decode.
if (
cudagraph_mode.decode_mode() == CUDAGraphMode.FULL
......@@ -5370,20 +5396,6 @@ class GPUModelRunner(
kv_transfer_group.register_kv_caches(kv_caches)
kv_transfer_group.set_host_xfer_buffer_ops(copy_kv_blocks)
if self.dcp_world_size > 1:
layer_type = cast(type[Any], AttentionLayerBase)
layers = get_layers_from_vllm_config(self.vllm_config, layer_type)
for layer in layers.values():
layer_impl = getattr(layer, "impl", None)
if layer_impl is None:
continue
assert layer_impl.need_to_return_lse_for_decode, (
"DCP requires attention impls to return"
" the softmax lse for decode, but the impl "
f"{layer_impl.__class__.__name__} "
"does not return the softmax lse for decode."
)
def may_add_encoder_only_layers_to_kv_cache_config(self) -> None:
"""
Add encoder-only layers to the KV cache config.
......
......@@ -54,6 +54,7 @@ from vllm.v1.outputs import (
from vllm.v1.utils import report_usage_stats
from vllm.v1.worker.utils import is_residual_scattered_for_sp
from vllm.v1.worker.worker_base import WorkerBase
from vllm.v1.worker.workspace import init_workspace_manager
logger = init_logger(__name__)
......@@ -81,7 +82,7 @@ class Worker(WorkerBase):
# configure float32 matmul precision according to vLLM env.
precision = envs.VLLM_FLOAT32_MATMUL_PRECISION
torch.set_float32_matmul_precision(precision)
torch.backends.cuda.matmul.fp32_precision = precision
if self.model_config.trust_remote_code:
# note: lazy import to avoid importing torch before initializing
......@@ -255,6 +256,10 @@ class Worker(WorkerBase):
else:
raise RuntimeError(f"Not support device type: {self.device_config.device}")
# Initialize workspace manager
num_ubatches = 2 if self.vllm_config.parallel_config.enable_dbo else 1
init_workspace_manager(self.device, num_ubatches)
# Construct the model runner
if self.use_v2_model_runner:
from vllm.v1.worker.gpu.model_runner import (
......@@ -926,10 +931,11 @@ def init_worker_distributed_environment(
backend: str = "nccl",
) -> None:
"""Initialize the distributed environment."""
attention_config = vllm_config.attention_config
parallel_config = vllm_config.parallel_config
from vllm.model_executor.layers.batch_invariant import init_batch_invariance
init_batch_invariance()
init_batch_invariance(attention_config.backend)
set_custom_all_reduce(not parallel_config.disable_custom_all_reduce)
init_method = distributed_init_method or "env://"
......
......@@ -22,7 +22,6 @@ from vllm.distributed.kv_transfer import (
has_kv_transfer_group,
)
from vllm.distributed.kv_transfer.kv_connector.base import KVConnectorBase
from vllm.distributed.kv_transfer.kv_connector.v1.metrics import KVConnectorStats
from vllm.forward_context import get_forward_context, set_forward_context
from vllm.logger import init_logger
from vllm.v1.kv_cache_interface import AttentionSpec, KVCacheConfig
......@@ -138,16 +137,10 @@ class KVConnectorModelRunnerMixin:
)
output.invalid_block_ids = kv_connector.get_block_ids_with_load_errors()
output.kv_connector_stats = (
KVConnectorModelRunnerMixin.get_kv_connector_stats()
)
kv_connector.clear_connector_metadata()
output.kv_connector_stats = kv_connector.get_kv_connector_stats()
output.kv_cache_events = kv_connector.get_kv_connector_kv_cache_events()
@staticmethod
def get_kv_connector_stats() -> KVConnectorStats | None:
if has_kv_transfer_group():
return get_kv_transfer_group().get_kv_connector_stats()
return None
kv_connector.clear_connector_metadata()
@staticmethod
def use_uniform_kv_cache(
......
......@@ -10,7 +10,7 @@ import torch
import torch.nn as nn
import vllm.envs as envs
from vllm.config import VllmConfig
from vllm.config import VllmConfig, set_current_vllm_config
from vllm.distributed import (
ensure_model_parallel_initialized,
init_distributed_environment,
......@@ -207,7 +207,8 @@ class TPUWorker:
# one compiled bytecode. Having one FX graph/cached bytecode per
# compiled model is required for `support_torch_compile` decorator to
# skip dynamo guard.
self.model_runner.reset_dynamo_cache()
with set_current_vllm_config(self.vllm_config):
self.model_runner.reset_dynamo_cache()
# Get the maximum amount of memory used by the model weights and
# intermediate activations.
......
......@@ -4,10 +4,12 @@ from collections import defaultdict
from dataclasses import dataclass, field
import torch
from typing_extensions import deprecated
from vllm.attention.backends.abstract import AttentionBackend
from vllm.attention.layer import Attention
from vllm.config import ModelConfig, SchedulerConfig, VllmConfig
from vllm.logger import init_logger
from vllm.model_executor.models.interfaces import MultiModalEmbeddings
from vllm.model_executor.models.utils import extract_layer_index
from vllm.multimodal.cache import processor_only_cache_from_config
......@@ -17,6 +19,8 @@ from vllm.v1.attention.backends.utils import AttentionMetadataBuilder
from vllm.v1.core.encoder_cache_manager import compute_mm_encoder_budget
from vllm.v1.kv_cache_interface import KVCacheGroupSpec, KVCacheSpec
logger = init_logger(__name__)
class MultiModalBudget:
"""Helper class to calculate budget information for multi-modal models."""
......@@ -135,7 +139,7 @@ class AttentionGroup:
kv_cache_spec: KVCacheSpec
kv_cache_group_id: int
# When ubatching is enabled we will have a metadata builder for each ubatch
# so that if they use internal persistant buffers for cudagraphs, and they
# so that if they use internal persistent buffers for cudagraphs, and they
# won't have to worry about conflicting with the other ubatches.
metadata_builders: list[AttentionMetadataBuilder] = field(
default_factory=lambda: []
......@@ -198,6 +202,7 @@ def sanity_check_mm_encoder_outputs(
)
@deprecated("`scatter_mm_placeholders` is deprecated and will be removed in v0.15.0.")
def scatter_mm_placeholders(
embeds: torch.Tensor,
is_embed: torch.Tensor | None,
......@@ -226,6 +231,7 @@ def scatter_mm_placeholders(
return placeholders
@deprecated("`gather_mm_placeholders` is deprecated and will be removed in v0.15.0.")
def gather_mm_placeholders(
placeholders: torch.Tensor,
is_embed: torch.Tensor | None,
......@@ -313,8 +319,12 @@ def bind_kv_cache(
# TODO - analyze where runner_kv_caches is used and the right
# way to ensure it properly reflects multiple attention layers
# in the same decoder block.
if current_platform.is_cuda_alike() or current_platform.is_xpu():
# We know that the GPU runner is not impacted by this
if (
current_platform.is_cuda_alike()
or current_platform.is_xpu()
or current_platform.is_cpu()
):
# We know that the GPU / CPU runner is not impacted by this
# case. Some test code depends on runner_kv_caches, but
# not in a way that's impacted by ignoring this.
pass
......
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import inspect
import os
from itertools import accumulate
from math import prod
from typing import Optional
import torch
import vllm.envs as envs
from vllm.logger import init_logger
from vllm.utils.math_utils import round_up
from vllm.v1.worker.ubatching import dbo_current_ubatch_id
logger = init_logger(__name__)
def _compute_bytes(shape: tuple[int, ...], dtype: torch.dtype) -> int:
return prod(shape) * dtype.itemsize
# Constants
_MB = 1024**2
_GiB = 1024**3
# Global workspace manager instance
_manager: Optional["WorkspaceManager"] = None
class WorkspaceManager:
"""Manager for workspace allocation.
Manages workspace buffers for DBO (Dual Batch Overlap) execution.
Can be locked to prevent further growth during execution.
"""
def __init__(self, device: torch.device, num_ubatches: int | None = None):
self._device = device
# Cache num ubatches at init based on configuration (default to 1)
self._num_ubatches = num_ubatches if num_ubatches is not None else 1
self._current_workspaces: list[torch.Tensor | None] = [None, None]
self._locked: bool = False
@staticmethod
def _workspace_size_bytes(workspace: torch.Tensor | None) -> int:
"""Get size of workspace in bytes."""
if workspace is None:
return 0
return workspace.numel() * workspace.element_size()
def lock(self) -> None:
"""Lock the workspace to prevent further growth.
After locking, any attempt to allocate a larger workspace will raise
an assertion error. This ensures workspace size is fixed during execution.
"""
self._locked = True
if envs.VLLM_DEBUG_WORKSPACE:
logger.info(
"[WORKSPACE DEBUG] Workspace locked. Current sizes: %s",
[
self._workspace_size_bytes(ws) / _MB
for ws in self._current_workspaces
if ws is not None
],
)
def is_locked(self) -> bool:
"""Check if workspace is locked."""
return self._locked
def get_simultaneous(
self, *shapes_and_dtypes: tuple[tuple[int, ...], torch.dtype]
) -> list[torch.Tensor]:
"""Get multiple workspace tensors simultaneously from a single allocation.
Args:
*shapes_and_dtypes: One or more (shape, dtype) tuples.
Returns:
List of tensor views into the workspace buffer, one per shape/dtype pair.
"""
actual_bytes = [_compute_bytes(s, d) for s, d in shapes_and_dtypes]
aligned_bytes = [round_up(actual, 256) for actual in actual_bytes]
total_bytes = sum(aligned_bytes)
# Calculate cumulative offsets using itertools.accumulate
offsets = list(accumulate([0] + aligned_bytes[:-1]))
current_workspace = self._ensure_workspace_size(total_bytes)
return [
current_workspace[offsets[i] : offsets[i] + actual_bytes[i]]
.view(shapes_and_dtypes[i][1])
.reshape(shapes_and_dtypes[i][0])
for i in range(len(shapes_and_dtypes))
]
def _ensure_workspace_size(self, required_bytes: int) -> torch.Tensor:
"""Ensure workspace is allocated and large enough, return current workspace.
Args:
required_bytes: The number of bytes required.
Returns:
The current workspace tensor.
"""
ubatch_id = dbo_current_ubatch_id()
current_workspace = self._current_workspaces[ubatch_id]
current_size = self._workspace_size_bytes(current_workspace)
if current_size < required_bytes:
def get_caller_info() -> str:
"""Find first frame outside WorkspaceManager."""
curr_frame = inspect.currentframe()
if curr_frame is None:
return "unknown"
# Walk up the stack skipping WorkspaceManager frames
curr_frame = curr_frame.f_back
while curr_frame is not None:
# TODO: This only catches instance methods (self), missing
# classmethods and staticmethods. Once Python 3.11+ is the
# minimum supported version, use co_qualname instead:
# qualname = curr_frame.f_code.co_qualname
# if qualname.startswith("WorkspaceManager."):
if isinstance(curr_frame.f_locals.get("self"), WorkspaceManager):
curr_frame = curr_frame.f_back
continue
filename = os.path.basename(curr_frame.f_code.co_filename)
return (
f"{filename}:{curr_frame.f_lineno}:{curr_frame.f_code.co_name}"
)
return "unknown"
if self._locked:
raise AssertionError(
f"Workspace is locked but allocation from '{get_caller_info()}' "
f"requires {required_bytes / _MB:.2f} MB, current size is "
f"{current_size / _MB:.2f} MB. "
"Workspace growth is not allowed after locking."
)
for ubatch_id in range(self._num_ubatches):
current_workspace = self._current_workspaces[ubatch_id]
if current_workspace is None:
self._current_workspaces[ubatch_id] = torch.empty(
(required_bytes,), dtype=torch.uint8, device=self._device
)
elif self._workspace_size_bytes(current_workspace) < required_bytes:
current_workspace.resize_(required_bytes)
if envs.VLLM_DEBUG_WORKSPACE:
logger.info(
"[WORKSPACE DEBUG] Resized workspace from '%s': %.2f MB -> "
"%.2f MB (%d ubatches, total memory %.2f MB)",
get_caller_info(),
current_size / _MB,
required_bytes / _MB,
self._num_ubatches,
required_bytes * self._num_ubatches / _MB,
)
current_workspace = self._current_workspaces[dbo_current_ubatch_id()]
return current_workspace
def is_workspace_manager_initialized() -> bool:
"""Check if workspace manager has been initialized.
Returns:
True if workspace manager is initialized, False otherwise.
"""
return _manager is not None
def current_workspace_manager() -> "WorkspaceManager":
"""Get the current workspace manager instance.
Raises:
AssertionError: If workspace manager has not been initialized.
"""
assert _manager is not None, (
"WorkspaceManager not initialized. Call init_workspace_manager() "
"with a device before using workspace functions."
)
return _manager
def init_workspace_manager(
device: torch.device, num_ubatches: int | None = None
) -> None:
"""Initialize the workspace manager with a device.
Must be called before using any workspace functions. Typically called
from GPUModelRunner.__init__.
Args:
device: The device to allocate workspace on.
num_ubatches: Number of micro-batches. Defaults to 1.
"""
global _manager
if _manager is not None:
logger.warning(
"WorkspaceManager already initialized on device %s, "
"reinitializing on device %s",
_manager._device,
device,
)
_manager = WorkspaceManager(device, num_ubatches)
def lock_workspace() -> None:
"""Lock the workspace to prevent further growth.
After calling this function, any attempt to allocate a workspace larger
than the current size will raise an AssertionError. This ensures that
workspace size is fixed during execution and prevents unexpected memory
allocations in the hot path.
Example:
# During initialization
init_workspace_manager(device)
reserve_workspace(shape1, dtype1)
reserve_workspace(shape2, dtype2)
# Lock after warmup/profiling
lock_workspace()
# Now all get_workspace calls must fit in pre-allocated size
"""
current_workspace_manager().lock()
def reset_workspace_manager() -> None:
"""Reset the workspace manager to uninitialized state.
This is primarily intended for testing purposes to allow tests
to reinitialize the workspace manager cleanly.
"""
global _manager
_manager = None
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment