Commit a3f8d5dd authored by zhuwenwen's avatar zhuwenwen
Browse files

Merge tag 'v0.13.0rc2' into v0.13.0rc2-ori

parents 8d75f22e f34eca5f
...@@ -219,7 +219,7 @@ class Executor(ABC): ...@@ -219,7 +219,7 @@ class Executor(ABC):
def sample_tokens( def sample_tokens(
self, grammar_output: GrammarOutput | None, non_block: bool = False self, grammar_output: GrammarOutput | None, non_block: bool = False
) -> ModelRunnerOutput | None | Future[ModelRunnerOutput | None]: ) -> ModelRunnerOutput | Future[ModelRunnerOutput]:
output = self.collective_rpc( # type: ignore[call-overload] output = self.collective_rpc( # type: ignore[call-overload]
"sample_tokens", args=(grammar_output,), non_block=non_block "sample_tokens", args=(grammar_output,), non_block=non_block
) )
......
...@@ -124,9 +124,7 @@ class MultiprocExecutor(Executor): ...@@ -124,9 +124,7 @@ class MultiprocExecutor(Executor):
# Set multiprocessing envs # Set multiprocessing envs
set_multiprocessing_worker_envs() set_multiprocessing_worker_envs()
# Multiprocessing-based executor does not support multi-node setting. # use the loopback address get_loopback_ip() for communication.
# Since it only works for single node, we can use the loopback address
# get_loopback_ip() for communication.
distributed_init_method = get_distributed_init_method( distributed_init_method = get_distributed_init_method(
get_loopback_ip(), get_open_port() get_loopback_ip(), get_open_port()
) )
...@@ -294,8 +292,8 @@ class MultiprocExecutor(Executor): ...@@ -294,8 +292,8 @@ class MultiprocExecutor(Executor):
kwargs: dict | None = None, kwargs: dict | None = None,
non_block: bool = False, non_block: bool = False,
unique_reply_rank: int | None = None, unique_reply_rank: int | None = None,
kv_output_aggregator: KVOutputAggregator = None, kv_output_aggregator: KVOutputAggregator | None = None,
) -> Any | list[Any] | Future[Any | list[Any]]: ) -> Any:
"""Returns single result if unique_reply_rank and/or kv_output_aggregator """Returns single result if unique_reply_rank and/or kv_output_aggregator
is provided, otherwise list.""" is provided, otherwise list."""
assert self.rpc_broadcast_mq is not None, ( assert self.rpc_broadcast_mq is not None, (
...@@ -476,6 +474,8 @@ class WorkerProc: ...@@ -476,6 +474,8 @@ class WorkerProc:
"""Wrapper that runs one Worker in a separate process.""" """Wrapper that runs one Worker in a separate process."""
READY_STR = "READY" READY_STR = "READY"
rpc_broadcast_mq: MessageQueue | None
worker_response_mq: MessageQueue | None
def _init_message_queues( def _init_message_queues(
self, input_shm_handle: Handle, vllm_config: VllmConfig self, input_shm_handle: Handle, vllm_config: VllmConfig
...@@ -487,7 +487,7 @@ class WorkerProc: ...@@ -487,7 +487,7 @@ class WorkerProc:
) )
# Initializes a message queue for sending the model output # Initializes a message queue for sending the model output
self.worker_response_mq: MessageQueue = MessageQueue(1, 1) self.worker_response_mq = MessageQueue(1, 1)
self.peer_response_handles = [] self.peer_response_handles = []
else: else:
# Initialize remote MessageQueue for receiving SchedulerOutput across nodes # Initialize remote MessageQueue for receiving SchedulerOutput across nodes
...@@ -706,7 +706,7 @@ class WorkerProc: ...@@ -706,7 +706,7 @@ class WorkerProc:
death_pipe.recv() death_pipe.recv()
except EOFError: except EOFError:
# Parent process has exited, terminate this worker # Parent process has exited, terminate this worker
logger.info("Parent process exited, terminating worker") logger.info_once("Parent process exited, terminating worker")
# Send signal to self to trigger clean shutdown # Send signal to self to trigger clean shutdown
shutdown_event.set() shutdown_event.set()
except Exception as e: except Exception as e:
...@@ -720,6 +720,7 @@ class WorkerProc: ...@@ -720,6 +720,7 @@ class WorkerProc:
try: try:
reader.close() reader.close()
worker = WorkerProc(*args, **kwargs) worker = WorkerProc(*args, **kwargs)
assert worker.worker_response_mq is not None
# Send READY once we know everything is loaded # Send READY once we know everything is loaded
ready_writer.send( ready_writer.send(
...@@ -804,6 +805,7 @@ class WorkerProc: ...@@ -804,6 +805,7 @@ class WorkerProc:
def worker_busy_loop(self, cancel: threading.Event | None = None): def worker_busy_loop(self, cancel: threading.Event | None = None):
"""Main busy loop for Multiprocessing Workers""" """Main busy loop for Multiprocessing Workers"""
assert self.rpc_broadcast_mq is not None
while True: while True:
method, args, kwargs, output_rank = self.rpc_broadcast_mq.dequeue( method, args, kwargs, output_rank = self.rpc_broadcast_mq.dequeue(
cancel=cancel, indefinite=True cancel=cancel, indefinite=True
......
...@@ -413,7 +413,7 @@ class RayDistributedExecutor(Executor): ...@@ -413,7 +413,7 @@ class RayDistributedExecutor(Executor):
self, self,
grammar_output: "GrammarOutput | None", grammar_output: "GrammarOutput | None",
non_block: bool = False, non_block: bool = False,
) -> ModelRunnerOutput | Future[ModelRunnerOutput]: ) -> ModelRunnerOutput | None | Future[ModelRunnerOutput | None]:
"""Execute the model on the Ray workers. """Execute the model on the Ray workers.
The scheduler output to use should have been provided in The scheduler output to use should have been provided in
...@@ -428,7 +428,7 @@ class RayDistributedExecutor(Executor): ...@@ -428,7 +428,7 @@ class RayDistributedExecutor(Executor):
""" """
scheduler_output = self.scheduler_output scheduler_output = self.scheduler_output
if scheduler_output is None: if scheduler_output is None:
return COMPLETED_NONE_FUTURE if non_block else None # noqa return COMPLETED_NONE_FUTURE if non_block else None
self.scheduler_output = None self.scheduler_output = None
...@@ -439,7 +439,7 @@ class RayDistributedExecutor(Executor): ...@@ -439,7 +439,7 @@ class RayDistributedExecutor(Executor):
scheduler_output: SchedulerOutput, scheduler_output: SchedulerOutput,
grammar_output: "GrammarOutput | None", grammar_output: "GrammarOutput | None",
non_block: bool = False, non_block: bool = False,
) -> ModelRunnerOutput | Future[ModelRunnerOutput]: ) -> ModelRunnerOutput | None | Future[ModelRunnerOutput | None]:
# Build the compiled DAG for the first time. # Build the compiled DAG for the first time.
if self.forward_dag is None: # type: ignore if self.forward_dag is None: # type: ignore
self.forward_dag = self._compiled_ray_dag(enable_asyncio=False) self.forward_dag = self._compiled_ray_dag(enable_asyncio=False)
......
...@@ -67,7 +67,7 @@ class UniProcExecutor(Executor): ...@@ -67,7 +67,7 @@ class UniProcExecutor(Executor):
kwargs: dict | None = None, kwargs: dict | None = None,
non_block: bool = False, non_block: bool = False,
single_value: bool = False, single_value: bool = False,
) -> Any | list[Any] | Future[Any | list[Any]]: ) -> Any:
if kwargs is None: if kwargs is None:
kwargs = {} kwargs = {}
...@@ -79,10 +79,13 @@ class UniProcExecutor(Executor): ...@@ -79,10 +79,13 @@ class UniProcExecutor(Executor):
result = run_method(self.driver_worker, method, args, kwargs) result = run_method(self.driver_worker, method, args, kwargs)
if isinstance(result, AsyncModelRunnerOutput): if isinstance(result, AsyncModelRunnerOutput):
if (async_thread := self.async_output_thread) is not None: if (async_thread := self.async_output_thread) is not None:
get_output = result.get_output if single_value:
if not single_value: return async_thread.submit(result.get_output)
get_output = lambda go=result.get_output: [go()]
return async_thread.submit(get_output) def get_output_list() -> list[Any]:
return [result.get_output()]
return async_thread.submit(get_output_list)
result = result.get_output() result = result.get_output()
future = Future[Any]() future = Future[Any]()
future.set_result(result if single_value else [result]) future.set_result(result if single_value else [result])
......
...@@ -13,7 +13,7 @@ from vllm.v1.kv_offload.backends.cpu import CPUBackend ...@@ -13,7 +13,7 @@ from vllm.v1.kv_offload.backends.cpu import CPUBackend
from vllm.v1.kv_offload.lru_manager import LRUOffloadingManager from vllm.v1.kv_offload.lru_manager import LRUOffloadingManager
from vllm.v1.kv_offload.mediums import CPULoadStoreSpec, GPULoadStoreSpec from vllm.v1.kv_offload.mediums import CPULoadStoreSpec, GPULoadStoreSpec
from vllm.v1.kv_offload.spec import OffloadingSpec from vllm.v1.kv_offload.spec import OffloadingSpec
from vllm.v1.kv_offload.worker.cpu_gpu import CpuGpuOffloadingHandler from vllm.v1.kv_offload.worker.cpu_gpu import CpuGpuOffloadingHandlers
from vllm.v1.kv_offload.worker.worker import OffloadingHandler from vllm.v1.kv_offload.worker.worker import OffloadingHandler
...@@ -32,7 +32,7 @@ class CPUOffloadingSpec(OffloadingSpec): ...@@ -32,7 +32,7 @@ class CPUOffloadingSpec(OffloadingSpec):
self._manager: OffloadingManager | None = None self._manager: OffloadingManager | None = None
# worker-side # worker-side
self._handler: OffloadingHandler | None = None self._handlers: CpuGpuOffloadingHandlers | None = None
self.eviction_policy: str = self.extra_config.get("eviction_policy", "lru") self.eviction_policy: str = self.extra_config.get("eviction_policy", "lru")
...@@ -67,13 +67,13 @@ class CPUOffloadingSpec(OffloadingSpec): ...@@ -67,13 +67,13 @@ class CPUOffloadingSpec(OffloadingSpec):
kv_caches: dict[str, torch.Tensor], kv_caches: dict[str, torch.Tensor],
attn_backends: dict[str, type[AttentionBackend]], attn_backends: dict[str, type[AttentionBackend]],
) -> Iterator[tuple[type[LoadStoreSpec], type[LoadStoreSpec], OffloadingHandler]]: ) -> Iterator[tuple[type[LoadStoreSpec], type[LoadStoreSpec], OffloadingHandler]]:
if not self._handler: if not self._handlers:
if not current_platform.is_cuda_alike(): if not current_platform.is_cuda_alike():
raise Exception( raise Exception(
"CPU Offloading is currently only supported on CUDA-alike GPUs" "CPU Offloading is currently only supported on CUDA-alike GPUs"
) )
self._handler = CpuGpuOffloadingHandler( self._handlers = CpuGpuOffloadingHandlers(
attn_backends=attn_backends, attn_backends=attn_backends,
gpu_block_size=self.gpu_block_size, gpu_block_size=self.gpu_block_size,
cpu_block_size=self.offloaded_block_size, cpu_block_size=self.offloaded_block_size,
...@@ -81,6 +81,6 @@ class CPUOffloadingSpec(OffloadingSpec): ...@@ -81,6 +81,6 @@ class CPUOffloadingSpec(OffloadingSpec):
gpu_caches=kv_caches, gpu_caches=kv_caches,
) )
assert self._handler is not None assert self._handlers is not None
yield GPULoadStoreSpec, CPULoadStoreSpec, self._handler yield GPULoadStoreSpec, CPULoadStoreSpec, self._handlers.gpu_to_cpu_handler
yield CPULoadStoreSpec, GPULoadStoreSpec, self._handler yield CPULoadStoreSpec, GPULoadStoreSpec, self._handlers.cpu_to_gpu_handler
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from collections import deque
import numpy as np import numpy as np
import torch import torch
...@@ -8,7 +9,7 @@ from vllm import _custom_ops as ops ...@@ -8,7 +9,7 @@ from vllm import _custom_ops as ops
from vllm.attention.backends.abstract import AttentionBackend from vllm.attention.backends.abstract import AttentionBackend
from vllm.logger import init_logger from vllm.logger import init_logger
from vllm.utils.platform_utils import is_pin_memory_available from vllm.utils.platform_utils import is_pin_memory_available
from vllm.v1.kv_offload.mediums import CPULoadStoreSpec, GPULoadStoreSpec from vllm.v1.kv_offload.mediums import BlockIDsLoadStoreSpec
from vllm.v1.kv_offload.worker.worker import ( from vllm.v1.kv_offload.worker.worker import (
OffloadingHandler, OffloadingHandler,
TransferResult, TransferResult,
...@@ -51,7 +52,123 @@ def expand_block_ids( ...@@ -51,7 +52,123 @@ def expand_block_ids(
output_idx = output_end_idx output_idx = output_end_idx
class CpuGpuOffloadingHandler(OffloadingHandler): class SingleDirectionOffloadingHandler(OffloadingHandler):
"""
SingleDirectionOffloadingHandler handles transfers for a single direction,
either CPU->GPU or GPU->CPU.
Transfers are guaranteed to be executed in order of their submission.
Each transfer uses a unique CUDA stream, and its stream will start
executing only after the streams of previous transfers have finished.
"""
def __init__(
self,
src_tensors: list[torch.Tensor],
dst_tensors: list[torch.Tensor],
kv_dim_before_num_blocks: list[bool],
src_block_size_factor: int,
dst_block_size_factor: int,
priority: int,
):
"""
Initialize a SingleDirectionOffloadingHandler.
Args:
src_tensors: list of KV cache tensors to copy from.
dst_tensors: list of KV cache tensors to copy to.
Order should match src_tensors.
kv_dim_before_num_blocks: list of bools, indicating
whether the respective KV cache tensor has a KV
dimension before its num_blocks dimension.
e.g. (2, num_blocks, ...)
src_block_size_factor: The number of kernel blocks
per KV block in a source tensor.
dst_block_size_factor: The number of kernel blocks
per KV block in a destination tensor.
priority: The priority of the backing CUDA streams.
Lower numbers indicate higher priority.
"""
assert len(src_tensors) == len(dst_tensors) == len(kv_dim_before_num_blocks)
self.src_tensors: list[torch.Tensor] = src_tensors
self.dst_tensors: list[torch.Tensor] = dst_tensors
self.kv_dim_before_num_blocks: list[bool] = kv_dim_before_num_blocks
self.src_block_size_factor: int = src_block_size_factor
self.dst_block_size_factor: int = dst_block_size_factor
self.priority = priority
# queue of transfers (job_id, stream, event)
self._transfers: deque[tuple[int, torch.cuda.Stream, torch.Event]] = deque()
# list of CUDA streams available for re-use
self._stream_pool: list[torch.cuda.Stream] = []
# list of CUDA events available for re-use
self._event_pool: list[torch.Event] = []
def transfer_async(self, job_id: int, transfer_spec: TransferSpec) -> bool:
src_spec, dst_spec = transfer_spec
assert isinstance(src_spec, BlockIDsLoadStoreSpec)
assert isinstance(dst_spec, BlockIDsLoadStoreSpec)
src_blocks = src_spec.block_ids
dst_blocks = dst_spec.block_ids
assert src_blocks.ndim == 1
assert dst_blocks.ndim == 1
src_sub_block_count = src_blocks.size * self.src_block_size_factor
dst_sub_block_count = dst_blocks.size * self.dst_block_size_factor
src_sub_blocks_to_skip = -dst_blocks.size % self.src_block_size_factor
assert dst_sub_block_count == src_sub_block_count - src_sub_blocks_to_skip
src_to_dst = np.empty((dst_sub_block_count, 2), dtype=np.int64)
expand_block_ids(
src_blocks,
self.src_block_size_factor,
src_to_dst[:, 0],
skip_count=src_sub_blocks_to_skip,
)
expand_block_ids(dst_blocks, self.dst_block_size_factor, src_to_dst[:, 1])
src_to_dst_tensor = torch.from_numpy(src_to_dst)
stream = (
self._stream_pool.pop()
if self._stream_pool
else torch.cuda.Stream(priority=self.priority)
)
event = self._event_pool.pop() if self._event_pool else torch.Event()
if self._transfers:
_, _, last_event = self._transfers[-1]
# assure job will start only after the previous one completes
stream.wait_event(last_event)
with torch.cuda.stream(stream):
for src_tensor, dst_tensor, kv_dim in zip(
self.src_tensors, self.dst_tensors, self.kv_dim_before_num_blocks
):
if kv_dim:
src_key_cache, src_value_cache = src_tensor
dst_key_cache, dst_value_cache = dst_tensor
ops.swap_blocks(src_key_cache, dst_key_cache, src_to_dst_tensor)
ops.swap_blocks(src_value_cache, dst_value_cache, src_to_dst_tensor)
else:
ops.swap_blocks(src_tensor, dst_tensor, src_to_dst_tensor)
event.record(stream)
self._transfers.append((job_id, stream, event))
# success
return True
def get_finished(self) -> list[TransferResult]:
results: list[TransferResult] = []
while self._transfers and self._transfers[0][2].query():
job_id, stream, event = self._transfers.popleft()
results.append((job_id, True))
self._stream_pool.append(stream)
self._event_pool.append(event)
return results
class CpuGpuOffloadingHandlers:
def __init__( def __init__(
self, self,
gpu_block_size: int, gpu_block_size: int,
...@@ -60,27 +177,20 @@ class CpuGpuOffloadingHandler(OffloadingHandler): ...@@ -60,27 +177,20 @@ class CpuGpuOffloadingHandler(OffloadingHandler):
gpu_caches: dict[str, torch.Tensor], gpu_caches: dict[str, torch.Tensor],
attn_backends: dict[str, type[AttentionBackend]], attn_backends: dict[str, type[AttentionBackend]],
): ):
assert gpu_caches
assert cpu_block_size % gpu_block_size == 0 assert cpu_block_size % gpu_block_size == 0
self.block_size_factor = cpu_block_size // gpu_block_size block_size_factor = cpu_block_size // gpu_block_size
# cuda streams for gpu->cpu and cpu->gpu
self.d2h_stream = torch.cuda.Stream()
self.h2d_stream = torch.cuda.Stream()
# job_id -> transfer cuda event
self.transfer_events: dict[int, torch.Event] = {}
# list of cuda events available for re-use
self.events_pool: list[torch.Event] = []
pin_memory = is_pin_memory_available() pin_memory = is_pin_memory_available()
# allocate cpu tensors # allocate cpu tensors
logger.info("Allocating %d CPU tensors...", len(gpu_caches)) logger.info("Allocating %d CPU tensors...", len(gpu_caches))
self.gpu_tensors: list[torch.Tensor] = [] gpu_tensors: list[torch.Tensor] = []
self.cpu_tensors: list[torch.Tensor] = [] cpu_tensors: list[torch.Tensor] = []
self.kv_dim_before_num_blocks: list[bool] = [] kv_dim_before_num_blocks: list[bool] = []
kernel_block_size: int | None = None
for layer_name, gpu_tensor in gpu_caches.items(): for layer_name, gpu_tensor in gpu_caches.items():
self.gpu_tensors.append(gpu_tensor) gpu_tensors.append(gpu_tensor)
gpu_shape = gpu_tensor.shape gpu_shape = gpu_tensor.shape
attn_backend = attn_backends[layer_name] attn_backend = attn_backends[layer_name]
...@@ -88,16 +198,21 @@ class CpuGpuOffloadingHandler(OffloadingHandler): ...@@ -88,16 +198,21 @@ class CpuGpuOffloadingHandler(OffloadingHandler):
num_blocks=1234, block_size=16, num_kv_heads=8, head_size=256 num_blocks=1234, block_size=16, num_kv_heads=8, head_size=256
) )
has_layers_dim = False
if len(gpu_shape) != len(test_shape): if len(gpu_shape) != len(test_shape):
# cross-layers tensor # cross-layers tensor
# shape is (num_blocks, ...) # shape is (num_blocks, ...)
assert len(gpu_shape) == len(test_shape) + 1 assert len(gpu_shape) == len(test_shape) + 1
num_blocks_idx = 0 num_blocks_idx = 0
self.kv_dim_before_num_blocks.append(False) has_layers_dim = True
kv_dim_before_num_blocks.append(False)
# prepend a dummy num_layers=80 to test_shape
test_shape = (80,) + test_shape
elif test_shape[0] == 1234: elif test_shape[0] == 1234:
# shape is (num_blocks, ...) # shape is (num_blocks, ...)
num_blocks_idx = 0 num_blocks_idx = 0
self.kv_dim_before_num_blocks.append(False) kv_dim_before_num_blocks.append(False)
else: else:
# shape should be (2, num_blocks, ...) # shape should be (2, num_blocks, ...)
assert test_shape[0] == 2 assert test_shape[0] == 2
...@@ -105,13 +220,32 @@ class CpuGpuOffloadingHandler(OffloadingHandler): ...@@ -105,13 +220,32 @@ class CpuGpuOffloadingHandler(OffloadingHandler):
assert gpu_shape[0] == 2 assert gpu_shape[0] == 2
num_blocks_idx = 1 num_blocks_idx = 1
self.kv_dim_before_num_blocks.append(True) kv_dim_before_num_blocks.append(True)
try:
kv_cache_stride_order = attn_backend.get_kv_cache_stride_order(
include_num_layers_dimension=has_layers_dim
)
assert len(kv_cache_stride_order) == len(gpu_shape)
except (AttributeError, NotImplementedError):
kv_cache_stride_order = tuple(range(len(gpu_shape)))
# permute test_shape according to stride_order
test_shape = tuple(test_shape[i] for i in kv_cache_stride_order)
# find block_size (16) dimension index
block_size_idx = test_shape.index(16)
if kernel_block_size is not None:
assert kernel_block_size == gpu_shape[block_size_idx]
else:
kernel_block_size = gpu_shape[block_size_idx]
assert gpu_block_size % kernel_block_size == 0
cpu_shape = list(gpu_shape) cpu_shape = list(gpu_shape)
cpu_shape[num_blocks_idx] = num_cpu_blocks * self.block_size_factor cpu_shape[num_blocks_idx] = num_cpu_blocks * block_size_factor
logger.debug("Allocating CPU tensor of shape %r", cpu_shape) logger.debug("Allocating CPU tensor of shape %r", cpu_shape)
self.cpu_tensors.append( cpu_tensors.append(
torch.zeros( torch.zeros(
cpu_shape, cpu_shape,
dtype=gpu_tensor.dtype, dtype=gpu_tensor.dtype,
...@@ -120,72 +254,27 @@ class CpuGpuOffloadingHandler(OffloadingHandler): ...@@ -120,72 +254,27 @@ class CpuGpuOffloadingHandler(OffloadingHandler):
) )
) )
def transfer_async(self, job_id: int, spec: TransferSpec) -> bool: assert kernel_block_size is not None
src_spec, dst_spec = spec gpu_block_size_factor = gpu_block_size // kernel_block_size
if isinstance(src_spec, CPULoadStoreSpec): cpu_block_size_factor = cpu_block_size // kernel_block_size
assert isinstance(dst_spec, GPULoadStoreSpec)
stream = self.h2d_stream
src_tensors = self.cpu_tensors
dst_tensors = self.gpu_tensors
src_block_size_factor = self.block_size_factor
dst_block_size_factor = 1
else:
assert isinstance(src_spec, GPULoadStoreSpec)
assert isinstance(dst_spec, CPULoadStoreSpec)
stream = self.d2h_stream
src_tensors = self.gpu_tensors
dst_tensors = self.cpu_tensors
src_block_size_factor = 1
dst_block_size_factor = self.block_size_factor
src_blocks = src_spec.block_ids
dst_blocks = dst_spec.block_ids
assert src_blocks.ndim == 1
assert dst_blocks.ndim == 1
src_sub_block_count = src_blocks.size * src_block_size_factor # TODO (orozery): adapt swap_blocks to support gpu_block_size_factor
dst_sub_block_count = dst_blocks.size * dst_block_size_factor assert gpu_block_size_factor == 1
src_sub_blocks_to_skip = -dst_blocks.size % src_block_size_factor
assert dst_sub_block_count == src_sub_block_count - src_sub_blocks_to_skip self.gpu_to_cpu_handler = SingleDirectionOffloadingHandler(
src_tensors=gpu_tensors,
src_to_dst = np.empty((dst_sub_block_count, 2), dtype=np.int64) dst_tensors=cpu_tensors,
expand_block_ids( kv_dim_before_num_blocks=kv_dim_before_num_blocks,
src_blocks, src_block_size_factor=gpu_block_size_factor,
src_block_size_factor, dst_block_size_factor=cpu_block_size_factor,
src_to_dst[:, 0], priority=1,
skip_count=src_sub_blocks_to_skip,
) )
expand_block_ids(dst_blocks, dst_block_size_factor, src_to_dst[:, 1])
src_to_dst_tensor = torch.from_numpy(src_to_dst)
event = self.events_pool.pop() if self.events_pool else torch.Event() self.cpu_to_gpu_handler = SingleDirectionOffloadingHandler(
with torch.cuda.stream(stream): src_tensors=cpu_tensors,
for src_tensor, dst_tensor, kv_dim in zip( dst_tensors=gpu_tensors,
src_tensors, dst_tensors, self.kv_dim_before_num_blocks kv_dim_before_num_blocks=kv_dim_before_num_blocks,
): src_block_size_factor=cpu_block_size_factor,
if kv_dim: dst_block_size_factor=gpu_block_size_factor,
src_key_cache = src_tensor[0] priority=-1,
dst_key_cache = dst_tensor[0] )
ops.swap_blocks(src_key_cache, dst_key_cache, src_to_dst_tensor)
src_value_cache = src_tensor[1]
dst_value_cache = dst_tensor[1]
ops.swap_blocks(src_value_cache, dst_value_cache, src_to_dst_tensor)
else:
ops.swap_blocks(src_tensor, dst_tensor, src_to_dst_tensor)
event.record(stream)
self.transfer_events[job_id] = event
# success
return True
def get_finished(self) -> list[TransferResult]:
results: list[TransferResult] = []
for job_id, event in self.transfer_events.items():
if event.query():
results.append((job_id, True))
self.events_pool.append(event)
for job_id, _ in results:
del self.transfer_events[job_id]
return results
...@@ -12,9 +12,11 @@ from vllm.compilation.cuda_graph import CUDAGraphStat ...@@ -12,9 +12,11 @@ from vllm.compilation.cuda_graph import CUDAGraphStat
from vllm.v1.core.sched.output import SchedulerOutput from vllm.v1.core.sched.output import SchedulerOutput
if TYPE_CHECKING: if TYPE_CHECKING:
from vllm.distributed.kv_events import KVConnectorKVEvents
from vllm.distributed.kv_transfer.kv_connector.v1.metrics import KVConnectorStats from vllm.distributed.kv_transfer.kv_connector.v1.metrics import KVConnectorStats
else: else:
KVConnectorStats = object KVConnectorStats = object
KVConnectorKVEvents = object
class LogprobsLists(NamedTuple): class LogprobsLists(NamedTuple):
...@@ -108,6 +110,7 @@ class KVConnectorOutput: ...@@ -108,6 +110,7 @@ class KVConnectorOutput:
finished_sending: set[str] | None = None finished_sending: set[str] | None = None
finished_recving: set[str] | None = None finished_recving: set[str] | None = None
kv_connector_stats: KVConnectorStats | None = None kv_connector_stats: KVConnectorStats | None = None
kv_cache_events: KVConnectorKVEvents | None = None
# IDs of externally computed KV blocks that failed to load. # IDs of externally computed KV blocks that failed to load.
# Requests referencing these blocks should be rescheduled to recompute them # Requests referencing these blocks should be rescheduled to recompute them
invalid_block_ids: set[int] = field(default_factory=set) invalid_block_ids: set[int] = field(default_factory=set)
...@@ -123,6 +126,7 @@ class KVConnectorOutput: ...@@ -123,6 +126,7 @@ class KVConnectorOutput:
not self.finished_sending not self.finished_sending
and not self.finished_recving and not self.finished_recving
and not self.kv_connector_stats and not self.kv_connector_stats
and not self.kv_cache_events
and not self.invalid_block_ids and not self.invalid_block_ids
) )
......
...@@ -209,10 +209,10 @@ class Request: ...@@ -209,10 +209,10 @@ class Request:
def get_finished_reason(self) -> FinishReason | None: def get_finished_reason(self) -> FinishReason | None:
return RequestStatus.get_finished_reason(self.status) return RequestStatus.get_finished_reason(self.status)
def get_num_encoder_tokens(self, input_id: int) -> int: def get_num_encoder_embeds(self, input_id: int) -> int:
assert input_id < len(self.mm_features) assert input_id < len(self.mm_features)
num_tokens = self.mm_features[input_id].mm_position.length num_embeds = self.mm_features[input_id].mm_position.get_num_embeds
return num_tokens return num_embeds
def record_event( def record_event(
self, self,
...@@ -255,6 +255,7 @@ class RequestStatus(enum.IntEnum): ...@@ -255,6 +255,7 @@ class RequestStatus(enum.IntEnum):
FINISHED_LENGTH_CAPPED = enum.auto() FINISHED_LENGTH_CAPPED = enum.auto()
FINISHED_ABORTED = enum.auto() FINISHED_ABORTED = enum.auto()
FINISHED_IGNORED = enum.auto() FINISHED_IGNORED = enum.auto()
FINISHED_ERROR = enum.auto()
def __str__(self): def __str__(self):
return self.name return self.name
...@@ -277,4 +278,5 @@ _FINISHED_REASON_MAP = { ...@@ -277,4 +278,5 @@ _FINISHED_REASON_MAP = {
RequestStatus.FINISHED_LENGTH_CAPPED: FinishReason.LENGTH, RequestStatus.FINISHED_LENGTH_CAPPED: FinishReason.LENGTH,
RequestStatus.FINISHED_ABORTED: FinishReason.ABORT, RequestStatus.FINISHED_ABORTED: FinishReason.ABORT,
RequestStatus.FINISHED_IGNORED: FinishReason.LENGTH, RequestStatus.FINISHED_IGNORED: FinishReason.LENGTH,
RequestStatus.FINISHED_ERROR: FinishReason.ERROR,
} }
...@@ -145,7 +145,7 @@ class RejectionSampler(nn.Module): ...@@ -145,7 +145,7 @@ class RejectionSampler(nn.Module):
) )
logprobs_tensors = None logprobs_tensors = None
if sampling_metadata.max_num_logprobs: if sampling_metadata.max_num_logprobs is not None:
logprobs_tensors = self._get_logprobs_tensors( logprobs_tensors = self._get_logprobs_tensors(
sampling_metadata.max_num_logprobs, sampling_metadata.max_num_logprobs,
metadata, metadata,
......
...@@ -170,7 +170,6 @@ class EagleProposer: ...@@ -170,7 +170,6 @@ class EagleProposer:
self.allowed_attn_types: tuple | None = None self.allowed_attn_types: tuple | None = None
if current_platform.is_rocm(): if current_platform.is_rocm():
rocm_types = [TritonAttentionMetadata, FlashAttentionMetadata] rocm_types = [TritonAttentionMetadata, FlashAttentionMetadata]
# ROCM_AITER_FA is an optional backend # ROCM_AITER_FA is an optional backend
# if find_spec( # if find_spec(
# AttentionBackendEnum.ROCM_AITER_FA.get_path(include_classname=False) # AttentionBackendEnum.ROCM_AITER_FA.get_path(include_classname=False)
...@@ -180,6 +179,12 @@ class EagleProposer: ...@@ -180,6 +179,12 @@ class EagleProposer:
# ) # )
# rocm_types.append(AiterFlashAttentionMetadata) # rocm_types.append(AiterFlashAttentionMetadata)
# TRITON_MLA backend support for MLA models (e.g., DeepSeek)
from vllm.v1.attention.backends.mla.common import MLACommonMetadata
rocm_types.append(MLACommonMetadata)
self.allowed_attn_types = tuple(rocm_types) self.allowed_attn_types = tuple(rocm_types)
# Parse the speculative token tree. # Parse the speculative token tree.
......
...@@ -7,7 +7,7 @@ from typing import TYPE_CHECKING ...@@ -7,7 +7,7 @@ from typing import TYPE_CHECKING
from vllm.config import VllmConfig from vllm.config import VllmConfig
from vllm.logger import init_logger from vllm.logger import init_logger
from vllm.reasoning import ReasoningParserManager from vllm.reasoning import ReasoningParserManager
from vllm.tokenizers import init_tokenizer_from_config from vllm.tokenizers import cached_tokenizer_from_config
from vllm.utils.import_utils import LazyLoader from vllm.utils.import_utils import LazyLoader
from vllm.v1.structured_output.backend_guidance import GuidanceBackend from vllm.v1.structured_output.backend_guidance import GuidanceBackend
from vllm.v1.structured_output.backend_types import ( from vllm.v1.structured_output.backend_types import (
...@@ -71,7 +71,7 @@ class StructuredOutputManager: ...@@ -71,7 +71,7 @@ class StructuredOutputManager:
# of CPUs. # of CPUs.
max_workers = max(1, (multiprocessing.cpu_count() + 1) // 2) max_workers = max(1, (multiprocessing.cpu_count() + 1) // 2)
self.executor = ThreadPoolExecutor(max_workers=max_workers) self.executor = ThreadPoolExecutor(max_workers=max_workers)
self.tokenizer = init_tokenizer_from_config( self.tokenizer = cached_tokenizer_from_config(
model_config=self.vllm_config.model_config model_config=self.vllm_config.model_config
) )
reasoning_parser = ( reasoning_parser = (
......
...@@ -10,7 +10,8 @@ import torch ...@@ -10,7 +10,8 @@ import torch
import vllm.envs import vllm.envs
from vllm.logger import init_logger from vllm.logger import init_logger
from vllm.sampling_params import SamplingParams from vllm.sampling_params import SamplingParams
from vllm.tokenizers import MistralTokenizer from vllm.tokenizers.deepseek_v32 import DeepseekV32Tokenizer
from vllm.tokenizers.mistral import MistralTokenizer
from vllm.utils.import_utils import LazyLoader from vllm.utils.import_utils import LazyLoader
from vllm.v1.structured_output.backend_types import ( from vllm.v1.structured_output.backend_types import (
StructuredOutputBackend, StructuredOutputBackend,
...@@ -56,6 +57,27 @@ class XgrammarBackend(StructuredOutputBackend): ...@@ -56,6 +57,27 @@ class XgrammarBackend(StructuredOutputBackend):
stop_token_ids=stop_token_ids, stop_token_ids=stop_token_ids,
add_prefix_space=True, add_prefix_space=True,
) )
elif isinstance(self.tokenizer, DeepseekV32Tokenizer):
# copy from xgr.TokenizerInfo.from_huggingface()
# because we are using a custom tokenizer wrapper here.
vocab_dict = self.tokenizer.get_vocab()
tokenizer_vocab_size = max(len(vocab_dict), self.tokenizer.max_token_id + 1)
vocab_size = self.vocab_size or tokenizer_vocab_size
# maintain tokenizer's indexing
encoded_vocab = [""] * vocab_size
for token, idx in vocab_dict.items():
if idx < vocab_size:
encoded_vocab[idx] = token
stop_token_ids = [self.tokenizer.eos_token_id]
backend_str = self.tokenizer.tokenizer.backend_tokenizer.to_str()
metadata = xgr.TokenizerInfo._detect_metadata_from_hf(backend_str)
tokenizer_info = xgr.TokenizerInfo(
encoded_vocab=encoded_vocab,
vocab_type=metadata["vocab_type"],
vocab_size=vocab_size,
stop_token_ids=stop_token_ids,
add_prefix_space=metadata["add_prefix_space"],
)
else: else:
tokenizer_info = xgr.TokenizerInfo.from_huggingface( tokenizer_info = xgr.TokenizerInfo.from_huggingface(
self.tokenizer, self.tokenizer,
...@@ -246,13 +268,7 @@ def has_xgrammar_unsupported_json_features(schema: dict[str, Any]) -> bool: ...@@ -246,13 +268,7 @@ def has_xgrammar_unsupported_json_features(schema: dict[str, Any]) -> bool:
# Unsupported keywords for objects # Unsupported keywords for objects
if obj.get("type") == "object" and any( if obj.get("type") == "object" and any(
key in obj key in obj for key in ("patternProperties", "propertyNames")
for key in (
"minProperties",
"maxProperties",
"propertyNames",
"patternProperties",
)
): ):
return True return True
......
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from typing import TYPE_CHECKING, Any, cast
from vllm.config import VllmConfig, get_layers_from_vllm_config
if TYPE_CHECKING:
from vllm.model_executor.layers.attention_layer_base import AttentionLayerBase
else:
AttentionLayerBase = object
def check_attention_cp_compatibility(vllm_config: VllmConfig) -> None:
pcp_size = vllm_config.parallel_config.prefill_context_parallel_size
dcp_size = vllm_config.parallel_config.decode_context_parallel_size
interleave_size = vllm_config.parallel_config.cp_kv_cache_interleave_size
if pcp_size * dcp_size > 1:
layer_type = cast(type[Any], AttentionLayerBase)
layers = get_layers_from_vllm_config(vllm_config, layer_type)
for layer in layers.values():
layer_impl = getattr(layer, "impl", None)
if layer_impl is None:
continue
if vllm_config.speculative_config is not None and interleave_size > 1:
assert layer_impl.supports_mtp_with_cp_non_trivial_interleave_size, (
"MTP with cp_kv_cache_interleave_size > 1 is not "
f"supported in {layer_impl.__class__.__name__}."
)
if dcp_size > 1:
assert layer_impl.need_to_return_lse_for_decode, (
"DCP requires attention impls to return"
" the softmax lse for decode, but the impl "
f"{layer_impl.__class__.__name__} "
"does not return the softmax lse for decode."
)
if pcp_size > 1:
assert layer_impl.supports_pcp, (
"PCP requires attention impls' support, "
f"but the impl {layer_impl.__class__.__name__} "
"does not support PCP."
)
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import functools
import gc import gc
import itertools import itertools
import time import time
...@@ -148,6 +149,7 @@ from vllm.v1.spec_decode.ngram_proposer import NgramProposer ...@@ -148,6 +149,7 @@ from vllm.v1.spec_decode.ngram_proposer import NgramProposer
from vllm.v1.spec_decode.suffix_decoding import SuffixDecodingProposer from vllm.v1.spec_decode.suffix_decoding import SuffixDecodingProposer
from vllm.v1.structured_output.utils import apply_grammar_bitmask from vllm.v1.structured_output.utils import apply_grammar_bitmask
from vllm.v1.utils import CpuGpuBuffer, record_function_or_nullcontext from vllm.v1.utils import CpuGpuBuffer, record_function_or_nullcontext
from vllm.v1.worker.cp_utils import check_attention_cp_compatibility
from vllm.v1.worker.dp_utils import coordinate_batch_across_dp from vllm.v1.worker.dp_utils import coordinate_batch_across_dp
from vllm.v1.worker.ec_connector_model_runner_mixin import ECConnectorModelRunnerMixin from vllm.v1.worker.ec_connector_model_runner_mixin import ECConnectorModelRunnerMixin
from vllm.v1.worker.gpu_input_batch import CachedRequestState, InputBatch from vllm.v1.worker.gpu_input_batch import CachedRequestState, InputBatch
...@@ -160,15 +162,14 @@ from vllm.v1.worker.ubatch_utils import ( ...@@ -160,15 +162,14 @@ from vllm.v1.worker.ubatch_utils import (
maybe_create_ubatch_slices, maybe_create_ubatch_slices,
) )
from vllm.v1.worker.utils import is_residual_scattered_for_sp from vllm.v1.worker.utils import is_residual_scattered_for_sp
from vllm.v1.worker.workspace import lock_workspace
from .utils import ( from .utils import (
AttentionGroup, AttentionGroup,
MultiModalBudget, MultiModalBudget,
add_kv_sharing_layers_to_kv_cache_groups, add_kv_sharing_layers_to_kv_cache_groups,
bind_kv_cache, bind_kv_cache,
gather_mm_placeholders,
sanity_check_mm_encoder_outputs, sanity_check_mm_encoder_outputs,
scatter_mm_placeholders,
) )
if TYPE_CHECKING: if TYPE_CHECKING:
...@@ -295,6 +296,7 @@ class GPUModelRunner( ...@@ -295,6 +296,7 @@ class GPUModelRunner(
self.device = device self.device = device
self.pin_memory = is_pin_memory_available() self.pin_memory = is_pin_memory_available()
self.dtype = self.model_config.dtype self.dtype = self.model_config.dtype
self.kv_cache_dtype = kv_cache_dtype_str_to_dtype( self.kv_cache_dtype = kv_cache_dtype_str_to_dtype(
cache_config.cache_dtype, self.model_config cache_config.cache_dtype, self.model_config
) )
...@@ -1267,6 +1269,8 @@ class GPUModelRunner( ...@@ -1267,6 +1269,8 @@ class GPUModelRunner(
if not isinstance(kv_cache_spec, CrossAttentionSpec): if not isinstance(kv_cache_spec, CrossAttentionSpec):
return None, None return None, None
# Zero out buffer for padding requests that are not actually scheduled (CGs)
self.encoder_seq_lens.np[:num_reqs] = 0
# Build encoder_seq_lens array mapping request indices to # Build encoder_seq_lens array mapping request indices to
# encoder lengths for inputs scheduled in this batch # encoder lengths for inputs scheduled in this batch
for req_id in num_scheduled_tokens: for req_id in num_scheduled_tokens:
...@@ -1530,28 +1534,13 @@ class GPUModelRunner( ...@@ -1530,28 +1534,13 @@ class GPUModelRunner(
""" """
:return: tuple[attn_metadata, spec_decode_common_attn_metadata] :return: tuple[attn_metadata, spec_decode_common_attn_metadata]
""" """
# Attention metadata is not needed for attention free models
if len(self.kv_cache_config.kv_cache_groups) == 0:
return {}, None
num_tokens_padded = num_tokens_padded or num_tokens num_tokens_padded = num_tokens_padded or num_tokens
num_reqs_padded = num_reqs_padded or num_reqs num_reqs_padded = num_reqs_padded or num_reqs
assert num_reqs_padded is not None and num_tokens_padded is not None
logits_indices_padded = None
num_logits_indices = None
if logits_indices is not None:
num_logits_indices = logits_indices.size(0)
if self.cache_config.kv_sharing_fast_prefill:
logits_indices_padded = self._prepare_kv_sharing_fast_prefill(
logits_indices
)
# update seq_lens of decode reqs under DCP.
if self.dcp_world_size > 1:
self.dcp_local_seq_lens.cpu[:num_reqs] = get_dcp_local_seq_lens(
self.seq_lens.cpu[:num_reqs],
self.dcp_world_size,
self.dcp_rank,
self.parallel_config.cp_kv_cache_interleave_size,
)
self.dcp_local_seq_lens.cpu[num_reqs:].fill_(0)
self.dcp_local_seq_lens.copy_to_gpu(num_reqs_padded)
attn_metadata: PerLayerAttnMetadata = {} attn_metadata: PerLayerAttnMetadata = {}
if ubatch_slices is not None: if ubatch_slices is not None:
...@@ -1572,36 +1561,12 @@ class GPUModelRunner( ...@@ -1572,36 +1561,12 @@ class GPUModelRunner(
self.num_accepted_tokens.np[num_reqs:].fill(1) self.num_accepted_tokens.np[num_reqs:].fill(1)
self.num_accepted_tokens.copy_to_gpu() self.num_accepted_tokens.copy_to_gpu()
# Used in the below loop, uses padded shapes kv_cache_groups = self.kv_cache_config.kv_cache_groups
query_start_loc = self.query_start_loc.gpu[: num_reqs_padded + 1]
query_start_loc_cpu = self.query_start_loc.cpu[: num_reqs_padded + 1]
seq_lens = self.seq_lens.gpu[:num_reqs_padded]
seq_lens_cpu = self.seq_lens.cpu[:num_reqs_padded]
num_computed_tokens_cpu = self.input_batch.num_computed_tokens_cpu_tensor[
:num_reqs_padded
]
dcp_local_seq_lens, dcp_local_seq_lens_cpu = None, None
if self.dcp_world_size > 1:
dcp_local_seq_lens = self.dcp_local_seq_lens.gpu[:num_reqs_padded]
dcp_local_seq_lens_cpu = self.dcp_local_seq_lens.cpu[:num_reqs_padded]
spec_decode_common_attn_metadata = None
# Prepare the attention metadata for each KV cache group and make layers def _get_block_table_and_slot_mapping(kv_cache_gid: int):
# in the same group share the same metadata. assert num_reqs_padded is not None and num_tokens_padded is not None
for kv_cache_gid, kv_cache_group in enumerate( kv_cache_spec = kv_cache_groups[kv_cache_gid].kv_cache_spec
self.kv_cache_config.kv_cache_groups if isinstance(kv_cache_spec, EncoderOnlyAttentionSpec):
):
encoder_seq_lens, encoder_seq_lens_cpu = self._get_encoder_seq_lens(
num_scheduled_tokens or {},
kv_cache_group.kv_cache_spec,
num_reqs_padded,
)
if isinstance(kv_cache_group.kv_cache_spec, EncoderOnlyAttentionSpec):
# Encoder-only layers do not have KV cache, so we need to
# create a dummy block table and slot mapping for them.
blk_table_tensor = torch.zeros( blk_table_tensor = torch.zeros(
(num_reqs_padded, 1), (num_reqs_padded, 1),
dtype=torch.int32, dtype=torch.int32,
...@@ -1617,92 +1582,129 @@ class GPUModelRunner( ...@@ -1617,92 +1582,129 @@ class GPUModelRunner(
blk_table_tensor = blk_table.get_device_tensor(num_reqs_padded) blk_table_tensor = blk_table.get_device_tensor(num_reqs_padded)
slot_mapping = blk_table.slot_mapping.gpu[:num_tokens_padded] slot_mapping = blk_table.slot_mapping.gpu[:num_tokens_padded]
# Fill unused with -1. Needed for reshape_and_cache in full cuda # Fill unused with -1. Needed for reshape_and_cache in full cuda
# graph mode. `blk_table_tensor` -1 to match mamba PAD_SLOT_ID # graph mode. `blk_table_tensor` -1 to match mamba PAD_SLOT_ID
slot_mapping[num_tokens:num_tokens_padded].fill_(-1) slot_mapping[num_tokens:num_tokens_padded].fill_(-1)
blk_table_tensor[num_reqs:num_reqs_padded].fill_(-1) blk_table_tensor[num_reqs:num_reqs_padded].fill_(-1)
common_attn_metadata = CommonAttentionMetadata( return blk_table_tensor, slot_mapping
query_start_loc=query_start_loc,
query_start_loc_cpu=query_start_loc_cpu, block_table_gid_0, slot_mapping_gid_0 = _get_block_table_and_slot_mapping(0)
seq_lens=seq_lens, cm_base = CommonAttentionMetadata(
_seq_lens_cpu=seq_lens_cpu, query_start_loc=self.query_start_loc.gpu[: num_reqs_padded + 1],
_num_computed_tokens_cpu=num_computed_tokens_cpu, query_start_loc_cpu=self.query_start_loc.cpu[: num_reqs_padded + 1],
num_actual_tokens=num_tokens_padded, seq_lens=self.seq_lens.gpu[:num_reqs_padded],
num_reqs=num_reqs_padded, _seq_lens_cpu=self.seq_lens.cpu[:num_reqs_padded],
max_query_len=max_query_len, _num_computed_tokens_cpu=self.input_batch.num_computed_tokens_cpu_tensor[
max_seq_len=max_seq_len, :num_reqs_padded
block_table_tensor=blk_table_tensor, ],
slot_mapping=slot_mapping, num_reqs=num_reqs_padded,
logits_indices_padded=logits_indices_padded, num_actual_tokens=num_tokens_padded,
num_logits_indices=num_logits_indices, max_query_len=max_query_len,
causal=True, max_seq_len=max_seq_len,
encoder_seq_lens=encoder_seq_lens, block_table_tensor=block_table_gid_0,
encoder_seq_lens_cpu=encoder_seq_lens_cpu, slot_mapping=slot_mapping_gid_0,
dcp_local_seq_lens=dcp_local_seq_lens, causal=True,
dcp_local_seq_lens_cpu=dcp_local_seq_lens_cpu, )
if self.dcp_world_size > 1:
self.dcp_local_seq_lens.cpu[:num_reqs] = get_dcp_local_seq_lens(
self.seq_lens.cpu[:num_reqs],
self.dcp_world_size,
self.dcp_rank,
self.parallel_config.cp_kv_cache_interleave_size,
)
self.dcp_local_seq_lens.cpu[num_reqs:].fill_(0)
self.dcp_local_seq_lens.copy_to_gpu(num_reqs_padded)
cm_base.dcp_local_seq_lens = self.dcp_local_seq_lens.gpu[:num_reqs_padded]
cm_base.dcp_local_seq_lens_cpu = self.dcp_local_seq_lens.cpu[
:num_reqs_padded
]
if logits_indices is not None and self.cache_config.kv_sharing_fast_prefill:
cm_base.num_logits_indices = logits_indices.size(0)
cm_base.logits_indices_padded = self._prepare_kv_sharing_fast_prefill(
logits_indices
)
def _build_attn_group_metadata(
kv_cache_gid: int,
attn_gid: int,
common_attn_metadata: CommonAttentionMetadata,
ubid: int | None = None,
) -> None:
attn_group = self.attn_groups[kv_cache_gid][attn_gid]
cascade_attn_prefix_len = (
cascade_attn_prefix_lens[kv_cache_gid][attn_gid]
if cascade_attn_prefix_lens
else 0
)
builder = attn_group.get_metadata_builder(ubid or 0)
extra_attn_metadata_args = {}
if use_spec_decode and isinstance(builder, GDNAttentionMetadataBuilder):
assert ubid is None, "UBatching not supported with GDN yet"
extra_attn_metadata_args = dict(
num_accepted_tokens=self.num_accepted_tokens.gpu[:num_reqs_padded],
num_decode_draft_tokens_cpu=self.num_decode_draft_tokens.cpu[
:num_reqs_padded
],
)
if for_cudagraph_capture:
attn_metadata_i = builder.build_for_cudagraph_capture(
common_attn_metadata
)
else:
attn_metadata_i = builder.build(
common_prefix_len=cascade_attn_prefix_len,
common_attn_metadata=common_attn_metadata,
**extra_attn_metadata_args,
)
if ubid is None:
assert isinstance(attn_metadata, dict)
attn_metadata_dict = attn_metadata
else:
assert isinstance(attn_metadata, list)
attn_metadata_dict = attn_metadata[ubid]
for layer_name in attn_group.layer_names:
attn_metadata_dict[layer_name] = attn_metadata_i
# Prepare the attention metadata for each KV cache group and make layers
# in the same group share the same metadata.
spec_decode_common_attn_metadata = None
for kv_cache_gid, kv_cache_group in enumerate(kv_cache_groups):
cm = copy(cm_base) # shallow copy
# Basically only the encoder seq_lens, block_table and slot_mapping change
# for each kv_cache_group.
cm.encoder_seq_lens, cm.encoder_seq_lens_cpu = self._get_encoder_seq_lens(
num_scheduled_tokens or {},
kv_cache_group.kv_cache_spec,
num_reqs_padded,
) )
if kv_cache_gid > 0:
cm.block_table_tensor, cm.slot_mapping = (
_get_block_table_and_slot_mapping(kv_cache_gid)
)
if self.speculative_config and spec_decode_common_attn_metadata is None: if self.speculative_config and spec_decode_common_attn_metadata is None:
if isinstance(self.drafter, EagleProposer): if isinstance(self.drafter, EagleProposer):
if self.drafter.attn_layer_names[0] in kv_cache_group.layer_names: if self.drafter.attn_layer_names[0] in kv_cache_group.layer_names:
spec_decode_common_attn_metadata = common_attn_metadata spec_decode_common_attn_metadata = cm
else: else:
spec_decode_common_attn_metadata = common_attn_metadata spec_decode_common_attn_metadata = cm
for attn_gid, attn_group in enumerate(self.attn_groups[kv_cache_gid]):
cascade_attn_prefix_len = (
cascade_attn_prefix_lens[kv_cache_gid][attn_gid]
if cascade_attn_prefix_lens
else 0
)
builder = attn_group.get_metadata_builder()
extra_attn_metadata_args = {}
if use_spec_decode and isinstance(builder, GDNAttentionMetadataBuilder):
extra_attn_metadata_args = dict(
num_accepted_tokens=self.num_accepted_tokens.gpu[
:num_reqs_padded
],
num_decode_draft_tokens_cpu=self.num_decode_draft_tokens.cpu[
:num_reqs_padded
],
)
for attn_gid in range(len(self.attn_groups[kv_cache_gid])):
if ubatch_slices is not None: if ubatch_slices is not None:
common_attn_metadata_list = split_attn_metadata( for ubid, _cm in enumerate(split_attn_metadata(ubatch_slices, cm)):
ubatch_slices, common_attn_metadata _build_attn_group_metadata(kv_cache_gid, attn_gid, _cm, ubid)
)
for ubid, common_attn_metadata in enumerate(
common_attn_metadata_list
):
builder = attn_group.get_metadata_builder(ubatch_id=ubid)
if for_cudagraph_capture:
attn_metadata_i = builder.build_for_cudagraph_capture(
common_attn_metadata
)
else:
attn_metadata_i = builder.build(
common_prefix_len=cascade_attn_prefix_len,
common_attn_metadata=common_attn_metadata,
)
for layer_name in kv_cache_group.layer_names:
assert type(attn_metadata) is list
attn_metadata[ubid][layer_name] = attn_metadata_i
else: else:
assert isinstance(attn_metadata, dict) _build_attn_group_metadata(kv_cache_gid, attn_gid, cm)
if for_cudagraph_capture:
attn_metadata_i = builder.build_for_cudagraph_capture(
common_attn_metadata
)
else:
attn_metadata_i = builder.build(
common_prefix_len=cascade_attn_prefix_len,
common_attn_metadata=common_attn_metadata,
**extra_attn_metadata_args,
)
for layer_name in attn_group.layer_names:
attn_metadata[layer_name] = attn_metadata_i
if self.is_mm_prefix_lm: if self.is_mm_prefix_lm:
req_doc_ranges = {} req_doc_ranges = {}
...@@ -2183,10 +2185,7 @@ class GPUModelRunner( ...@@ -2183,10 +2185,7 @@ class GPUModelRunner(
# Cache the encoder outputs by mm_hash # Cache the encoder outputs by mm_hash
for (mm_hash, pos_info), output in zip(mm_hashes_pos, encoder_outputs): for (mm_hash, pos_info), output in zip(mm_hashes_pos, encoder_outputs):
self.encoder_cache[mm_hash] = scatter_mm_placeholders( self.encoder_cache[mm_hash] = output
output,
is_embed=pos_info.is_embed,
)
logger.debug("Finish execute for mm hash %s", mm_hash) logger.debug("Finish execute for mm hash %s", mm_hash)
self.maybe_save_ec_to_connector(self.encoder_cache, mm_hash) self.maybe_save_ec_to_connector(self.encoder_cache, mm_hash)
...@@ -2237,6 +2236,13 @@ class GPUModelRunner( ...@@ -2237,6 +2236,13 @@ class GPUModelRunner(
num_encoder_tokens, num_encoder_tokens,
) )
assert start_idx < end_idx assert start_idx < end_idx
curr_embeds_start, curr_embeds_end = (
pos_info.get_embeds_indices_in_range(start_idx, end_idx)
)
# If there are no embeddings in the current range, we skip
# gathering the embeddings.
if curr_embeds_start == curr_embeds_end:
continue
mm_hash = mm_feature.identifier mm_hash = mm_feature.identifier
encoder_output = self.encoder_cache.get(mm_hash, None) encoder_output = self.encoder_cache.get(mm_hash, None)
...@@ -2244,16 +2250,14 @@ class GPUModelRunner( ...@@ -2244,16 +2250,14 @@ class GPUModelRunner(
if (is_embed := pos_info.is_embed) is not None: if (is_embed := pos_info.is_embed) is not None:
is_embed = is_embed[start_idx:end_idx] is_embed = is_embed[start_idx:end_idx]
mm_embeds_item = encoder_output[curr_embeds_start:curr_embeds_end]
else:
mm_embeds_item = encoder_output[start_idx:end_idx]
req_start_pos = req_start_idx + start_pos - num_computed_tokens req_start_pos = req_start_idx + start_pos - num_computed_tokens
is_mm_embed[req_start_pos + start_idx : req_start_pos + end_idx] = ( is_mm_embed[req_start_pos + start_idx : req_start_pos + end_idx] = (
True if is_embed is None else is_embed True if is_embed is None else is_embed
) )
mm_embeds_item = gather_mm_placeholders(
encoder_output[start_idx:end_idx],
is_embed=is_embed,
)
mm_embeds_req.append(mm_embeds_item) mm_embeds_req.append(mm_embeds_item)
if self.is_multimodal_pruning_enabled and self.uses_mrope: if self.is_multimodal_pruning_enabled and self.uses_mrope:
...@@ -2764,6 +2768,7 @@ class GPUModelRunner( ...@@ -2764,6 +2768,7 @@ class GPUModelRunner(
# be improved in model runner v2) # be improved in model runner v2)
force_uniform_decode: bool | None = None, force_uniform_decode: bool | None = None,
force_has_lora: bool | None = None, force_has_lora: bool | None = None,
num_encoder_reqs: int = 0,
) -> tuple[ ) -> tuple[
CUDAGraphMode, CUDAGraphMode,
BatchDescriptor, BatchDescriptor,
...@@ -2780,6 +2785,11 @@ class GPUModelRunner( ...@@ -2780,6 +2785,11 @@ class GPUModelRunner(
if force_uniform_decode is None if force_uniform_decode is None
else force_uniform_decode else force_uniform_decode
) )
# Encoder-decoder models only support CG for decoder_step > 0 (no enc_output
# is present). Also, chunked-prefill is disabled, so batch are uniform.
has_encoder_output = (
self.model_config.is_encoder_decoder and num_encoder_reqs > 0
)
has_lora = ( has_lora = (
len(self.input_batch.lora_id_to_lora_request) > 0 len(self.input_batch.lora_id_to_lora_request) > 0
...@@ -2799,7 +2809,7 @@ class GPUModelRunner( ...@@ -2799,7 +2809,7 @@ class GPUModelRunner(
) )
cudagraph_mode, batch_descriptor = dispatch_cudagraph( cudagraph_mode, batch_descriptor = dispatch_cudagraph(
num_tokens_padded, use_cascade_attn num_tokens_padded, use_cascade_attn or has_encoder_output
) )
num_tokens_padded = batch_descriptor.num_tokens num_tokens_padded = batch_descriptor.num_tokens
...@@ -2997,6 +3007,7 @@ class GPUModelRunner( ...@@ -2997,6 +3007,7 @@ class GPUModelRunner(
num_scheduled_tokens_np=num_scheduled_tokens_np, num_scheduled_tokens_np=num_scheduled_tokens_np,
max_num_scheduled_tokens=max_num_scheduled_tokens, max_num_scheduled_tokens=max_num_scheduled_tokens,
use_cascade_attn=cascade_attn_prefix_lens is not None, use_cascade_attn=cascade_attn_prefix_lens is not None,
num_encoder_reqs=len(scheduler_output.scheduled_encoder_inputs),
) )
logger.debug( logger.debug(
...@@ -3562,74 +3573,89 @@ class GPUModelRunner( ...@@ -3562,74 +3573,89 @@ class GPUModelRunner(
if self.parallel_config.enable_eplb: if self.parallel_config.enable_eplb:
self.eplb_state = EplbState(self.parallel_config, self.device) self.eplb_state = EplbState(self.parallel_config, self.device)
eplb_models = 0 eplb_models = 0
with DeviceMemoryProfiler() as m:
time_before_load = time.perf_counter() try:
model_loader = get_model_loader(self.load_config) with DeviceMemoryProfiler() as m:
self.model = model_loader.load_model( time_before_load = time.perf_counter()
vllm_config=self.vllm_config, model_config=self.model_config model_loader = get_model_loader(self.load_config)
) self.model = model_loader.load_model(
if self.lora_config: vllm_config=self.vllm_config, model_config=self.model_config
self.model = self.load_lora_model(
self.model, self.vllm_config, self.device
) )
if hasattr(self, "drafter"): if self.lora_config:
logger.info_once("Loading drafter model...") self.model = self.load_lora_model(
self.drafter.load_model(self.model) self.model, self.vllm_config, self.device
if (
hasattr(self.drafter, "model")
and is_mixture_of_experts(self.drafter.model)
and self.parallel_config.enable_eplb
):
spec_config = self.vllm_config.speculative_config
assert spec_config is not None
assert spec_config.draft_model_config is not None
logger.info_once(
"EPLB is enabled for drafter model %s.",
spec_config.draft_model_config.model,
) )
if hasattr(self, "drafter"):
logger.info_once("Loading drafter model...")
self.drafter.load_model(self.model)
if (
hasattr(self.drafter, "model")
and is_mixture_of_experts(self.drafter.model)
and self.parallel_config.enable_eplb
):
spec_config = self.vllm_config.speculative_config
assert spec_config is not None
assert spec_config.draft_model_config is not None
logger.info_once(
"EPLB is enabled for drafter model %s.",
spec_config.draft_model_config.model,
)
global_expert_load = ( global_expert_load = (
global_expert_loads[eplb_models] global_expert_loads[eplb_models]
if global_expert_loads if global_expert_loads
else None else None
) )
old_global_expert_indices = ( old_global_expert_indices = (
old_global_expert_indices_per_model[eplb_models] old_global_expert_indices_per_model[eplb_models]
if old_global_expert_indices_per_model if old_global_expert_indices_per_model
else None else None
) )
if self.eplb_state is None: if self.eplb_state is None:
self.eplb_state = EplbState(self.parallel_config, self.device) self.eplb_state = EplbState(
self.eplb_state.add_model( self.parallel_config, self.device
self.drafter.model, )
spec_config.draft_model_config, self.eplb_state.add_model(
global_expert_load, self.drafter.model,
old_global_expert_indices, spec_config.draft_model_config,
rank_mapping, global_expert_load,
) old_global_expert_indices,
eplb_models += 1 rank_mapping,
)
eplb_models += 1
if self.use_aux_hidden_state_outputs: if self.use_aux_hidden_state_outputs:
if not supports_eagle3(self.get_model()): if not supports_eagle3(self.get_model()):
raise RuntimeError( raise RuntimeError(
"Model does not support EAGLE3 interface but " "Model does not support EAGLE3 interface but "
"aux_hidden_state_outputs was requested" "aux_hidden_state_outputs was requested"
) )
# Try to get auxiliary layers from speculative config, # Try to get auxiliary layers from speculative config,
# otherwise use model's default layers # otherwise use model's default layers
aux_layers = self._get_eagle3_aux_layers_from_config() aux_layers = self._get_eagle3_aux_layers_from_config()
if aux_layers: if aux_layers:
logger.info( logger.info(
"Using auxiliary layers from speculative config: %s", "Using auxiliary layers from speculative config: %s",
aux_layers, aux_layers,
) )
else: else:
aux_layers = self.model.get_eagle3_aux_hidden_state_layers() aux_layers = self.model.get_eagle3_aux_hidden_state_layers()
self.model.set_aux_hidden_state_layers(aux_layers) self.model.set_aux_hidden_state_layers(aux_layers)
time_after_load = time.perf_counter() time_after_load = time.perf_counter()
self.model_memory_usage = m.consumed_memory self.model_memory_usage = m.consumed_memory
except torch.cuda.OutOfMemoryError as e:
msg = (
"Failed to load model - not enough GPU memory. "
"Try lowering --gpu-memory-utilization to free memory for weights, "
"increasing --tensor-parallel-size, or using --quantization. "
"See https://docs.vllm.ai/en/latest/configuration/conserving_memory/ "
"for more tips."
)
combined_msg = f"{msg} (original error: {e})"
logger.error(combined_msg)
raise e
logger.info_once( logger.info_once(
"Model loading took %.4f GiB memory and %.6f seconds", "Model loading took %.4f GiB memory and %.6f seconds",
self.model_memory_usage / GiB_bytes, self.model_memory_usage / GiB_bytes,
...@@ -3867,19 +3893,21 @@ class GPUModelRunner( ...@@ -3867,19 +3893,21 @@ class GPUModelRunner(
return {} return {}
@contextmanager @contextmanager
def maybe_randomize_inputs(self, input_ids: torch.Tensor): def maybe_randomize_inputs(
self, input_ids: torch.Tensor | None, inputs_embeds: torch.Tensor | None
):
""" """
Randomize input_ids if VLLM_RANDOMIZE_DP_DUMMY_INPUTS is set. Randomize input_ids if VLLM_RANDOMIZE_DP_DUMMY_INPUTS is set.
This is to help balance expert-selection This is to help balance expert-selection
- during profile_run - during profile_run
- during DP rank dummy run - during DP rank dummy run
""" """
dp_size = self.vllm_config.parallel_config.data_parallel_size dp_size = self.vllm_config.parallel_config.data_parallel_size
randomize_inputs = envs.VLLM_RANDOMIZE_DP_DUMMY_INPUTS and dp_size > 1 randomize_inputs = envs.VLLM_RANDOMIZE_DP_DUMMY_INPUTS and dp_size > 1
if not randomize_inputs: if not randomize_inputs:
yield yield
else: elif input_ids is not None:
import functools
@functools.cache @functools.cache
def rand_input_ids() -> torch.Tensor: def rand_input_ids() -> torch.Tensor:
...@@ -3887,13 +3915,27 @@ class GPUModelRunner( ...@@ -3887,13 +3915,27 @@ class GPUModelRunner(
self.input_ids.gpu, self.input_ids.gpu,
low=0, low=0,
high=self.model_config.get_vocab_size(), high=self.model_config.get_vocab_size(),
dtype=input_ids.dtype,
) )
logger.debug_once("Randomizing dummy data for DP Rank") logger.debug_once("Randomizing dummy input_ids for DP Rank")
input_ids.copy_(rand_input_ids()[: input_ids.size(0)], non_blocking=True) input_ids.copy_(rand_input_ids()[: input_ids.size(0)], non_blocking=True)
yield yield
input_ids.fill_(0) input_ids.fill_(0)
else:
@functools.cache
def rand_inputs_embeds() -> torch.Tensor:
return torch.randn_like(
self.inputs_embeds.gpu,
)
assert inputs_embeds is not None
logger.debug_once("Randomizing dummy inputs_embeds for DP Rank")
inputs_embeds.copy_(
rand_inputs_embeds()[: inputs_embeds.size(0)], non_blocking=True
)
yield
inputs_embeds.fill_(0)
def _get_mm_dummy_batch( def _get_mm_dummy_batch(
self, self,
...@@ -4142,7 +4184,7 @@ class GPUModelRunner( ...@@ -4142,7 +4184,7 @@ class GPUModelRunner(
num_tokens_across_dp[:] = num_tokens_padded num_tokens_across_dp[:] = num_tokens_padded
with ( with (
self.maybe_randomize_inputs(input_ids), self.maybe_randomize_inputs(input_ids, inputs_embeds),
set_forward_context( set_forward_context(
attn_metadata, attn_metadata,
self.vllm_config, self.vllm_config,
...@@ -4425,31 +4467,8 @@ class GPUModelRunner( ...@@ -4425,31 +4467,8 @@ class GPUModelRunner(
dummy_encoder_outputs, dummy_encoder_outputs,
expected_num_items=max_mm_items_per_batch, expected_num_items=max_mm_items_per_batch,
) )
for i, output in enumerate(dummy_encoder_outputs):
# NOTE: This happens when encoder cache needs to store self.encoder_cache[f"tmp_{i}"] = output
# the embeddings that encoder outputs are scattered onto.
# In this case we create dummy embeddings of size
# (max_tokens_for_modality, hidden_size) and scatter
# encoder output into it.
encoder_output_shape = dummy_encoder_outputs[0].shape
max_mm_tokens_per_item = mm_budget.max_tokens_by_modality[
dummy_modality
]
if encoder_output_shape[0] < max_mm_tokens_per_item:
encoder_hidden_size = encoder_output_shape[-1]
expanded_outputs = []
for output in dummy_encoder_outputs:
expanded = output.new_zeros(
(max_mm_tokens_per_item, encoder_hidden_size)
)
num_tokens = output.shape[0]
expanded[:num_tokens].copy_(output)
expanded_outputs.append(expanded)
dummy_encoder_outputs = expanded_outputs
# Cache the dummy encoder outputs.
self.encoder_cache["tmp"] = dict(enumerate(dummy_encoder_outputs))
# Add `is_profile` here to pre-allocate communication buffers # Add `is_profile` here to pre-allocate communication buffers
hidden_states, last_hidden_states = self._dummy_run( hidden_states, last_hidden_states = self._dummy_run(
...@@ -4557,6 +4576,10 @@ class GPUModelRunner( ...@@ -4557,6 +4576,10 @@ class GPUModelRunner(
# after here. # after here.
set_cudagraph_capturing_enabled(False) set_cudagraph_capturing_enabled(False)
# Lock workspace to prevent resizing during execution.
# Max workspace sizes should have been captured during warmup/profiling.
lock_workspace()
end_time = time.perf_counter() end_time = time.perf_counter()
elapsed_time = end_time - start_time elapsed_time = end_time - start_time
cuda_graph_size = start_free_gpu_memory - end_free_gpu_memory cuda_graph_size = start_free_gpu_memory - end_free_gpu_memory
...@@ -4712,6 +4735,9 @@ class GPUModelRunner( ...@@ -4712,6 +4735,9 @@ class GPUModelRunner(
attention_backend_list, kv_cache_config.kv_cache_groups attention_backend_list, kv_cache_config.kv_cache_groups
) )
# Check if attention backend supports PCP&DCP and related features.
check_attention_cp_compatibility(self.vllm_config)
for i, attn_backend_map in enumerate(attention_backend_maps): for i, attn_backend_map in enumerate(attention_backend_maps):
self.attn_groups.append(create_attn_groups(attn_backend_map, i)) self.attn_groups.append(create_attn_groups(attn_backend_map, i))
...@@ -4871,7 +4897,7 @@ class GPUModelRunner( ...@@ -4871,7 +4897,7 @@ class GPUModelRunner(
# we need to adjust the cudagraph sizes to be a multiple of the uniform # we need to adjust the cudagraph sizes to be a multiple of the uniform
# decode query length to avoid: https://github.com/vllm-project/vllm/issues/28207 # decode query length to avoid: https://github.com/vllm-project/vllm/issues/28207
# temp-fix: https://github.com/vllm-project/vllm/issues/28207#issuecomment-3504004536 # temp-fix: https://github.com/vllm-project/vllm/issues/28207#issuecomment-3504004536
# Will be removed in the near future when we have seperate cudagraph capture # Will be removed in the near future when we have separate cudagraph capture
# sizes for decode and mixed prefill-decode. # sizes for decode and mixed prefill-decode.
if ( if (
cudagraph_mode.decode_mode() == CUDAGraphMode.FULL cudagraph_mode.decode_mode() == CUDAGraphMode.FULL
...@@ -5370,20 +5396,6 @@ class GPUModelRunner( ...@@ -5370,20 +5396,6 @@ class GPUModelRunner(
kv_transfer_group.register_kv_caches(kv_caches) kv_transfer_group.register_kv_caches(kv_caches)
kv_transfer_group.set_host_xfer_buffer_ops(copy_kv_blocks) kv_transfer_group.set_host_xfer_buffer_ops(copy_kv_blocks)
if self.dcp_world_size > 1:
layer_type = cast(type[Any], AttentionLayerBase)
layers = get_layers_from_vllm_config(self.vllm_config, layer_type)
for layer in layers.values():
layer_impl = getattr(layer, "impl", None)
if layer_impl is None:
continue
assert layer_impl.need_to_return_lse_for_decode, (
"DCP requires attention impls to return"
" the softmax lse for decode, but the impl "
f"{layer_impl.__class__.__name__} "
"does not return the softmax lse for decode."
)
def may_add_encoder_only_layers_to_kv_cache_config(self) -> None: def may_add_encoder_only_layers_to_kv_cache_config(self) -> None:
""" """
Add encoder-only layers to the KV cache config. Add encoder-only layers to the KV cache config.
......
...@@ -54,6 +54,7 @@ from vllm.v1.outputs import ( ...@@ -54,6 +54,7 @@ from vllm.v1.outputs import (
from vllm.v1.utils import report_usage_stats from vllm.v1.utils import report_usage_stats
from vllm.v1.worker.utils import is_residual_scattered_for_sp from vllm.v1.worker.utils import is_residual_scattered_for_sp
from vllm.v1.worker.worker_base import WorkerBase from vllm.v1.worker.worker_base import WorkerBase
from vllm.v1.worker.workspace import init_workspace_manager
logger = init_logger(__name__) logger = init_logger(__name__)
...@@ -81,7 +82,7 @@ class Worker(WorkerBase): ...@@ -81,7 +82,7 @@ class Worker(WorkerBase):
# configure float32 matmul precision according to vLLM env. # configure float32 matmul precision according to vLLM env.
precision = envs.VLLM_FLOAT32_MATMUL_PRECISION precision = envs.VLLM_FLOAT32_MATMUL_PRECISION
torch.set_float32_matmul_precision(precision) torch.backends.cuda.matmul.fp32_precision = precision
if self.model_config.trust_remote_code: if self.model_config.trust_remote_code:
# note: lazy import to avoid importing torch before initializing # note: lazy import to avoid importing torch before initializing
...@@ -255,6 +256,10 @@ class Worker(WorkerBase): ...@@ -255,6 +256,10 @@ class Worker(WorkerBase):
else: else:
raise RuntimeError(f"Not support device type: {self.device_config.device}") raise RuntimeError(f"Not support device type: {self.device_config.device}")
# Initialize workspace manager
num_ubatches = 2 if self.vllm_config.parallel_config.enable_dbo else 1
init_workspace_manager(self.device, num_ubatches)
# Construct the model runner # Construct the model runner
if self.use_v2_model_runner: if self.use_v2_model_runner:
from vllm.v1.worker.gpu.model_runner import ( from vllm.v1.worker.gpu.model_runner import (
...@@ -926,10 +931,11 @@ def init_worker_distributed_environment( ...@@ -926,10 +931,11 @@ def init_worker_distributed_environment(
backend: str = "nccl", backend: str = "nccl",
) -> None: ) -> None:
"""Initialize the distributed environment.""" """Initialize the distributed environment."""
attention_config = vllm_config.attention_config
parallel_config = vllm_config.parallel_config parallel_config = vllm_config.parallel_config
from vllm.model_executor.layers.batch_invariant import init_batch_invariance from vllm.model_executor.layers.batch_invariant import init_batch_invariance
init_batch_invariance() init_batch_invariance(attention_config.backend)
set_custom_all_reduce(not parallel_config.disable_custom_all_reduce) set_custom_all_reduce(not parallel_config.disable_custom_all_reduce)
init_method = distributed_init_method or "env://" init_method = distributed_init_method or "env://"
......
...@@ -22,7 +22,6 @@ from vllm.distributed.kv_transfer import ( ...@@ -22,7 +22,6 @@ from vllm.distributed.kv_transfer import (
has_kv_transfer_group, has_kv_transfer_group,
) )
from vllm.distributed.kv_transfer.kv_connector.base import KVConnectorBase from vllm.distributed.kv_transfer.kv_connector.base import KVConnectorBase
from vllm.distributed.kv_transfer.kv_connector.v1.metrics import KVConnectorStats
from vllm.forward_context import get_forward_context, set_forward_context from vllm.forward_context import get_forward_context, set_forward_context
from vllm.logger import init_logger from vllm.logger import init_logger
from vllm.v1.kv_cache_interface import AttentionSpec, KVCacheConfig from vllm.v1.kv_cache_interface import AttentionSpec, KVCacheConfig
...@@ -138,16 +137,10 @@ class KVConnectorModelRunnerMixin: ...@@ -138,16 +137,10 @@ class KVConnectorModelRunnerMixin:
) )
output.invalid_block_ids = kv_connector.get_block_ids_with_load_errors() output.invalid_block_ids = kv_connector.get_block_ids_with_load_errors()
output.kv_connector_stats = ( output.kv_connector_stats = kv_connector.get_kv_connector_stats()
KVConnectorModelRunnerMixin.get_kv_connector_stats() output.kv_cache_events = kv_connector.get_kv_connector_kv_cache_events()
)
kv_connector.clear_connector_metadata()
@staticmethod kv_connector.clear_connector_metadata()
def get_kv_connector_stats() -> KVConnectorStats | None:
if has_kv_transfer_group():
return get_kv_transfer_group().get_kv_connector_stats()
return None
@staticmethod @staticmethod
def use_uniform_kv_cache( def use_uniform_kv_cache(
......
...@@ -10,7 +10,7 @@ import torch ...@@ -10,7 +10,7 @@ import torch
import torch.nn as nn import torch.nn as nn
import vllm.envs as envs import vllm.envs as envs
from vllm.config import VllmConfig from vllm.config import VllmConfig, set_current_vllm_config
from vllm.distributed import ( from vllm.distributed import (
ensure_model_parallel_initialized, ensure_model_parallel_initialized,
init_distributed_environment, init_distributed_environment,
...@@ -207,7 +207,8 @@ class TPUWorker: ...@@ -207,7 +207,8 @@ class TPUWorker:
# one compiled bytecode. Having one FX graph/cached bytecode per # one compiled bytecode. Having one FX graph/cached bytecode per
# compiled model is required for `support_torch_compile` decorator to # compiled model is required for `support_torch_compile` decorator to
# skip dynamo guard. # skip dynamo guard.
self.model_runner.reset_dynamo_cache() with set_current_vllm_config(self.vllm_config):
self.model_runner.reset_dynamo_cache()
# Get the maximum amount of memory used by the model weights and # Get the maximum amount of memory used by the model weights and
# intermediate activations. # intermediate activations.
......
...@@ -4,10 +4,12 @@ from collections import defaultdict ...@@ -4,10 +4,12 @@ from collections import defaultdict
from dataclasses import dataclass, field from dataclasses import dataclass, field
import torch import torch
from typing_extensions import deprecated
from vllm.attention.backends.abstract import AttentionBackend from vllm.attention.backends.abstract import AttentionBackend
from vllm.attention.layer import Attention from vllm.attention.layer import Attention
from vllm.config import ModelConfig, SchedulerConfig, VllmConfig from vllm.config import ModelConfig, SchedulerConfig, VllmConfig
from vllm.logger import init_logger
from vllm.model_executor.models.interfaces import MultiModalEmbeddings from vllm.model_executor.models.interfaces import MultiModalEmbeddings
from vllm.model_executor.models.utils import extract_layer_index from vllm.model_executor.models.utils import extract_layer_index
from vllm.multimodal.cache import processor_only_cache_from_config from vllm.multimodal.cache import processor_only_cache_from_config
...@@ -17,6 +19,8 @@ from vllm.v1.attention.backends.utils import AttentionMetadataBuilder ...@@ -17,6 +19,8 @@ from vllm.v1.attention.backends.utils import AttentionMetadataBuilder
from vllm.v1.core.encoder_cache_manager import compute_mm_encoder_budget from vllm.v1.core.encoder_cache_manager import compute_mm_encoder_budget
from vllm.v1.kv_cache_interface import KVCacheGroupSpec, KVCacheSpec from vllm.v1.kv_cache_interface import KVCacheGroupSpec, KVCacheSpec
logger = init_logger(__name__)
class MultiModalBudget: class MultiModalBudget:
"""Helper class to calculate budget information for multi-modal models.""" """Helper class to calculate budget information for multi-modal models."""
...@@ -135,7 +139,7 @@ class AttentionGroup: ...@@ -135,7 +139,7 @@ class AttentionGroup:
kv_cache_spec: KVCacheSpec kv_cache_spec: KVCacheSpec
kv_cache_group_id: int kv_cache_group_id: int
# When ubatching is enabled we will have a metadata builder for each ubatch # When ubatching is enabled we will have a metadata builder for each ubatch
# so that if they use internal persistant buffers for cudagraphs, and they # so that if they use internal persistent buffers for cudagraphs, and they
# won't have to worry about conflicting with the other ubatches. # won't have to worry about conflicting with the other ubatches.
metadata_builders: list[AttentionMetadataBuilder] = field( metadata_builders: list[AttentionMetadataBuilder] = field(
default_factory=lambda: [] default_factory=lambda: []
...@@ -198,6 +202,7 @@ def sanity_check_mm_encoder_outputs( ...@@ -198,6 +202,7 @@ def sanity_check_mm_encoder_outputs(
) )
@deprecated("`scatter_mm_placeholders` is deprecated and will be removed in v0.15.0.")
def scatter_mm_placeholders( def scatter_mm_placeholders(
embeds: torch.Tensor, embeds: torch.Tensor,
is_embed: torch.Tensor | None, is_embed: torch.Tensor | None,
...@@ -226,6 +231,7 @@ def scatter_mm_placeholders( ...@@ -226,6 +231,7 @@ def scatter_mm_placeholders(
return placeholders return placeholders
@deprecated("`gather_mm_placeholders` is deprecated and will be removed in v0.15.0.")
def gather_mm_placeholders( def gather_mm_placeholders(
placeholders: torch.Tensor, placeholders: torch.Tensor,
is_embed: torch.Tensor | None, is_embed: torch.Tensor | None,
...@@ -313,8 +319,12 @@ def bind_kv_cache( ...@@ -313,8 +319,12 @@ def bind_kv_cache(
# TODO - analyze where runner_kv_caches is used and the right # TODO - analyze where runner_kv_caches is used and the right
# way to ensure it properly reflects multiple attention layers # way to ensure it properly reflects multiple attention layers
# in the same decoder block. # in the same decoder block.
if current_platform.is_cuda_alike() or current_platform.is_xpu(): if (
# We know that the GPU runner is not impacted by this current_platform.is_cuda_alike()
or current_platform.is_xpu()
or current_platform.is_cpu()
):
# We know that the GPU / CPU runner is not impacted by this
# case. Some test code depends on runner_kv_caches, but # case. Some test code depends on runner_kv_caches, but
# not in a way that's impacted by ignoring this. # not in a way that's impacted by ignoring this.
pass pass
......
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import inspect
import os
from itertools import accumulate
from math import prod
from typing import Optional
import torch
import vllm.envs as envs
from vllm.logger import init_logger
from vllm.utils.math_utils import round_up
from vllm.v1.worker.ubatching import dbo_current_ubatch_id
logger = init_logger(__name__)
def _compute_bytes(shape: tuple[int, ...], dtype: torch.dtype) -> int:
return prod(shape) * dtype.itemsize
# Constants
_MB = 1024**2
_GiB = 1024**3
# Global workspace manager instance
_manager: Optional["WorkspaceManager"] = None
class WorkspaceManager:
"""Manager for workspace allocation.
Manages workspace buffers for DBO (Dual Batch Overlap) execution.
Can be locked to prevent further growth during execution.
"""
def __init__(self, device: torch.device, num_ubatches: int | None = None):
self._device = device
# Cache num ubatches at init based on configuration (default to 1)
self._num_ubatches = num_ubatches if num_ubatches is not None else 1
self._current_workspaces: list[torch.Tensor | None] = [None, None]
self._locked: bool = False
@staticmethod
def _workspace_size_bytes(workspace: torch.Tensor | None) -> int:
"""Get size of workspace in bytes."""
if workspace is None:
return 0
return workspace.numel() * workspace.element_size()
def lock(self) -> None:
"""Lock the workspace to prevent further growth.
After locking, any attempt to allocate a larger workspace will raise
an assertion error. This ensures workspace size is fixed during execution.
"""
self._locked = True
if envs.VLLM_DEBUG_WORKSPACE:
logger.info(
"[WORKSPACE DEBUG] Workspace locked. Current sizes: %s",
[
self._workspace_size_bytes(ws) / _MB
for ws in self._current_workspaces
if ws is not None
],
)
def is_locked(self) -> bool:
"""Check if workspace is locked."""
return self._locked
def get_simultaneous(
self, *shapes_and_dtypes: tuple[tuple[int, ...], torch.dtype]
) -> list[torch.Tensor]:
"""Get multiple workspace tensors simultaneously from a single allocation.
Args:
*shapes_and_dtypes: One or more (shape, dtype) tuples.
Returns:
List of tensor views into the workspace buffer, one per shape/dtype pair.
"""
actual_bytes = [_compute_bytes(s, d) for s, d in shapes_and_dtypes]
aligned_bytes = [round_up(actual, 256) for actual in actual_bytes]
total_bytes = sum(aligned_bytes)
# Calculate cumulative offsets using itertools.accumulate
offsets = list(accumulate([0] + aligned_bytes[:-1]))
current_workspace = self._ensure_workspace_size(total_bytes)
return [
current_workspace[offsets[i] : offsets[i] + actual_bytes[i]]
.view(shapes_and_dtypes[i][1])
.reshape(shapes_and_dtypes[i][0])
for i in range(len(shapes_and_dtypes))
]
def _ensure_workspace_size(self, required_bytes: int) -> torch.Tensor:
"""Ensure workspace is allocated and large enough, return current workspace.
Args:
required_bytes: The number of bytes required.
Returns:
The current workspace tensor.
"""
ubatch_id = dbo_current_ubatch_id()
current_workspace = self._current_workspaces[ubatch_id]
current_size = self._workspace_size_bytes(current_workspace)
if current_size < required_bytes:
def get_caller_info() -> str:
"""Find first frame outside WorkspaceManager."""
curr_frame = inspect.currentframe()
if curr_frame is None:
return "unknown"
# Walk up the stack skipping WorkspaceManager frames
curr_frame = curr_frame.f_back
while curr_frame is not None:
# TODO: This only catches instance methods (self), missing
# classmethods and staticmethods. Once Python 3.11+ is the
# minimum supported version, use co_qualname instead:
# qualname = curr_frame.f_code.co_qualname
# if qualname.startswith("WorkspaceManager."):
if isinstance(curr_frame.f_locals.get("self"), WorkspaceManager):
curr_frame = curr_frame.f_back
continue
filename = os.path.basename(curr_frame.f_code.co_filename)
return (
f"{filename}:{curr_frame.f_lineno}:{curr_frame.f_code.co_name}"
)
return "unknown"
if self._locked:
raise AssertionError(
f"Workspace is locked but allocation from '{get_caller_info()}' "
f"requires {required_bytes / _MB:.2f} MB, current size is "
f"{current_size / _MB:.2f} MB. "
"Workspace growth is not allowed after locking."
)
for ubatch_id in range(self._num_ubatches):
current_workspace = self._current_workspaces[ubatch_id]
if current_workspace is None:
self._current_workspaces[ubatch_id] = torch.empty(
(required_bytes,), dtype=torch.uint8, device=self._device
)
elif self._workspace_size_bytes(current_workspace) < required_bytes:
current_workspace.resize_(required_bytes)
if envs.VLLM_DEBUG_WORKSPACE:
logger.info(
"[WORKSPACE DEBUG] Resized workspace from '%s': %.2f MB -> "
"%.2f MB (%d ubatches, total memory %.2f MB)",
get_caller_info(),
current_size / _MB,
required_bytes / _MB,
self._num_ubatches,
required_bytes * self._num_ubatches / _MB,
)
current_workspace = self._current_workspaces[dbo_current_ubatch_id()]
return current_workspace
def is_workspace_manager_initialized() -> bool:
"""Check if workspace manager has been initialized.
Returns:
True if workspace manager is initialized, False otherwise.
"""
return _manager is not None
def current_workspace_manager() -> "WorkspaceManager":
"""Get the current workspace manager instance.
Raises:
AssertionError: If workspace manager has not been initialized.
"""
assert _manager is not None, (
"WorkspaceManager not initialized. Call init_workspace_manager() "
"with a device before using workspace functions."
)
return _manager
def init_workspace_manager(
device: torch.device, num_ubatches: int | None = None
) -> None:
"""Initialize the workspace manager with a device.
Must be called before using any workspace functions. Typically called
from GPUModelRunner.__init__.
Args:
device: The device to allocate workspace on.
num_ubatches: Number of micro-batches. Defaults to 1.
"""
global _manager
if _manager is not None:
logger.warning(
"WorkspaceManager already initialized on device %s, "
"reinitializing on device %s",
_manager._device,
device,
)
_manager = WorkspaceManager(device, num_ubatches)
def lock_workspace() -> None:
"""Lock the workspace to prevent further growth.
After calling this function, any attempt to allocate a workspace larger
than the current size will raise an AssertionError. This ensures that
workspace size is fixed during execution and prevents unexpected memory
allocations in the hot path.
Example:
# During initialization
init_workspace_manager(device)
reserve_workspace(shape1, dtype1)
reserve_workspace(shape2, dtype2)
# Lock after warmup/profiling
lock_workspace()
# Now all get_workspace calls must fit in pre-allocated size
"""
current_workspace_manager().lock()
def reset_workspace_manager() -> None:
"""Reset the workspace manager to uninitialized state.
This is primarily intended for testing purposes to allow tests
to reinitialize the workspace manager cleanly.
"""
global _manager
_manager = None
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment