Unverified Commit f4417f84 authored by Martin Hickey's avatar Martin Hickey Committed by GitHub
Browse files

[KVConnector] Add KV events to KV Connectors (#28309)


Signed-off-by: default avatarMartin Hickey <martin.hickey@ie.ibm.com>
parent a11f4a81
This diff is collapsed.
......@@ -5,7 +5,7 @@ import queue
import threading
import time
from abc import ABC, abstractmethod
from collections import deque
from collections import Counter, deque
from collections.abc import Callable
from dataclasses import asdict
from itertools import count
......@@ -54,11 +54,26 @@ class BlockStored(KVCacheEvent):
lora_id: int | None
medium: str | None
def __hash__(self) -> int:
return hash(
(
tuple(self.block_hashes),
self.parent_block_hash,
tuple(self.token_ids),
self.block_size,
self.lora_id,
self.medium,
)
)
class BlockRemoved(KVCacheEvent):
block_hashes: list[ExternalBlockHash]
medium: str | None
def __hash__(self) -> int:
return hash((tuple(self.block_hashes), self.medium))
class AllBlocksCleared(KVCacheEvent):
pass
......@@ -68,6 +83,119 @@ class KVEventBatch(EventBatch):
events: list[BlockStored | BlockRemoved | AllBlocksCleared]
class KVEventAggregator:
"""
Aggregates KV events across multiple workers.
Tracks how many times each event appears and returns only those
that were emitted by all workers.
"""
__slots__ = ("_event_counter", "_num_workers")
def __init__(self, num_workers: int) -> None:
if num_workers <= 0:
raise ValueError("num_workers must be greater than zero.")
self._event_counter: Counter[KVCacheEvent] = Counter()
self._num_workers: int = num_workers
def add_events(self, events: list[KVCacheEvent]) -> None:
"""
Add events from a worker batch.
:param events: List of KVCacheEvent objects.
"""
if not isinstance(events, list):
raise TypeError("events must be a list of KVCacheEvent.")
self._event_counter.update(events)
def get_common_events(self) -> list[KVCacheEvent]:
"""
Return events that appeared in all workers.
:return: List of events present in all workers.
"""
return [
event
for event, count in self._event_counter.items()
if count == self._num_workers
]
def get_all_events(self) -> list[KVCacheEvent]:
"""
Return all events for all workers.
:return: List of events for all workers.
"""
return list(self._event_counter.elements())
def clear_events(self) -> None:
"""
Clear all tracked events.
"""
self._event_counter.clear()
def increment_workers(self, count: int = 1) -> None:
"""
Increment the number of workers contributing events.
:param count: Number to increment the workers by.
"""
if count <= 0:
raise ValueError("count must be positive.")
self._num_workers += count
def reset_workers(self) -> None:
"""
Reset the number of workers to 1.
"""
self._num_workers = 1
def get_number_of_workers(self) -> int:
"""
Return the number of workers.
:return: int number of workers.
"""
return self._num_workers
def __repr__(self) -> str:
return (
f"<KVEventAggregator workers={self._num_workers}, "
f"events={len(self._event_counter)}>"
)
class KVConnectorKVEvents(ABC):
"""
Abstract base class for KV events.
Acts as a container for KV events from the connector.
"""
@abstractmethod
def add_events(self, events: list[KVCacheEvent]) -> None:
raise NotImplementedError
@abstractmethod
def aggregate(self) -> "KVConnectorKVEvents":
raise NotImplementedError
@abstractmethod
def increment_workers(self, count: int = 1) -> None:
raise NotImplementedError
@abstractmethod
def get_all_events(self) -> list[KVCacheEvent]:
raise NotImplementedError
@abstractmethod
def get_number_of_workers(self) -> int:
raise NotImplementedError
@abstractmethod
def clear_events(self) -> None:
raise NotImplementedError
class EventPublisher(ABC):
"""Lightweight publisher for EventBatch batches with data parallelism
support.
......
......@@ -78,6 +78,7 @@ class KVOutputAggregator:
finished_sending = set[str]()
finished_recving = set[str]()
aggregated_kv_connector_stats = None
combined_kv_cache_events = None
invalid_block_ids = set[int]()
for model_runner_output in outputs:
assert model_runner_output is not None
......@@ -119,6 +120,19 @@ class KVOutputAggregator:
aggregated_kv_connector_stats.aggregate(kv_connector_stats)
)
# Combine kv_cache_events from all workers.
if combined_kv_cache_events is None:
# Use the first worker's kv_cache events as start event list.
combined_kv_cache_events = kv_output.kv_cache_events
elif kv_cache_events := kv_output.kv_cache_events:
assert isinstance(
combined_kv_cache_events,
type(kv_cache_events),
)
worker_kv_cache_events = kv_cache_events.get_all_events()
combined_kv_cache_events.add_events(worker_kv_cache_events)
combined_kv_cache_events.increment_workers(1)
invalid_block_ids |= kv_output.invalid_block_ids
# select output of the worker specified by output_rank
......@@ -129,6 +143,7 @@ class KVOutputAggregator:
finished_sending=finished_sending or None,
finished_recving=finished_recving or None,
kv_connector_stats=aggregated_kv_connector_stats or None,
kv_cache_events=combined_kv_cache_events or None,
invalid_block_ids=invalid_block_ids,
expected_finished_count=self._expected_finished_count,
)
......
......@@ -49,7 +49,7 @@ from vllm.v1.outputs import KVConnectorOutput
if TYPE_CHECKING:
from vllm.config import VllmConfig
from vllm.distributed.kv_events import KVCacheEvent
from vllm.distributed.kv_events import KVCacheEvent, KVConnectorKVEvents
from vllm.distributed.kv_transfer.kv_connector.v1.metrics import (
KVConnectorPromMetrics,
KVConnectorStats,
......@@ -379,6 +379,14 @@ class KVConnectorBase_V1(ABC):
"""
return None
def get_kv_connector_kv_cache_events(self) -> Optional["KVConnectorKVEvents"]:
"""
Get the KV connector kv cache events collected during the last interval.
This function should be called by the model runner every time after the
model execution and before cleanup.
"""
return None
def get_handshake_metadata(self) -> KVConnectorHandshakeMetadata | None:
"""
Get the KVConnector handshake metadata for this connector.
......
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from collections.abc import Iterable
from typing import TYPE_CHECKING, Any
import torch
from lmcache.integration.vllm.vllm_v1_adapter import (
LMCacheConnectorV1Impl as LMCacheConnectorLatestImpl,
)
from vllm.attention.backends.abstract import AttentionMetadata
from vllm.config import VllmConfig
from vllm.distributed.kv_events import (
BlockStored,
KVCacheEvent,
KVConnectorKVEvents,
KVEventAggregator,
)
from vllm.distributed.kv_transfer.kv_connector.v1.base import (
KVConnectorBase_V1,
KVConnectorMetadata,
......@@ -16,6 +20,7 @@ from vllm.distributed.kv_transfer.kv_connector.v1.base import (
)
from vllm.logger import init_logger
from vllm.v1.core.sched.output import SchedulerOutput
from vllm.v1.outputs import KVConnectorOutput
if TYPE_CHECKING:
from vllm.forward_context import ForwardContext
......@@ -26,6 +31,44 @@ if TYPE_CHECKING:
logger = init_logger(__name__)
class LMCacheKVEvents(KVConnectorKVEvents):
"""
Concrete implementation of KVConnectorKVEvents using KVEventAggregator.
"""
def __init__(self, num_workers: int) -> None:
self._aggregator = KVEventAggregator(num_workers)
def add_events(self, events: list[KVCacheEvent]) -> None:
self._aggregator.add_events(events)
def aggregate(self) -> "LMCacheKVEvents":
"""
Aggregate KV events and retain only common events.
"""
common_events = self._aggregator.get_common_events()
self._aggregator.clear_events()
self._aggregator.add_events(common_events)
self._aggregator.reset_workers()
return self
def increment_workers(self, count: int = 1) -> None:
self._aggregator.increment_workers(count)
def get_all_events(self) -> list[KVCacheEvent]:
return self._aggregator.get_all_events()
def get_number_of_workers(self) -> int:
return self._aggregator.get_number_of_workers()
def clear_events(self) -> None:
self._aggregator.clear_events()
self._aggregator.reset_workers()
def __repr__(self) -> str:
return f"<LMCacheKVEvents events={self.get_all_events()}>"
class LMCacheConnectorV1(KVConnectorBase_V1):
def __init__(
self,
......@@ -50,10 +93,17 @@ class LMCacheConnectorV1(KVConnectorBase_V1):
cls = _adapter.LMCacheConnectorV1Impl
else:
logger.info("Initializing latest dev LMCache connector")
# lazy import
from lmcache.integration.vllm.vllm_v1_adapter import (
LMCacheConnectorV1Impl as LMCacheConnectorLatestImpl,
)
cls = LMCacheConnectorLatestImpl
self._lmcache_engine = cls(vllm_config, role, self)
self._kv_cache_events: LMCacheKVEvents | None = None
# ==============================
# Worker-side methods
# ==============================
......@@ -151,6 +201,31 @@ class LMCacheConnectorV1(KVConnectorBase_V1):
# Fallback for older versions that don't support this method
return set()
def get_kv_connector_kv_cache_events(self) -> LMCacheKVEvents | None:
"""
Get the KV connector kv cache events collected during the last interval.
"""
events = self._lmcache_engine.get_kv_events() # type: ignore [attr-defined]
if not events:
return None
blocks: list[BlockStored] = [
BlockStored(
block_hashes=e.block_hashes,
parent_block_hash=e.parent_block_hash,
token_ids=e.token_ids,
lora_id=e.lora_id,
block_size=e.block_size,
medium=e.medium,
)
for e in events
]
lmcache_kv_events = LMCacheKVEvents(num_workers=1)
lmcache_kv_events.add_events(blocks)
return lmcache_kv_events
# ==============================
# Scheduler-side methods
# ==============================
......@@ -198,6 +273,28 @@ class LMCacheConnectorV1(KVConnectorBase_V1):
"""
return self._lmcache_engine.build_connector_meta(scheduler_output)
def update_connector_output(self, connector_output: KVConnectorOutput):
"""
Update KVConnector state from worker-side connectors output.
Args:
connector_output (KVConnectorOutput): the worker-side
connectors output.
"""
# Get the KV events
kv_cache_events = connector_output.kv_cache_events
if not kv_cache_events or not isinstance(kv_cache_events, LMCacheKVEvents):
return
if self._kv_cache_events is None:
self._kv_cache_events = kv_cache_events
else:
self._kv_cache_events.add_events(kv_cache_events.get_all_events())
self._kv_cache_events.increment_workers(
kv_cache_events.get_number_of_workers()
)
return
def request_finished(
self,
request: "Request",
......@@ -214,3 +311,17 @@ class LMCacheConnectorV1(KVConnectorBase_V1):
returned by the engine.
"""
return self._lmcache_engine.request_finished(request, block_ids)
def take_events(self) -> Iterable["KVCacheEvent"]:
"""
Take the KV cache events from the connector.
Yields:
New KV cache events since the last call.
"""
if self._kv_cache_events is not None:
self._kv_cache_events.aggregate()
kv_cache_events = self._kv_cache_events.get_all_events()
yield from kv_cache_events
self._kv_cache_events.clear_events()
self._kv_cache_events = None
......@@ -259,6 +259,12 @@ class MultiConnector(KVConnectorBase_V1):
agg_block_ids |= c.get_block_ids_with_load_errors()
return agg_block_ids
# TODO: Add a generic implementation of 'get_kv_connector_kv_cache_events' method
# for the MultiConnector. It should be able to get events from multiple
# connectors, handling the case where only a subset of the requested connectors
# implements the 'get_kv_connector_kv_cache_events'
# Follow on PR from https://github.com/vllm-project/vllm/pull/28309#pullrequestreview-3566351082
# ==============================
# Scheduler-side methods
# ==============================
......
......@@ -12,9 +12,11 @@ from vllm.compilation.cuda_graph import CUDAGraphStat
from vllm.v1.core.sched.output import SchedulerOutput
if TYPE_CHECKING:
from vllm.distributed.kv_events import KVConnectorKVEvents
from vllm.distributed.kv_transfer.kv_connector.v1.metrics import KVConnectorStats
else:
KVConnectorStats = object
KVConnectorKVEvents = object
class LogprobsLists(NamedTuple):
......@@ -108,6 +110,7 @@ class KVConnectorOutput:
finished_sending: set[str] | None = None
finished_recving: set[str] | None = None
kv_connector_stats: KVConnectorStats | None = None
kv_cache_events: KVConnectorKVEvents | None = None
# IDs of externally computed KV blocks that failed to load.
# Requests referencing these blocks should be rescheduled to recompute them
invalid_block_ids: set[int] = field(default_factory=set)
......@@ -123,6 +126,7 @@ class KVConnectorOutput:
not self.finished_sending
and not self.finished_recving
and not self.kv_connector_stats
and not self.kv_cache_events
and not self.invalid_block_ids
)
......
......@@ -22,7 +22,6 @@ from vllm.distributed.kv_transfer import (
has_kv_transfer_group,
)
from vllm.distributed.kv_transfer.kv_connector.base import KVConnectorBase
from vllm.distributed.kv_transfer.kv_connector.v1.metrics import KVConnectorStats
from vllm.forward_context import get_forward_context, set_forward_context
from vllm.logger import init_logger
from vllm.v1.kv_cache_interface import AttentionSpec, KVCacheConfig
......@@ -138,16 +137,10 @@ class KVConnectorModelRunnerMixin:
)
output.invalid_block_ids = kv_connector.get_block_ids_with_load_errors()
output.kv_connector_stats = (
KVConnectorModelRunnerMixin.get_kv_connector_stats()
)
kv_connector.clear_connector_metadata()
output.kv_connector_stats = kv_connector.get_kv_connector_stats()
output.kv_cache_events = kv_connector.get_kv_connector_kv_cache_events()
@staticmethod
def get_kv_connector_stats() -> KVConnectorStats | None:
if has_kv_transfer_group():
return get_kv_transfer_group().get_kv_connector_stats()
return None
kv_connector.clear_connector_metadata()
@staticmethod
def use_uniform_kv_cache(
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment