Unverified Commit 12001f2e authored by maobaolong's avatar maobaolong Committed by GitHub
Browse files

[LMCache] Pass TP size in lookup for MLA multi-reader locking (#36129)


Signed-off-by: default avatarbaoloongmao <baoloongmao@tencent.com>
Co-authored-by: default avatarYihua Cheng <yihua98@uchicago.edu>
parent 7ee5d509
...@@ -114,6 +114,7 @@ class LMCacheMPSchedulerAdapter: ...@@ -114,6 +114,7 @@ class LMCacheMPSchedulerAdapter:
world_size: int, world_size: int,
kv_rank: int, kv_rank: int,
vllm_block_size: int, vllm_block_size: int,
tp_size: int = 1,
): ):
""" """
Args: Args:
...@@ -124,6 +125,8 @@ class LMCacheMPSchedulerAdapter: ...@@ -124,6 +125,8 @@ class LMCacheMPSchedulerAdapter:
world_size: The world size used for LMCache keys world_size: The world size used for LMCache keys
kv_rank: The kv rank used for LMCache keys kv_rank: The kv rank used for LMCache keys
vllm_block_size: The block size used in vLLM vllm_block_size: The block size used in vLLM
tp_size: Tensor-parallel size for MLA
multi-reader locking (default 1).
""" """
self.mq_client = MessageQueueClient(server_url, context) self.mq_client = MessageQueueClient(server_url, context)
...@@ -133,6 +136,7 @@ class LMCacheMPSchedulerAdapter: ...@@ -133,6 +136,7 @@ class LMCacheMPSchedulerAdapter:
self.model_name = model_name self.model_name = model_name
self.world_size = world_size self.world_size = world_size
self.worker_id = kv_rank self.worker_id = kv_rank
self.tp_size = tp_size
# Read chunk size from lmcache # Read chunk size from lmcache
self.chunk_size = get_lmcache_chunk_size(self.mq_client) self.chunk_size = get_lmcache_chunk_size(self.mq_client)
...@@ -281,6 +285,7 @@ class LMCacheMPSchedulerAdapter: ...@@ -281,6 +285,7 @@ class LMCacheMPSchedulerAdapter:
start=start, start=start,
end=end, end=end,
request_id=request_id, request_id=request_id,
tp_size=self.tp_size,
) )
def _create_hash_key( def _create_hash_key(
...@@ -293,6 +298,7 @@ class LMCacheMPSchedulerAdapter: ...@@ -293,6 +298,7 @@ class LMCacheMPSchedulerAdapter:
worker_id=None, worker_id=None,
chunk_hash=chunk_hash, chunk_hash=chunk_hash,
request_id=request_id, request_id=request_id,
tp_size=self.tp_size,
) )
......
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import enum import enum
import inspect
from collections.abc import Iterable from collections.abc import Iterable
from dataclasses import dataclass, field from dataclasses import dataclass, field
from typing import TYPE_CHECKING, Any, Literal from typing import TYPE_CHECKING, Any, Literal
...@@ -52,6 +53,12 @@ if TYPE_CHECKING: ...@@ -52,6 +53,12 @@ if TYPE_CHECKING:
logger = lmcache_init_logger(__name__) logger = lmcache_init_logger(__name__)
def _adapter_accepts_tp_size() -> bool:
"""Check if the imported adapter accepts tp_size."""
sig = inspect.signature(LMCacheMPSchedulerAdapter.__init__)
return "tp_size" in sig.parameters
# Helper functions # Helper functions
def reformat_block_ids(block_ids: tuple[list[int], ...] | None) -> list[int]: def reformat_block_ids(block_ids: tuple[list[int], ...] | None) -> list[int]:
if block_ids is None: if block_ids is None:
...@@ -101,6 +108,14 @@ def create_scheduler_adapter( ...@@ -101,6 +108,14 @@ def create_scheduler_adapter(
vllm_config.parallel_config.rank, vllm_config.parallel_config.rank,
vllm_config, vllm_config,
) )
tp_size = vllm_config.parallel_config.tensor_parallel_size
# Pass tp_size only when the adapter accepts it so that
# a newer vllm can still work with an older LMCache.
kwargs: dict[str, Any] = {}
if _adapter_accepts_tp_size():
kwargs["tp_size"] = tp_size
return LMCacheMPSchedulerAdapter( return LMCacheMPSchedulerAdapter(
server_url, server_url,
zmq_context, zmq_context,
...@@ -108,6 +123,7 @@ def create_scheduler_adapter( ...@@ -108,6 +123,7 @@ def create_scheduler_adapter(
world_size, world_size,
kv_rank, kv_rank,
vllm_config.cache_config.block_size, vllm_config.cache_config.block_size,
**kwargs,
) )
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment