"docs/vscode:/vscode.git/clone" did not exist on "e20eba753bbced43837aa92f747e6c50ee36ce09"
Unverified Commit 6825d9a9 authored by Woosuk Kwon's avatar Woosuk Kwon Committed by GitHub
Browse files

[BugFix][Spec Decode] Improve Prefix Caching Logic in Speculative Decoding (#18668)


Signed-off-by: default avatarWoosuk Kwon <woosuk.kwon@berkeley.edu>
parent b554ab73
...@@ -174,6 +174,7 @@ class KVCacheManager: ...@@ -174,6 +174,7 @@ class KVCacheManager:
num_new_tokens: int, num_new_tokens: int,
num_new_computed_tokens: int = 0, num_new_computed_tokens: int = 0,
new_computed_blocks: Optional[KVCacheBlocks] = None, new_computed_blocks: Optional[KVCacheBlocks] = None,
num_draft_tokens: int = 0,
num_lookahead_tokens: int = 0, num_lookahead_tokens: int = 0,
delay_cache_blocks: bool = False, delay_cache_blocks: bool = False,
) -> Optional[KVCacheBlocks]: ) -> Optional[KVCacheBlocks]:
...@@ -273,7 +274,7 @@ class KVCacheManager: ...@@ -273,7 +274,7 @@ class KVCacheManager:
# generated (accepted) tokens. # generated (accepted) tokens.
self.single_type_manager.cache_blocks( self.single_type_manager.cache_blocks(
request, self.req_to_block_hashes[request.request_id], request, self.req_to_block_hashes[request.request_id],
num_computed_tokens + num_new_tokens - len(request.spec_token_ids)) num_computed_tokens + num_new_tokens - num_draft_tokens)
return KVCacheBlocks(new_blocks) return KVCacheBlocks(new_blocks)
......
...@@ -227,10 +227,15 @@ class Scheduler(SchedulerInterface): ...@@ -227,10 +227,15 @@ class Scheduler(SchedulerInterface):
req_index += 1 req_index += 1
continue continue
num_draft_tokens = max(
num_new_tokens + request.num_computed_tokens -
request.num_tokens, 0)
while True: while True:
new_blocks = self.kv_cache_manager.allocate_slots( new_blocks = self.kv_cache_manager.allocate_slots(
request, request,
num_new_tokens, num_new_tokens,
num_draft_tokens=num_draft_tokens,
num_lookahead_tokens=self.num_lookahead_tokens) num_lookahead_tokens=self.num_lookahead_tokens)
if new_blocks is None: if new_blocks is None:
# The request cannot be scheduled. # The request cannot be scheduled.
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment