"vscode:/vscode.git/clone" did not exist on "bf97f933db4b8b8de1582bb29a0065ec2e80932f"
Unverified Commit deded17f authored by Byron Hsu's avatar Byron Hsu Committed by GitHub
Browse files

[PD] Fix edge case and simplify large page size + chunked prefill (#5589)

parent f29a718f
...@@ -287,8 +287,16 @@ class SchedulerDisaggregationPrefillMixin: ...@@ -287,8 +287,16 @@ class SchedulerDisaggregationPrefillMixin:
""" """
Send a prefilled chunk to the decode server Send a prefilled chunk to the decode server
""" """
page_size = self.token_to_kv_pool_allocator.page_size
start_idx = req.start_send_idx start_idx = req.start_send_idx
end_idx = min(len(req.fill_ids), len(req.origin_input_ids)) end_idx = min(len(req.fill_ids), len(req.origin_input_ids))
last_chunk = token_id is not None
if (not last_chunk) and (
end_idx % page_size != 0
): # todo: remove the second condition
# if not the last chunk and the last page is partial, delay the last partial page to the next send
end_idx = end_idx - end_idx % page_size
# Update next start_send_idx # Update next start_send_idx
req.start_send_idx = end_idx req.start_send_idx = end_idx
...@@ -298,18 +306,21 @@ class SchedulerDisaggregationPrefillMixin: ...@@ -298,18 +306,21 @@ class SchedulerDisaggregationPrefillMixin:
.cpu() .cpu()
.numpy() .numpy()
) )
if token_id is not None: if last_chunk is True:
self.disagg_prefill_pending_queue.store_prefill_results( self.disagg_prefill_pending_queue.store_prefill_results(
req.metadata_buffer_index, token_id req.metadata_buffer_index, token_id
) )
is_last = token_id is not None page_indices = kv_to_page_indices(kv_indices, page_size)
page_indices = kv_to_page_indices(
kv_indices, self.token_to_kv_pool_allocator.page_size
)
page_start_idx = start_idx // self.token_to_kv_pool_allocator.page_size page_start_idx = start_idx // page_size
page_end_idx = page_start_idx + len(page_indices) page_end_idx = page_start_idx + len(page_indices)
if len(page_indices) == 0:
logger.info(
f"Skip sending kv chunk for request {req.rid=} {req.bootstrap_room=} because page_indices is empty"
)
return
req.disagg_kv_sender.send( req.disagg_kv_sender.send(
page_indices, slice(page_start_idx, page_end_idx), is_last page_indices, slice(page_start_idx, page_end_idx), last_chunk
) )
...@@ -76,22 +76,14 @@ def get_kv_class(transfer_backend: TransferBackend, class_type: KVClassType): ...@@ -76,22 +76,14 @@ def get_kv_class(transfer_backend: TransferBackend, class_type: KVClassType):
raise ValueError(f"Unsupported transfer backend: {transfer_backend}") raise ValueError(f"Unsupported transfer backend: {transfer_backend}")
def kv_to_page_indices(kv_indices: np.ndarray, page_size: int, is_last: bool = True): def kv_to_page_indices(kv_indices: np.ndarray, page_size: int):
# 1. The page is guaruanteed to be full except the last page. # 1. The page is guaruanteed to be full except the last page.
# 2. page index = kv_index // page_size # 2. page index = kv_index // page_size
# The return vector is kv_indices[::page_size] // page_size
if page_size == 1: # shortcut if page_size == 1: # shortcut
return kv_indices return kv_indices
# if last chunk, send the last partial page
# if not last chunk, delay the last partial page to the next send
if is_last:
return kv_indices[::page_size] // page_size
else:
if len(kv_indices) % page_size == 0: # no partial page
return kv_indices[::page_size] // page_size return kv_indices[::page_size] // page_size
else: # partial page
return kv_indices[::page_size][:-1] // page_size
def kv_to_page_num(num_kv_indices: int, page_size: int): def kv_to_page_num(num_kv_indices: int, page_size: int):
......
prompt = "Hello " * 16000 prompt = [0] * 431
import json import json
...@@ -6,8 +6,8 @@ import requests ...@@ -6,8 +6,8 @@ import requests
response = requests.post( response = requests.post(
"http://0.0.0.0:8000/generate", "http://0.0.0.0:8000/generate",
json={"text": prompt, "sampling_params": {"temperature": 0}}, json={"input_ids": [prompt] * 32, "sampling_params": {"temperature": 0}},
) )
print("Response content (raw):", response.content) # print("Response content (raw):", response.content)
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment