Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
change
sglang
Commits
deded17f
Unverified
Commit
deded17f
authored
Apr 21, 2025
by
Byron Hsu
Committed by
GitHub
Apr 21, 2025
Browse files
[PD] Fix edge case and simplify large page size + chunked prefill (#5589)
parent
f29a718f
Changes
3
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
24 additions
and
21 deletions
+24
-21
python/sglang/srt/disaggregation/prefill.py
python/sglang/srt/disaggregation/prefill.py
+18
-7
python/sglang/srt/disaggregation/utils.py
python/sglang/srt/disaggregation/utils.py
+3
-11
scripts/playground/disaggregation/cli.py
scripts/playground/disaggregation/cli.py
+3
-3
No files found.
python/sglang/srt/disaggregation/prefill.py
View file @
deded17f
...
@@ -287,8 +287,16 @@ class SchedulerDisaggregationPrefillMixin:
...
@@ -287,8 +287,16 @@ class SchedulerDisaggregationPrefillMixin:
"""
"""
Send a prefilled chunk to the decode server
Send a prefilled chunk to the decode server
"""
"""
page_size
=
self
.
token_to_kv_pool_allocator
.
page_size
start_idx
=
req
.
start_send_idx
start_idx
=
req
.
start_send_idx
end_idx
=
min
(
len
(
req
.
fill_ids
),
len
(
req
.
origin_input_ids
))
end_idx
=
min
(
len
(
req
.
fill_ids
),
len
(
req
.
origin_input_ids
))
last_chunk
=
token_id
is
not
None
if
(
not
last_chunk
)
and
(
end_idx
%
page_size
!=
0
):
# todo: remove the second condition
# if not the last chunk and the last page is partial, delay the last partial page to the next send
end_idx
=
end_idx
-
end_idx
%
page_size
# Update next start_send_idx
# Update next start_send_idx
req
.
start_send_idx
=
end_idx
req
.
start_send_idx
=
end_idx
...
@@ -298,18 +306,21 @@ class SchedulerDisaggregationPrefillMixin:
...
@@ -298,18 +306,21 @@ class SchedulerDisaggregationPrefillMixin:
.
cpu
()
.
cpu
()
.
numpy
()
.
numpy
()
)
)
if
token_id
is
not
Non
e
:
if
last_chunk
is
Tru
e
:
self
.
disagg_prefill_pending_queue
.
store_prefill_results
(
self
.
disagg_prefill_pending_queue
.
store_prefill_results
(
req
.
metadata_buffer_index
,
token_id
req
.
metadata_buffer_index
,
token_id
)
)
is_last
=
token_id
is
not
None
page_indices
=
kv_to_page_indices
(
kv_indices
,
page_size
)
page_indices
=
kv_to_page_indices
(
kv_indices
,
self
.
token_to_kv_pool_allocator
.
page_size
)
page_start_idx
=
start_idx
//
self
.
token_to_kv_pool_allocator
.
page_size
page_start_idx
=
start_idx
//
page_size
page_end_idx
=
page_start_idx
+
len
(
page_indices
)
page_end_idx
=
page_start_idx
+
len
(
page_indices
)
if
len
(
page_indices
)
==
0
:
logger
.
info
(
f
"Skip sending kv chunk for request
{
req
.
rid
=
}
{
req
.
bootstrap_room
=
}
because page_indices is empty"
)
return
req
.
disagg_kv_sender
.
send
(
req
.
disagg_kv_sender
.
send
(
page_indices
,
slice
(
page_start_idx
,
page_end_idx
),
is_
last
page_indices
,
slice
(
page_start_idx
,
page_end_idx
),
last
_chunk
)
)
python/sglang/srt/disaggregation/utils.py
View file @
deded17f
...
@@ -76,22 +76,14 @@ def get_kv_class(transfer_backend: TransferBackend, class_type: KVClassType):
...
@@ -76,22 +76,14 @@ def get_kv_class(transfer_backend: TransferBackend, class_type: KVClassType):
raise
ValueError
(
f
"Unsupported transfer backend:
{
transfer_backend
}
"
)
raise
ValueError
(
f
"Unsupported transfer backend:
{
transfer_backend
}
"
)
def
kv_to_page_indices
(
kv_indices
:
np
.
ndarray
,
page_size
:
int
,
is_last
:
bool
=
True
):
def
kv_to_page_indices
(
kv_indices
:
np
.
ndarray
,
page_size
:
int
):
# 1. The page is guaruanteed to be full except the last page.
# 1. The page is guaruanteed to be full except the last page.
# 2. page index = kv_index // page_size
# 2. page index = kv_index // page_size
# The return vector is kv_indices[::page_size] // page_size
if
page_size
==
1
:
# shortcut
if
page_size
==
1
:
# shortcut
return
kv_indices
return
kv_indices
# if last chunk, send the last partial page
return
kv_indices
[::
page_size
]
//
page_size
# if not last chunk, delay the last partial page to the next send
if
is_last
:
return
kv_indices
[::
page_size
]
//
page_size
else
:
if
len
(
kv_indices
)
%
page_size
==
0
:
# no partial page
return
kv_indices
[::
page_size
]
//
page_size
else
:
# partial page
return
kv_indices
[::
page_size
][:
-
1
]
//
page_size
def
kv_to_page_num
(
num_kv_indices
:
int
,
page_size
:
int
):
def
kv_to_page_num
(
num_kv_indices
:
int
,
page_size
:
int
):
...
...
scripts/playground/disaggregation/cli.py
View file @
deded17f
prompt
=
"Hello "
*
16000
prompt
=
[
0
]
*
431
import
json
import
json
...
@@ -6,8 +6,8 @@ import requests
...
@@ -6,8 +6,8 @@ import requests
response
=
requests
.
post
(
response
=
requests
.
post
(
"http://0.0.0.0:8000/generate"
,
"http://0.0.0.0:8000/generate"
,
json
=
{
"
text
"
:
prompt
,
"sampling_params"
:
{
"temperature"
:
0
}},
json
=
{
"
input_ids
"
:
[
prompt
]
*
32
,
"sampling_params"
:
{
"temperature"
:
0
}},
)
)
print
(
"Response content (raw):"
,
response
.
content
)
#
print("Response content (raw):", response.content)
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment