Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
change
sglang
Commits
5f12f0e7
Unverified
Commit
5f12f0e7
authored
Dec 01, 2024
by
Liangsheng Yin
Committed by
GitHub
Dec 01, 2024
Browse files
Fix chunked prefill when ignore eos (#2290)
parent
d5b95cbb
Changes
2
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
19 additions
and
16 deletions
+19
-16
python/sglang/srt/managers/schedule_policy.py
python/sglang/srt/managers/schedule_policy.py
+7
-4
python/sglang/srt/managers/scheduler.py
python/sglang/srt/managers/scheduler.py
+12
-12
No files found.
python/sglang/srt/managers/schedule_policy.py
View file @
5f12f0e7
...
...
@@ -142,7 +142,7 @@ class PrefillAdder:
self
.
req_states
=
None
self
.
can_run_list
=
[]
self
.
new_
inflight
_req
=
None
self
.
new_
being_chunked
_req
=
None
self
.
log_hit_tokens
=
0
self
.
log_input_tokens
=
0
...
...
@@ -182,7 +182,7 @@ class PrefillAdder:
self
.
log_hit_tokens
+=
prefix_len
self
.
log_input_tokens
+=
extend_input_len
def
add_
inflight
_req
(
self
,
req
:
Req
):
def
add_
being_chunked
_req
(
self
,
req
:
Req
):
truncated
=
req
.
extend_input_len
>
self
.
rem_chunk_tokens
req
.
extend_input_len
=
min
(
req
.
extend_input_len
,
self
.
rem_chunk_tokens
)
req
.
fill_ids
=
req
.
fill_ids
[:
len
(
req
.
prefix_indices
)
+
req
.
extend_input_len
]
...
...
@@ -269,10 +269,13 @@ class PrefillAdder:
else
:
# Chunked prefill
trunc_len
=
self
.
rem_chunk_tokens
if
trunc_len
==
0
:
return
AddReqResult
.
OTHER
req
.
extend_input_len
=
trunc_len
req
.
fill_ids
=
req
.
fill_ids
[:
trunc_len
]
self
.
can_run_list
.
append
(
req
)
self
.
new_
inflight
_req
=
req
self
.
new_
being_chunked
_req
=
req
self
.
_prefill_one_req
(
0
,
trunc_len
,
0
)
return
self
.
budget_state
()
...
...
@@ -326,7 +329,7 @@ class PrefillAdder:
req
.
extend_input_len
=
trunc_len
req
.
fill_ids
=
req
.
fill_ids
[:
len
(
req
.
prefix_indices
)
+
trunc_len
]
self
.
can_run_list
.
append
(
req
)
self
.
new_
inflight
_req
=
req
self
.
new_
being_chunked
_req
=
req
self
.
tree_cache
.
inc_lock_ref
(
req
.
last_node
)
self
.
_prefill_one_req
(
prefix_len
,
trunc_len
,
0
)
...
...
python/sglang/srt/managers/scheduler.py
View file @
5f12f0e7
...
...
@@ -660,7 +660,7 @@ class Scheduler:
self
.
waiting_queue
.
append
(
req
)
def
log_prefill_stats
(
self
,
adder
,
can_run_list
,
running_bs
,
has_
inflight
):
def
log_prefill_stats
(
self
,
adder
,
can_run_list
,
running_bs
,
has_
being_chunked
):
if
isinstance
(
self
.
tree_cache
,
RadixCache
):
self
.
tree_cache_metrics
[
"total"
]
+=
(
adder
.
log_input_tokens
+
adder
.
log_hit_tokens
...
...
@@ -684,14 +684,14 @@ class Scheduler:
f
"cache hit rate:
{
100.0
*
tree_cache_hit_rate
:.
2
f
}
%, "
f
"token usage:
{
num_used
/
self
.
max_total_num_tokens
:.
2
f
}
, "
f
"#running-req:
{
running_bs
}
, "
f
"#queue-req:
{
len
(
self
.
waiting_queue
)
+
has_
inflight
}
"
f
"#queue-req:
{
len
(
self
.
waiting_queue
)
+
has_
being_chunked
}
"
)
if
self
.
enable_metrics
:
self
.
stats
.
num_running_reqs
=
running_bs
self
.
stats
.
num_used_tokens
=
num_used
self
.
stats
.
token_usage
=
round
(
num_used
/
self
.
max_total_num_tokens
,
2
)
self
.
stats
.
num_queue_reqs
=
len
(
self
.
waiting_queue
)
+
has_
inflight
self
.
stats
.
num_queue_reqs
=
len
(
self
.
waiting_queue
)
+
has_
being_chunked
self
.
stats
.
cache_hit_rate
=
tree_cache_hit_rate
self
.
metrics_collector
.
log_stats
(
self
.
stats
)
...
...
@@ -752,7 +752,7 @@ class Scheduler:
# Move the chunked request out of the batch
self
.
last_batch
.
filter_batch
(
being_chunked_req
=
self
.
being_chunked_req
)
self
.
tree_cache
.
cache_unfinished_req
(
self
.
being_chunked_req
)
#
Inflight
request keeps its rid but will get a new req_pool_idx
#
being chunked
request keeps its rid but will get a new req_pool_idx
self
.
req_to_token_pool
.
free
(
self
.
being_chunked_req
.
req_pool_idx
)
self
.
batch_is_full
=
False
...
...
@@ -803,10 +803,10 @@ class Scheduler:
running_bs
if
self
.
is_mixed_chunk
else
0
,
)
has_
inflight
=
self
.
being_chunked_req
is
not
None
if
has_
inflight
:
has_
being_chunked
=
self
.
being_chunked_req
is
not
None
if
has_
being_chunked
:
self
.
being_chunked_req
.
init_next_round_input
()
self
.
being_chunked_req
=
adder
.
add_
inflight
_req
(
self
.
being_chunked_req
)
self
.
being_chunked_req
=
adder
.
add_
being_chunked
_req
(
self
.
being_chunked_req
)
if
self
.
lora_paths
:
lora_set
=
(
...
...
@@ -848,16 +848,16 @@ class Scheduler:
x
for
x
in
self
.
waiting_queue
if
x
not
in
set
(
can_run_list
)
]
if
adder
.
new_
inflight
_req
is
not
None
:
if
adder
.
new_
being_chunked
_req
is
not
None
:
assert
self
.
being_chunked_req
is
None
self
.
being_chunked_req
=
adder
.
new_
inflight
_req
self
.
being_chunked_req
=
adder
.
new_
being_chunked
_req
if
self
.
being_chunked_req
:
self
.
being_chunked_req
.
is_being_chunked
+=
1
# Print stats
if
self
.
tp_rank
==
0
:
self
.
log_prefill_stats
(
adder
,
can_run_list
,
running_bs
,
has_
inflight
)
self
.
log_prefill_stats
(
adder
,
can_run_list
,
running_bs
,
has_
being_chunked
)
# Create a new batch
new_batch
=
ScheduleBatch
.
init_new
(
...
...
@@ -1030,7 +1030,7 @@ class Scheduler:
if
req
.
grammar
is
not
None
:
req
.
grammar
.
accept_token
(
next_token_id
)
else
:
#
Inflight
reqs' prefill is not finished
#
being chunked
reqs' prefill is not finished
req
.
is_being_chunked
-=
1
if
batch
.
next_batch_sampling_info
:
...
...
@@ -1058,7 +1058,7 @@ class Scheduler:
else
:
self
.
tree_cache
.
cache_unfinished_req
(
req
)
else
:
#
Inflight
reqs' prefill is not finished
#
being chunked
reqs' prefill is not finished
req
.
is_being_chunked
-=
1
self
.
stream_output
(
batch
.
reqs
)
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment