Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
change
sglang
Commits
c55550cb
Unverified
Commit
c55550cb
authored
Apr 25, 2025
by
Liangsheng Yin
Committed by
GitHub
Apr 25, 2025
Browse files
[PD] Better logs (#5715)
parent
43fb95c2
Changes
3
Show whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
50 additions
and
34 deletions
+50
-34
python/sglang/srt/disaggregation/decode.py
python/sglang/srt/disaggregation/decode.py
+12
-5
python/sglang/srt/disaggregation/prefill.py
python/sglang/srt/disaggregation/prefill.py
+8
-9
python/sglang/srt/managers/scheduler.py
python/sglang/srt/managers/scheduler.py
+30
-20
No files found.
python/sglang/srt/disaggregation/decode.py
View file @
c55550cb
...
...
@@ -307,7 +307,7 @@ class DecodeTransferQueue:
def
extend
(
self
,
req_conns
)
->
None
:
self
.
queue
.
extend
(
req_conns
)
def
pop_transferred
(
self
)
->
List
[
Req
]:
def
pop_transferred
(
self
)
->
List
[
DecodeRequest
]:
if
not
self
.
queue
:
return
[]
...
...
@@ -330,7 +330,7 @@ class DecodeTransferQueue:
assert
len
(
decode_req
.
req
.
output_ids
)
==
0
assert
decode_req
.
req
.
transferred_output_id
is
None
decode_req
.
req
.
transferred_output_id
=
output_id
transferred_reqs
.
append
(
decode_req
.
req
)
transferred_reqs
.
append
(
decode_req
)
indices_to_remove
.
add
(
i
)
elif
poll
in
[
KVPoll
.
Bootstrapping
,
...
...
@@ -454,7 +454,7 @@ class SchedulerDisaggregationDecodeMixin:
return
batch
,
result
@
torch
.
no_grad
()
def
event_loop_normal_disagg_decode
(
self
):
def
event_loop_normal_disagg_decode
(
self
:
Scheduler
):
"""A normal scheduler loop for decode worker in disaggregation mode."""
while
True
:
...
...
@@ -497,7 +497,7 @@ class SchedulerDisaggregationDecodeMixin:
self
.
last_batch
=
batch
@
torch
.
no_grad
()
def
event_loop_overlap_disagg_decode
(
self
):
def
event_loop_overlap_disagg_decode
(
self
:
Scheduler
):
result_queue
=
deque
()
self
.
last_batch
:
Optional
[
ScheduleBatch
]
=
None
self
.
last_batch_in_queue
=
False
# last batch is modifed in-place, so we need another variable to track if it's extend
...
...
@@ -641,8 +641,15 @@ class SchedulerDisaggregationDecodeMixin:
def
process_decode_queue
(
self
:
Scheduler
):
req_conns
=
self
.
disagg_decode_prealloc_queue
.
pop_preallocated
()
def
_num_pre_alloc
(
req
):
return
len
(
req
.
req
.
origin_input_ids
)
+
max
(
len
(
req
.
req
.
output_ids
)
-
1
,
0
)
self
.
num_tokens_pre_allocated
+=
sum
(
_num_pre_alloc
(
req
)
for
req
in
req_conns
)
self
.
disagg_decode_transfer_queue
.
extend
(
req_conns
)
alloc_reqs
=
(
self
.
disagg_decode_transfer_queue
.
pop_transferred
()
)
# the requests which kv has arrived
self
.
waiting_queue
.
extend
(
alloc_reqs
)
self
.
num_tokens_pre_allocated
-=
sum
(
_num_pre_alloc
(
req
)
for
req
in
alloc_reqs
)
self
.
waiting_queue
.
extend
([
req
.
req
for
req
in
alloc_reqs
])
python/sglang/srt/disaggregation/prefill.py
View file @
c55550cb
...
...
@@ -176,14 +176,14 @@ class SchedulerDisaggregationPrefillMixin:
"""
@
torch
.
no_grad
()
def
event_loop_normal_disagg_prefill
(
self
):
def
event_loop_normal_disagg_prefill
(
self
:
Scheduler
):
"""A normal scheduler loop for prefill worker in disaggregation mode."""
while
True
:
recv_reqs
=
self
.
recv_requests
()
self
.
process_input_requests
(
recv_reqs
)
self
.
waiting_queue
.
extend
(
self
.
disagg_prefill_
pending
_queue
.
pop_bootstrapped
()
self
.
disagg_prefill_
bootstrap
_queue
.
pop_bootstrapped
()
)
self
.
process_prefill_chunk
()
batch
=
self
.
get_new_batch_prefill
()
...
...
@@ -214,14 +214,14 @@ class SchedulerDisaggregationPrefillMixin:
self
.
running_batch
.
batch_is_full
=
False
@
torch
.
no_grad
()
def
event_loop_overlap_disagg_prefill
(
self
):
def
event_loop_overlap_disagg_prefill
(
self
:
Scheduler
):
self
.
result_queue
=
deque
()
while
True
:
recv_reqs
=
self
.
recv_requests
()
self
.
process_input_requests
(
recv_reqs
)
self
.
waiting_queue
.
extend
(
self
.
disagg_prefill_
pending
_queue
.
pop_bootstrapped
()
self
.
disagg_prefill_
bootstrap
_queue
.
pop_bootstrapped
()
)
self
.
process_prefill_chunk
()
batch
=
self
.
get_new_batch_prefill
()
...
...
@@ -326,7 +326,7 @@ class SchedulerDisaggregationPrefillMixin:
raise
Exception
(
"Transferring failed"
)
for
req
in
done_reqs
:
self
.
disagg_prefill_
pending
_queue
.
req_to_metadata_buffer_idx_allocator
.
free
(
self
.
disagg_prefill_
bootstrap
_queue
.
req_to_metadata_buffer_idx_allocator
.
free
(
req
.
metadata_buffer_index
)
...
...
@@ -342,9 +342,8 @@ class SchedulerDisaggregationPrefillMixin:
# only finished requests to running_batch.
self
.
last_batch
.
filter_batch
(
chunked_req_to_exclude
=
self
.
chunked_req
)
self
.
tree_cache
.
cache_unfinished_req
(
self
.
chunked_req
)
if
(
self
.
enable_overlap
):
# Delay KV transfer to process_batch_result_disagg_prefill when overlap is enabled to ensure results are resolved
if
self
.
enable_overlap
:
# Delay KV transfer to process_batch_result_disagg_prefill when overlap is enabled to ensure results are resolved
self
.
chunked_req
.
tmp_end_idx
=
min
(
len
(
self
.
chunked_req
.
fill_ids
),
len
(
self
.
chunked_req
.
origin_input_ids
),
...
...
@@ -390,7 +389,7 @@ class SchedulerDisaggregationPrefillMixin:
.
numpy
()
)
if
last_chunk
is
True
:
self
.
disagg_prefill_
pending
_queue
.
store_prefill_results
(
self
.
disagg_prefill_
bootstrap
_queue
.
store_prefill_results
(
req
.
metadata_buffer_index
,
token_id
)
page_indices
=
kv_to_page_indices
(
kv_indices
,
page_size
)
...
...
python/sglang/srt/managers/scheduler.py
View file @
c55550cb
...
...
@@ -578,6 +578,10 @@ class Scheduler(
bootstrap_port
=
self
.
server_args
.
disaggregation_bootstrap_port
,
transfer_backend
=
self
.
transfer_backend
,
)
# Metric for pre-allocation
self
.
num_tokens_pre_allocated
=
0
elif
self
.
disaggregation_mode
==
DisaggregationMode
.
PREFILL
:
# *2 for the headroom.
buffer_size
=
self
.
max_running_requests
*
2
...
...
@@ -593,7 +597,7 @@ class Scheduler(
)
metadata_buffers
=
[
output_id_buffer
]
self
.
disagg_prefill_
pending
_queue
=
PrefillBootstrapQueue
(
self
.
disagg_prefill_
bootstrap
_queue
=
PrefillBootstrapQueue
(
token_to_kv_pool
=
self
.
token_to_kv_pool_allocator
.
get_kvcache
(),
req_to_metadata_buffer_idx_allocator
=
req_to_metadata_buffer_idx_allocator
,
metadata_buffers
=
metadata_buffers
,
...
...
@@ -901,7 +905,7 @@ class Scheduler(
def
_add_request_to_queue
(
self
,
req
:
Req
):
req
.
queue_time_start
=
time
.
time
()
if
self
.
disaggregation_mode
==
DisaggregationMode
.
PREFILL
:
self
.
disagg_prefill_
pending
_queue
.
add
(
req
)
self
.
disagg_prefill_
bootstrap
_queue
.
add
(
req
)
elif
self
.
disaggregation_mode
==
DisaggregationMode
.
DECODE
:
self
.
disagg_decode_prealloc_queue
.
add
(
req
)
else
:
...
...
@@ -991,8 +995,15 @@ class Scheduler(
f
"#cached-token:
{
adder
.
log_hit_tokens
}
, "
f
"token usage:
{
num_used
/
self
.
max_total_num_tokens
:.
2
f
}
, "
f
"#running-req:
{
running_bs
}
, "
f
"#queue-req:
{
len
(
self
.
waiting_queue
)
}
, "
)
if
self
.
disaggregation_mode
==
DisaggregationMode
.
PREFILL
:
f
+=
f
"#unbootstrapped-req:
{
len
(
self
.
disagg_prefill_bootstrap_queue
.
queue
)
}
, "
f
+=
f
"#queue-req:
{
len
(
self
.
waiting_queue
)
}
, "
f
+=
f
"#transferring-req:
{
len
(
self
.
disagg_prefill_inflight_queue
)
}
"
else
:
f
+=
f
"#queue-req:
{
len
(
self
.
waiting_queue
)
}
"
logger
.
info
(
f
)
if
self
.
enable_metrics
:
...
...
@@ -1028,15 +1039,14 @@ class Scheduler(
gap_latency
/
self
.
server_args
.
decode_log_interval
)
if
self
.
spec_algorithm
.
is_none
():
msg
=
(
f
"Decode batch. "
f
"#running-req:
{
num_running_reqs
}
, "
f
"#token:
{
num_used
}
, "
f
"token usage:
{
num_used
/
self
.
max_total_num_tokens
:.
2
f
}
, "
f
"gen throughput (token/s):
{
self
.
last_gen_throughput
:.
2
f
}
, "
f
"#queue-req:
{
len
(
self
.
waiting_queue
)
}
, "
)
if
self
.
spec_algorithm
.
is_none
():
spec_accept_length
=
0
else
:
spec_accept_length
=
(
...
...
@@ -1045,14 +1055,14 @@ class Scheduler(
self
.
cum_spec_accept_length
+=
self
.
spec_num_total_accepted_tokens
self
.
cum_spec_accept_count
+=
self
.
spec_num_total_forward_ct
self
.
spec_num_total_accepted_tokens
=
self
.
spec_num_total_forward_ct
=
0
msg
=
(
f
"Decode batch. "
f
"#running-req:
{
num_running_reqs
}
, "
f
"#token:
{
num_used
}
, "
f
"token usage:
{
num_used
/
self
.
max_total_num_tokens
:.
2
f
}
, "
f
"accept len:
{
spec_accept_length
:.
2
f
}
, "
msg
+
=
f
"accept len:
{
spec_accept_length
:.
2
f
}
, "
if
self
.
disaggregation_mode
==
DisaggregationMode
.
DECODE
:
msg
+=
f
"pre-allocated usage:
{
self
.
num_tokens_pre_allocated
/
self
.
max_total_num_tokens
:.
2
f
}
, "
msg
+=
(
f
"gen throughput (token/s):
{
self
.
last_gen_throughput
:.
2
f
}
, "
f
"#queue-req:
{
len
(
self
.
waiting_queue
)
}
,
"
f
"#queue-req:
{
len
(
self
.
waiting_queue
)
}
"
)
logger
.
info
(
msg
)
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment