Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
change
sglang
Commits
bd7cfbd2
"tests/vscode:/vscode.git/clone" did not exist on "06608f847c1e58b16a2c3eb876b1afaa7cc7a7f7"
Unverified
Commit
bd7cfbd2
authored
Jun 13, 2025
by
Povilas Kanapickas
Committed by
GitHub
Jun 12, 2025
Browse files
[Fix] Reduce busy polling when scheduler is idle (#6026)
parent
4b9971e4
Changes
5
Hide whitespace changes
Inline
Side-by-side
Showing
5 changed files
with
48 additions
and
1 deletion
+48
-1
docs/backend/server_arguments.md
docs/backend/server_arguments.md
+1
-1
python/sglang/srt/disaggregation/decode.py
python/sglang/srt/disaggregation/decode.py
+2
-0
python/sglang/srt/disaggregation/prefill.py
python/sglang/srt/disaggregation/prefill.py
+2
-0
python/sglang/srt/managers/scheduler.py
python/sglang/srt/managers/scheduler.py
+37
-0
python/sglang/srt/server_args.py
python/sglang/srt/server_args.py
+6
-0
No files found.
docs/backend/server_arguments.md
View file @
bd7cfbd2
...
@@ -107,7 +107,7 @@ Please consult the documentation below and [server_args.py](https://github.com/s
...
@@ -107,7 +107,7 @@ Please consult the documentation below and [server_args.py](https://github.com/s
|
`--download-dir`
| Model download directory for huggingface. | None |
|
`--download-dir`
| Model download directory for huggingface. | None |
|
`--base-gpu-id`
| The base GPU ID to start allocating GPUs from. Useful when running multiple instances on the same machine. | 0 |
|
`--base-gpu-id`
| The base GPU ID to start allocating GPUs from. Useful when running multiple instances on the same machine. | 0 |
|
`--gpu-id-step`
| The delta between consecutive GPU IDs that are used. For example, setting it to 2 will use GPU 0,2,4,.... | 1 |
|
`--gpu-id-step`
| The delta between consecutive GPU IDs that are used. For example, setting it to 2 will use GPU 0,2,4,.... | 1 |
|
`--sleep-on-idle`
| Reduce CPU usage when sglang is idle. | False |
## Logging
## Logging
...
...
python/sglang/srt/disaggregation/decode.py
View file @
bd7cfbd2
...
@@ -550,6 +550,7 @@ class SchedulerDisaggregationDecodeMixin:
...
@@ -550,6 +550,7 @@ class SchedulerDisaggregationDecodeMixin:
# When the server is idle, do self-check and re-init some states
# When the server is idle, do self-check and re-init some states
self
.
check_memory
()
self
.
check_memory
()
self
.
new_token_ratio
=
self
.
init_new_token_ratio
self
.
new_token_ratio
=
self
.
init_new_token_ratio
self
.
maybe_sleep_on_idle
()
self
.
last_batch
=
batch
self
.
last_batch
=
batch
...
@@ -628,6 +629,7 @@ class SchedulerDisaggregationDecodeMixin:
...
@@ -628,6 +629,7 @@ class SchedulerDisaggregationDecodeMixin:
# When the server is idle, do self-check and re-init some states
# When the server is idle, do self-check and re-init some states
self
.
check_memory
()
self
.
check_memory
()
self
.
new_token_ratio
=
self
.
init_new_token_ratio
self
.
new_token_ratio
=
self
.
init_new_token_ratio
self
.
maybe_sleep_on_idle
()
self
.
last_batch
=
batch
self
.
last_batch
=
batch
self
.
last_batch_in_queue
=
last_batch_in_queue
self
.
last_batch_in_queue
=
last_batch_in_queue
...
...
python/sglang/srt/disaggregation/prefill.py
View file @
bd7cfbd2
...
@@ -242,6 +242,7 @@ class SchedulerDisaggregationPrefillMixin:
...
@@ -242,6 +242,7 @@ class SchedulerDisaggregationPrefillMixin:
if
batch
is
None
and
len
(
self
.
disagg_prefill_inflight_queue
)
==
0
:
if
batch
is
None
and
len
(
self
.
disagg_prefill_inflight_queue
)
==
0
:
self
.
check_memory
()
self
.
check_memory
()
self
.
new_token_ratio
=
self
.
init_new_token_ratio
self
.
new_token_ratio
=
self
.
init_new_token_ratio
self
.
maybe_sleep_on_idle
()
self
.
last_batch
=
batch
self
.
last_batch
=
batch
# HACK (byronhsu): reset the batch_is_full flag because we never enter update_running_batch which resets it
# HACK (byronhsu): reset the batch_is_full flag because we never enter update_running_batch which resets it
...
@@ -294,6 +295,7 @@ class SchedulerDisaggregationPrefillMixin:
...
@@ -294,6 +295,7 @@ class SchedulerDisaggregationPrefillMixin:
if
batch
is
None
and
len
(
self
.
disagg_prefill_inflight_queue
)
==
0
:
if
batch
is
None
and
len
(
self
.
disagg_prefill_inflight_queue
)
==
0
:
self
.
check_memory
()
self
.
check_memory
()
self
.
new_token_ratio
=
self
.
init_new_token_ratio
self
.
new_token_ratio
=
self
.
init_new_token_ratio
self
.
maybe_sleep_on_idle
()
self
.
last_batch
=
batch
self
.
last_batch
=
batch
# HACK (byronhsu): reset the batch_is_full flag because we never enter update_running_batch which resets it
# HACK (byronhsu): reset the batch_is_full flag because we never enter update_running_batch which resets it
...
...
python/sglang/srt/managers/scheduler.py
View file @
bd7cfbd2
...
@@ -179,6 +179,27 @@ class EmbeddingBatchResult:
...
@@ -179,6 +179,27 @@ class EmbeddingBatchResult:
bid
:
int
bid
:
int
class
IdleSleeper
:
"""
In setups which have long inactivity periods it is desirable to reduce
system power consumption when sglang does nothing. This would lead not only
to power savings, but also to more CPU thermal headroom when a request
eventually comes. This is important in cases when multiple GPUs are connected
as each GPU would otherwise pin one thread at 100% CPU usage.
The simplest solution is to use zmq.Poller on all sockets that may receive
data that needs handling immediately.
"""
def
__init__
(
self
,
sockets
):
self
.
poller
=
zmq
.
Poller
()
for
s
in
sockets
:
self
.
poller
.
register
(
s
,
zmq
.
POLLIN
)
def
maybe_sleep
(
self
):
self
.
poller
.
poll
(
1000
)
class
Scheduler
(
class
Scheduler
(
SchedulerOutputProcessorMixin
,
SchedulerOutputProcessorMixin
,
SchedulerDisaggregationDecodeMixin
,
SchedulerDisaggregationDecodeMixin
,
...
@@ -228,6 +249,8 @@ class Scheduler(
...
@@ -228,6 +249,8 @@ class Scheduler(
# Init inter-process communication
# Init inter-process communication
context
=
zmq
.
Context
(
2
)
context
=
zmq
.
Context
(
2
)
self
.
idle_sleeper
=
None
if
self
.
pp_rank
==
0
and
self
.
attn_tp_rank
==
0
:
if
self
.
pp_rank
==
0
and
self
.
attn_tp_rank
==
0
:
self
.
recv_from_tokenizer
=
get_zmq_socket
(
self
.
recv_from_tokenizer
=
get_zmq_socket
(
context
,
zmq
.
PULL
,
port_args
.
scheduler_input_ipc_name
,
False
context
,
zmq
.
PULL
,
port_args
.
scheduler_input_ipc_name
,
False
...
@@ -250,6 +273,13 @@ class Scheduler(
...
@@ -250,6 +273,13 @@ class Scheduler(
self
.
recv_from_rpc
=
get_zmq_socket
(
self
.
recv_from_rpc
=
get_zmq_socket
(
context
,
zmq
.
DEALER
,
port_args
.
rpc_ipc_name
,
False
context
,
zmq
.
DEALER
,
port_args
.
rpc_ipc_name
,
False
)
)
if
self
.
server_args
.
sleep_on_idle
:
self
.
idle_sleeper
=
IdleSleeper
(
[
self
.
recv_from_tokenizer
,
self
.
recv_from_rpc
,
]
)
else
:
else
:
self
.
recv_from_tokenizer
=
None
self
.
recv_from_tokenizer
=
None
self
.
recv_from_rpc
=
None
self
.
recv_from_rpc
=
None
...
@@ -478,6 +508,10 @@ class Scheduler(
...
@@ -478,6 +508,10 @@ class Scheduler(
)
)
self
.
init_disaggregation
()
self
.
init_disaggregation
()
def
maybe_sleep_on_idle
(
self
):
if
self
.
idle_sleeper
is
not
None
:
self
.
idle_sleeper
.
maybe_sleep
()
def
init_tokenizer
(
self
):
def
init_tokenizer
(
self
):
server_args
=
self
.
server_args
server_args
=
self
.
server_args
...
@@ -667,6 +701,7 @@ class Scheduler(
...
@@ -667,6 +701,7 @@ class Scheduler(
# When the server is idle, do self-check and re-init some states
# When the server is idle, do self-check and re-init some states
self
.
check_memory
()
self
.
check_memory
()
self
.
new_token_ratio
=
self
.
init_new_token_ratio
self
.
new_token_ratio
=
self
.
init_new_token_ratio
self
.
maybe_sleep_on_idle
()
self
.
last_batch
=
batch
self
.
last_batch
=
batch
...
@@ -711,6 +746,7 @@ class Scheduler(
...
@@ -711,6 +746,7 @@ class Scheduler(
# When the server is idle, do self-check and re-init some states
# When the server is idle, do self-check and re-init some states
self
.
check_memory
()
self
.
check_memory
()
self
.
new_token_ratio
=
self
.
init_new_token_ratio
self
.
new_token_ratio
=
self
.
init_new_token_ratio
self
.
maybe_sleep_on_idle
()
self
.
last_batch
=
batch
self
.
last_batch
=
batch
...
@@ -816,6 +852,7 @@ class Scheduler(
...
@@ -816,6 +852,7 @@ class Scheduler(
if
server_is_idle
:
if
server_is_idle
:
self
.
check_memory
()
self
.
check_memory
()
self
.
new_token_ratio
=
self
.
init_new_token_ratio
self
.
new_token_ratio
=
self
.
init_new_token_ratio
self
.
maybe_sleep_on_idle
()
def
recv_requests
(
self
)
->
List
[
Req
]:
def
recv_requests
(
self
)
->
List
[
Req
]:
"""Receive results at tp_rank = 0 and broadcast it to all other TP ranks."""
"""Receive results at tp_rank = 0 and broadcast it to all other TP ranks."""
...
...
python/sglang/srt/server_args.py
View file @
bd7cfbd2
...
@@ -90,6 +90,7 @@ class ServerArgs:
...
@@ -90,6 +90,7 @@ class ServerArgs:
download_dir
:
Optional
[
str
]
=
None
download_dir
:
Optional
[
str
]
=
None
base_gpu_id
:
int
=
0
base_gpu_id
:
int
=
0
gpu_id_step
:
int
=
1
gpu_id_step
:
int
=
1
sleep_on_idle
:
bool
=
False
# Logging
# Logging
log_level
:
str
=
"info"
log_level
:
str
=
"info"
...
@@ -844,6 +845,11 @@ class ServerArgs:
...
@@ -844,6 +845,11 @@ class ServerArgs:
default
=
ServerArgs
.
gpu_id_step
,
default
=
ServerArgs
.
gpu_id_step
,
help
=
"The delta between consecutive GPU IDs that are used. For example, setting it to 2 will use GPU 0,2,4,..."
,
help
=
"The delta between consecutive GPU IDs that are used. For example, setting it to 2 will use GPU 0,2,4,..."
,
)
)
parser
.
add_argument
(
"--sleep-on-idle"
,
action
=
"store_true"
,
help
=
"Reduce CPU usage when sglang is idle."
,
)
# Logging
# Logging
parser
.
add_argument
(
parser
.
add_argument
(
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment