Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
change
sglang
Commits
2c9aebea
Unverified
Commit
2c9aebea
authored
Oct 31, 2025
by
Liangsheng Yin
Committed by
GitHub
Oct 31, 2025
Browse files
Simplify watchdog (#12463)
parent
bc741073
Changes
2
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
45 additions
and
72 deletions
+45
-72
python/sglang/srt/managers/scheduler.py
python/sglang/srt/managers/scheduler.py
+0
-72
python/sglang/srt/managers/scheduler_runtime_checker_mixin.py
...on/sglang/srt/managers/scheduler_runtime_checker_mixin.py
+45
-0
No files found.
python/sglang/srt/managers/scheduler.py
View file @
2c9aebea
...
...
@@ -170,7 +170,6 @@ from sglang.srt.utils import (
broadcast_pyobj
,
configure_gc_logger
,
configure_logger
,
disable_request_logging
,
freeze_gc
,
get_available_gpu_memory
,
get_bool_env_var
,
...
...
@@ -179,7 +178,6 @@ from sglang.srt.utils import (
kill_itself_when_parent_died
,
numa_bind_to_node
,
point_to_point_pyobj
,
pyspy_dump_schedulers
,
require_mlp_sync
,
require_mlp_tp_gather
,
set_gpu_proc_affinity
,
...
...
@@ -2295,76 +2293,6 @@ class Scheduler(
self
.
_add_request_to_queue
(
req
)
self
.
grammar_queue
=
self
.
grammar_queue
[
num_ready_reqs
:]
def
watchdog_thread
(
self
):
"""A watch dog thread that will try to kill the server itself if one forward batch takes too long."""
self
.
watchdog_last_forward_ct
=
0
self
.
watchdog_last_time
=
time
.
perf_counter
()
while
True
:
current
=
time
.
perf_counter
()
if
self
.
cur_batch
is
not
None
:
if
self
.
watchdog_last_forward_ct
==
self
.
forward_ct
:
if
current
>
self
.
watchdog_last_time
+
self
.
watchdog_timeout
:
break
else
:
self
.
watchdog_last_forward_ct
=
self
.
forward_ct
self
.
watchdog_last_time
=
current
time
.
sleep
(
self
.
watchdog_timeout
//
2
)
if
not
disable_request_logging
():
# Print batch size and memory pool info to check whether there are de-sync issues.
if
self
.
is_hybrid
:
(
_
,
_
,
_
,
_
,
full_available_size
,
full_evictable_size
,
swa_available_size
,
swa_evictable_size
,
)
=
self
.
_get_swa_token_info
()
info_msg
=
(
f
"
{
full_available_size
=
}
, "
f
"
{
full_evictable_size
=
}
, "
f
"
{
swa_available_size
=
}
, "
f
"
{
swa_evictable_size
=
}
, "
)
elif
self
.
is_hybrid_gdn
and
isinstance
(
self
.
tree_cache
,
MambaRadixCache
):
(
_
,
_
,
_
,
_
,
full_available_size
,
full_evictable_size
,
mamba_available_size
,
mamba_evictable_size
,
)
=
self
.
_get_mamba_token_info
()
info_msg
=
(
f
"
{
full_available_size
=
}
, "
f
"
{
full_evictable_size
=
}
, "
f
"
{
mamba_available_size
=
}
, "
f
"
{
mamba_evictable_size
=
}
, "
)
else
:
_
,
_
,
available_size
,
evictable_size
=
self
.
_get_token_info
()
info_msg
=
f
"
{
available_size
=
}
, "
f
"
{
evictable_size
=
}
, "
logger
.
error
(
f
"
{
self
.
cur_batch
.
batch_size
()
=
}
, "
f
"
{
self
.
cur_batch
.
reqs
=
}
, "
f
"
{
info_msg
}
"
)
pyspy_dump_schedulers
()
logger
.
error
(
f
"Watchdog timeout (
{
self
.
watchdog_timeout
=
}
)"
)
print
(
file
=
sys
.
stderr
,
flush
=
True
)
print
(
file
=
sys
.
stdout
,
flush
=
True
)
# Wait for some time so that the parent process can print the error.
time
.
sleep
(
5
)
self
.
parent_process
.
send_signal
(
signal
.
SIGQUIT
)
def
flush_cache_wrapped
(
self
,
recv_req
:
FlushCacheReqInput
):
success
=
self
.
flush_cache
()
return
FlushCacheReqOutput
(
success
=
success
)
...
...
python/sglang/srt/managers/scheduler_runtime_checker_mixin.py
View file @
2c9aebea
from
__future__
import
annotations
import
logging
import
signal
import
sys
import
time
from
typing
import
TYPE_CHECKING
...
...
@@ -7,10 +10,13 @@ from sglang.srt.disaggregation.utils import DisaggregationMode
from
sglang.srt.managers.schedule_batch
import
ScheduleBatch
from
sglang.srt.mem_cache.mamba_radix_cache
import
MambaRadixCache
from
sglang.srt.mem_cache.swa_radix_cache
import
SWARadixCache
from
sglang.srt.utils.common
import
disable_request_logging
,
pyspy_dump_schedulers
if
TYPE_CHECKING
:
from
sglang.srt.managers.scheduler
import
Scheduler
logger
=
logging
.
getLogger
(
__name__
)
class
SchedulerRuntimeCheckerMixin
:
...
...
@@ -215,3 +221,42 @@ class SchedulerRuntimeCheckerMixin:
self
.
check_tree_cache
()
self
.
new_token_ratio
=
self
.
init_new_token_ratio
self
.
maybe_sleep_on_idle
()
def
watchdog_thread
(
self
:
Scheduler
):
"""A watch dog thread that will try to kill the server itself if one forward batch takes too long."""
self
.
watchdog_last_forward_ct
=
0
self
.
watchdog_last_time
=
time
.
perf_counter
()
while
True
:
current
=
time
.
perf_counter
()
if
self
.
cur_batch
is
not
None
:
if
self
.
watchdog_last_forward_ct
==
self
.
forward_ct
:
if
current
>
self
.
watchdog_last_time
+
self
.
watchdog_timeout
:
break
else
:
self
.
watchdog_last_forward_ct
=
self
.
forward_ct
self
.
watchdog_last_time
=
current
time
.
sleep
(
self
.
watchdog_timeout
//
2
)
if
not
disable_request_logging
():
# Print batch size and memory pool info to check whether there are de-sync issues.
if
self
.
is_hybrid
:
_
,
info_msg
=
self
.
_check_hybrid_memory
()
elif
self
.
is_hybrid_gdn
and
isinstance
(
self
.
tree_cache
,
MambaRadixCache
):
_
,
info_msg
=
self
.
_check_mamba_memory
()
else
:
_
,
info_msg
=
self
.
_check_radix_cache_memory
()
logger
.
error
(
f
"
{
self
.
cur_batch
.
batch_size
()
=
}
\n
"
f
"
{
self
.
cur_batch
.
reqs
=
}
\n
"
f
"
{
info_msg
}
"
)
pyspy_dump_schedulers
()
logger
.
error
(
f
"Watchdog timeout (
{
self
.
watchdog_timeout
=
}
)"
)
print
(
file
=
sys
.
stderr
,
flush
=
True
)
print
(
file
=
sys
.
stdout
,
flush
=
True
)
# Wait for some time so that the parent process can print the error.
time
.
sleep
(
5
)
self
.
parent_process
.
send_signal
(
signal
.
SIGQUIT
)
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment