Simplify watchdog (#12463)

2c9aebea · Liangsheng Yin · GitHub · bc741073 · 2c9aebea · 2c9aebea
Unverified Commit 2c9aebea authored Oct 31, 2025 by Liangsheng Yin Committed by GitHub Oct 31, 2025
Showing with 45 additions and 72 deletions

python/sglang/srt/managers/scheduler.py python/sglang/srt/managers/scheduler.py +0 -72

python/sglang/srt/managers/scheduler_runtime_checker_mixin.py ...on/sglang/srt/managers/scheduler_runtime_checker_mixin.py +45 -0

No files found.
--- a/python/sglang/srt/managers/scheduler.py
+++ b/python/sglang/srt/managers/scheduler.py
@@ -170,7 +170,6 @@ from sglang.srt.utils import (
    broadcast_pyobj,
    configure_gc_logger,
    configure_logger,
-    disable_request_logging,
    freeze_gc,
    get_available_gpu_memory,
    get_bool_env_var,
@@ -179,7 +178,6 @@ from sglang.srt.utils import (
    kill_itself_when_parent_died,
    numa_bind_to_node,
    point_to_point_pyobj,
-    pyspy_dump_schedulers,
    require_mlp_sync,
    require_mlp_tp_gather,
    set_gpu_proc_affinity,
@@ -2295,76 +2293,6 @@ class Scheduler(
            self._add_request_to_queue(req)
        self.grammar_queue = self.grammar_queue[num_ready_reqs:]

-    def watchdog_thread(self):
-        """A watch dog thread that will try to kill the server itself if one forward batch takes too long."""
-        self.watchdog_last_forward_ct = 0
-        self.watchdog_last_time = time.perf_counter()
-
-        while True:
-            current = time.perf_counter()
-            if self.cur_batch is not None:
-                if self.watchdog_last_forward_ct == self.forward_ct:
-                    if current > self.watchdog_last_time + self.watchdog_timeout:
-                        break
-                else:
-                    self.watchdog_last_forward_ct = self.forward_ct
-                    self.watchdog_last_time = current
-            time.sleep(self.watchdog_timeout // 2)
-
-        if not disable_request_logging():
-            # Print batch size and memory pool info to check whether there are de-sync issues.
-            if self.is_hybrid:
-                (
-                    _,
-                    _,
-                    _,
-                    _,
-                    full_available_size,
-                    full_evictable_size,
-                    swa_available_size,
-                    swa_evictable_size,
-                ) = self._get_swa_token_info()
-                info_msg = (
-                    f"{full_available_size=}, "
-                    f"{full_evictable_size=}, "
-                    f"{swa_available_size=}, "
-                    f"{swa_evictable_size=}, "
-                )
-            elif self.is_hybrid_gdn and isinstance(self.tree_cache, MambaRadixCache):
-                (
-                    _,
-                    _,
-                    _,
-                    _,
-                    full_available_size,
-                    full_evictable_size,
-                    mamba_available_size,
-                    mamba_evictable_size,
-                ) = self._get_mamba_token_info()
-                info_msg = (
-                    f"{full_available_size=}, "
-                    f"{full_evictable_size=}, "
-                    f"{mamba_available_size=}, "
-                    f"{mamba_evictable_size=}, "
-                )
-            else:
-                _, _, available_size, evictable_size = self._get_token_info()
-                info_msg = f"{available_size=}, " f"{evictable_size=}, "
-            logger.error(
-                f"{self.cur_batch.batch_size()=}, "
-                f"{self.cur_batch.reqs=}, "
-                f"{info_msg}"
-            )
-
-        pyspy_dump_schedulers()
-        logger.error(f"Watchdog timeout ({self.watchdog_timeout=})")
-        print(file=sys.stderr, flush=True)
-        print(file=sys.stdout, flush=True)
-
-        # Wait for some time so that the parent process can print the error.
-        time.sleep(5)
-        self.parent_process.send_signal(signal.SIGQUIT)
-
    def flush_cache_wrapped(self, recv_req: FlushCacheReqInput):
        success = self.flush_cache()
        return FlushCacheReqOutput(success=success)

--- a/python/sglang/srt/managers/scheduler_runtime_checker_mixin.py
+++ b/python/sglang/srt/managers/scheduler_runtime_checker_mixin.py
 from __future__ import annotations

+import logging
+import signal
+import sys
 import time
 from typing import TYPE_CHECKING

@@ -7,10 +10,13 @@ from sglang.srt.disaggregation.utils import DisaggregationMode
 from sglang.srt.managers.schedule_batch import ScheduleBatch
 from sglang.srt.mem_cache.mamba_radix_cache import MambaRadixCache
 from sglang.srt.mem_cache.swa_radix_cache import SWARadixCache
+from sglang.srt.utils.common import disable_request_logging, pyspy_dump_schedulers

 if TYPE_CHECKING:
    from sglang.srt.managers.scheduler import Scheduler

+logger = logging.getLogger(__name__)
+

 class SchedulerRuntimeCheckerMixin:

@@ -215,3 +221,42 @@ class SchedulerRuntimeCheckerMixin:
        self.check_tree_cache()
        self.new_token_ratio = self.init_new_token_ratio
        self.maybe_sleep_on_idle()
+
+    def watchdog_thread(self: Scheduler):
+        """A watch dog thread that will try to kill the server itself if one forward batch takes too long."""
+        self.watchdog_last_forward_ct = 0
+        self.watchdog_last_time = time.perf_counter()
+
+        while True:
+            current = time.perf_counter()
+            if self.cur_batch is not None:
+                if self.watchdog_last_forward_ct == self.forward_ct:
+                    if current > self.watchdog_last_time + self.watchdog_timeout:
+                        break
+                else:
+                    self.watchdog_last_forward_ct = self.forward_ct
+                    self.watchdog_last_time = current
+            time.sleep(self.watchdog_timeout // 2)
+
+        if not disable_request_logging():
+            # Print batch size and memory pool info to check whether there are de-sync issues.
+            if self.is_hybrid:
+                _, info_msg = self._check_hybrid_memory()
+            elif self.is_hybrid_gdn and isinstance(self.tree_cache, MambaRadixCache):
+                _, info_msg = self._check_mamba_memory()
+            else:
+                _, info_msg = self._check_radix_cache_memory()
+            logger.error(
+                f"{self.cur_batch.batch_size()=}\n"
+                f"{self.cur_batch.reqs=}\n"
+                f"{info_msg}"
+            )
+
+        pyspy_dump_schedulers()
+        logger.error(f"Watchdog timeout ({self.watchdog_timeout=})")
+        print(file=sys.stderr, flush=True)
+        print(file=sys.stdout, flush=True)
+
+        # Wait for some time so that the parent process can print the error.
+        time.sleep(5)
+        self.parent_process.send_signal(signal.SIGQUIT)