Fix memory leak when aborting decode request in PD-Disagg (#9817)

Co-authored-by: Lianmin Zheng <15100009+merrymercy@users.noreply.github.com>

Fix memory leak when aborting decode request in PD-Disagg (#9817)
Co-authored-by: Lianmin Zheng <15100009+merrymercy@users.noreply.github.com>
836873b9 · Liangsheng Yin · GitHub · 8abe8dea · 836873b9 · 836873b9
Unverified Commit 836873b9 authored Aug 30, 2025 by Liangsheng Yin Committed by GitHub Aug 30, 2025
Showing with 4 additions and 13 deletions

python/sglang/srt/disaggregation/launch_lb.py python/sglang/srt/disaggregation/launch_lb.py +0 -13

python/sglang/srt/managers/scheduler.py python/sglang/srt/managers/scheduler.py +4 -0

No files found.
--- a/python/sglang/srt/disaggregation/launch_lb.py
+++ b/python/sglang/srt/disaggregation/launch_lb.py
@@ -6,7 +6,6 @@ from sglang.srt.disaggregation.mini_lb import PrefillConfig, run
 @dataclasses.dataclass
 class LBArgs:
-    rust_lb: bool = False
    host: str = "0.0.0.0"
    port: int = 8000
    policy: str = "random"
@@ -17,11 +16,6 @@ class LBArgs:
    @staticmethod
    def add_cli_args(parser: argparse.ArgumentParser):
-        parser.add_argument(
-            "--rust-lb",
-            action="store_true",
-            help="Deprecated, please use SGLang Router instead, this argument will have no effect.",
-        )
        parser.add_argument(
            "--host",
            type=str,
@@ -92,7 +86,6 @@ class LBArgs:
        ]
        return cls(
-            rust_lb=args.rust_lb,
            host=args.host,
            port=args.port,
            policy=args.policy,
@@ -102,12 +95,6 @@ class LBArgs:
            timeout=args.timeout,
        )
-    def __post_init__(self):
-        if not self.rust_lb:
-            assert (
-                self.policy == "random"
-            ), "Only random policy is supported for Python load balancer"
 def main():
    parser = argparse.ArgumentParser(

--- a/python/sglang/srt/managers/scheduler.py
+++ b/python/sglang/srt/managers/scheduler.py
@@ -2378,6 +2378,10 @@ class Scheduler(
            # We still need to send something back to TokenizerManager to clean up the state.
            req = self.waiting_queue.pop(i)
            self.send_to_tokenizer.send_pyobj(AbortReq(req.rid))
+            # For disaggregation decode mode, the request in the waiting queue has KV cache allocated.
+            if self.disaggregation_mode == DisaggregationMode.DECODE:
+                self.tree_cache.cache_finished_req(req)
            logger.debug(f"Abort queued request. {req.rid=}")
        # Delete the requests in the grammar queue