Unverified Commit 836873b9 authored by Liangsheng Yin's avatar Liangsheng Yin Committed by GitHub
Browse files

Fix memory leak when aborting decode request in PD-Disagg (#9817)


Co-authored-by: default avatarLianmin Zheng <15100009+merrymercy@users.noreply.github.com>
parent 8abe8dea
...@@ -6,7 +6,6 @@ from sglang.srt.disaggregation.mini_lb import PrefillConfig, run ...@@ -6,7 +6,6 @@ from sglang.srt.disaggregation.mini_lb import PrefillConfig, run
@dataclasses.dataclass @dataclasses.dataclass
class LBArgs: class LBArgs:
rust_lb: bool = False
host: str = "0.0.0.0" host: str = "0.0.0.0"
port: int = 8000 port: int = 8000
policy: str = "random" policy: str = "random"
...@@ -17,11 +16,6 @@ class LBArgs: ...@@ -17,11 +16,6 @@ class LBArgs:
@staticmethod @staticmethod
def add_cli_args(parser: argparse.ArgumentParser): def add_cli_args(parser: argparse.ArgumentParser):
parser.add_argument(
"--rust-lb",
action="store_true",
help="Deprecated, please use SGLang Router instead, this argument will have no effect.",
)
parser.add_argument( parser.add_argument(
"--host", "--host",
type=str, type=str,
...@@ -92,7 +86,6 @@ class LBArgs: ...@@ -92,7 +86,6 @@ class LBArgs:
] ]
return cls( return cls(
rust_lb=args.rust_lb,
host=args.host, host=args.host,
port=args.port, port=args.port,
policy=args.policy, policy=args.policy,
...@@ -102,12 +95,6 @@ class LBArgs: ...@@ -102,12 +95,6 @@ class LBArgs:
timeout=args.timeout, timeout=args.timeout,
) )
def __post_init__(self):
if not self.rust_lb:
assert (
self.policy == "random"
), "Only random policy is supported for Python load balancer"
def main(): def main():
parser = argparse.ArgumentParser( parser = argparse.ArgumentParser(
......
...@@ -2378,6 +2378,10 @@ class Scheduler( ...@@ -2378,6 +2378,10 @@ class Scheduler(
# We still need to send something back to TokenizerManager to clean up the state. # We still need to send something back to TokenizerManager to clean up the state.
req = self.waiting_queue.pop(i) req = self.waiting_queue.pop(i)
self.send_to_tokenizer.send_pyobj(AbortReq(req.rid)) self.send_to_tokenizer.send_pyobj(AbortReq(req.rid))
# For disaggregation decode mode, the request in the waiting queue has KV cache allocated.
if self.disaggregation_mode == DisaggregationMode.DECODE:
self.tree_cache.cache_finished_req(req)
logger.debug(f"Abort queued request. {req.rid=}") logger.debug(f"Abort queued request. {req.rid=}")
# Delete the requests in the grammar queue # Delete the requests in the grammar queue
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment