Fix a bug in abort & Improve docstrings for abort (#6931)

e6b7053b · Lianmin Zheng · GitHub · 5f91c825 · e6b7053b · e6b7053b
Unverified Commit e6b7053b authored Jun 06, 2025 by Lianmin Zheng Committed by GitHub Jun 06, 2025
Showing with 17 additions and 9 deletions

python/sglang/srt/managers/scheduler.py python/sglang/srt/managers/scheduler.py +16 -8

python/sglang/srt/managers/tokenizer_manager.py python/sglang/srt/managers/tokenizer_manager.py +1 -1

No files found.
--- a/python/sglang/srt/managers/scheduler.py
+++ b/python/sglang/srt/managers/scheduler.py
@@ -2041,10 +2041,23 @@ class Scheduler(
        # Sort in reverse order to avoid index issues when deleting
        for i in reversed(to_del):
+            # Abort method 1: directly pop from the queue
+            # This only works for requests that have not started anything.
+            # We still need to send something back to TokenizerManager to clean up the state.
            req = self.waiting_queue.pop(i)
            self.send_to_tokenizer.send_pyobj(AbortReq(req.rid))
            logger.debug(f"Abort queued request. {req.rid=}")
+        # Delete the requests in the grammar queue
+        for req in self.grammar_queue:
+            # Abort method 2: call `set_finish_with_abort`
+            # The request will still run one prefill forward pass.
+            # In this case, we change the input_ids to be only one token to make this prefill cheap.
+            if req.rid.startswith(recv_req.rid):
+                logger.debug(f"Abort grammar queue request. {req.rid=}")
+                req.grammar.cancel()
+                req.set_finish_with_abort("Aborted by AbortReq.")
        # Delete requests in the running batch
        if self.cur_batch is self.running_batch or self.cur_batch is None:
            reqs = self.running_batch.reqs
@@ -2053,17 +2066,12 @@ class Scheduler(
        for req in reqs:
            if req.rid.startswith(recv_req.rid) and not req.finished():
+                # Abort method 3: set `to_abort=True`
+                # The request will still run one decode forward pass.
+                # Then we reuse all existing code to clean up the KV cache allocation.
                logger.debug(f"Abort running request. {req.rid=}")
-                # We must use to_abort because it is in a running batch
                req.to_abort = True
-        # Delete the requests in the grammar queue
-        for req in self.grammar_queue:
-            if req.rid.startswith(recv_req.rid):
-                logger.debug(f"Abort grammar queue request. {req.rid=}")
-                req.grammar.cancel()
-                req.set_finish_with_abort("Aborted by AbortReq.")
    def _pause_engine(self) -> Tuple[List[Req], int]:
        raise NotImplementedError()

--- a/python/sglang/srt/managers/tokenizer_manager.py
+++ b/python/sglang/srt/managers/tokenizer_manager.py
@@ -1419,7 +1419,7 @@ class TokenizerManager:
            asyncio.create_task(asyncio.to_thread(background_task))
    def _handle_abort_req(self, recv_obj):
-        self.rid_to_state.pop(recv_obj.rid)
+        self.rid_to_state.pop(recv_obj.rid, None)
    def _handle_open_session_req_output(self, recv_obj):
        self.session_futures[recv_obj.session_id].set_result(