Improve performance when running with full parallel (#394)

9216b106 · Liangsheng Yin · GitHub · da19434c · 9216b106
Unverified Commit 9216b106 authored Apr 25, 2024 by Liangsheng Yin Committed by GitHub Apr 25, 2024
Hide whitespace changes
Inline Side-by-side

Showing with 3 additions and 1 deletion

python/sglang/srt/managers/router/model_rpc.py python/sglang/srt/managers/router/model_rpc.py +3 -1

No files found.
--- a/python/sglang/srt/managers/router/model_rpc.py
+++ b/python/sglang/srt/managers/router/model_rpc.py
@@ -348,6 +348,7 @@ class ModelRpcServer:
                    # Undo the insertion
                    delta = self.tree_cache.dec_ref_counter(req.last_node)
                    available_size += delta
+                    break
                else:
                    # Add this request to the running batch
                    self.token_to_kv_pool.add_refs(req.prefix_indices)
@@ -356,7 +357,8 @@ class ModelRpcServer:
                        req.extend_input_len + req.max_new_tokens()
                    )
                    new_batch_input_tokens += req.extend_input_len
+            else:
+                break
        if len(can_run_list) == 0:
            return None