Only stream output on tp rank 0 (#2124)

e1b63624 · Lianmin Zheng · GitHub · c35cd1f8 · e1b63624 · e1b63624
Unverified Commit e1b63624 authored Nov 22, 2024 by Lianmin Zheng Committed by GitHub Nov 22, 2024
Showing with 7 additions and 5 deletions

python/sglang/srt/managers/scheduler.py python/sglang/srt/managers/scheduler.py +6 -4

python/sglang/srt/model_executor/model_runner.py python/sglang/srt/model_executor/model_runner.py +1 -1

No files found.
--- a/python/sglang/srt/managers/scheduler.py
+++ b/python/sglang/srt/managers/scheduler.py
@@ -134,8 +134,8 @@ class Scheduler:
                )
        else:
            self.recv_from_tokenizer = None
-            self.send_to_tokenizer = SimpleNamespace(send_pyobj=lambda x: None)
-            self.send_to_detokenizer = SimpleNamespace(send_pyobj=lambda x: None)
+            self.send_to_tokenizer = SimpleNamespace(send_pyobj=lambda _: None)
+            self.send_to_detokenizer = SimpleNamespace(send_pyobj=lambda _: None)

        # Init tokenizer
        self.model_config = ModelConfig(
@@ -1028,7 +1028,8 @@ class Scheduler:
                else:
                    self.tree_cache.cache_unfinished_req(req)

-        self.stream_output(batch.reqs)
+        if self.tp_rank == 0:
+            self.stream_output(batch.reqs)

    def process_batch_result_decode(self, batch: ScheduleBatch, result):
        logits_output, next_token_ids, bid = result
@@ -1079,7 +1080,8 @@ class Scheduler:
            torch.cuda.current_stream().synchronize()
            batch.next_batch_sampling_info.sampling_info_done.set()

-        self.stream_output(batch.reqs)
+        if self.tp_rank == 0:
+            self.stream_output(batch.reqs)

        self.token_to_kv_pool.free_group_end()


--- a/python/sglang/srt/model_executor/model_runner.py
+++ b/python/sglang/srt/model_executor/model_runner.py
@@ -179,7 +179,7 @@ class ModelRunner:
        if self.device == "cuda":
            torch.cuda.set_device(self.gpu_id)
            backend = "nccl"
-        # ToDO(liangan1):Just use gloo to bypass the initilization fail
+        # TODO(liangan1): Just use gloo to bypass the initilization fail
        # Need to use xccl for xpu backend in the future
        elif self.device == "xpu":
            torch.xpu.set_device(self.gpu_id)