"...git@developer.sourcefind.cn:2222/OpenDAS/vllm_cscc.git" did not exist on "eef921f45e7d3efb2ed2ccab80ee20ee2e4ebe38"
Unverified Commit 4ab3ac28 authored by Michael Goin's avatar Michael Goin Committed by GitHub
Browse files

[Bugfix] Fix flaky failure when getting DP ports (#20151)


Signed-off-by: default avatarmgoin <mgoin64@gmail.com>
parent d1c956dc
...@@ -1878,18 +1878,41 @@ class ParallelConfig: ...@@ -1878,18 +1878,41 @@ class ParallelConfig:
return answer return answer
def stateless_init_dp_group(self) -> "ProcessGroup": def stateless_init_dp_group(self) -> "ProcessGroup":
# NOTE: In high-concurrency scenarios multiple processes
# can pick the same (currently free) port through a race
# condition when calling `get_open_port()`. When the first
# process binds the port the others will subsequently fail
# with `torch.distributed.DistNetworkError: EADDRINUSE`.
# To make the initialization more robust we retry a few times
# with a fresh port whenever this specific error is observed.
from torch.distributed import DistNetworkError
from vllm.distributed.utils import ( from vllm.distributed.utils import (
stateless_init_torch_distributed_process_group) stateless_init_torch_distributed_process_group)
# use gloo since the engine process might not have cuda device max_retries = 5
dp_group = stateless_init_torch_distributed_process_group( last_exc: Optional[Exception] = None
self.data_parallel_master_ip, for _ in range(max_retries):
self.get_next_dp_init_port(), try:
self.data_parallel_rank, # use gloo since the engine process might not have cuda device
self.data_parallel_size, return stateless_init_torch_distributed_process_group(
backend="gloo") self.data_parallel_master_ip,
self.get_next_dp_init_port(),
return dp_group self.data_parallel_rank,
self.data_parallel_size,
backend="gloo")
except DistNetworkError as e:
# We only want to retry when the root cause is EADDRINUSE.
if "EADDRINUSE" in str(e):
logger.warning(
"Address already in use. Retrying with a new port.")
last_exc = e
continue # try again with a new port
raise e
# If we get here all retries have failed.
assert last_exc is not None
raise last_exc
@staticmethod @staticmethod
def has_unfinished_dp(dp_group: "ProcessGroup", def has_unfinished_dp(dp_group: "ProcessGroup",
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment