Unverified Commit 0860087a authored by Conroy Cheers's avatar Conroy Cheers Committed by GitHub
Browse files

[Fix] Fall back to Gloo when NCCL backend is unavailable (#19641)


Signed-off-by: default avatarconroy-cheers <conroy@corncheese.org>
parent 6bc7b573
...@@ -938,6 +938,13 @@ def init_distributed_environment( ...@@ -938,6 +938,13 @@ def init_distributed_environment(
assert distributed_init_method is not None, ( assert distributed_init_method is not None, (
"distributed_init_method must be provided when initializing " "distributed_init_method must be provided when initializing "
"distributed environment") "distributed environment")
if not torch.distributed.is_backend_available(backend):
logger.warning(
"Distributed backend %s is not available; "
"falling back to gloo.", backend)
assert torch.distributed.is_gloo_available(), (
"Fallback Gloo backend is not available.")
backend = "gloo"
# this backend is used for WORLD # this backend is used for WORLD
torch.distributed.init_process_group( torch.distributed.init_process_group(
backend=backend, backend=backend,
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment