Unverified Commit b82662d9 authored by Nick Hill's avatar Nick Hill Committed by GitHub
Browse files

[BugFix] Fix torch distributed stateless PG backend init (#14870)


Signed-off-by: default avatarNick Hill <nhill@redhat.com>
parent 71c1e071
...@@ -76,5 +76,10 @@ if __name__ == "__main__": ...@@ -76,5 +76,10 @@ if __name__ == "__main__":
GPUs_per_dp_rank)) GPUs_per_dp_rank))
proc.start() proc.start()
procs.append(proc) procs.append(proc)
exit_code = 0
for proc in procs: for proc in procs:
proc.join() proc.join()
if proc.exitcode:
exit_code = proc.exitcode
exit(exit_code)
...@@ -299,13 +299,10 @@ def stateless_init_torch_distributed_process_group( ...@@ -299,13 +299,10 @@ def stateless_init_torch_distributed_process_group(
# different systems (e.g. RPC) in case the store is multi-tenant. # different systems (e.g. RPC) in case the store is multi-tenant.
prefix_store = PrefixStore(init_method, store) prefix_store = PrefixStore(init_method, store)
pg_options = ProcessGroup.Options(backend=backend, timeout=timeout)
pg: ProcessGroup = ProcessGroup( pg: ProcessGroup = ProcessGroup(
prefix_store, prefix_store,
group_rank, group_rank,
group_size, group_size,
pg_options,
) )
if backend == "gloo": if backend == "gloo":
...@@ -327,7 +324,10 @@ def stateless_init_torch_distributed_process_group( ...@@ -327,7 +324,10 @@ def stateless_init_torch_distributed_process_group(
backend_options) backend_options)
backend_type = ProcessGroup.BackendType.NCCL backend_type = ProcessGroup.BackendType.NCCL
device = torch.device("cuda") device = torch.device("cuda")
else:
raise RuntimeError(f"Unsupported torch distributed backend: {backend}")
pg._set_default_backend(backend_type)
backend_class._set_sequence_number_for_group() backend_class._set_sequence_number_for_group()
pg._register_backend(device, backend_type, backend_class) pg._register_backend(device, backend_type, backend_class)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment