Unverified Commit 1f5b7872 authored by Shaoshuai Shi's avatar Shaoshuai Shi Committed by GitHub
Browse files

bugfixed: stuck when training with dist_train.sh (#728)

parent ec982888
...@@ -166,9 +166,9 @@ def init_dist_pytorch(tcp_port, local_rank, backend='nccl'): ...@@ -166,9 +166,9 @@ def init_dist_pytorch(tcp_port, local_rank, backend='nccl'):
torch.cuda.set_device(local_rank % num_gpus) torch.cuda.set_device(local_rank % num_gpus)
dist.init_process_group( dist.init_process_group(
backend=backend, backend=backend,
init_method='tcp://127.0.0.1:%d' % tcp_port, # init_method='tcp://127.0.0.1:%d' % tcp_port,
rank=local_rank, # rank=local_rank,
world_size=num_gpus # world_size=num_gpus
) )
rank = dist.get_rank() rank = dist.get_rank()
return num_gpus, rank return num_gpus, rank
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment