Unverified Commit 1f5b7872 authored by Shaoshuai Shi's avatar Shaoshuai Shi Committed by GitHub
Browse files

bugfixed: stuck when training with dist_train.sh (#728)

parent ec982888
......@@ -166,9 +166,9 @@ def init_dist_pytorch(tcp_port, local_rank, backend='nccl'):
torch.cuda.set_device(local_rank % num_gpus)
dist.init_process_group(
backend=backend,
init_method='tcp://127.0.0.1:%d' % tcp_port,
rank=local_rank,
world_size=num_gpus
# init_method='tcp://127.0.0.1:%d' % tcp_port,
# rank=local_rank,
# world_size=num_gpus
)
rank = dist.get_rank()
return num_gpus, rank
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment