Unverified Commit e1bfcecb authored by Shaoshuai Shi's avatar Shaoshuai Shi Committed by GitHub
Browse files

bugfixed: solve the issue of blocking with dist_train.sh, automatically search tcp ports (#815)

* bugfixed: stuck when training with dist_train.sh, support tcp_port

* bugfixed: solve the issue of blocking with dist_train.sh, automatically search tcp ports
parent a5cf2a53
...@@ -161,8 +161,8 @@ def init_dist_slurm(tcp_port, local_rank, backend='nccl'): ...@@ -161,8 +161,8 @@ def init_dist_slurm(tcp_port, local_rank, backend='nccl'):
def init_dist_pytorch(tcp_port, local_rank, backend='nccl'): def init_dist_pytorch(tcp_port, local_rank, backend='nccl'):
if mp.get_start_method(allow_none=True) is None: if mp.get_start_method(allow_none=True) is None:
mp.set_start_method('spawn') mp.set_start_method('spawn')
os.environ['MASTER_PORT'] = str(tcp_port) # os.environ['MASTER_PORT'] = str(tcp_port)
os.environ['MASTER_ADDR'] = 'localhost' # os.environ['MASTER_ADDR'] = 'localhost'
num_gpus = torch.cuda.device_count() num_gpus = torch.cuda.device_count()
torch.cuda.set_device(local_rank % num_gpus) torch.cuda.set_device(local_rank % num_gpus)
......
...@@ -4,5 +4,15 @@ set -x ...@@ -4,5 +4,15 @@ set -x
NGPUS=$1 NGPUS=$1
PY_ARGS=${@:2} PY_ARGS=${@:2}
python -m torch.distributed.launch --nproc_per_node=${NGPUS} train.py --launcher pytorch ${PY_ARGS} while true
do
PORT=$(( ((RANDOM<<15)|RANDOM) % 49152 + 10000 ))
status="$(nc -z 127.0.0.1 $PORT < /dev/null &>/dev/null; echo $?)"
if [ "${status}" != "0" ]; then
break;
fi
done
echo $PORT
python -m torch.distributed.launch --nproc_per_node=${NGPUS} --rdzv_endpoint=localhost:${PORT} train.py --launcher pytorch ${PY_ARGS}
#!/usr/bin/env bash
set -x
NGPUS=$1
PY_ARGS=${@:2}
while true
do
PORT=$(( ((RANDOM<<15)|RANDOM) % 49152 + 10000 ))
status="$(nc -z 127.0.0.1 $PORT < /dev/null &>/dev/null; echo $?)"
if [ "${status}" != "0" ]; then
break;
fi
done
echo $PORT
torchrun --nproc_per_node=${NGPUS} --rdzv_endpoint=localhost:${PORT} train.py --launcher pytorch ${PY_ARGS}
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment