Unverified Commit e1bfcecb authored by Shaoshuai Shi's avatar Shaoshuai Shi Committed by GitHub
Browse files

bugfixed: solve the issue of blocking with dist_train.sh, automatically search tcp ports (#815)

* bugfixed: stuck when training with dist_train.sh, support tcp_port

* bugfixed: solve the issue of blocking with dist_train.sh, automatically search tcp ports
parent a5cf2a53
......@@ -161,8 +161,8 @@ def init_dist_slurm(tcp_port, local_rank, backend='nccl'):
def init_dist_pytorch(tcp_port, local_rank, backend='nccl'):
if mp.get_start_method(allow_none=True) is None:
mp.set_start_method('spawn')
os.environ['MASTER_PORT'] = str(tcp_port)
os.environ['MASTER_ADDR'] = 'localhost'
# os.environ['MASTER_PORT'] = str(tcp_port)
# os.environ['MASTER_ADDR'] = 'localhost'
num_gpus = torch.cuda.device_count()
torch.cuda.set_device(local_rank % num_gpus)
......
......@@ -4,5 +4,15 @@ set -x
NGPUS=$1
PY_ARGS=${@:2}
python -m torch.distributed.launch --nproc_per_node=${NGPUS} train.py --launcher pytorch ${PY_ARGS}
while true
do
PORT=$(( ((RANDOM<<15)|RANDOM) % 49152 + 10000 ))
status="$(nc -z 127.0.0.1 $PORT < /dev/null &>/dev/null; echo $?)"
if [ "${status}" != "0" ]; then
break;
fi
done
echo $PORT
python -m torch.distributed.launch --nproc_per_node=${NGPUS} --rdzv_endpoint=localhost:${PORT} train.py --launcher pytorch ${PY_ARGS}
#!/usr/bin/env bash
set -x
NGPUS=$1
PY_ARGS=${@:2}
while true
do
PORT=$(( ((RANDOM<<15)|RANDOM) % 49152 + 10000 ))
status="$(nc -z 127.0.0.1 $PORT < /dev/null &>/dev/null; echo $?)"
if [ "${status}" != "0" ]; then
break;
fi
done
echo $PORT
torchrun --nproc_per_node=${NGPUS} --rdzv_endpoint=localhost:${PORT} train.py --launcher pytorch ${PY_ARGS}
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment