Unverified Commit aecc89c3 authored by jihan.yang's avatar jihan.yang Committed by GitHub
Browse files

automatically find unused tcp port when mgpu train or test (#224)

* automatically find unused tcp port when mgpu train or test

* fixbug: set tcp_port before set python args
parent 07419768
...@@ -9,7 +9,15 @@ PY_ARGS=${@:3} ...@@ -9,7 +9,15 @@ PY_ARGS=${@:3}
JOB_NAME=eval JOB_NAME=eval
SRUN_ARGS=${SRUN_ARGS:-""} SRUN_ARGS=${SRUN_ARGS:-""}
PORT=$(( ( RANDOM % 10000 ) + 10000 )) while true
do
PORT=$(( ((RANDOM<<15)|RANDOM) % 49152 + 10000 ))
status="$(nc -z 127.0.0.1 $PORT < /dev/null &>/dev/null; echo $?)"
if [ "${status}" != "0" ]; then
break;
fi
done
echo $PORT
srun -p ${PARTITION} \ srun -p ${PARTITION} \
--job-name=${JOB_NAME} \ --job-name=${JOB_NAME} \
...@@ -18,5 +26,5 @@ srun -p ${PARTITION} \ ...@@ -18,5 +26,5 @@ srun -p ${PARTITION} \
--ntasks-per-node=${GPUS_PER_NODE} \ --ntasks-per-node=${GPUS_PER_NODE} \
--kill-on-bad-exit=1 \ --kill-on-bad-exit=1 \
${SRUN_ARGS} \ ${SRUN_ARGS} \
python -u test.py --launcher slurm ${PY_ARGS} --tcp_port $PORT python -u test.py --launcher slurm --tcp_port $PORT ${PY_ARGS}
...@@ -11,7 +11,15 @@ GPUS_PER_NODE=${GPUS_PER_NODE:-8} ...@@ -11,7 +11,15 @@ GPUS_PER_NODE=${GPUS_PER_NODE:-8}
CPUS_PER_TASK=${CPUS_PER_TASK:-5} CPUS_PER_TASK=${CPUS_PER_TASK:-5}
SRUN_ARGS=${SRUN_ARGS:-""} SRUN_ARGS=${SRUN_ARGS:-""}
PORT=$(( ( RANDOM % 10000 ) + 10000 )) while true
do
PORT=$(( ((RANDOM<<15)|RANDOM) % 49152 + 10000 ))
status="$(nc -z 127.0.0.1 $PORT < /dev/null &>/dev/null; echo $?)"
if [ "${status}" != "0" ]; then
break;
fi
done
echo $PORT
srun -p ${PARTITION} \ srun -p ${PARTITION} \
--job-name=${JOB_NAME} \ --job-name=${JOB_NAME} \
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment