run_train_multi.sh 655 Bytes
Newer Older
Rayyyyy's avatar
Rayyyyy committed
1
2
3
4
5
6
7
8
9
10
ulimit -u 200000
echo "START TIME: $(date)"
hostfile=./hostfile

np=$(cat $hostfile|sort|uniq |wc -l)
np=$(($np*4))

echo $np

nodename=$(cat $hostfile |sed -n "1p")
Rayyyyy's avatar
Rayyyyy committed
11
export dist_url=`echo $nodename | awk '{print $1}'`
Rayyyyy's avatar
Rayyyyy committed
12
13
14
15
16
which mpirun

# 添加pythonlib环境, 用户需修改为自己的环境变量地址
export LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:/path/of/conda/envs/{env_name}/lib
export PYTHON=python3
Rayyyyy's avatar
Rayyyyy committed
17
export NPROC_PER_NODE=4
Rayyyyy's avatar
Rayyyyy committed
18
19
20

# -np 显卡数量
# -x 将变量传递到single_process.sh脚本中
21
mpirun -np $np --allow-run-as-root --hostfile hostfile --bind-to none -x dist_url -x PYTHON -x NPROC_PER_NODE `pwd`/single_process.sh
Rayyyyy's avatar
Rayyyyy committed
22
echo "END TIME: $(date)"