run_train_multi.sh 613 Bytes
Newer Older
Rayyyyy's avatar
Rayyyyy committed
1
2
3
4
5
ulimit -u 200000
echo "START TIME: $(date)"
hostfile=./hostfile

np=$(cat $hostfile|sort|uniq |wc -l)
Rayyyyy's avatar
Rayyyyy committed
6
np=$(($np*4))
Rayyyyy's avatar
Rayyyyy committed
7
8
9
10

echo $np

nodename=$(cat $hostfile |sed -n "1p")
Rayyyyy's avatar
Rayyyyy committed
11
export dist_url=`echo $nodename | awk '{print $1}'`
Rayyyyy's avatar
Rayyyyy committed
12
13
14
15
16
17
18
19
which mpirun

# 添加pythonlib环境, 用户需修改为自己的环境变量地址
export LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:/path/of/conda/envs/{env_name}/lib
export PYTHON=python3

# -np 显卡数量
# -x 将变量传递到single_process.sh脚本中
Rayyyyy's avatar
Rayyyyy committed
20
mpirun -np $np --allow-run-as-root --hostfile hostfile --bind-to none -x dist_url -x PYTHON `pwd`/single_process.sh
Rayyyyy's avatar
Rayyyyy committed
21
echo "END TIME: $(date)"