run.sh 724 Bytes
Newer Older
Rayyyyy's avatar
Rayyyyy committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
ulimit -u 200000
echo "START TIME: $(date)"
hostfile=./hostfile

np=$(cat $hostfile|sort|uniq |wc -l)
np=$(($np*8))

echo $np

nodename=$(cat $hostfile |sed -n "1p")
dist_url=`echo $nodename | awk '{print $1}'`
which mpirun

# 添加pythonlib环境, 用户需修改为自己的环境变量地址
export LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:/path/of/conda/envs/{env_name}/lib
export PYTHON=python3
export NPROC_PER_NODE=4

# 使用mpirun 进行多卡训练,分别对单张卡进行单线程启动
# -np 显卡数量
# -x 将变量传递到single_process.sh脚本中
mpirun -np $np --allow-run-as-root --hostfile hostfile --bind-to none -x dist_url -x PYTHON -x NPROC_PER_NODE `pwd`/single_process.sh
echo "END TIME: $(date)"