run_bert_pre2_4dcus.sh 797 Bytes
Newer Older
hepj987's avatar
hepj987 committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
#!/usr/bin/env bash
#SBATCH -J 2node-test
#SBATCH -p wzhdtest
#SBATCH -N 2
#SBARCH -n 32
#SBATCH --ntasks-per-node=4
#SBATCH --cpus-per-task=8
#SBATCH --gres=dcu:4
set -x

HOME_PATH=/work/home/hepj
WORK_PATH=${HOME_PATH}/torch/bert-pretrain/2node-run
source ~/env22.10.sh
which python3
hostfile=./$SLURM_JOB_ID
scontrol show hostnames $SLURM_JOB_NODELIST > ${hostfile}
for i in `cat $hostfile`
do
    echo ${i} slots=4 >> `pwd`/hostfile-$SLURM_JOB_ID
    ((num_node=${num_node}+1))
done
num_dcu=$((${num_node}*4))
echo $num_dcu

nodename=$(cat $hostfile |sed -n "1p")
echo $nodename
dist_url=`echo $nodename | awk '{print $1}'`
export NCCL_DEBUG=INFO
export HSA_USERPTR_FOR_PAGED_MEM=0

mpirun -np ${num_dcu} --hostfile hostfile-$SLURM_JOB_ID ${WORK_PATH}/2nodes_single_process_pre2.sh $dist_url