run_sbatch.sh 2.01 KB
Newer Older
liangjing's avatar
liangjing committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
#!/bin/bash
#SBATCH -p ty_huchen 
#SBATCH -N 2
#SBATCH --ntasks-per-node=8
#SBATCH --cpus-per-task=4
#SBATCH --gres=dcu:4
#SBATCH -J mlperf
source ~/paddle_dtk21.04.sh
export HSA_FORCE_FINE_GRAIN_PCIE=1
export NCCL_GRAPH_FILE=./text.xml
export PYTHON=python3
export PADDLE_TRAINER_ENDPOINTS=`$PYTHON -c "import list;print(list.get_list())"`
echo $PADDLE_TRAINER_ENDPOINTS
#set -e
hostfile=./hostfile
scontrol show hostnames $SLURM_JOB_NODELIST > ${hostfile}
num_node=$(cat $hostfile|sort|uniq |wc -l)
num_DCU=$(($num_node*4))
export LD_LIBRARY_PATH=/public/home/zhangqha/dtk-21.04/rccl/lib:$LD_LIBRARY_PATH
export LD_LIBRARY_PATH=/public/home/zhangqha/ucx/lib:$LD_LIBRARY_PATH
export NCCL_PLUGIN_P2P=ucx
export HSA_FORCE_FINE_GRAIN_PCIE=1
export NCCL_P2P_LEVEL=5
rm hosts
p=0
for i in `cat hostfile`;do
    for j in `seq 1 4`;do
        num=$[ $p * 4 + $j - 1]
        echo "rank ${num}=${i} slot=$[j-1]" >> hosts
    done
    p=$(expr $p + 1)
done
for i in `cat hostfile`;do
    for j in `seq 1 4`;do
        num=$[ $p * 4 + $j - 1]
        echo "rank ${num}=${i} slot=$[j+3]" >> hosts
    done
    p=$(expr $p + 1)
done
nodename=$(cat $hostfile |sed -n "1p")
rm $SLURM_JOB_ID

#ldconfig
export HIP_LAUNCH_BLOCKING=1

export PADDLE_TRAINERS_NUM=$num_DCU


export SEED=${SEED:-"$RANDOM"}
echo "PADDLE_TRAINER_ENDPOINTS " $PADDLE_TRAINER_ENDPOINTS

CMD="bash run_benchmark.sh"


#bash kill_grep.sh $PYTHON || true

num_process=$(($PADDLE_TRAINERS_NUM*1))

if [[ $num_process -gt 1 ]]; then
  ORTERUN=`which orterun`
  mpirun="mpirun --allow-run-as-root -np $num_process -machinefile ./hosts -mca pml ucx -x UCX_IB_ADDR_TYPE=ib_global -x UCX_TLS=rc_x,rocm_copy -mca btl_tcp_if_exclude ib0 --bind-to none -x PADDLE_TRAINERS_NUM -x PADDLE_TRAINER_ENDPOINTS -x LD_LIBRARY_PATH -x SEED -x PYTHON -x NPROC_PER_NODE"
else
  mpirun=""
fi

echo "command is " $mpirun $CMD
for NPROC_PER_NODE in 4; do
  echo "command is " $mpirun $CMD
  export NPROC_PER_NODE=$NPROC_PER_NODE
  $mpirun $CMD
done
# bash kill_grep.sh run_and_time || true
# bash kill_grep.sh python || true