#!/bin/bash
#SBATCH -p ty_huchen 
#SBATCH -N 2
#SBATCH --ntasks-per-node=8
#SBATCH --cpus-per-task=4
#SBATCH --gres=dcu:4
#SBATCH -J mlperf
source ~/paddle_dtk21.04.sh
export HSA_FORCE_FINE_GRAIN_PCIE=1
export NCCL_GRAPH_FILE=./text.xml
export PYTHON=python3
export PADDLE_TRAINER_ENDPOINTS=`$PYTHON -c "import list;print(list.get_list())"`
echo $PADDLE_TRAINER_ENDPOINTS
#set -e
hostfile=./hostfile
scontrol show hostnames $SLURM_JOB_NODELIST > ${hostfile}
num_node=$(cat $hostfile|sort|uniq |wc -l)
num_DCU=$(($num_node*4))
export LD_LIBRARY_PATH=/public/home/zhangqha/dtk-21.04/rccl/lib:$LD_LIBRARY_PATH
export LD_LIBRARY_PATH=/public/home/zhangqha/ucx/lib:$LD_LIBRARY_PATH
export NCCL_PLUGIN_P2P=ucx
export HSA_FORCE_FINE_GRAIN_PCIE=1
export NCCL_P2P_LEVEL=5
rm hosts
p=0
for i in `cat hostfile`;do
    for j in `seq 1 4`;do
        num=$[ $p * 4 + $j - 1]
        echo "rank ${num}=${i} slot=$[j-1]" >> hosts
    done
    p=$(expr $p + 1)
done
for i in `cat hostfile`;do
    for j in `seq 1 4`;do
        num=$[ $p * 4 + $j - 1]
        echo "rank ${num}=${i} slot=$[j+3]" >> hosts
    done
    p=$(expr $p + 1)
done
nodename=$(cat $hostfile |sed -n "1p")
rm $SLURM_JOB_ID

#ldconfig
export HIP_LAUNCH_BLOCKING=1

export PADDLE_TRAINERS_NUM=$num_DCU


export SEED=${SEED:-"$RANDOM"}
echo "PADDLE_TRAINER_ENDPOINTS " $PADDLE_TRAINER_ENDPOINTS

CMD="bash run_benchmark.sh"


#bash kill_grep.sh $PYTHON || true

num_process=$(($PADDLE_TRAINERS_NUM*1))

if [[ $num_process -gt 1 ]]; then
  ORTERUN=`which orterun`
  mpirun="mpirun --allow-run-as-root -np $num_process -machinefile ./hosts -mca pml ucx -x UCX_IB_ADDR_TYPE=ib_global -x UCX_TLS=rc_x,rocm_copy -mca btl_tcp_if_exclude ib0 --bind-to none -x PADDLE_TRAINERS_NUM -x PADDLE_TRAINER_ENDPOINTS -x LD_LIBRARY_PATH -x SEED -x PYTHON -x NPROC_PER_NODE"
else
  mpirun=""
fi

echo "command is " $mpirun $CMD
for NPROC_PER_NODE in 4; do
  echo "command is " $mpirun $CMD
  export NPROC_PER_NODE=$NPROC_PER_NODE
  $mpirun $CMD
done
# bash kill_grep.sh run_and_time || true
# bash kill_grep.sh python || true
