#!/bin/bash
export MIOPEN_FIND_MODE=1
export PADDLE_TRAINERS_NUM=8
export PADDLE_TRAINER_ENDPOINTS=localhost:60005,localhost:60006,localhost:60007,localhost:60008,localhost:60009,localhost:60010,localhost:60011,localhost:60012
export PYTHON=python3
export SEED=${SEED:-"$RANDOM"}
export LD_LIBRARY_PATH=/opt/dtk-21.04/rccl/lib:$LD_LIBRARY_PATH
#export HSA_FORCE_FINE_GRAIN_PCIE=1
#export NCCL_P2P_LEVEL=5
#export use_hierarchical_allreduce=True
export num_process=16

if [[ $num_process -gt 1 ]]; then
  ORTERUN=`which orterun`
  mpirun="mpirun --allow-run-as-root -np $num_process --bind-to none  -x UCX_IB_ADDR_TYPE=ib_global -x UCX_TLS=rc_x,rocm_copy -mca btl_tcp_if_exclude ib0 -x PADDLE_TRAINERS_NUM -x PADDLE_TRAINER_ENDPOINTS -x LD_LIBRARY_PATH -x SEED -x PYTHON -x NPROC_PER_NODE -x use_hierarchical_allreduce ./run_benchmark.sh"
else
  mpirun=""
fi

echo "command is " $mpirun $CMD
for NPROC_PER_NODE in 8; do
  echo "command is " $mpirun $CMD
  export NPROC_PER_NODE=$NPROC_PER_NODE
  $mpirun $CMD
done
 
#mpirun -np 8 --allow-run-as-root  --bind-to none -x PADDLE_TRAINERS_NUM -x PADDLE_TRAINER_ENDPOINTS -x LD_LIBRARY_PATH -x SEED -x PYTHON ./run_benchmark.sh
