#!/bin/bash export MIOPEN_FIND_MODE=1 export PADDLE_TRAINERS_NUM=8 export PADDLE_TRAINER_ENDPOINTS=localhost:60005,localhost:60006,localhost:60007,localhost:60008,localhost:60009,localhost:60010,localhost:60011,localhost:60012 export PYTHON=python3 export SEED=${SEED:-"$RANDOM"} export LD_LIBRARY_PATH=/opt/dtk-21.04/rccl/lib:$LD_LIBRARY_PATH #export HSA_FORCE_FINE_GRAIN_PCIE=1 #export NCCL_P2P_LEVEL=5 #export use_hierarchical_allreduce=True export num_process=16 if [[ $num_process -gt 1 ]]; then ORTERUN=`which orterun` mpirun="mpirun --allow-run-as-root -np $num_process --bind-to none -x UCX_IB_ADDR_TYPE=ib_global -x UCX_TLS=rc_x,rocm_copy -mca btl_tcp_if_exclude ib0 -x PADDLE_TRAINERS_NUM -x PADDLE_TRAINER_ENDPOINTS -x LD_LIBRARY_PATH -x SEED -x PYTHON -x NPROC_PER_NODE -x use_hierarchical_allreduce ./run_benchmark.sh" else mpirun="" fi echo "command is " $mpirun $CMD for NPROC_PER_NODE in 8; do echo "command is " $mpirun $CMD export NPROC_PER_NODE=$NPROC_PER_NODE $mpirun $CMD done #mpirun -np 8 --allow-run-as-root --bind-to none -x PADDLE_TRAINERS_NUM -x PADDLE_TRAINER_ENDPOINTS -x LD_LIBRARY_PATH -x SEED -x PYTHON ./run_benchmark.sh