#!/bin/bash #SBATCH -p xxx #SBATCH -N 2 #SBATCH --ntasks-per-node=8 #SBATCH --cpus-per-task=4 #SBATCH --gres=dcu:4 #SBATCH -J mlperf source ./env.sh cp ./rundir_8gpu/init_env.py . export HSA_FORCE_FINE_GRAIN_PCIE=1 export PYTHON=python3 export PADDLE_TRAINER_ENDPOINTS=`$PYTHON -c "import list;print(list.get_list())"` echo $PADDLE_TRAINER_ENDPOINTS #set -e hostfile=./hostfile scontrol show hostnames $SLURM_JOB_NODELIST > ${hostfile} num_node=$(cat $hostfile|sort|uniq |wc -l) num_DCU=$(($num_node*4)) export LD_LIBRARY_PATH=/public/software/compiler/rocm/dtk-21.04/rccl/lib:$LD_LIBRARY_PATH export HSA_FORCE_FINE_GRAIN_PCIE=1 export NCCL_P2P_LEVEL=5 export use_hierarchical_allreduce=True export NCCL_IB_HCA=mlx5_0:1 rm hosts p=0 for i in `cat hostfile`;do for j in `seq 1 4`;do num=$[ $p * 4 + $j - 1] echo "rank ${num}=${i} slot=$[j-1]" >> hosts done p=$(expr $p + 1) done for i in `cat hostfile`;do for j in `seq 1 4`;do num=$[ $p * 4 + $j - 1] echo "rank ${num}=${i} slot=$[j+3]" >> hosts done p=$(expr $p + 1) done export HIP_LAUNCH_BLOCKING=1 export PADDLE_TRAINERS_NUM=$num_DCU export SEED=${SEED:-"$RANDOM"} echo "PADDLE_TRAINER_ENDPOINTS " $PADDLE_TRAINER_ENDPOINTS num_process=$(($PADDLE_TRAINERS_NUM*2)) if [[ $num_process -gt 1 ]]; then ORTERUN=`which orterun` mpirun="mpirun --allow-run-as-root -np $num_process --rankfile hosts -mca btl_tcp_if_exclude ib0 --bind-to none -x PADDLE_TRAINERS_NUM -x PADDLE_TRAINER_ENDPOINTS -x LD_LIBRARY_PATH -x SEED -x PYTHON -x NPROC_PER_NODE -x use_hierarchical_allreduce ./run_benchmark.sh" else mpirun="" fi for NPROC_PER_NODE in 4; do echo "command is " $mpirun $CMD export NPROC_PER_NODE=$NPROC_PER_NODE $mpirun $CMD done