#!/bin/bash lrank=$OMPI_COMM_WORLD_LOCAL_RANK comm_rank=$OMPI_COMM_WORLD_RANK comm_size=$OMPI_COMM_WORLD_SIZE ## DL params export BATCHSIZE=16 export NUMEPOCHS=6 export DATASET_DIR="/data/OpenImages_mlperf" export EXTRA_PARAMS='--lr 0.000085 --warmup-epochs 0 --frozen-bn-opt --frozen-bn-fp16 --apex-adam --disable-ddp-broadcast-buffers --fp16-allreduce --skip-metric-loss --async-coco' # Set variables EVALBATCHSIZE=${EVALBATCHSIZE:-${BATCHSIZE}} LOG_INTERVAL=${LOG_INTERVAL:-20} TORCH_HOME=${TORCH_HOME:-"$(pwd)/torch-model-cache"} # run benchmark echo "running benchmark" PARAMS=( --batch-size "${BATCHSIZE}" --eval-batch-size "${EVALBATCHSIZE}" --epochs "${NUMEPOCHS}" --print-freq "${LOG_INTERVAL}" --dataset-path "${DATASET_DIR}" --local_rank "${comm_rank}" --world-size "${comm_size}" ) # run training APP="python3 train.py ${PARAMS[@]} ${EXTRA_PARAMS}" case ${lrank} in [0]) export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 numactl --cpunodebind=0 --membind=0 ${APP} ;; [1]) export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 numactl --cpunodebind=1 --membind=1 ${APP} ;; [2]) export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 numactl --cpunodebind=2 --membind=2 ${APP} ;; [3]) export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 numactl --cpunodebind=3 --membind=3 ${APP} ;; [4]) export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 numactl --cpunodebind=4 --membind=4 ${APP} ;; [5]) export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 numactl --cpunodebind=5 --membind=5 ${APP} ;; [6]) export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 numactl --cpunodebind=6 --membind=6 ${APP} ;; [7]) export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 numactl --cpunodebind=7 --membind=7 ${APP} ;; esac