#!/bin/bash ## DL params export HSA_FORCE_FINE_GRAIN_PCIE=1 export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 export BATCHSIZE=16 export NUMEPOCHS=6 export DATASET_DIR="/data/OpenImages_mlperf" export EXTRA_PARAMS='--lr 0.000085 --warmup-epochs 0 --frozen-bn-opt --frozen-bn-fp16 --apex-adam --disable-ddp-broadcast-buffers --fp16-allreduce --skip-metric-loss --async-coco' ## System config params export DGXNGPU=8 # Set variables EVALBATCHSIZE=${EVALBATCHSIZE:-${BATCHSIZE}} LOG_INTERVAL=${LOG_INTERVAL:-20} TORCH_HOME=${TORCH_HOME:-"$(pwd)/torch-model-cache"} # run benchmark echo "running benchmark" PARAMS=( --batch-size "${BATCHSIZE}" --eval-batch-size "${EVALBATCHSIZE}" --epochs "${NUMEPOCHS}" --print-freq "${LOG_INTERVAL}" --dataset-path "${DATASET_DIR}" ) # run training torchrun --nproc_per_node="${DGXNGPU}" train.py "${PARAMS[@]}" ${EXTRA_PARAMS} 2>&1 | tee ssd_bs16_epoch6.log