#!/bin/bash #SBATCH -p caspra #SBATCH -N 1 #SBATCH --ntasks-per-node=4 #SBATCH --cpus-per-task=8 #SBATCH --gres=dcu:4 #SBATCH -J fp16_64_mlperf #SBATCH -o ./fp16/output.%j #SBATCH -e ./fp16/output.%j # Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. module rm compiler/rocm/2.9 module load apps/PyTorch/1.5.0a0/hpcx-2.4.1-gcc-7.3.1-rocm3.3 export MIOPEN_DEBUG_DISABLE_FIND_DB=1 #for singnode source `pwd`/config_singlenode.sh SLURM_NTASKS_PER_NODE=${SLURM_NTASKS_PER_NODE:-$DGXNGPU} SLURM_JOB_ID=${SLURM_JOB_ID:-$RANDOM} echo "Run vars: id $SLURM_JOB_ID gpus $SLURM_NTASKS_PER_NODE mparams $MULTI_NODE" set -e # start timing start=$(date +%s) start_fmt=$(date +%Y-%m-%d\ %r) echo "STARTING TIMING RUN AT $start_fmt" # run benchmark set -x NUMEPOCHS=${NUMEPOCHS:-70} LR=${LR:-"2.5e-3"} echo "running benchmark" export DATASET_DIR="/public/software/apps/DeepLearning/Data/COCO2017" python3 -m bind_launch --nsockets_per_node ${DGXNSOCKET} \ --ncores_per_socket ${DGXSOCKETCORES} \ --nproc_per_node $SLURM_NTASKS_PER_NODE $MULTI_NODE \ --no_hyperthreads \ --no_membind \ train_fp16.py \ --epochs "${NUMEPOCHS}" \ --warmup-factor 0 \ --lr "${LR}" \ --no-save \ --threshold=0.23 \ --data ${DATASET_DIR} \ --opt-level O3 --loss-scale="dynamic" \ ${EXTRA_PARAMS[@]} ; ret_code=$? set +x sleep 3 if [[ $ret_code != 0 ]]; then exit $ret_code; fi # end timing end=$(date +%s) end_fmt=$(date +%Y-%m-%d\ %r) echo "ENDING TIMING RUN AT $end_fmt" # report result result=$(( $end - $start )) result_name="OBJECT_DETECTION" echo "RESULT,$result_name,,$result,nvidia,$start_fmt"