#!/bin/bash # Copyright (c) 2018-2022, NVIDIA CORPORATION. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # runs benchmark and reports time to convergence # to use the script: # run_and_time.sh set +x set -e source config_DGXA100_001x08x032.sh # Only rank print [ "${SLURM_LOCALID-0}" -ne 0 ] && set +x # start timing start=$(date +%s) start_fmt=$(date +%Y-%m-%d\ %r) echo "STARTING TIMING RUN AT $start_fmt" # Set variables [ "${DEBUG}" = "1" ] && set -x LR=${LR:-0.0001} WARMUP_EPOCHS=${WARMUP_EPOCHS:-1} BATCHSIZE=${BATCHSIZE:-2} EVALBATCHSIZE=${EVALBATCHSIZE:-${BATCHSIZE}} NUMEPOCHS=${NUMEPOCHS:-10} LOG_INTERVAL=${LOG_INTERVAL:-20} DATASET_DIR=${DATASET_DIR:-"/public/home/liangjj/2023/training_results_v2.1-main/NVIDIA/benchmarks/ssd/implementations/pytorch-22.09/public-scripts/datasets/open-images-v6"} TORCH_HOME=${TORCH_HOME:-"$(pwd)/torch-home"} TIME_TAGS=${TIME_TAGS:-0} NVTX_FLAG=${NVTX_FLAG:-0} NCCL_TEST=${NCCL_TEST:-0} EPOCH_PROF=${EPOCH_PROF:-0} SYNTH_DATA=${SYNTH_DATA:-0} DISABLE_CG=${DISABLE_CG:-0} # run benchmark echo "running benchmark" #if [ ${NVTX_FLAG} -gt 0 ]; then ## FIXME mfrank 2022-May-24: NSYSCMD needs to be an array, not a space-separated string # NSYSCMD=" /nsight/bin/nsys profile --capture-range cudaProfilerApi --capture-range-end stop --sample=none --cpuctxsw=none --trace=cuda,nvtx --force-overwrite true --output /results/single_stage_detector_pytorch_${DGXNNODES}x${DGXNGPU}x${BATCHSIZE}_${DATESTAMP}_${SLURM_PROCID}_${SYNTH_DATA}_${DISABLE_CG}.nsys-rep " #else # NSYSCMD="" #fi #if [ ${SYNTH_DATA} -gt 0 ]; then #EXTRA_PARAMS+=" --syn-dataset --cuda-graphs-syn " #EXTRA_PARAMS=$(echo $EXTRA_PARAMS | sed 's/--dali//') #fi declare -a CMD #if [ -n "${SLURM_LOCALID-}" ]; then # # Mode 1: Slurm launched a task for each GPU and set some envvars; no need for parallel launch # if [ "${SLURM_NTASKS}" -gt "${SLURM_JOB_NUM_NODES}" ]; then # CMD=( 'bindpcie' '--ib=single' '--' ${NSYSCMD} 'python' '-u' ) # else # CMD=( ${NSYSCMD} 'python' '-u' ) # fi #else # # Mode 2: Single-node Docker, we've been launched with `torch_run` # # TODO: Replace below CMD with NSYSCMD..., but make sure NSYSCMD is an array, not a string # CMD=( "python" ) #fi CMD=( "python" ) #if [ "$LOGGER" = "apiLog.sh" ]; #then # LOGGER="${LOGGER} -p MLPerf/${MODEL_NAME} -v ${FRAMEWORK}/train/${DGXSYSTEM}" # TODO(ahmadki): track the apiLog.sh bug and remove the workaround # there is a bug in apiLog.sh preventing it from collecting # NCCL logs, the workaround is to log a single rank only # LOCAL_RANK is set with an enroot hook for Pytorch containers # SLURM_LOCALID is set by Slurm # OMPI_COMM_WORLD_LOCAL_RANK is set by mpirun # readonly node_rank="${SLURM_NODEID:-0}" # readonly local_rank="${LOCAL_RANK:=${SLURM_LOCALID:=${OMPI_COMM_WORLD_LOCAL_RANK:-}}}" # if [ "$node_rank" -eq 0 ] && [ "$local_rank" -eq 0 ]; # then # LOGGER=$LOGGER # else # LOGGER="" # fi #fi PARAMS=( --lr "${LR}" --batch-size "${BATCHSIZE}" --eval-batch-size "${EVALBATCHSIZE}" --epochs "${NUMEPOCHS}" --print-freq "${LOG_INTERVAL}" --dataset-path "${DATASET_DIR}" --warmup-epochs "${WARMUP_EPOCHS}" ) export HIP_VISIBLE_DEVICES=4,5 export HSA_FORCE_FINE_GRAIN_PCIE=1 #export MIOPEN_FIND_MODE=5 #export NCCL_NET_GDR_LEVEL=5 export NCCL_P2P_LEVEL=5 # run training #${LOGGER:-} "${CMD[@]}" train.py "${PARAMS[@]}" ${EXTRA_PARAMS} ; ret_code=$? python -m torch.distributed.launch --nnodes 1 --nproc_per_node=2 train.py "${PARAMS[@]}" ${EXTRA_PARAMS} ; ret_code=$? sleep 3 if [[ $ret_code != 0 ]]; then exit $ret_code; fi # end timing end=$(date +%s) end_fmt=$(date +%Y-%m-%d\ %r) echo "ENDING TIMING RUN AT $end_fmt" # report result result=$(( $end - $start )) result_name="SINGLE_STAGE_DETECTOR" echo "RESULT,$result_name,,$result,nvidia,$start_fmt"