#!/bin/bash

# Copyright (c) 2018-2022, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# runs benchmark and reports time to convergence
# to use the script:
#   run_and_time.sh

set +x
set -e

source config_DGXA100_001x08x032.sh

# Only rank print
[ "${SLURM_LOCALID-0}" -ne 0 ] && set +x


# start timing
start=$(date +%s)
start_fmt=$(date +%Y-%m-%d\ %r)
echo "STARTING TIMING RUN AT $start_fmt"

# Set variables
[ "${DEBUG}" = "1" ] && set -x
LR=${LR:-0.0001}
WARMUP_EPOCHS=${WARMUP_EPOCHS:-1}
BATCHSIZE=${BATCHSIZE:-2}
EVALBATCHSIZE=${EVALBATCHSIZE:-${BATCHSIZE}}
NUMEPOCHS=${NUMEPOCHS:-10}
LOG_INTERVAL=${LOG_INTERVAL:-20}
DATASET_DIR=${DATASET_DIR:-"/public/home/liangjj/2023/training_results_v2.1-main/NVIDIA/benchmarks/ssd/implementations/pytorch-22.09/public-scripts/datasets/open-images-v6"}
TORCH_HOME=${TORCH_HOME:-"$(pwd)/torch-home"}
TIME_TAGS=${TIME_TAGS:-0}
NVTX_FLAG=${NVTX_FLAG:-0}
NCCL_TEST=${NCCL_TEST:-0}
EPOCH_PROF=${EPOCH_PROF:-0}
SYNTH_DATA=${SYNTH_DATA:-0}
DISABLE_CG=${DISABLE_CG:-0}

# run benchmark
echo "running benchmark"
#if [ ${NVTX_FLAG} -gt 0 ]; then
## FIXME mfrank 2022-May-24: NSYSCMD needs to be an array, not a space-separated string
# NSYSCMD=" /nsight/bin/nsys profile --capture-range cudaProfilerApi --capture-range-end stop --sample=none --cpuctxsw=none  --trace=cuda,nvtx  --force-overwrite true --output /results/single_stage_detector_pytorch_${DGXNNODES}x${DGXNGPU}x${BATCHSIZE}_${DATESTAMP}_${SLURM_PROCID}_${SYNTH_DATA}_${DISABLE_CG}.nsys-rep "
#else
# NSYSCMD=""
#fi

#if [ ${SYNTH_DATA} -gt 0 ]; then
#EXTRA_PARAMS+=" --syn-dataset --cuda-graphs-syn "
#EXTRA_PARAMS=$(echo $EXTRA_PARAMS | sed 's/--dali//')
#fi

declare -a CMD
#if [ -n "${SLURM_LOCALID-}" ]; then
#    # Mode 1: Slurm launched a task for each GPU and set some envvars; no need for parallel launch
#  if [ "${SLURM_NTASKS}" -gt "${SLURM_JOB_NUM_NODES}" ]; then
#    CMD=( 'bindpcie' '--ib=single' '--' ${NSYSCMD} 'python' '-u' )
#  else
#    CMD=( ${NSYSCMD} 'python' '-u' )
#  fi
#else
#  # Mode 2: Single-node Docker, we've been launched with `torch_run`
#  # TODO: Replace below CMD with NSYSCMD..., but make sure NSYSCMD is an array, not a string
#  CMD=( "python" )
#fi
CMD=( "python" )


#if [ "$LOGGER" = "apiLog.sh" ];
#then
#  LOGGER="${LOGGER} -p MLPerf/${MODEL_NAME} -v ${FRAMEWORK}/train/${DGXSYSTEM}"
  # TODO(ahmadki): track the apiLog.sh bug and remove the workaround
  # there is a bug in apiLog.sh preventing it from collecting
  # NCCL logs, the workaround is to log a single rank only
  # LOCAL_RANK is set with an enroot hook for Pytorch containers
  # SLURM_LOCALID is set by Slurm
  # OMPI_COMM_WORLD_LOCAL_RANK is set by mpirun
#  readonly node_rank="${SLURM_NODEID:-0}"
#  readonly local_rank="${LOCAL_RANK:=${SLURM_LOCALID:=${OMPI_COMM_WORLD_LOCAL_RANK:-}}}"
#  if [ "$node_rank" -eq 0 ] && [ "$local_rank" -eq 0 ];
#  then
#    LOGGER=$LOGGER
#  else
#    LOGGER=""
#  fi
#fi

PARAMS=(
      --lr                      "${LR}"
      --batch-size              "${BATCHSIZE}"
      --eval-batch-size         "${EVALBATCHSIZE}"
      --epochs                  "${NUMEPOCHS}"
      --print-freq              "${LOG_INTERVAL}"
      --dataset-path            "${DATASET_DIR}"
      --warmup-epochs           "${WARMUP_EPOCHS}"
)


export HIP_VISIBLE_DEVICES=4,5
export HSA_FORCE_FINE_GRAIN_PCIE=1
#export MIOPEN_FIND_MODE=5
#export NCCL_NET_GDR_LEVEL=5
export NCCL_P2P_LEVEL=5

# run training
#${LOGGER:-} "${CMD[@]}" train.py "${PARAMS[@]}" ${EXTRA_PARAMS} ; ret_code=$?
python -m torch.distributed.launch --nnodes 1 --nproc_per_node=2  train.py "${PARAMS[@]}" ${EXTRA_PARAMS} ; ret_code=$?


sleep 3
if [[ $ret_code != 0 ]]; then exit $ret_code; fi

# end timing
end=$(date +%s)
end_fmt=$(date +%Y-%m-%d\ %r)
echo "ENDING TIMING RUN AT $end_fmt"

# report result
result=$(( $end - $start ))
result_name="SINGLE_STAGE_DETECTOR"

echo "RESULT,$result_name,,$result,nvidia,$start_fmt"