#!/bin/bash

source `pwd`/config_DGX1.sh

set -e

# start timing
start=$(date +%s)
start_fmt=$(date +%Y-%m-%d\ %r)
echo "STARTING TIMING RUN AT $start_fmt"
export NCCL_DEBUG=${NCCL_DEBUG:-"WARN"}

# run benchmark
set -x
DATASET_DIR='../wmt16_de_en/'
PREPROC_DATADIR='./preproc_data'
RESULTS_DIR='gnmt_wmt16'

DIST_OPTS=${DIST_OPTS:-""}
EXTRA_OPTS=${EXTRA_OPTS:-""}

declare -a CMD


CMD=( 'python3' '-u' '-m' 'bind_launch' "--nsockets_per_node=${DGXNSOCKET}" \
  "--ncores_per_socket=${DGXSOCKETCORES}" "--nproc_per_node=${DGXNGPU}" "--no_hyperthreads")

echo "running benchmark"

# run training
#for 1 card fp32 training
HIP_VISIBLE_DEVICES=0 python3 train.py \
  --save ${RESULTS_DIR} \
  --dataset-dir ${DATASET_DIR} \
  --preproc-data-dir ${PREPROC_DATADIR}/${MAX_SEQ_LEN} \
  --target-bleu $TARGET \
  --epochs "${NUMEPOCHS}" \
  --math ${MATH} \
  --max-length-train ${MAX_SEQ_LEN} \
  --print-freq 10 \
  --train-batch-size $TRAIN_BATCH_SIZE \
  --test-batch-size $TEST_BATCH_SIZE \
  --optimizer Adam \
  --lr $LR \
  --warmup-steps $WARMUP_STEPS \
  --remain-steps $REMAIN_STEPS \
  --decay-interval $DECAY_INTERVAL \
  $EXTRA_OPTS ; ret_code=$?

set +x

sleep 3
if [[ $ret_code != 0 ]]; then exit $ret_code; fi

# end timing
end=$(date +%s)
end_fmt=$(date +%Y-%m-%d\ %r)
echo "ENDING TIMING RUN AT $end_fmt"

# report result
result=$(( $end - $start ))
result_name="RNN_TRANSLATOR"

echo "RESULT,$result_name,,$result,nvidia,$start_fmt"

