run_fp32_singleCard.sh


#! /bin/bash

echo "STARTING TIMING RUN AT $start_fmt"

# run benchmark
set -x
RESULTS_DIR='results/gnmt_wmt16'

GPUS=4
CPUS=1
CORES_PER_CPU=32

TRAIN_BATCH_SIZE=64
NUMEPOCHS=1
TRAIN_SEQ_LEN=50

MATH="fp16"
GLOBAL_BATCH_SIZE=`expr ${GPUS} \* ${TRAIN_BATCH_SIZE}`
${TRAIN_BATCH_SIZE} \* ${GPUS}
echo "running benchmark"

# run training
python3 -m torch.distributed.launch --nproc_per_node=${GPUS} train.py   --seed 2   --epochs ${NUMEPOCHS}  --train-batch-size ${TRAIN_BATCH_SIZE}   --train-global-batch-size ${GLOBAL_BATCH_SIZE}  --train-max-length ${TRAIN_SEQ_LEN}  --math ${MATH} ;


set +x
# end timing
end=$(date +%s)
end_fmt=$(date +%Y-%m-%d\ %r)
echo "ENDING TIMING RUN AT $end_fmt"