#!/bin/bash #SBATCH -p caspra #SBATCH -N 30 #SBATCH --ntasks-per-node=4 #SBATCH --cpus-per-task=8 #SBATCH --gres=dcu:4 #SBATCH -J 12_30_nv_lr #SBATCH -o ./test/output.%j #SBATCH -e ./test/output.%j module rm compiler/rocm/2.9 source /public/home/aiss/Pytorch/rocm3.3_env_torch.sh #for singnode source `pwd`/config_DGX1_singlenode.sh #for multinode #source `pwd`/config_DGX1_multinode.sh SLURM_NTASKS_PER_NODE=${SLURM_NTASKS_PER_NODE:-$DGXNGPU} SLURM_JOB_ID=${SLURM_JOB_ID:-$RANDOM} echo "Run vars: id $SLURM_JOB_ID gpus $SLURM_NTASKS_PER_NODE mparams $MULTI_NODE" set -e # start timing start=$(date +%s) start_fmt=$(date +%Y-%m-%d\ %r) echo "STARTING TIMING RUN AT $start_fmt" # run benchmark set -x NUMEPOCHS=${NUMEPOCHS:-100} LR=${LR:-"2.5e-3"} echo "running benchmark" export DATASET_DIR="/public/software/apps/DeepLearning/Data/COCO2017" hostfile=./$SLURM_JOB_ID scontrol show hostnames $SLURM_JOB_NODELIST > ${hostfile} #rm `pwd`/hostfile-dl -f cat ${hostfile} > `pwd`/tmp dist_url=`sed -n '1p' ./tmp` #echo $dist_url rank=0 num_lines=`cat ./tmp |wc -l` for((i=0;i<$num_lines-1;i++)) do ((rank=$i+1)) nodename=$(cat ./tmp |sed -n "${rank}p") #ssh ${nodename} "cd `pwd` && module rm compiler/rocm/2.9 && source /public/home/aiss/Pytorch/rocm3.3_env_torch.sh && python3 -m bind_launch --nnodes=$num_lines --node_rank=$i --master_addr=${dist_url} --master_port=4567 --nsockets_per_node ${DGXNSOCKET} --ncores_per_socket ${DGXSOCKETCORES} --nproc_per_node $SLURM_NTASKS_PER_NODE --no_hyperthreads --no_membind `pwd`/train.py --epochs ${NUMEPOCHS} --warmup-factor 0 --lr ${LR} --threshold=0.23 --data ${DATASET_DIR} --batch-size 96 --warmup 3.92" & ssh ${nodename} "cd `pwd` && module rm compiler/rocm/2.9 && source /public/home/aiss/Pytorch/rocm3.3_env_torch.sh && python3 -m bind_launch --nnodes=$num_lines --node_rank=$i --master_addr=${dist_url} --master_port=4567 --nsockets_per_node ${DGXNSOCKET} --ncores_per_socket ${DGXSOCKETCORES} --nproc_per_node $SLURM_NTASKS_PER_NODE --no_hyperthreads --no_membind `pwd`/train.py --epochs ${NUMEPOCHS} --warmup-factor 0 --threshold=0.23 --data ${DATASET_DIR} --batch-size 12 --warmup 10 --lr 4.1e-3 --wd 2e-4 --snapshot_path=`pwd`/$dist_url" & done ((i=$num_lines-1)) nodename=$(cat ./tmp |sed -n "${num_lines}p") #echo ssh ${nodename} "module rm compiler/rocm/2.9 && source /public/home/aiss/Pytorch/rocm3.3_env_torch.sh && HIP_VISIBLE_DEVICES=0,1,2,3 python3 -m bind_launch --nnodes=$num_lines --node_rank=$i --master_addr=${dist_url}i --master_port=4567 --nsockets_per_node ${DGXNSOCKET} --ncores_per_socket ${DGXSOCKETCORES} --nproc_per_node $SLURM_NTASKS_PER_NODE --no_hyperthreads --no_membind `pwd`/train.py --epochs "${NUMEPOCHS}" --warmup-factor 0 --lr "${LR}" --threshold=0.23 --data ${DATASET_DIR} ${EXTRA_PARAMS[@]} && ret_code=$?" #echo ssh ${nodename} "cd `pwd` && module rm compiler/rocm/2.9 && source /public/home/aiss/Pytorch/rocm3.3_env_torch.sh && HIP_VISIBLE_DEVICES=0,1,2,3 python3 -m bind_launch --nnodes=$num_lines --node_rank=$i --master_addr=${dist_url} --master_port=4567 --nsockets_per_node ${DGXNSOCKET} --ncores_per_socket ${DGXSOCKETCORES} --nproc_per_node $SLURM_NTASKS_PER_NODE --no_hyperthreads --no_membind `pwd`/train.py --epochs ${NUMEPOCHS} --warmup-factor 0 --lr ${LR} --threshold=0.23 --data ${DATASET_DIR} ${EXTRA_PARAMS[@]} && ret_code=$?" ssh ${nodename} "cd `pwd` && module rm compiler/rocm/2.9 && source /public/home/aiss/Pytorch/rocm3.3_env_torch.sh && HIP_VISIBLE_DEVICES=0,1,2,3 python3 -m bind_launch --nnodes=$num_lines --node_rank=$i --master_addr=${dist_url} --master_port=4567 --nsockets_per_node ${DGXNSOCKET} --ncores_per_socket ${DGXSOCKETCORES} --nproc_per_node $SLURM_NTASKS_PER_NODE --no_hyperthreads --no_membind `pwd`/train.py --epochs ${NUMEPOCHS} --warmup-factor 0 --threshold=0.23 --data ${DATASET_DIR} --batch-size 10 --warmup 18 --lr 4.1e-3 --wd 2e-4 --snapshot_path=`pwd`/$dist_url" #python3 -m bind_launch --nsockets_per_node ${DGXNSOCKET} \ # --ncores_per_socket ${DGXSOCKETCORES} \ # --nproc_per_node $SLURM_NTASKS_PER_NODE $MULTI_NODE \ # --no_hyperthreads \ # --no_membind \ # train.py \ # --epochs "${NUMEPOCHS}" \ # --warmup-factor 0 \ # --lr "${LR}" \ # --no-save \ # --threshold=0.23 \ # --data ${DATASET_DIR} \ # ${EXTRA_PARAMS[@]} ; ret_code=$? set +x sleep 3 #if [[ $ret_code != 0 ]]; then exit $ret_code; fi # end timing end=$(date +%s) end_fmt=$(date +%Y-%m-%d\ %r) echo "ENDING TIMING RUN AT $end_fmt" # report result result=$(( $end - $start )) result_name="OBJECT_DETECTION" echo "RESULT,$result_name,,$result,nvidia,$start_fmt"