run_multi.sh 4.59 KB
Newer Older
huchen's avatar
huchen committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
#!/bin/bash

#SBATCH -p caspra
#SBATCH -N 30
#SBATCH --ntasks-per-node=4
#SBATCH --cpus-per-task=8
#SBATCH --gres=dcu:4
#SBATCH -J 12_30_nv_lr
#SBATCH -o ./test/output.%j
#SBATCH -e ./test/output.%j

module rm compiler/rocm/2.9
source /public/home/aiss/Pytorch/rocm3.3_env_torch.sh
#for singnode
source `pwd`/config_DGX1_singlenode.sh

#for multinode
#source `pwd`/config_DGX1_multinode.sh


SLURM_NTASKS_PER_NODE=${SLURM_NTASKS_PER_NODE:-$DGXNGPU}
SLURM_JOB_ID=${SLURM_JOB_ID:-$RANDOM}
echo "Run vars: id $SLURM_JOB_ID gpus $SLURM_NTASKS_PER_NODE mparams $MULTI_NODE"

set -e

# start timing
start=$(date +%s)
start_fmt=$(date +%Y-%m-%d\ %r)
echo "STARTING TIMING RUN AT $start_fmt"

# run benchmark
set -x
NUMEPOCHS=${NUMEPOCHS:-100}
LR=${LR:-"2.5e-3"}

echo "running benchmark"

export DATASET_DIR="/public/software/apps/DeepLearning/Data/COCO2017"

hostfile=./$SLURM_JOB_ID
scontrol show hostnames $SLURM_JOB_NODELIST > ${hostfile}
#rm `pwd`/hostfile-dl -f
cat ${hostfile} > `pwd`/tmp
dist_url=`sed -n '1p' ./tmp`
#echo $dist_url

rank=0
num_lines=`cat ./tmp |wc -l`

for((i=0;i<$num_lines-1;i++))
do
   ((rank=$i+1))
    nodename=$(cat ./tmp |sed -n "${rank}p")
    #ssh ${nodename} "cd `pwd` && module rm compiler/rocm/2.9 && source /public/home/aiss/Pytorch/rocm3.3_env_torch.sh && python3 -m bind_launch --nnodes=$num_lines --node_rank=$i  --master_addr=${dist_url} --master_port=4567 --nsockets_per_node ${DGXNSOCKET} --ncores_per_socket ${DGXSOCKETCORES} --nproc_per_node $SLURM_NTASKS_PER_NODE --no_hyperthreads --no_membind `pwd`/train.py --epochs ${NUMEPOCHS} --warmup-factor 0 --lr ${LR} --threshold=0.23 --data ${DATASET_DIR} --batch-size 96 --warmup 3.92" &
    ssh ${nodename} "cd `pwd` && module rm compiler/rocm/2.9 && source /public/home/aiss/Pytorch/rocm3.3_env_torch.sh && python3 -m bind_launch --nnodes=$num_lines --node_rank=$i  --master_addr=${dist_url} --master_port=4567 --nsockets_per_node ${DGXNSOCKET} --ncores_per_socket ${DGXSOCKETCORES} --nproc_per_node $SLURM_NTASKS_PER_NODE --no_hyperthreads --no_membind `pwd`/train.py --epochs ${NUMEPOCHS} --warmup-factor 0 --threshold=0.23 --data ${DATASET_DIR} --batch-size 12 --warmup 10 --lr 4.1e-3 --wd 2e-4 --snapshot_path=`pwd`/$dist_url" &
done 

((i=$num_lines-1))
nodename=$(cat ./tmp |sed -n "${num_lines}p")


#echo ssh ${nodename} "module rm compiler/rocm/2.9 && source /public/home/aiss/Pytorch/rocm3.3_env_torch.sh && HIP_VISIBLE_DEVICES=0,1,2,3 python3 -m bind_launch  --nnodes=$num_lines --node_rank=$i  --master_addr=${dist_url}i --master_port=4567 --nsockets_per_node ${DGXNSOCKET} --ncores_per_socket ${DGXSOCKETCORES} --nproc_per_node $SLURM_NTASKS_PER_NODE --no_hyperthreads --no_membind `pwd`/train.py --epochs "${NUMEPOCHS}" --warmup-factor 0 --lr "${LR}" --threshold=0.23 --data ${DATASET_DIR} ${EXTRA_PARAMS[@]} && ret_code=$?"

#echo ssh ${nodename} "cd `pwd` && module rm compiler/rocm/2.9 && source /public/home/aiss/Pytorch/rocm3.3_env_torch.sh && HIP_VISIBLE_DEVICES=0,1,2,3 python3 -m bind_launch  --nnodes=$num_lines --node_rank=$i  --master_addr=${dist_url} --master_port=4567 --nsockets_per_node ${DGXNSOCKET} --ncores_per_socket ${DGXSOCKETCORES} --nproc_per_node $SLURM_NTASKS_PER_NODE --no_hyperthreads --no_membind `pwd`/train.py --epochs ${NUMEPOCHS} --warmup-factor 0 --lr ${LR} --threshold=0.23 --data ${DATASET_DIR} ${EXTRA_PARAMS[@]} && ret_code=$?"
ssh ${nodename} "cd `pwd` && module rm compiler/rocm/2.9 && source /public/home/aiss/Pytorch/rocm3.3_env_torch.sh && HIP_VISIBLE_DEVICES=0,1,2,3 python3 -m bind_launch  --nnodes=$num_lines --node_rank=$i  --master_addr=${dist_url} --master_port=4567 --nsockets_per_node ${DGXNSOCKET} --ncores_per_socket ${DGXSOCKETCORES} --nproc_per_node $SLURM_NTASKS_PER_NODE --no_hyperthreads --no_membind `pwd`/train.py --epochs ${NUMEPOCHS} --warmup-factor 0 --threshold=0.23 --data ${DATASET_DIR} --batch-size 10 --warmup 18 --lr 4.1e-3 --wd 2e-4 --snapshot_path=`pwd`/$dist_url"

#python3 -m bind_launch --nsockets_per_node ${DGXNSOCKET} \

#                      --ncores_per_socket ${DGXSOCKETCORES} \
#                      --nproc_per_node $SLURM_NTASKS_PER_NODE $MULTI_NODE \
#                      --no_hyperthreads \
#                      --no_membind \
# train.py \
#  --epochs "${NUMEPOCHS}" \
#  --warmup-factor 0 \
#  --lr "${LR}" \
#  --no-save \
#  --threshold=0.23 \
#  --data ${DATASET_DIR} \
#  ${EXTRA_PARAMS[@]} ; ret_code=$?

set +x

sleep 3
#if [[ $ret_code != 0 ]]; then exit $ret_code; fi

# end timing
end=$(date +%s)
end_fmt=$(date +%Y-%m-%d\ %r)
echo "ENDING TIMING RUN AT $end_fmt"

# report result
result=$(( $end - $start ))
result_name="OBJECT_DETECTION"

echo "RESULT,$result_name,,$result,nvidia,$start_fmt"