multi_train.sh 565 Bytes
Newer Older
wangsen's avatar
wangsen committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20

NUM=$(($(rocm-smi |sed -n '/DCU/,/===/ p'|wc -l)-2))
START=0
if [ $# -gt 0 ];then      ##DCU Number
    NUM=$1
fi
if [ $# -gt 1 ];then      ##The First DCU ID
    START=$2
fi
LAST=$((START+NUM-1))
export HIP_VISIBLE_DEVICES=$(seq -s, ${START} ${LAST})
#export ROCBLAS_LAYER=3
#export MIOPEN_ENABLE_LOGGING=1
#export MIOPEN_ENABLE_LOGGING_CMD=1
#export MIOPEN_LOG_LEVEL=6

export HSA_FORCE_FINE_GRAIN_PCIE=1
logfile=bert_base_${NUM}dcu_bs64_epoch30_`date +%Y%m%d%H%M%S`.log

python3 -m torch.distributed.run --nproc_per_node=${NUM} crf_ddp.py 2>&1 | tee $logfile