multi_train.sh 563 Bytes
Newer Older
yangzhong's avatar
yangzhong committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15

NUM=$(($(rocm-smi |sed -n '/DCU/,/===/ p'|wc -l)-2))
START=0
if [ $# -gt 0 ];then      ##DCU Number
    NUM=$1
fi
if [ $# -gt 1 ];then      ##The First DCU ID
    START=$2
fi
LAST=$((START+NUM-1))
export HIP_VISIBLE_DEVICES=$(seq -s, ${START} ${LAST})

export HSA_FORCE_FINE_GRAIN_PCIE=1
logfile=bert_base_${NUM}dcu_`date +%Y%m%d%H%M%S`.log

yangzhong's avatar
yangzhong committed
16
17
python3 -m torch.distributed.run --nproc_per_node=${NUM} crf_ddp.py 2>&1 | tee $logfile                # fp32
#python3 -m torch.distributed.run --nproc_per_node=${NUM} crf_ddp.py --use-amp 2>&1 | tee $logfile     # fp16