NUM=$(($(rocm-smi |sed -n '/DCU/,/===/ p'|wc -l)-2)) START=0 if [ $# -gt 0 ];then ##DCU Number NUM=$1 fi if [ $# -gt 1 ];then ##The First DCU ID START=$2 fi LAST=$((START+NUM-1)) export HIP_VISIBLE_DEVICES=$(seq -s, ${START} ${LAST}) export HSA_FORCE_FINE_GRAIN_PCIE=1 logfile=bert_base_${NUM}dcu_`date +%Y%m%d%H%M%S`.log python3 -m torch.distributed.run --nproc_per_node=${NUM} crf_ddp.py --batch-size=64 --root-path=/bert4torch/datasets --epochs=20 2>&1 | tee $logfile # fp32 #python3 -m torch.distributed.run --nproc_per_node=${NUM} crf_ddp.py --use-amp --batch-size=64 --root-path=/bert4torch/datasets --epochs=20 2>&1 | tee $logfile # fp16