增加多机多卡运行

b1232fb0 · hepj · 17bc28d5 · b1232fb0 · b1232fb0 · b1232fb0
Commit b1232fb0 authored Jan 11, 2023 by hepj
7 changed files
--- a/PyTorch/NLP/BERT/2node-run-pre/2nodes_single_process_pre1.sh
+++ b/PyTorch/NLP/BERT/2node-run-pre/2nodes_single_process_pre1.sh
+#!/bin/bash
+export MIOPEN_DEBUG_DISABLE_FIND_DB=1
+export NCCL_SOCKET_IFNAME=eno1
+export HSA_USERPTR_FOR_PAGED_MEM=0
+export HIP_LAUNCH_BLOCKING=1
+lrank=$OMPI_COMM_WORLD_LOCAL_RANK
+comm_rank=$OMPI_COMM_WORLD_RANK
+comm_size=$OMPI_COMM_WORLD_SIZE
+export PATH_PHRASE1=/public/DL_DATA/wikicorpus_en/lower_case_1_seq_len_128_max_pred_20_masked_lm_prob_0.15_random_seed_12345_dupe_factor_5_shard_1472_test_split_10/wikicorpus_en/training
+APP="python3 ${HOME}/torch/bert-pretrain/run_pretraining_v4.py  \
+    --input_dir=${PATH_PHRASE1}    \
+    --output_dir=${HOME}/outdir/torch/pre_wiki/phrase1 \
+    --config_file=${HOME}/model/uncased_L-24_H-1024_A-16/bert_config.json \
+    --bert_model=bert-large-uncased \
+    --train_batch_size=16 \
+    --max_seq_length=128 \
+    --max_predictions_per_seq=20 \
+    --max_steps=100000 \
+    --warmup_proportion=0.0 \
+    --num_steps_per_checkpoint=20000 \
+    --learning_rate=4.0e-4 \
+    --seed=12439 \
+    --gradient_accumulation_steps=1 \
+    --allreduce_post_accumulation \
+    --gpus_per_node 2 \
+    --do_train \
+    --local_rank ${comm_rank} \
+    --world_size ${comm_size} \
+    --dist_url tcp://${1}:34567 \
+    --json-summary ${HOME}/outdir/torch/pre_wiki/phrase1/dllogger.json
+ "
+case ${lrank} in
+[0])
+  export HIP_VISIBLE_DEVICES=0
+  export UCX_NET_DEVICES=mlx5_0:1
+  export UCX_IB_PCI_BW=mlx5_0:50Gbs
+  echo NCCL_SOCKET_IFNAME=eno1 numactl --cpunodebind=0 --membind=0 ${APP}
+  NCCL_SOCKET_IFNAME=eno1 numactl --cpunodebind=0 --membind=0 ${APP}
+  ;;
+[1])
+  export HIP_VISIBLE_DEVICES=1
+  export UCX_NET_DEVICES=mlx5_1:1
+  export UCX_IB_PCI_BW=mlx5_1:50Gbs
+  echo NCCL_SOCKET_IFNAME=eno1 numactl --cpunodebind=1 --membind=1 ${APP}
+  NCCL_SOCKET_IFNAME=eno1 numactl --cpunodebind=1 --membind=1 ${APP}
+  ;;
+[2])
+  export HIP_VISIBLE_DEVICES=2
+  export UCX_NET_DEVICES=mlx5_2:1
+  export UCX_IB_PCI_BW=mlx5_2:50Gbs
+  echo NCCL_SOCKET_IFNAME=eno1 numactl --cpunodebind=2 --membind=2 ${APP}  
+  NCCL_SOCKET_IFNAME=eno1 numactl --cpunodebind=2 --membind=2 ${APP}
+  ;;
+[3])
+  export HIP_VISIBLE_DEVICES=3
+  export UCX_NET_DEVICES=mlx5_3:1
+  export UCX_IB_PCI_BW=mlx5_3:50Gbs
+  echo NCCL_SOCKET_IFNAME=eno1 numactl --cpunodebind=3 --membind=3 ${APP}  
+  NCCL_SOCKET_IFNAME=eno1 numactl --cpunodebind=3 --membind=3 ${APP}
+  ;;
+esac
--- a/PyTorch/NLP/BERT/2node-run-pre/2nodes_single_process_pre2.sh
+++ b/PyTorch/NLP/BERT/2node-run-pre/2nodes_single_process_pre2.sh
+#!/bin/bash
+export MIOPEN_DEBUG_DISABLE_FIND_DB=1
+export NCCL_SOCKET_IFNAME=eno1
+export HSA_USERPTR_FOR_PAGED_MEM=0
+export HIP_LAUNCH_BLOCKING=1
+lrank=$OMPI_COMM_WORLD_LOCAL_RANK
+comm_rank=$OMPI_COMM_WORLD_RANK
+comm_size=$OMPI_COMM_WORLD_SIZE
+export PATH_PHRASE2=/public/DL_DATA/wikicorpus_en/lower_case_1_seq_len_512_max_pred_80_masked_lm_prob_0.15_random_seed_12345_dupe_factor_5_shard_1472_test_split_10/wikicorpus_en/training
+APP="python3 ${HOME}/torch/bert-pretrain/run_pretraining_v4.py  \
+    --input_dir=${PATH_PHRASE2}    \
+    --output_dir=${HOME}/outdir/torch/pre_wiki/phrase2 \
+    --config_file=${HOME}/model/uncased_L-24_H-1024_A-16/bert_config.json \
+    --bert_model=bert-large-uncased \
+    --train_batch_size=2 \
+    --max_seq_length=512 \
+    --max_predictions_per_seq=80 \
+    --max_steps=400000 \
+    --warmup_proportion=0.128 \
+    --num_steps_per_checkpoint=20000 \
+    --learning_rate=4.0e-3 \
+    --seed=12439 \
+    --gradient_accumulation_steps=1 \
+    --allreduce_post_accumulation \
+    --gpus_per_node 2 \
+    --do_train \
+    --phase2 \
+    --phase1_end_step=0 \
+    --local_rank ${comm_rank} \
+    --world_size ${comm_size} \
+    --dist_url tcp://${1}:34567 \
+    --json-summary ${HOME}/outdir/torch/pre_wiki4/phrase2/dllogger.json
+ "
+case ${lrank} in
+[0])
+  export HIP_VISIBLE_DEVICES=0
+  export UCX_NET_DEVICES=mlx5_0:1
+  export UCX_IB_PCI_BW=mlx5_0:50Gbs
+  echo NCCL_SOCKET_IFNAME=eno1 numactl --cpunodebind=0 --membind=0 ${APP}
+  NCCL_SOCKET_IFNAME=eno1 numactl --cpunodebind=0 --membind=0 ${APP}
+  ;;
+[1])
+  export HIP_VISIBLE_DEVICES=1
+  export UCX_NET_DEVICES=mlx5_1:1
+  export UCX_IB_PCI_BW=mlx5_1:50Gbs
+  echo NCCL_SOCKET_IFNAME=eno1 numactl --cpunodebind=1 --membind=1 ${APP}
+  NCCL_SOCKET_IFNAME=eno1 numactl --cpunodebind=1 --membind=1 ${APP}
+  ;;
+[2])
+  export HIP_VISIBLE_DEVICES=2
+  export UCX_NET_DEVICES=mlx5_2:1
+  export UCX_IB_PCI_BW=mlx5_2:50Gbs
+  echo NCCL_SOCKET_IFNAME=eno1 numactl --cpunodebind=2 --membind=2 ${APP}  
+  NCCL_SOCKET_IFNAME=eno1 numactl --cpunodebind=2 --membind=2 ${APP}
+  ;;
+[3])
+  export HIP_VISIBLE_DEVICES=3
+  export UCX_NET_DEVICES=mlx5_3:1
+  export UCX_IB_PCI_BW=mlx5_3:50Gbs
+  echo NCCL_SOCKET_IFNAME=eno1 numactl --cpunodebind=3 --membind=3 ${APP}  
+  NCCL_SOCKET_IFNAME=eno1 numactl --cpunodebind=3 --membind=3 ${APP}
+  ;;
+esac
--- a/PyTorch/NLP/BERT/2node-run-pre/run_bert_pre1_4dcus.sh
+++ b/PyTorch/NLP/BERT/2node-run-pre/run_bert_pre1_4dcus.sh
+#!/usr/bin/env bash
+#SBATCH -J 2node-test
+#SBATCH -p wzhdtest
+#SBATCH -N 2
+#SBARCH -n 32
+#SBATCH --ntasks-per-node=4
+#SBATCH --cpus-per-task=8
+#SBATCH --gres=dcu:4
+set -x
+HOME_PATH=/work/home/hepj
+WORK_PATH=${HOME_PATH}/torch/bert-pretrain/2node-run
+source ~/env22.10.sh
+which python3
+hostfile=./$SLURM_JOB_ID
+scontrol show hostnames $SLURM_JOB_NODELIST > ${hostfile}
+for i in `cat $hostfile`
+do
+    echo ${i} slots=4 >> `pwd`/hostfile-$SLURM_JOB_ID
+    ((num_node=${num_node}+1))
+done
+num_dcu=$((${num_node}*4))
+echo $num_dcu
+nodename=$(cat $hostfile |sed -n "1p")
+echo $nodename
+dist_url=`echo $nodename | awk '{print $1}'`
+export NCCL_DEBUG=INFO
+export HSA_USERPTR_FOR_PAGED_MEM=0
+mpirun -np ${num_dcu} --hostfile hostfile-$SLURM_JOB_ID ${WORK_PATH}/2nodes_single_process_pre1.sh $dist_url
--- a/PyTorch/NLP/BERT/2node-run-pre/run_bert_pre2_4dcus.sh
+++ b/PyTorch/NLP/BERT/2node-run-pre/run_bert_pre2_4dcus.sh
+#!/usr/bin/env bash
+#SBATCH -J 2node-test
+#SBATCH -p wzhdtest
+#SBATCH -N 2
+#SBARCH -n 32
+#SBATCH --ntasks-per-node=4
+#SBATCH --cpus-per-task=8
+#SBATCH --gres=dcu:4
+set -x
+HOME_PATH=/work/home/hepj
+WORK_PATH=${HOME_PATH}/torch/bert-pretrain/2node-run
+source ~/env22.10.sh
+which python3
+hostfile=./$SLURM_JOB_ID
+scontrol show hostnames $SLURM_JOB_NODELIST > ${hostfile}
+for i in `cat $hostfile`
+do
+    echo ${i} slots=4 >> `pwd`/hostfile-$SLURM_JOB_ID
+    ((num_node=${num_node}+1))
+done
+num_dcu=$((${num_node}*4))
+echo $num_dcu
+nodename=$(cat $hostfile |sed -n "1p")
+echo $nodename
+dist_url=`echo $nodename | awk '{print $1}'`
+export NCCL_DEBUG=INFO
+export HSA_USERPTR_FOR_PAGED_MEM=0
+mpirun -np ${num_dcu} --hostfile hostfile-$SLURM_JOB_ID ${WORK_PATH}/2nodes_single_process_pre2.sh $dist_url
--- a/PyTorch/NLP/BERT/2node-run-squad/2nodes_single_process.sh
+++ b/PyTorch/NLP/BERT/2node-run-squad/2nodes_single_process.sh
+#!/bin/bash
+export MIOPEN_DEBUG_DISABLE_FIND_DB=1
+export NCCL_SOCKET_IFNAME=eno1
+export HSA_USERPTR_FOR_PAGED_MEM=0
+export HIP_LAUNCH_BLOCKING=1
+lrank=$OMPI_COMM_WORLD_LOCAL_RANK
+comm_rank=$OMPI_COMM_WORLD_RANK
+comm_size=$OMPI_COMM_WORLD_SIZE
+APP="python3 /work/home/hepj/torch/bert-squad/run_squad_v4.py \
+  --train_file  ${HOME}/data/sq1.1/train-v1.1.json \
+  --predict_file  ${HOME}/data/sq1.1/dev-v1.1.json \
+  --init_checkpoint  ${HOME}/model/pytorch_bert/model.ckpt-28252.pt \
+  --vocab_file  ${HOME}/model/pytorch_bert/vocab.txt \
+  --output_dir  ${HOME}/outdir/torch/SQUAD4 \
+  --config_file  ${HOME}/model/pytorch_bert/bert_config.json \
+  --json-summary  ${HOME}/outdir/torch/SQUAD4/results.json \
+  --bert_model bert-large-uncased \
+  --do_train \
+  --do_predict \
+  --train_batch_size  4 \
+  --predict_batch_size 4 \
+  --gpus_per_node  2 \
+  --local_rank ${comm_rank} \
+  --world_size ${comm_size} \
+  --use_env  \
+  --dist_url tcp://${1}:34567 \
+ "
+case ${lrank} in
+[0])
+  export HIP_VISIBLE_DEVICES=0
+  export UCX_NET_DEVICES=mlx5_0:1
+  export UCX_IB_PCI_BW=mlx5_0:50Gbs
+  echo NCCL_SOCKET_IFNAME=eno1 numactl --cpunodebind=0 --membind=0 ${APP}
+  NCCL_SOCKET_IFNAME=eno1 numactl --cpunodebind=0 --membind=0 ${APP}
+  ;;
+[1])
+  export HIP_VISIBLE_DEVICES=1
+  export UCX_NET_DEVICES=mlx5_1:1
+  export UCX_IB_PCI_BW=mlx5_1:50Gbs
+  echo NCCL_SOCKET_IFNAME=eno1 numactl --cpunodebind=1 --membind=1 ${APP}
+  NCCL_SOCKET_IFNAME=eno1 numactl --cpunodebind=1 --membind=1 ${APP}
+  ;;
+[2])
+  export HIP_VISIBLE_DEVICES=2
+  export UCX_NET_DEVICES=mlx5_2:1
+  export UCX_IB_PCI_BW=mlx5_2:50Gbs
+  echo NCCL_SOCKET_IFNAME=eno1 numactl --cpunodebind=2 --membind=2 ${APP}  
+  NCCL_SOCKET_IFNAME=eno1 numactl --cpunodebind=2 --membind=2 ${APP}
+  ;;
+[3])
+  export HIP_VISIBLE_DEVICES=3
+  export UCX_NET_DEVICES=mlx5_3:1
+  export UCX_IB_PCI_BW=mlx5_3:50Gbs
+  echo NCCL_SOCKET_IFNAME=eno1 numactl --cpunodebind=3 --membind=3 ${APP}  
+  NCCL_SOCKET_IFNAME=eno1 numactl --cpunodebind=3 --membind=3 ${APP}
+  ;;
+esac
--- a/PyTorch/NLP/BERT/2node-run-squad/run_bert_squad_4dcus.sh
+++ b/PyTorch/NLP/BERT/2node-run-squad/run_bert_squad_4dcus.sh
+#!/usr/bin/env bash
+#SBATCH -J 2node-test
+#SBATCH -p wzhdtest
+#SBATCH -N 2
+#SBARCH -n 32
+#SBATCH --ntasks-per-node=4
+#SBATCH --cpus-per-task=8
+#SBATCH --gres=dcu:4
+set -x
+HOME_PATH=/work/home/hepj
+WORK_PATH=${HOME_PATH}/torch/bert-squad/2node-run
+source ~/env22.10.sh
+which python3
+#export NCCL_GRAPH_DUMP_FILE=graph.xml
+#export NCCL_GRAPH_FILE=test.xml
+#export NCCL_NET_GDR_LEVEL=5
+hostfile=./$SLURM_JOB_ID
+scontrol show hostnames $SLURM_JOB_NODELIST > ${hostfile}
+for i in `cat $hostfile`
+do
+    echo ${i} slots=4 >> `pwd`/hostfile-$SLURM_JOB_ID
+    ((num_node=${num_node}+1))
+done
+num_dcu=$((${num_node}*4))
+echo $num_dcu
+nodename=$(cat $hostfile |sed -n "1p")
+echo $nodename
+dist_url=`echo $nodename | awk '{print $1}'`
+export NCCL_DEBUG=INFO
+export HSA_USERPTR_FOR_PAGED_MEM=0
+#mpirun -np ${num_dcu} --hostfile hostfile-$SLURM_JOB_ID ${WORK_PATH}/single_process.sh $dist_url 
+#mpirun -np ${num_dcu} --hostfile hostfile-$SLURM_JOB_ID ${WORK_PATH}/single_process_ddp.sh $dist_url
+#mpirun -np ${num_dcu} --hostfile hostfile-$SLURM_JOB_ID ${WORK_PATH}/2nodes_single_process.sh $dist_url 
+#hipprof mpirun -np ${num_dcu} --hostfile hostfile-$SLURM_JOB_ID ${WORK_PATH}/hipprof_single.sh $dist_url
+#hipprof mpirun -np 4 --hostfile hostfile-18261131 hipprof_single.sh j17r3n01
+mpirun -np ${num_dcu} --hostfile hostfile-$SLURM_JOB_ID ${WORK_PATH}/2nodes_single_process.sh $dist_url
+#mpirun -np ${num_dcu} --hostfile hostfile-$SLURM_JOB_ID hipprof ${WORK_PATH}/2nodes_single_process.sh $dist_url 
--- a/PyTorch/NLP/BERT/README.md
+++ b/PyTorch/NLP/BERT/README.md
@@ -108,6 +108,14 @@ python3 tf_to_torch/convert_tf_checkpoint.py --tf_checkpoint ~/NLP/cks/bs64k_32k
 ./bert_squad4_fp16.sh #半精度  （按自己路径对single_squad4_fp16.sh里APP设置进行修改）
 ```
+```
+#多机多卡
+cd 2node-run-squad
+sbatch run_bert_squad_4dcu.sh （按照自己情况对#SBATCH -p、#SBATCH -J进行修改；需要fp16可以在相应single文件APP中增加 --fp16 与 --amp参数,运行结果保存在相应的slurm文件中）
+```
 ## 4.**PHRASE测试**
 ### 1.参数说明
@@ -142,6 +150,9 @@ python3 tf_to_torch/convert_tf_checkpoint.py --tf_checkpoint ~/NLP/cks/bs64k_32k
 #多卡
 ./bert_pre1_4.sh #单精度 （按自己路径对single_pre1_4.sh里APP设置进行修改）
 ./bert_pre1_4_fp16.sh   #半精度 （按自己路径对single_pre1_4_fp16.sh里APP设置进行修改）
+#多机多卡
+cd 2node-run-pre
+sbatch run_bert_pre1_4dcu.sh （按照自己情况对#SBATCH -p、#SBATCH -J进行修改；需要fp16可以在相应single文件APP中增加 --fp16 与 --amp参数,运行结果保存在相应的slurm文件中）
 ```
 ### 3.PHRASE2
@@ -153,6 +164,8 @@ python3 tf_to_torch/convert_tf_checkpoint.py --tf_checkpoint ~/NLP/cks/bs64k_32k
 #多卡
 ./bert_pre2_4.sh  #单精度 （按自己路径对single_pre2_4.sh里APP设置进行修改）
 ./bert_pre2_4_fp16.sh  #半精度 （按自己路径对single_pre2_4_fp16.sh里APP设置进行修改）
+#多机多卡
+cd 2node-run-pre
+sbatch run_bert_pre2_4dcu.sh （按照自己情况对#SBATCH -p、#SBATCH -J进行修改；需要fp16可以在相应single文件APP中增加 --fp16 与 --amp参数,运行结果保存在相应的slurm文件中）
 ```