"vscode:/vscode.git/clone" did not exist on "262d539a8a8f505dc72958f7ea50915a4b56dfac"
Commit b1232fb0 authored by hepj's avatar hepj
Browse files

增加多机多卡运行

parent 17bc28d5
#!/bin/bash
export MIOPEN_DEBUG_DISABLE_FIND_DB=1
export NCCL_SOCKET_IFNAME=eno1
export HSA_USERPTR_FOR_PAGED_MEM=0
export HIP_LAUNCH_BLOCKING=1
lrank=$OMPI_COMM_WORLD_LOCAL_RANK
comm_rank=$OMPI_COMM_WORLD_RANK
comm_size=$OMPI_COMM_WORLD_SIZE
export PATH_PHRASE1=/public/DL_DATA/wikicorpus_en/lower_case_1_seq_len_128_max_pred_20_masked_lm_prob_0.15_random_seed_12345_dupe_factor_5_shard_1472_test_split_10/wikicorpus_en/training
APP="python3 ${HOME}/torch/bert-pretrain/run_pretraining_v4.py \
--input_dir=${PATH_PHRASE1} \
--output_dir=${HOME}/outdir/torch/pre_wiki/phrase1 \
--config_file=${HOME}/model/uncased_L-24_H-1024_A-16/bert_config.json \
--bert_model=bert-large-uncased \
--train_batch_size=16 \
--max_seq_length=128 \
--max_predictions_per_seq=20 \
--max_steps=100000 \
--warmup_proportion=0.0 \
--num_steps_per_checkpoint=20000 \
--learning_rate=4.0e-4 \
--seed=12439 \
--gradient_accumulation_steps=1 \
--allreduce_post_accumulation \
--gpus_per_node 2 \
--do_train \
--local_rank ${comm_rank} \
--world_size ${comm_size} \
--dist_url tcp://${1}:34567 \
--json-summary ${HOME}/outdir/torch/pre_wiki/phrase1/dllogger.json
"
case ${lrank} in
[0])
export HIP_VISIBLE_DEVICES=0
export UCX_NET_DEVICES=mlx5_0:1
export UCX_IB_PCI_BW=mlx5_0:50Gbs
echo NCCL_SOCKET_IFNAME=eno1 numactl --cpunodebind=0 --membind=0 ${APP}
NCCL_SOCKET_IFNAME=eno1 numactl --cpunodebind=0 --membind=0 ${APP}
;;
[1])
export HIP_VISIBLE_DEVICES=1
export UCX_NET_DEVICES=mlx5_1:1
export UCX_IB_PCI_BW=mlx5_1:50Gbs
echo NCCL_SOCKET_IFNAME=eno1 numactl --cpunodebind=1 --membind=1 ${APP}
NCCL_SOCKET_IFNAME=eno1 numactl --cpunodebind=1 --membind=1 ${APP}
;;
[2])
export HIP_VISIBLE_DEVICES=2
export UCX_NET_DEVICES=mlx5_2:1
export UCX_IB_PCI_BW=mlx5_2:50Gbs
echo NCCL_SOCKET_IFNAME=eno1 numactl --cpunodebind=2 --membind=2 ${APP}
NCCL_SOCKET_IFNAME=eno1 numactl --cpunodebind=2 --membind=2 ${APP}
;;
[3])
export HIP_VISIBLE_DEVICES=3
export UCX_NET_DEVICES=mlx5_3:1
export UCX_IB_PCI_BW=mlx5_3:50Gbs
echo NCCL_SOCKET_IFNAME=eno1 numactl --cpunodebind=3 --membind=3 ${APP}
NCCL_SOCKET_IFNAME=eno1 numactl --cpunodebind=3 --membind=3 ${APP}
;;
esac
#!/bin/bash
export MIOPEN_DEBUG_DISABLE_FIND_DB=1
export NCCL_SOCKET_IFNAME=eno1
export HSA_USERPTR_FOR_PAGED_MEM=0
export HIP_LAUNCH_BLOCKING=1
lrank=$OMPI_COMM_WORLD_LOCAL_RANK
comm_rank=$OMPI_COMM_WORLD_RANK
comm_size=$OMPI_COMM_WORLD_SIZE
export PATH_PHRASE2=/public/DL_DATA/wikicorpus_en/lower_case_1_seq_len_512_max_pred_80_masked_lm_prob_0.15_random_seed_12345_dupe_factor_5_shard_1472_test_split_10/wikicorpus_en/training
APP="python3 ${HOME}/torch/bert-pretrain/run_pretraining_v4.py \
--input_dir=${PATH_PHRASE2} \
--output_dir=${HOME}/outdir/torch/pre_wiki/phrase2 \
--config_file=${HOME}/model/uncased_L-24_H-1024_A-16/bert_config.json \
--bert_model=bert-large-uncased \
--train_batch_size=2 \
--max_seq_length=512 \
--max_predictions_per_seq=80 \
--max_steps=400000 \
--warmup_proportion=0.128 \
--num_steps_per_checkpoint=20000 \
--learning_rate=4.0e-3 \
--seed=12439 \
--gradient_accumulation_steps=1 \
--allreduce_post_accumulation \
--gpus_per_node 2 \
--do_train \
--phase2 \
--phase1_end_step=0 \
--local_rank ${comm_rank} \
--world_size ${comm_size} \
--dist_url tcp://${1}:34567 \
--json-summary ${HOME}/outdir/torch/pre_wiki4/phrase2/dllogger.json
"
case ${lrank} in
[0])
export HIP_VISIBLE_DEVICES=0
export UCX_NET_DEVICES=mlx5_0:1
export UCX_IB_PCI_BW=mlx5_0:50Gbs
echo NCCL_SOCKET_IFNAME=eno1 numactl --cpunodebind=0 --membind=0 ${APP}
NCCL_SOCKET_IFNAME=eno1 numactl --cpunodebind=0 --membind=0 ${APP}
;;
[1])
export HIP_VISIBLE_DEVICES=1
export UCX_NET_DEVICES=mlx5_1:1
export UCX_IB_PCI_BW=mlx5_1:50Gbs
echo NCCL_SOCKET_IFNAME=eno1 numactl --cpunodebind=1 --membind=1 ${APP}
NCCL_SOCKET_IFNAME=eno1 numactl --cpunodebind=1 --membind=1 ${APP}
;;
[2])
export HIP_VISIBLE_DEVICES=2
export UCX_NET_DEVICES=mlx5_2:1
export UCX_IB_PCI_BW=mlx5_2:50Gbs
echo NCCL_SOCKET_IFNAME=eno1 numactl --cpunodebind=2 --membind=2 ${APP}
NCCL_SOCKET_IFNAME=eno1 numactl --cpunodebind=2 --membind=2 ${APP}
;;
[3])
export HIP_VISIBLE_DEVICES=3
export UCX_NET_DEVICES=mlx5_3:1
export UCX_IB_PCI_BW=mlx5_3:50Gbs
echo NCCL_SOCKET_IFNAME=eno1 numactl --cpunodebind=3 --membind=3 ${APP}
NCCL_SOCKET_IFNAME=eno1 numactl --cpunodebind=3 --membind=3 ${APP}
;;
esac
#!/usr/bin/env bash
#SBATCH -J 2node-test
#SBATCH -p wzhdtest
#SBATCH -N 2
#SBARCH -n 32
#SBATCH --ntasks-per-node=4
#SBATCH --cpus-per-task=8
#SBATCH --gres=dcu:4
set -x
HOME_PATH=/work/home/hepj
WORK_PATH=${HOME_PATH}/torch/bert-pretrain/2node-run
source ~/env22.10.sh
which python3
hostfile=./$SLURM_JOB_ID
scontrol show hostnames $SLURM_JOB_NODELIST > ${hostfile}
for i in `cat $hostfile`
do
echo ${i} slots=4 >> `pwd`/hostfile-$SLURM_JOB_ID
((num_node=${num_node}+1))
done
num_dcu=$((${num_node}*4))
echo $num_dcu
nodename=$(cat $hostfile |sed -n "1p")
echo $nodename
dist_url=`echo $nodename | awk '{print $1}'`
export NCCL_DEBUG=INFO
export HSA_USERPTR_FOR_PAGED_MEM=0
mpirun -np ${num_dcu} --hostfile hostfile-$SLURM_JOB_ID ${WORK_PATH}/2nodes_single_process_pre1.sh $dist_url
#!/usr/bin/env bash
#SBATCH -J 2node-test
#SBATCH -p wzhdtest
#SBATCH -N 2
#SBARCH -n 32
#SBATCH --ntasks-per-node=4
#SBATCH --cpus-per-task=8
#SBATCH --gres=dcu:4
set -x
HOME_PATH=/work/home/hepj
WORK_PATH=${HOME_PATH}/torch/bert-pretrain/2node-run
source ~/env22.10.sh
which python3
hostfile=./$SLURM_JOB_ID
scontrol show hostnames $SLURM_JOB_NODELIST > ${hostfile}
for i in `cat $hostfile`
do
echo ${i} slots=4 >> `pwd`/hostfile-$SLURM_JOB_ID
((num_node=${num_node}+1))
done
num_dcu=$((${num_node}*4))
echo $num_dcu
nodename=$(cat $hostfile |sed -n "1p")
echo $nodename
dist_url=`echo $nodename | awk '{print $1}'`
export NCCL_DEBUG=INFO
export HSA_USERPTR_FOR_PAGED_MEM=0
mpirun -np ${num_dcu} --hostfile hostfile-$SLURM_JOB_ID ${WORK_PATH}/2nodes_single_process_pre2.sh $dist_url
#!/bin/bash
export MIOPEN_DEBUG_DISABLE_FIND_DB=1
export NCCL_SOCKET_IFNAME=eno1
export HSA_USERPTR_FOR_PAGED_MEM=0
export HIP_LAUNCH_BLOCKING=1
lrank=$OMPI_COMM_WORLD_LOCAL_RANK
comm_rank=$OMPI_COMM_WORLD_RANK
comm_size=$OMPI_COMM_WORLD_SIZE
APP="python3 /work/home/hepj/torch/bert-squad/run_squad_v4.py \
--train_file ${HOME}/data/sq1.1/train-v1.1.json \
--predict_file ${HOME}/data/sq1.1/dev-v1.1.json \
--init_checkpoint ${HOME}/model/pytorch_bert/model.ckpt-28252.pt \
--vocab_file ${HOME}/model/pytorch_bert/vocab.txt \
--output_dir ${HOME}/outdir/torch/SQUAD4 \
--config_file ${HOME}/model/pytorch_bert/bert_config.json \
--json-summary ${HOME}/outdir/torch/SQUAD4/results.json \
--bert_model bert-large-uncased \
--do_train \
--do_predict \
--train_batch_size 4 \
--predict_batch_size 4 \
--gpus_per_node 2 \
--local_rank ${comm_rank} \
--world_size ${comm_size} \
--use_env \
--dist_url tcp://${1}:34567 \
"
case ${lrank} in
[0])
export HIP_VISIBLE_DEVICES=0
export UCX_NET_DEVICES=mlx5_0:1
export UCX_IB_PCI_BW=mlx5_0:50Gbs
echo NCCL_SOCKET_IFNAME=eno1 numactl --cpunodebind=0 --membind=0 ${APP}
NCCL_SOCKET_IFNAME=eno1 numactl --cpunodebind=0 --membind=0 ${APP}
;;
[1])
export HIP_VISIBLE_DEVICES=1
export UCX_NET_DEVICES=mlx5_1:1
export UCX_IB_PCI_BW=mlx5_1:50Gbs
echo NCCL_SOCKET_IFNAME=eno1 numactl --cpunodebind=1 --membind=1 ${APP}
NCCL_SOCKET_IFNAME=eno1 numactl --cpunodebind=1 --membind=1 ${APP}
;;
[2])
export HIP_VISIBLE_DEVICES=2
export UCX_NET_DEVICES=mlx5_2:1
export UCX_IB_PCI_BW=mlx5_2:50Gbs
echo NCCL_SOCKET_IFNAME=eno1 numactl --cpunodebind=2 --membind=2 ${APP}
NCCL_SOCKET_IFNAME=eno1 numactl --cpunodebind=2 --membind=2 ${APP}
;;
[3])
export HIP_VISIBLE_DEVICES=3
export UCX_NET_DEVICES=mlx5_3:1
export UCX_IB_PCI_BW=mlx5_3:50Gbs
echo NCCL_SOCKET_IFNAME=eno1 numactl --cpunodebind=3 --membind=3 ${APP}
NCCL_SOCKET_IFNAME=eno1 numactl --cpunodebind=3 --membind=3 ${APP}
;;
esac
#!/usr/bin/env bash
#SBATCH -J 2node-test
#SBATCH -p wzhdtest
#SBATCH -N 2
#SBARCH -n 32
#SBATCH --ntasks-per-node=4
#SBATCH --cpus-per-task=8
#SBATCH --gres=dcu:4
set -x
HOME_PATH=/work/home/hepj
WORK_PATH=${HOME_PATH}/torch/bert-squad/2node-run
source ~/env22.10.sh
which python3
#export NCCL_GRAPH_DUMP_FILE=graph.xml
#export NCCL_GRAPH_FILE=test.xml
#export NCCL_NET_GDR_LEVEL=5
hostfile=./$SLURM_JOB_ID
scontrol show hostnames $SLURM_JOB_NODELIST > ${hostfile}
for i in `cat $hostfile`
do
echo ${i} slots=4 >> `pwd`/hostfile-$SLURM_JOB_ID
((num_node=${num_node}+1))
done
num_dcu=$((${num_node}*4))
echo $num_dcu
nodename=$(cat $hostfile |sed -n "1p")
echo $nodename
dist_url=`echo $nodename | awk '{print $1}'`
export NCCL_DEBUG=INFO
export HSA_USERPTR_FOR_PAGED_MEM=0
#mpirun -np ${num_dcu} --hostfile hostfile-$SLURM_JOB_ID ${WORK_PATH}/single_process.sh $dist_url
#mpirun -np ${num_dcu} --hostfile hostfile-$SLURM_JOB_ID ${WORK_PATH}/single_process_ddp.sh $dist_url
#mpirun -np ${num_dcu} --hostfile hostfile-$SLURM_JOB_ID ${WORK_PATH}/2nodes_single_process.sh $dist_url
#hipprof mpirun -np ${num_dcu} --hostfile hostfile-$SLURM_JOB_ID ${WORK_PATH}/hipprof_single.sh $dist_url
#hipprof mpirun -np 4 --hostfile hostfile-18261131 hipprof_single.sh j17r3n01
mpirun -np ${num_dcu} --hostfile hostfile-$SLURM_JOB_ID ${WORK_PATH}/2nodes_single_process.sh $dist_url
#mpirun -np ${num_dcu} --hostfile hostfile-$SLURM_JOB_ID hipprof ${WORK_PATH}/2nodes_single_process.sh $dist_url
......@@ -108,6 +108,14 @@ python3 tf_to_torch/convert_tf_checkpoint.py --tf_checkpoint ~/NLP/cks/bs64k_32k
./bert_squad4_fp16.sh #半精度 (按自己路径对single_squad4_fp16.sh里APP设置进行修改)
```
```
#多机多卡
cd 2node-run-squad
sbatch run_bert_squad_4dcu.sh (按照自己情况对#SBATCH -p、#SBATCH -J进行修改;需要fp16可以在相应single文件APP中增加 --fp16 与 --amp参数,运行结果保存在相应的slurm文件中)
```
## 4.**PHRASE测试**
### 1.参数说明
......@@ -142,6 +150,9 @@ python3 tf_to_torch/convert_tf_checkpoint.py --tf_checkpoint ~/NLP/cks/bs64k_32k
#多卡
./bert_pre1_4.sh #单精度 (按自己路径对single_pre1_4.sh里APP设置进行修改)
./bert_pre1_4_fp16.sh #半精度 (按自己路径对single_pre1_4_fp16.sh里APP设置进行修改)
#多机多卡
cd 2node-run-pre
sbatch run_bert_pre1_4dcu.sh (按照自己情况对#SBATCH -p、#SBATCH -J进行修改;需要fp16可以在相应single文件APP中增加 --fp16 与 --amp参数,运行结果保存在相应的slurm文件中)
```
### 3.PHRASE2
......@@ -153,6 +164,8 @@ python3 tf_to_torch/convert_tf_checkpoint.py --tf_checkpoint ~/NLP/cks/bs64k_32k
#多卡
./bert_pre2_4.sh #单精度 (按自己路径对single_pre2_4.sh里APP设置进行修改)
./bert_pre2_4_fp16.sh #半精度 (按自己路径对single_pre2_4_fp16.sh里APP设置进行修改)
#多机多卡
cd 2node-run-pre
sbatch run_bert_pre2_4dcu.sh (按照自己情况对#SBATCH -p、#SBATCH -J进行修改;需要fp16可以在相应single文件APP中增加 --fp16 与 --amp参数,运行结果保存在相应的slurm文件中)
```
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment