Commit 7d366e11 authored by hepj's avatar hepj
Browse files

增加torch多机多卡运行

parent b1232fb0
#!/bin/bash
export MIOPEN_DEBUG_DISABLE_FIND_DB=1
export NCCL_SOCKET_IFNAME=eno1
export HSA_USERPTR_FOR_PAGED_MEM=0
export HIP_LAUNCH_BLOCKING=1
lrank=$OMPI_COMM_WORLD_LOCAL_RANK
comm_rank=$OMPI_COMM_WORLD_RANK
comm_size=$OMPI_COMM_WORLD_SIZE
export RANK=$OMPI_COMM_WORLD_RANK
export WORLD_SIZE=$OMPI_COMM_WORLD_SIZE
export LOCAL_RANK=$OMPI_COMM_WORLD_LOCAL_RANK
export HIP_VISIBLE_DEVICES=0,1,2,3
#pyenv activate torch-dtk22.04.2
source ~/env22.04.2.sh
APP="python /work/home/hepj/torch/Conformer-main/main.py \
--model Conformer_small_patch16 \
--data-set IMNET \
--batch-size 64 \
--world_size 4\
--lr 0.001 \
--local_rank ${comm_rank} \
--dist_url tcp://${1}:9999 \
--data-path /public/DL_DATA/ImageNet-pytorch \
--output_dir /work/home/hepj/torch/Conformer-main/out_dir \
--epochs 1"
case ${lrank} in
[0])
export HIP_VISIBLE_DEVICES=0
export UCX_NET_DEVICES=mlx5_0:1
export UCX_IB_PCI_BW=mlx5_0:50Gbs
echo NCCL_SOCKET_IFNAME=eno1 numactl --cpunodebind=0 --membind=0 ${APP}
NCCL_SOCKET_IFNAME=eno1 numactl --cpunodebind=0 --membind=0 ${APP}
;;
[1])
export HIP_VISIBLE_DEVICES=1
export UCX_NET_DEVICES=mlx5_1:1
export UCX_IB_PCI_BW=mlx5_1:50Gbs
echo NCCL_SOCKET_IFNAME=eno1 numactl --cpunodebind=1 --membind=1 ${APP}
NCCL_SOCKET_IFNAME=eno1 numactl --cpunodebind=1 --membind=1 ${APP}
;;
[2])
export HIP_VISIBLE_DEVICES=2
export UCX_NET_DEVICES=mlx5_2:1
export UCX_IB_PCI_BW=mlx5_2:50Gbs
echo NCCL_SOCKET_IFNAME=eno1 numactl --cpunodebind=2 --membind=2 ${APP}
NCCL_SOCKET_IFNAME=eno1 numactl --cpunodebind=2 --membind=2 ${APP}
;;
[3])
export HIP_VISIBLE_DEVICES=3
export UCX_NET_DEVICES=mlx5_3:1
export UCX_IB_PCI_BW=mlx5_3:50Gbs
echo NCCL_SOCKET_IFNAME=eno1 numactl --cpunodebind=3 --membind=3 ${APP}
NCCL_SOCKET_IFNAME=eno1 numactl --cpunodebind=3 --membind=3 ${APP}
;;
esac
#!/usr/bin/env bash
#SBATCH -J 2node-test
#SBATCH -p wzhdtest
#SBATCH -N 2
#SBARCH -n 32
#SBATCH --ntasks-per-node=4
#SBATCH --cpus-per-task=8
#SBATCH --gres=dcu:4
set -x
HOME_PATH=/work/home/hepj
WORK_PATH=${HOME_PATH}/torch/Conformer-main/2node-run
source ~/env22.10.sh
which python3
#export NCCL_GRAPH_DUMP_FILE=graph.xml
#export NCCL_GRAPH_FILE=test.xml
#export NCCL_NET_GDR_LEVEL=5
hostfile=./$SLURM_JOB_ID
scontrol show hostnames $SLURM_JOB_NODELIST > ${hostfile}
for i in `cat $hostfile`
do
echo ${i} slots=4 >> `pwd`/hostfile-$SLURM_JOB_ID
((num_node=${num_node}+1))
done
num_dcu=$((${num_node}*4))
echo $num_dcu
nodename=$(cat $hostfile |sed -n "1p")
echo $nodename
dist_url=`echo $nodename | awk '{print $1}'`
#export NCCL_DEBUG=INFO
#export HSA_USERPTR_FOR_PAGED_MEM=0
#mpirun -np ${num_dcu} --hostfile hostfile-$SLURM_JOB_ID ${WORK_PATH}/single_process.sh $dist_url
#mpirun -np ${num_dcu} --hostfile hostfile-$SLURM_JOB_ID ${WORK_PATH}/single_process_ddp.sh $dist_url
#mpirun -np ${num_dcu} --hostfile hostfile-$SLURM_JOB_ID ${WORK_PATH}/2nodes_single_process.sh $dist_url
#hipprof mpirun -np ${num_dcu} --hostfile hostfile-$SLURM_JOB_ID ${WORK_PATH}/hipprof_single.sh $dist_url
#hipprof mpirun -np 4 --hostfile hostfile-18261131 hipprof_single.sh j17r3n01
#mpirun -np 1 --hostfile hostfile-$SLURM_JOB_ID ${WORK_PATH}/2nodes_single_process.sh $dist_url
mpirun -np ${num_dcu} --hostfile hostfile-$SLURM_JOB_ID ${WORK_PATH}/2nodes_single_process.sh $dist_url
#mpirun -np ${num_dcu} --hostfile hostfile-$SLURM_JOB_ID hipprof ${WORK_PATH}/2nodes_single_process.sh $dist_url
...@@ -39,7 +39,7 @@ import collections.abc as container_abcs ...@@ -39,7 +39,7 @@ import collections.abc as container_abcs
/public/software/apps/DeepLearning/Data/ImageNet-pytorch /public/software/apps/DeepLearning/Data/ImageNet-pytorch
### 单卡 ## 单卡
``` ```
#启动 #启动
...@@ -59,3 +59,10 @@ sh脚本中--nnodes 为机器数 ,--nproc_per_node每个机器显卡数目, ...@@ -59,3 +59,10 @@ sh脚本中--nnodes 为机器数 ,--nproc_per_node每个机器显卡数目,
./run4.sh ./run4.sh
``` ```
## 多机多卡
```
cd 2node-run-comformer
sbatch run_conformer_4dcus.sh (按照自己情况对#SBATCH -p、#SBATCH -J进行修改,运行结果保存在相应的slurm文件中)
```
#!/usr/bin/env bash
#SBATCH -J 2node-test
#SBATCH -p wzhdtest
#SBATCH -N 2
#SBARCH -n 32
#SBATCH --ntasks-per-node=4
#SBATCH --cpus-per-task=8
#SBATCH --gres=dcu:4
set -x
WORK_PATH=/work/home/hepj/torch/Vision_Transformer/2node-run
source ~/env22.10.sh
which python3
hostfile=./$SLURM_JOB_ID
scontrol show hostnames $SLURM_JOB_NODELIST > ${hostfile}
for i in `cat $hostfile`
do
echo ${i} slots=4 >> `pwd`/hostfile-$SLURM_JOB_ID
((num_node=${num_node}+1))
done
num_dcu=$((${num_node}*4))
echo $num_dcu
nodename=$(cat $hostfile |sed -n "1p")
echo $nodename
dist_url=`echo $nodename | awk '{print $1}'`
# export NCCL_DEBUG=INFO
# export HSA_USERPTR_FOR_PAGED_MEM=0
mpirun -np ${num_dcu} --hostfile hostfile-$SLURM_JOB_ID ${WORK_PATH}/single_finetune-4.sh $dist_url
#!/usr/bin/env bash
#SBATCH -J 2node-test
#SBATCH -p wzhdtest
#SBATCH -N 2
#SBARCH -n 32
#SBATCH --ntasks-per-node=4
#SBATCH --cpus-per-task=8
#SBATCH --gres=dcu:4
set -x
WORK_PATH=/work/home/hepj/torch/mae-main/2node-run
source ~/env22.04.2.sh
which python3
hostfile=./$SLURM_JOB_ID
scontrol show hostnames $SLURM_JOB_NODELIST > ${hostfile}
for i in `cat $hostfile`
do
echo ${i} slots=4 >> `pwd`/hostfile-$SLURM_JOB_ID
((num_node=${num_node}+1))
done
num_dcu=$((${num_node}*4))
echo $num_dcu
nodename=$(cat $hostfile |sed -n "1p")
echo $nodename
dist_url=`echo $nodename | awk '{print $1}'`
export NCCL_DEBUG=INFO
export HSA_USERPTR_FOR_PAGED_MEM=0
mpirun -np ${num_dcu} --hostfile hostfile-$SLURM_JOB_ID ${WORK_PATH}/single_pre-4.sh $dist_url
#!/bin/bash
export MIOPEN_DEBUG_DISABLE_FIND_DB=1
export NCCL_SOCKET_IFNAME=eno1
export HSA_USERPTR_FOR_PAGED_MEM=0
lrank=$OMPI_COMM_WORLD_LOCAL_RANK
comm_rank=$OMPI_COMM_WORLD_RANK
comm_size=$OMPI_COMM_WORLD_SIZE
source ~/env22.10.sh
export PRETRAIN_CHKPT=/work/home/hepj/model/VIT/mae_pretrain_vit_base.pth #mae_finetuned_vit_base.pth
export IMAGENET_DIR=/public/DL_DATA/ImageNet-pytorch
export HIP_VISIBLE_DEVICES=0,1,2,3
APP="python /work/home/hepj/torch/mae-main/main_finetune.py \
--batch_size 32 \
--dist_on_itp \
--dist_url tcp://${1}:34567 \
--local_rank ${comm_rank} \
--model vit_base_patch16 \
--finetune ${PRETRAIN_CHKPT}\
--epochs 1 \
--blr 5e-4 --layer_decay 0.65 --weight_decay 0.05 \
--drop_path 0.1 --mixup 0.8 --cutmix 1.0 --reprob 0.25 --dist_eval \
--data_path ${IMAGENET_DIR} \
"
case ${lrank} in
[0])
export HIP_VISIBLE_DEVICES=0
export UCX_NET_DEVICES=mlx5_0:1
export UCX_IB_PCI_BW=mlx5_0:50Gbs
echo numactl --cpunodebind=0 --membind=0 ${APP}
numactl --cpunodebind=0 --membind=0 ${APP}
;;
[1])
export HIP_VISIBLE_DEVICES=1
export UCX_NET_DEVICES=mlx5_1:1
export UCX_IB_PCI_BW=mlx5_1:50Gbs
echo numactl --cpunodebind=1 --membind=1 ${APP}
numactl --cpunodebind=1 --membind=1 ${APP}
;;
[2])
export HIP_VISIBLE_DEVICES=2
export UCX_NET_DEVICES=mlx5_2:1
export UCX_IB_PCI_BW=mlx5_2:50Gbs
echo numactl --cpunodebind=2 --membind=2 ${APP}
numactl --cpunodebind=2 --membind=2 ${APP}
;;
[3])
export HIP_VISIBLE_DEVICES=3
export UCX_NET_DEVICES=mlx5_3:1
export UCX_IB_PCI_BW=mlx5_3:50Gbs
echo numactl --cpunodebind=3 --membind=3 ${APP}
numactl --cpunodebind=3 --membind=3 ${APP}
;;
esac
#!/bin/bash
export MIOPEN_DEBUG_DISABLE_FIND_DB=1
export NCCL_SOCKET_IFNAME=eno1
export HSA_USERPTR_FOR_PAGED_MEM=0
lrank=$OMPI_COMM_WORLD_LOCAL_RANK
comm_rank=$OMPI_COMM_WORLD_RANK
comm_size=$OMPI_COMM_WORLD_SIZE
pyenv activate torch-dtk22.04.2
source ~/env22.04.2.sh
export PRETRAIN_CHKPT=/work/home/hepj/model/VIT/mae_pretrain_vit_base.pth #mae_finetuned_vit_base.pth
export IMAGENET_DIR=/public/DL_DATA/ImageNet-pytorch
export HIP_VISIBLE_DEVICES=0,1,2,3
APP="python /work/home/hepj/torch/mae-main/main_pretrain.py \
--epochs 1 \
--dist_on_itp \
--dist_url tcp://${1}:34567 \
--local_rank ${comm_rank} \
--model mae_vit_base_patch16 \
--batch_size 64 \
--model mae_vit_base_patch16 \
--data_path ${IMAGENET_DIR}"
case ${lrank} in
[0])
export HIP_VISIBLE_DEVICES=0
export UCX_NET_DEVICES=mlx5_0:1
export UCX_IB_PCI_BW=mlx5_0:50Gbs
echo NCCL_SOCKET_IFNAME=eno1 numactl --cpunodebind=0 --membind=0 ${APP}
NCCL_SOCKET_IFNAME=eno1 numactl --cpunodebind=0 --membind=0 ${APP}
;;
[1])
export HIP_VISIBLE_DEVICES=1
export UCX_NET_DEVICES=mlx5_1:1
export UCX_IB_PCI_BW=mlx5_1:50Gbs
echo NCCL_SOCKET_IFNAME=eno1 numactl --cpunodebind=1 --membind=1 ${APP}
NCCL_SOCKET_IFNAME=eno1 numactl --cpunodebind=1 --membind=1 ${APP}
;;
[2])
export HIP_VISIBLE_DEVICES=2
export UCX_NET_DEVICES=mlx5_2:1
export UCX_IB_PCI_BW=mlx5_2:50Gbs
echo NCCL_SOCKET_IFNAME=eno1 numactl --cpunodebind=2 --membind=2 ${APP}
NCCL_SOCKET_IFNAME=eno1 numactl --cpunodebind=2 --membind=2 ${APP}
;;
[3])
export HIP_VISIBLE_DEVICES=3
export UCX_NET_DEVICES=mlx5_3:1
export UCX_IB_PCI_BW=mlx5_3:50Gbs
echo NCCL_SOCKET_IFNAME=eno1 numactl --cpunodebind=3 --membind=3 ${APP}
NCCL_SOCKET_IFNAME=eno1 numactl --cpunodebind=3 --membind=3 ${APP}
;;
esac
...@@ -88,6 +88,13 @@ OMP_NUM_THREADS=1 python3 -m torch.distributed.launch --nproc_per_node=4 main_ ...@@ -88,6 +88,13 @@ OMP_NUM_THREADS=1 python3 -m torch.distributed.launch --nproc_per_node=4 main_
``` ```
## 多机多卡
```
cd 2node-run-vit
sbatch run-vit-pre.sh (按照自己情况对#SBATCH -p、#SBATCH -J 进行修改;运行结果保存在相应的slurm文件中)
```
# 微调任务 # 微调任务
...@@ -135,6 +142,13 @@ OMP_NUM_THREADS=1 python3 -m torch.distributed.launch --nproc_per_node=4 main_fi ...@@ -135,6 +142,13 @@ OMP_NUM_THREADS=1 python3 -m torch.distributed.launch --nproc_per_node=4 main_fi
--dist_eval --data_path ${IMAGENET_DIR} --dist_eval --data_path ${IMAGENET_DIR}
``` ```
## 多机多卡
```
cd 2node-run-vit
sbatch run-vit-finetune.sh (按照自己情况对#SBATCH -p、#SBATCH -J 进行修改;运行结果保存在相应的slurm文件中)
```
# 结果验证 # 结果验证
验证使用的模型为mae_finetuned_vit_xxx.pth,下载地址: 验证使用的模型为mae_finetuned_vit_xxx.pth,下载地址:
......
#!/bin/bash
export MIOPEN_DEBUG_DISABLE_FIND_DB=1
export NCCL_SOCKET_IFNAME=eno1
export HSA_USERPTR_FOR_PAGED_MEM=0
export HSA_FORCE_FINE_GRAIN_PCIE=1
export MIOPEN_FIND_MODE=1
lrank=$OMPI_COMM_WORLD_LOCAL_RANK
comm_rank=$OMPI_COMM_WORLD_RANK
comm_size=$OMPI_COMM_WORLD_SIZE
#TOKENS=4096
TOKENS=2560
export DATA_PATH=~/data/wmt14_en_de_joined_dict
APP="python3 /work/home/hepj/torch/TransFormer/train.py $DATA_PATH --save-dir 2node-outdir --arch transformer_wmt_en_de --share-decoder-input-output-embed --optimizer adam --adam-betas (0.9,0.98) --clip-norm 0.0 --lr 5e-4 --lr-scheduler inverse_sqrt --warmup-updates 4000 --dropout 0.3 --weight-decay 0.0001 --criterion label_smoothed_cross_entropy --label-smoothing 0.1 --max-tokens $TOKENS --eval-bleu --eval-bleu-args {\"beam\":5,\"max_len_a\":1.2,\"max_len_b\":10} --eval-bleu-detok moses --eval-bleu-remove-bpe --eval-bleu-print-samples --best-checkpoint-metric bleu --maximize-best-checkpoint-metric --distributed-rank ${comm_rank} --distributed-world-size ${comm_size} --device-id ${lrank} --local_rank ${lrank} --distributed-init-method tcp://${1}:34567 --distributed-no-spawn --max-epoch 1"
case ${lrank} in
[0])
export HIP_VISIBLE_DEVICES=0,1,2,3
export UCX_NET_DEVICES=mlx5_0:1
export UCX_IB_PCI_BW=mlx5_0:50Gbs
echo NCCL_SOCKET_IFNAME=ib0 numactl --cpunodebind=0 --membind=0 ${APP}
NCCL_SOCKET_IFNAME=ib0 numactl --cpunodebind=0 --membind=0 ${APP}
;;
[1])
export HIP_VISIBLE_DEVICES=0,1,2,3
export UCX_NET_DEVICES=mlx5_1:1
export UCX_IB_PCI_BW=mlx5_1:50Gbs
echo NCCL_SOCKET_IFNAME=ib0 numactl --cpunodebind=1 --membind=1 ${APP}
NCCL_SOCKET_IFNAME=ib0 numactl --cpunodebind=1 --membind=1 ${APP}
;;
[2])
export HIP_VISIBLE_DEVICES=0,1,2,3
export UCX_NET_DEVICES=mlx5_2:1
export UCX_IB_PCI_BW=mlx5_2:50Gbs
echo NCCL_SOCKET_IFNAME=ib0 numactl --cpunodebind=2 --membind=2 ${APP}
NCCL_SOCKET_IFNAME=ib0 numactl --cpunodebind=2 --membind=2 ${APP}
;;
[3])
export HIP_VISIBLE_DEVICES=0,1,2,3
export UCX_NET_DEVICES=mlx5_3:1
export UCX_IB_PCI_BW=mlx5_3:50Gbs
echo NCCL_SOCKET_IFNAME=ib0 numactl --cpunodebind=3 --membind=3 ${APP}
NCCL_SOCKET_IFNAME=ib0 numactl --cpunodebind=3 --membind=3 ${APP}
;;
esac
#!/usr/bin/env bash
#SBATCH -J 2node-test
#SBATCH -p wzhdtest
#SBATCH -N 2
#SBARCH -n 32
#SBATCH --ntasks-per-node=4
#SBATCH --cpus-per-task=8
#SBATCH --gres=dcu:4
set -x
HOME_PATH=/work/home/hepj
WORK_PATH=${HOME_PATH}/torch/TransFormer/2node-run
source ~/env22.10.sh
which python3
#export NCCL_GRAPH_DUMP_FILE=graph.xml
#export NCCL_GRAPH_FILE=test.xml
#export NCCL_NET_GDR_LEVEL=5
hostfile=./$SLURM_JOB_ID
scontrol show hostnames $SLURM_JOB_NODELIST > ${hostfile}
for i in `cat $hostfile`
do
echo ${i} slots=4 >> `pwd`/hostfile-$SLURM_JOB_ID
((num_node=${num_node}+1))
done
num_dcu=$((${num_node}*4))
echo $num_dcu
nodename=$(cat $hostfile |sed -n "1p")
echo $nodename
dist_url=`echo $nodename | awk '{print $1}'`
#export NCCL_DEBUG=INFO
#export HSA_USERPTR_FOR_PAGED_MEM=0
mpirun -np ${num_dcu} --hostfile hostfile-$SLURM_JOB_ID ${WORK_PATH}/2nodes_single_process.sh $dist_url
...@@ -390,9 +390,21 @@ sbatch fp16_ run_transformer_4dcus.sh ...@@ -390,9 +390,21 @@ sbatch fp16_ run_transformer_4dcus.sh
- 通过--arch 设置要测试的网络,eg:transformer_wmt_en_de 等; - 通过--arch 设置要测试的网络,eg:transformer_wmt_en_de 等;
- 上述 run_transformer_4dcus.sh中mpirun 运行命令表示使用4张DCU加速卡训练。 - 上述 run_transformer_4dcus.sh中mpirun 运行命令表示使用4张DCU加速卡训练。
#### 3.5. 部分问题说明 #### 3.5.多机多卡
##### 3.5.1. format错误 ```
cd 2node-run
#fp32
sbatch run_transformer_4dcus.sh (按照自己情况对#SBATCH -p、#SBATCH -J进行修改,运行结果保存在相应的slurm文件中)
#fp16
sbatch run_transformer_4dcus_fp16.sh (按照自己情况对#SBATCH -p、#SBATCH -J进行修改,运行结果保存在相应的slurm文件中)
```
#### 3.6. 部分问题说明
##### 3.6.1. format错误
报错信息如下: 报错信息如下:
...@@ -414,7 +426,7 @@ self._verbose += f"ref_len = {slef.ref_len:.0f}" ...@@ -414,7 +426,7 @@ self._verbose += f"ref_len = {slef.ref_len:.0f}"
##### 3.5.2 json格式解析错误 ##### 3.6.2 json格式解析错误
报错信息如下: 报错信息如下:
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment