Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
dcuai
dlexamples
Commits
7d366e11
Commit
7d366e11
authored
Jan 11, 2023
by
hepj
Browse files
增加torch多机多卡运行
parent
b1232fb0
Changes
11
Show whitespace changes
Inline
Side-by-side
Showing
11 changed files
with
383 additions
and
4 deletions
+383
-4
PyTorch/NLP/Conformer-main/2node-run-comformer/2nodes_single_process.sh
...nformer-main/2node-run-comformer/2nodes_single_process.sh
+58
-0
PyTorch/NLP/Conformer-main/2node-run-comformer/run_conformer_4dcus.sh
...Conformer-main/2node-run-comformer/run_conformer_4dcus.sh
+42
-0
PyTorch/NLP/Conformer-main/README.md
PyTorch/NLP/Conformer-main/README.md
+8
-1
PyTorch/NLP/Vision_Transformer/2node-run-vit/run-vit-finetune.sh
.../NLP/Vision_Transformer/2node-run-vit/run-vit-finetune.sh
+30
-0
PyTorch/NLP/Vision_Transformer/2node-run-vit/run-vit-pre.sh
PyTorch/NLP/Vision_Transformer/2node-run-vit/run-vit-pre.sh
+30
-0
PyTorch/NLP/Vision_Transformer/2node-run-vit/single_finetune-4.sh
...NLP/Vision_Transformer/2node-run-vit/single_finetune-4.sh
+55
-0
PyTorch/NLP/Vision_Transformer/2node-run-vit/single_pre-4.sh
PyTorch/NLP/Vision_Transformer/2node-run-vit/single_pre-4.sh
+53
-0
PyTorch/NLP/Vision_Transformer/README.md
PyTorch/NLP/Vision_Transformer/README.md
+14
-0
PyTorch/NLP/new-Transformer/2node-run/2nodes_single_process.sh
...ch/NLP/new-Transformer/2node-run/2nodes_single_process.sh
+44
-0
PyTorch/NLP/new-Transformer/2node-run/run_transformer_4dcus.sh
...ch/NLP/new-Transformer/2node-run/run_transformer_4dcus.sh
+34
-0
PyTorch/NLP/new-Transformer/README.md
PyTorch/NLP/new-Transformer/README.md
+15
-3
No files found.
PyTorch/NLP/Conformer-main/2node-run-comformer/2nodes_single_process.sh
0 → 100644
View file @
7d366e11
#!/bin/bash
export
MIOPEN_DEBUG_DISABLE_FIND_DB
=
1
export
NCCL_SOCKET_IFNAME
=
eno1
export
HSA_USERPTR_FOR_PAGED_MEM
=
0
export
HIP_LAUNCH_BLOCKING
=
1
lrank
=
$OMPI_COMM_WORLD_LOCAL_RANK
comm_rank
=
$OMPI_COMM_WORLD_RANK
comm_size
=
$OMPI_COMM_WORLD_SIZE
export
RANK
=
$OMPI_COMM_WORLD_RANK
export
WORLD_SIZE
=
$OMPI_COMM_WORLD_SIZE
export
LOCAL_RANK
=
$OMPI_COMM_WORLD_LOCAL_RANK
export
HIP_VISIBLE_DEVICES
=
0,1,2,3
#pyenv activate torch-dtk22.04.2
source
~/env22.04.2.sh
APP
=
"python /work/home/hepj/torch/Conformer-main/main.py
\
--model Conformer_small_patch16
\
--data-set IMNET
\
--batch-size 64
\
--world_size 4
\
--lr 0.001
\
--local_rank
${
comm_rank
}
\
--dist_url tcp://
${
1
}
:9999
\
--data-path /public/DL_DATA/ImageNet-pytorch
\
--output_dir /work/home/hepj/torch/Conformer-main/out_dir
\
--epochs 1"
case
${
lrank
}
in
[
0]
)
export
HIP_VISIBLE_DEVICES
=
0
export
UCX_NET_DEVICES
=
mlx5_0:1
export
UCX_IB_PCI_BW
=
mlx5_0:50Gbs
echo
NCCL_SOCKET_IFNAME
=
eno1 numactl
--cpunodebind
=
0
--membind
=
0
${
APP
}
NCCL_SOCKET_IFNAME
=
eno1 numactl
--cpunodebind
=
0
--membind
=
0
${
APP
}
;;
[
1]
)
export
HIP_VISIBLE_DEVICES
=
1
export
UCX_NET_DEVICES
=
mlx5_1:1
export
UCX_IB_PCI_BW
=
mlx5_1:50Gbs
echo
NCCL_SOCKET_IFNAME
=
eno1 numactl
--cpunodebind
=
1
--membind
=
1
${
APP
}
NCCL_SOCKET_IFNAME
=
eno1 numactl
--cpunodebind
=
1
--membind
=
1
${
APP
}
;;
[
2]
)
export
HIP_VISIBLE_DEVICES
=
2
export
UCX_NET_DEVICES
=
mlx5_2:1
export
UCX_IB_PCI_BW
=
mlx5_2:50Gbs
echo
NCCL_SOCKET_IFNAME
=
eno1 numactl
--cpunodebind
=
2
--membind
=
2
${
APP
}
NCCL_SOCKET_IFNAME
=
eno1 numactl
--cpunodebind
=
2
--membind
=
2
${
APP
}
;;
[
3]
)
export
HIP_VISIBLE_DEVICES
=
3
export
UCX_NET_DEVICES
=
mlx5_3:1
export
UCX_IB_PCI_BW
=
mlx5_3:50Gbs
echo
NCCL_SOCKET_IFNAME
=
eno1 numactl
--cpunodebind
=
3
--membind
=
3
${
APP
}
NCCL_SOCKET_IFNAME
=
eno1 numactl
--cpunodebind
=
3
--membind
=
3
${
APP
}
;;
esac
PyTorch/NLP/Conformer-main/2node-run-comformer/run_conformer_4dcus.sh
0 → 100644
View file @
7d366e11
#!/usr/bin/env bash
#SBATCH -J 2node-test
#SBATCH -p wzhdtest
#SBATCH -N 2
#SBARCH -n 32
#SBATCH --ntasks-per-node=4
#SBATCH --cpus-per-task=8
#SBATCH --gres=dcu:4
set
-x
HOME_PATH
=
/work/home/hepj
WORK_PATH
=
${
HOME_PATH
}
/torch/Conformer-main/2node-run
source
~/env22.10.sh
which python3
#export NCCL_GRAPH_DUMP_FILE=graph.xml
#export NCCL_GRAPH_FILE=test.xml
#export NCCL_NET_GDR_LEVEL=5
hostfile
=
./
$SLURM_JOB_ID
scontrol show hostnames
$SLURM_JOB_NODELIST
>
${
hostfile
}
for
i
in
`
cat
$hostfile
`
do
echo
${
i
}
slots
=
4
>>
`
pwd
`
/hostfile-
$SLURM_JOB_ID
((
num_node
=
${
num_node
}
+1
))
done
num_dcu
=
$((${
num_node
}
*
4
))
echo
$num_dcu
nodename
=
$(
cat
$hostfile
|sed
-n
"1p"
)
echo
$nodename
dist_url
=
`
echo
$nodename
|
awk
'{print $1}'
`
#export NCCL_DEBUG=INFO
#export HSA_USERPTR_FOR_PAGED_MEM=0
#mpirun -np ${num_dcu} --hostfile hostfile-$SLURM_JOB_ID ${WORK_PATH}/single_process.sh $dist_url
#mpirun -np ${num_dcu} --hostfile hostfile-$SLURM_JOB_ID ${WORK_PATH}/single_process_ddp.sh $dist_url
#mpirun -np ${num_dcu} --hostfile hostfile-$SLURM_JOB_ID ${WORK_PATH}/2nodes_single_process.sh $dist_url
#hipprof mpirun -np ${num_dcu} --hostfile hostfile-$SLURM_JOB_ID ${WORK_PATH}/hipprof_single.sh $dist_url
#hipprof mpirun -np 4 --hostfile hostfile-18261131 hipprof_single.sh j17r3n01
#mpirun -np 1 --hostfile hostfile-$SLURM_JOB_ID ${WORK_PATH}/2nodes_single_process.sh $dist_url
mpirun
-np
${
num_dcu
}
--hostfile
hostfile-
$SLURM_JOB_ID
${
WORK_PATH
}
/2nodes_single_process.sh
$dist_url
#mpirun -np ${num_dcu} --hostfile hostfile-$SLURM_JOB_ID hipprof ${WORK_PATH}/2nodes_single_process.sh $dist_url
PyTorch/NLP/Conformer-main/README.md
View file @
7d366e11
...
...
@@ -39,7 +39,7 @@ import collections.abc as container_abcs
/public/software/apps/DeepLearning/Data/ImageNet-pytorch
##
#
单卡
## 单卡
```
#启动
...
...
@@ -59,3 +59,10 @@ sh脚本中--nnodes 为机器数 ,--nproc_per_node每个机器显卡数目,
./run4.sh
```
## 多机多卡
```
cd 2node-run-comformer
sbatch run_conformer_4dcus.sh (按照自己情况对#SBATCH -p、#SBATCH -J进行修改,运行结果保存在相应的slurm文件中)
```
PyTorch/NLP/Vision_Transformer/2node-run-vit/run-vit-finetune.sh
0 → 100644
View file @
7d366e11
#!/usr/bin/env bash
#SBATCH -J 2node-test
#SBATCH -p wzhdtest
#SBATCH -N 2
#SBARCH -n 32
#SBATCH --ntasks-per-node=4
#SBATCH --cpus-per-task=8
#SBATCH --gres=dcu:4
set
-x
WORK_PATH
=
/work/home/hepj/torch/Vision_Transformer/2node-run
source
~/env22.10.sh
which python3
hostfile
=
./
$SLURM_JOB_ID
scontrol show hostnames
$SLURM_JOB_NODELIST
>
${
hostfile
}
for
i
in
`
cat
$hostfile
`
do
echo
${
i
}
slots
=
4
>>
`
pwd
`
/hostfile-
$SLURM_JOB_ID
((
num_node
=
${
num_node
}
+1
))
done
num_dcu
=
$((${
num_node
}
*
4
))
echo
$num_dcu
nodename
=
$(
cat
$hostfile
|sed
-n
"1p"
)
echo
$nodename
dist_url
=
`
echo
$nodename
|
awk
'{print $1}'
`
# export NCCL_DEBUG=INFO
# export HSA_USERPTR_FOR_PAGED_MEM=0
mpirun
-np
${
num_dcu
}
--hostfile
hostfile-
$SLURM_JOB_ID
${
WORK_PATH
}
/single_finetune-4.sh
$dist_url
PyTorch/NLP/Vision_Transformer/2node-run-vit/run-vit-pre.sh
0 → 100644
View file @
7d366e11
#!/usr/bin/env bash
#SBATCH -J 2node-test
#SBATCH -p wzhdtest
#SBATCH -N 2
#SBARCH -n 32
#SBATCH --ntasks-per-node=4
#SBATCH --cpus-per-task=8
#SBATCH --gres=dcu:4
set
-x
WORK_PATH
=
/work/home/hepj/torch/mae-main/2node-run
source
~/env22.04.2.sh
which python3
hostfile
=
./
$SLURM_JOB_ID
scontrol show hostnames
$SLURM_JOB_NODELIST
>
${
hostfile
}
for
i
in
`
cat
$hostfile
`
do
echo
${
i
}
slots
=
4
>>
`
pwd
`
/hostfile-
$SLURM_JOB_ID
((
num_node
=
${
num_node
}
+1
))
done
num_dcu
=
$((${
num_node
}
*
4
))
echo
$num_dcu
nodename
=
$(
cat
$hostfile
|sed
-n
"1p"
)
echo
$nodename
dist_url
=
`
echo
$nodename
|
awk
'{print $1}'
`
export
NCCL_DEBUG
=
INFO
export
HSA_USERPTR_FOR_PAGED_MEM
=
0
mpirun
-np
${
num_dcu
}
--hostfile
hostfile-
$SLURM_JOB_ID
${
WORK_PATH
}
/single_pre-4.sh
$dist_url
PyTorch/NLP/Vision_Transformer/2node-run-vit/single_finetune-4.sh
0 → 100644
View file @
7d366e11
#!/bin/bash
export
MIOPEN_DEBUG_DISABLE_FIND_DB
=
1
export
NCCL_SOCKET_IFNAME
=
eno1
export
HSA_USERPTR_FOR_PAGED_MEM
=
0
lrank
=
$OMPI_COMM_WORLD_LOCAL_RANK
comm_rank
=
$OMPI_COMM_WORLD_RANK
comm_size
=
$OMPI_COMM_WORLD_SIZE
source
~/env22.10.sh
export
PRETRAIN_CHKPT
=
/work/home/hepj/model/VIT/mae_pretrain_vit_base.pth
#mae_finetuned_vit_base.pth
export
IMAGENET_DIR
=
/public/DL_DATA/ImageNet-pytorch
export
HIP_VISIBLE_DEVICES
=
0,1,2,3
APP
=
"python /work/home/hepj/torch/mae-main/main_finetune.py
\
--batch_size 32
\
--dist_on_itp
\
--dist_url tcp://
${
1
}
:34567
\
--local_rank
${
comm_rank
}
\
--model vit_base_patch16
\
--finetune
${
PRETRAIN_CHKPT
}
\
--epochs 1
\
--blr 5e-4 --layer_decay 0.65 --weight_decay 0.05
\
--drop_path 0.1 --mixup 0.8 --cutmix 1.0 --reprob 0.25 --dist_eval
\
--data_path
${
IMAGENET_DIR
}
\
"
case
${
lrank
}
in
[
0]
)
export
HIP_VISIBLE_DEVICES
=
0
export
UCX_NET_DEVICES
=
mlx5_0:1
export
UCX_IB_PCI_BW
=
mlx5_0:50Gbs
echo
numactl
--cpunodebind
=
0
--membind
=
0
${
APP
}
numactl
--cpunodebind
=
0
--membind
=
0
${
APP
}
;;
[
1]
)
export
HIP_VISIBLE_DEVICES
=
1
export
UCX_NET_DEVICES
=
mlx5_1:1
export
UCX_IB_PCI_BW
=
mlx5_1:50Gbs
echo
numactl
--cpunodebind
=
1
--membind
=
1
${
APP
}
numactl
--cpunodebind
=
1
--membind
=
1
${
APP
}
;;
[
2]
)
export
HIP_VISIBLE_DEVICES
=
2
export
UCX_NET_DEVICES
=
mlx5_2:1
export
UCX_IB_PCI_BW
=
mlx5_2:50Gbs
echo
numactl
--cpunodebind
=
2
--membind
=
2
${
APP
}
numactl
--cpunodebind
=
2
--membind
=
2
${
APP
}
;;
[
3]
)
export
HIP_VISIBLE_DEVICES
=
3
export
UCX_NET_DEVICES
=
mlx5_3:1
export
UCX_IB_PCI_BW
=
mlx5_3:50Gbs
echo
numactl
--cpunodebind
=
3
--membind
=
3
${
APP
}
numactl
--cpunodebind
=
3
--membind
=
3
${
APP
}
;;
esac
PyTorch/NLP/Vision_Transformer/2node-run-vit/single_pre-4.sh
0 → 100644
View file @
7d366e11
#!/bin/bash
export
MIOPEN_DEBUG_DISABLE_FIND_DB
=
1
export
NCCL_SOCKET_IFNAME
=
eno1
export
HSA_USERPTR_FOR_PAGED_MEM
=
0
lrank
=
$OMPI_COMM_WORLD_LOCAL_RANK
comm_rank
=
$OMPI_COMM_WORLD_RANK
comm_size
=
$OMPI_COMM_WORLD_SIZE
pyenv activate torch-dtk22.04.2
source
~/env22.04.2.sh
export
PRETRAIN_CHKPT
=
/work/home/hepj/model/VIT/mae_pretrain_vit_base.pth
#mae_finetuned_vit_base.pth
export
IMAGENET_DIR
=
/public/DL_DATA/ImageNet-pytorch
export
HIP_VISIBLE_DEVICES
=
0,1,2,3
APP
=
"python /work/home/hepj/torch/mae-main/main_pretrain.py
\
--epochs 1
\
--dist_on_itp
\
--dist_url tcp://
${
1
}
:34567
\
--local_rank
${
comm_rank
}
\
--model mae_vit_base_patch16
\
--batch_size 64
\
--model mae_vit_base_patch16
\
--data_path
${
IMAGENET_DIR
}
"
case
${
lrank
}
in
[
0]
)
export
HIP_VISIBLE_DEVICES
=
0
export
UCX_NET_DEVICES
=
mlx5_0:1
export
UCX_IB_PCI_BW
=
mlx5_0:50Gbs
echo
NCCL_SOCKET_IFNAME
=
eno1 numactl
--cpunodebind
=
0
--membind
=
0
${
APP
}
NCCL_SOCKET_IFNAME
=
eno1 numactl
--cpunodebind
=
0
--membind
=
0
${
APP
}
;;
[
1]
)
export
HIP_VISIBLE_DEVICES
=
1
export
UCX_NET_DEVICES
=
mlx5_1:1
export
UCX_IB_PCI_BW
=
mlx5_1:50Gbs
echo
NCCL_SOCKET_IFNAME
=
eno1 numactl
--cpunodebind
=
1
--membind
=
1
${
APP
}
NCCL_SOCKET_IFNAME
=
eno1 numactl
--cpunodebind
=
1
--membind
=
1
${
APP
}
;;
[
2]
)
export
HIP_VISIBLE_DEVICES
=
2
export
UCX_NET_DEVICES
=
mlx5_2:1
export
UCX_IB_PCI_BW
=
mlx5_2:50Gbs
echo
NCCL_SOCKET_IFNAME
=
eno1 numactl
--cpunodebind
=
2
--membind
=
2
${
APP
}
NCCL_SOCKET_IFNAME
=
eno1 numactl
--cpunodebind
=
2
--membind
=
2
${
APP
}
;;
[
3]
)
export
HIP_VISIBLE_DEVICES
=
3
export
UCX_NET_DEVICES
=
mlx5_3:1
export
UCX_IB_PCI_BW
=
mlx5_3:50Gbs
echo
NCCL_SOCKET_IFNAME
=
eno1 numactl
--cpunodebind
=
3
--membind
=
3
${
APP
}
NCCL_SOCKET_IFNAME
=
eno1 numactl
--cpunodebind
=
3
--membind
=
3
${
APP
}
;;
esac
PyTorch/NLP/Vision_Transformer/README.md
View file @
7d366e11
...
...
@@ -88,6 +88,13 @@ OMP_NUM_THREADS=1 python3 -m torch.distributed.launch --nproc_per_node=4 main_
```
## 多机多卡
```
cd 2node-run-vit
sbatch run-vit-pre.sh (按照自己情况对#SBATCH -p、#SBATCH -J 进行修改;运行结果保存在相应的slurm文件中)
```
# 微调任务
...
...
@@ -135,6 +142,13 @@ OMP_NUM_THREADS=1 python3 -m torch.distributed.launch --nproc_per_node=4 main_fi
--dist_eval --data_path ${IMAGENET_DIR}
```
## 多机多卡
```
cd 2node-run-vit
sbatch run-vit-finetune.sh (按照自己情况对#SBATCH -p、#SBATCH -J 进行修改;运行结果保存在相应的slurm文件中)
```
# 结果验证
验证使用的模型为mae_finetuned_vit_xxx.pth,下载地址:
...
...
PyTorch/NLP/new-Transformer/2node-run/2nodes_single_process.sh
0 → 100644
View file @
7d366e11
#!/bin/bash
export
MIOPEN_DEBUG_DISABLE_FIND_DB
=
1
export
NCCL_SOCKET_IFNAME
=
eno1
export
HSA_USERPTR_FOR_PAGED_MEM
=
0
export
HSA_FORCE_FINE_GRAIN_PCIE
=
1
export
MIOPEN_FIND_MODE
=
1
lrank
=
$OMPI_COMM_WORLD_LOCAL_RANK
comm_rank
=
$OMPI_COMM_WORLD_RANK
comm_size
=
$OMPI_COMM_WORLD_SIZE
#TOKENS=4096
TOKENS
=
2560
export
DATA_PATH
=
~/data/wmt14_en_de_joined_dict
APP
=
"python3 /work/home/hepj/torch/TransFormer/train.py
$DATA_PATH
--save-dir 2node-outdir --arch transformer_wmt_en_de --share-decoder-input-output-embed --optimizer adam --adam-betas (0.9,0.98) --clip-norm 0.0 --lr 5e-4 --lr-scheduler inverse_sqrt --warmup-updates 4000 --dropout 0.3 --weight-decay 0.0001 --criterion label_smoothed_cross_entropy --label-smoothing 0.1 --max-tokens
$TOKENS
--eval-bleu --eval-bleu-args {
\"
beam
\"
:5,
\"
max_len_a
\"
:1.2,
\"
max_len_b
\"
:10} --eval-bleu-detok moses --eval-bleu-remove-bpe --eval-bleu-print-samples --best-checkpoint-metric bleu --maximize-best-checkpoint-metric --distributed-rank
${
comm_rank
}
--distributed-world-size
${
comm_size
}
--device-id
${
lrank
}
--local_rank
${
lrank
}
--distributed-init-method tcp://
${
1
}
:34567 --distributed-no-spawn --max-epoch 1"
case
${
lrank
}
in
[
0]
)
export
HIP_VISIBLE_DEVICES
=
0,1,2,3
export
UCX_NET_DEVICES
=
mlx5_0:1
export
UCX_IB_PCI_BW
=
mlx5_0:50Gbs
echo
NCCL_SOCKET_IFNAME
=
ib0 numactl
--cpunodebind
=
0
--membind
=
0
${
APP
}
NCCL_SOCKET_IFNAME
=
ib0 numactl
--cpunodebind
=
0
--membind
=
0
${
APP
}
;;
[
1]
)
export
HIP_VISIBLE_DEVICES
=
0,1,2,3
export
UCX_NET_DEVICES
=
mlx5_1:1
export
UCX_IB_PCI_BW
=
mlx5_1:50Gbs
echo
NCCL_SOCKET_IFNAME
=
ib0 numactl
--cpunodebind
=
1
--membind
=
1
${
APP
}
NCCL_SOCKET_IFNAME
=
ib0 numactl
--cpunodebind
=
1
--membind
=
1
${
APP
}
;;
[
2]
)
export
HIP_VISIBLE_DEVICES
=
0,1,2,3
export
UCX_NET_DEVICES
=
mlx5_2:1
export
UCX_IB_PCI_BW
=
mlx5_2:50Gbs
echo
NCCL_SOCKET_IFNAME
=
ib0 numactl
--cpunodebind
=
2
--membind
=
2
${
APP
}
NCCL_SOCKET_IFNAME
=
ib0 numactl
--cpunodebind
=
2
--membind
=
2
${
APP
}
;;
[
3]
)
export
HIP_VISIBLE_DEVICES
=
0,1,2,3
export
UCX_NET_DEVICES
=
mlx5_3:1
export
UCX_IB_PCI_BW
=
mlx5_3:50Gbs
echo
NCCL_SOCKET_IFNAME
=
ib0 numactl
--cpunodebind
=
3
--membind
=
3
${
APP
}
NCCL_SOCKET_IFNAME
=
ib0 numactl
--cpunodebind
=
3
--membind
=
3
${
APP
}
;;
esac
PyTorch/NLP/new-Transformer/2node-run/run_transformer_4dcus.sh
0 → 100644
View file @
7d366e11
#!/usr/bin/env bash
#SBATCH -J 2node-test
#SBATCH -p wzhdtest
#SBATCH -N 2
#SBARCH -n 32
#SBATCH --ntasks-per-node=4
#SBATCH --cpus-per-task=8
#SBATCH --gres=dcu:4
set
-x
HOME_PATH
=
/work/home/hepj
WORK_PATH
=
${
HOME_PATH
}
/torch/TransFormer/2node-run
source
~/env22.10.sh
which python3
#export NCCL_GRAPH_DUMP_FILE=graph.xml
#export NCCL_GRAPH_FILE=test.xml
#export NCCL_NET_GDR_LEVEL=5
hostfile
=
./
$SLURM_JOB_ID
scontrol show hostnames
$SLURM_JOB_NODELIST
>
${
hostfile
}
for
i
in
`
cat
$hostfile
`
do
echo
${
i
}
slots
=
4
>>
`
pwd
`
/hostfile-
$SLURM_JOB_ID
((
num_node
=
${
num_node
}
+1
))
done
num_dcu
=
$((${
num_node
}
*
4
))
echo
$num_dcu
nodename
=
$(
cat
$hostfile
|sed
-n
"1p"
)
echo
$nodename
dist_url
=
`
echo
$nodename
|
awk
'{print $1}'
`
#export NCCL_DEBUG=INFO
#export HSA_USERPTR_FOR_PAGED_MEM=0
mpirun
-np
${
num_dcu
}
--hostfile
hostfile-
$SLURM_JOB_ID
${
WORK_PATH
}
/2nodes_single_process.sh
$dist_url
PyTorch/NLP/new-Transformer/README.md
View file @
7d366e11
...
...
@@ -390,9 +390,21 @@ sbatch fp16_ run_transformer_4dcus.sh
-
通过--arch 设置要测试的网络,eg:transformer_wmt_en_de 等;
-
上述 run_transformer_4dcus.sh中mpirun 运行命令表示使用4张DCU加速卡训练。
#### 3.5.
部分问题说明
#### 3.5.
多机多卡
##### 3.5.1. format错误
```
cd 2node-run
#fp32
sbatch run_transformer_4dcus.sh (按照自己情况对#SBATCH -p、#SBATCH -J进行修改,运行结果保存在相应的slurm文件中)
#fp16
sbatch run_transformer_4dcus_fp16.sh (按照自己情况对#SBATCH -p、#SBATCH -J进行修改,运行结果保存在相应的slurm文件中)
```
#### 3.6. 部分问题说明
##### 3.6.1. format错误
报错信息如下:
...
...
@@ -414,7 +426,7 @@ self._verbose += f"ref_len = {slef.ref_len:.0f}"
##### 3.
5
.2 json格式解析错误
##### 3.
6
.2 json格式解析错误
报错信息如下:
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment