Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
dcuai
dlexamples
Commits
b1232fb0
Commit
b1232fb0
authored
Jan 11, 2023
by
hepj
Browse files
增加多机多卡运行
parent
17bc28d5
Changes
7
Show whitespace changes
Inline
Side-by-side
Showing
7 changed files
with
301 additions
and
1 deletion
+301
-1
PyTorch/NLP/BERT/2node-run-pre/2nodes_single_process_pre1.sh
PyTorch/NLP/BERT/2node-run-pre/2nodes_single_process_pre1.sh
+61
-0
PyTorch/NLP/BERT/2node-run-pre/2nodes_single_process_pre2.sh
PyTorch/NLP/BERT/2node-run-pre/2nodes_single_process_pre2.sh
+64
-0
PyTorch/NLP/BERT/2node-run-pre/run_bert_pre1_4dcus.sh
PyTorch/NLP/BERT/2node-run-pre/run_bert_pre1_4dcus.sh
+32
-0
PyTorch/NLP/BERT/2node-run-pre/run_bert_pre2_4dcus.sh
PyTorch/NLP/BERT/2node-run-pre/run_bert_pre2_4dcus.sh
+32
-0
PyTorch/NLP/BERT/2node-run-squad/2nodes_single_process.sh
PyTorch/NLP/BERT/2node-run-squad/2nodes_single_process.sh
+57
-0
PyTorch/NLP/BERT/2node-run-squad/run_bert_squad_4dcus.sh
PyTorch/NLP/BERT/2node-run-squad/run_bert_squad_4dcus.sh
+41
-0
PyTorch/NLP/BERT/README.md
PyTorch/NLP/BERT/README.md
+14
-1
No files found.
PyTorch/NLP/BERT/2node-run-pre/2nodes_single_process_pre1.sh
0 → 100644
View file @
b1232fb0
#!/bin/bash
export
MIOPEN_DEBUG_DISABLE_FIND_DB
=
1
export
NCCL_SOCKET_IFNAME
=
eno1
export
HSA_USERPTR_FOR_PAGED_MEM
=
0
export
HIP_LAUNCH_BLOCKING
=
1
lrank
=
$OMPI_COMM_WORLD_LOCAL_RANK
comm_rank
=
$OMPI_COMM_WORLD_RANK
comm_size
=
$OMPI_COMM_WORLD_SIZE
export
PATH_PHRASE1
=
/public/DL_DATA/wikicorpus_en/lower_case_1_seq_len_128_max_pred_20_masked_lm_prob_0.15_random_seed_12345_dupe_factor_5_shard_1472_test_split_10/wikicorpus_en/training
APP
=
"python3
${
HOME
}
/torch/bert-pretrain/run_pretraining_v4.py
\
--input_dir=
${
PATH_PHRASE1
}
\
--output_dir=
${
HOME
}
/outdir/torch/pre_wiki/phrase1
\
--config_file=
${
HOME
}
/model/uncased_L-24_H-1024_A-16/bert_config.json
\
--bert_model=bert-large-uncased
\
--train_batch_size=16
\
--max_seq_length=128
\
--max_predictions_per_seq=20
\
--max_steps=100000
\
--warmup_proportion=0.0
\
--num_steps_per_checkpoint=20000
\
--learning_rate=4.0e-4
\
--seed=12439
\
--gradient_accumulation_steps=1
\
--allreduce_post_accumulation
\
--gpus_per_node 2
\
--do_train
\
--local_rank
${
comm_rank
}
\
--world_size
${
comm_size
}
\
--dist_url tcp://
${
1
}
:34567
\
--json-summary
${
HOME
}
/outdir/torch/pre_wiki/phrase1/dllogger.json
"
case
${
lrank
}
in
[
0]
)
export
HIP_VISIBLE_DEVICES
=
0
export
UCX_NET_DEVICES
=
mlx5_0:1
export
UCX_IB_PCI_BW
=
mlx5_0:50Gbs
echo
NCCL_SOCKET_IFNAME
=
eno1 numactl
--cpunodebind
=
0
--membind
=
0
${
APP
}
NCCL_SOCKET_IFNAME
=
eno1 numactl
--cpunodebind
=
0
--membind
=
0
${
APP
}
;;
[
1]
)
export
HIP_VISIBLE_DEVICES
=
1
export
UCX_NET_DEVICES
=
mlx5_1:1
export
UCX_IB_PCI_BW
=
mlx5_1:50Gbs
echo
NCCL_SOCKET_IFNAME
=
eno1 numactl
--cpunodebind
=
1
--membind
=
1
${
APP
}
NCCL_SOCKET_IFNAME
=
eno1 numactl
--cpunodebind
=
1
--membind
=
1
${
APP
}
;;
[
2]
)
export
HIP_VISIBLE_DEVICES
=
2
export
UCX_NET_DEVICES
=
mlx5_2:1
export
UCX_IB_PCI_BW
=
mlx5_2:50Gbs
echo
NCCL_SOCKET_IFNAME
=
eno1 numactl
--cpunodebind
=
2
--membind
=
2
${
APP
}
NCCL_SOCKET_IFNAME
=
eno1 numactl
--cpunodebind
=
2
--membind
=
2
${
APP
}
;;
[
3]
)
export
HIP_VISIBLE_DEVICES
=
3
export
UCX_NET_DEVICES
=
mlx5_3:1
export
UCX_IB_PCI_BW
=
mlx5_3:50Gbs
echo
NCCL_SOCKET_IFNAME
=
eno1 numactl
--cpunodebind
=
3
--membind
=
3
${
APP
}
NCCL_SOCKET_IFNAME
=
eno1 numactl
--cpunodebind
=
3
--membind
=
3
${
APP
}
;;
esac
PyTorch/NLP/BERT/2node-run-pre/2nodes_single_process_pre2.sh
0 → 100644
View file @
b1232fb0
#!/bin/bash
export
MIOPEN_DEBUG_DISABLE_FIND_DB
=
1
export
NCCL_SOCKET_IFNAME
=
eno1
export
HSA_USERPTR_FOR_PAGED_MEM
=
0
export
HIP_LAUNCH_BLOCKING
=
1
lrank
=
$OMPI_COMM_WORLD_LOCAL_RANK
comm_rank
=
$OMPI_COMM_WORLD_RANK
comm_size
=
$OMPI_COMM_WORLD_SIZE
export
PATH_PHRASE2
=
/public/DL_DATA/wikicorpus_en/lower_case_1_seq_len_512_max_pred_80_masked_lm_prob_0.15_random_seed_12345_dupe_factor_5_shard_1472_test_split_10/wikicorpus_en/training
APP
=
"python3
${
HOME
}
/torch/bert-pretrain/run_pretraining_v4.py
\
--input_dir=
${
PATH_PHRASE2
}
\
--output_dir=
${
HOME
}
/outdir/torch/pre_wiki/phrase2
\
--config_file=
${
HOME
}
/model/uncased_L-24_H-1024_A-16/bert_config.json
\
--bert_model=bert-large-uncased
\
--train_batch_size=2
\
--max_seq_length=512
\
--max_predictions_per_seq=80
\
--max_steps=400000
\
--warmup_proportion=0.128
\
--num_steps_per_checkpoint=20000
\
--learning_rate=4.0e-3
\
--seed=12439
\
--gradient_accumulation_steps=1
\
--allreduce_post_accumulation
\
--gpus_per_node 2
\
--do_train
\
--phase2
\
--phase1_end_step=0
\
--local_rank
${
comm_rank
}
\
--world_size
${
comm_size
}
\
--dist_url tcp://
${
1
}
:34567
\
--json-summary
${
HOME
}
/outdir/torch/pre_wiki4/phrase2/dllogger.json
"
case
${
lrank
}
in
[
0]
)
export
HIP_VISIBLE_DEVICES
=
0
export
UCX_NET_DEVICES
=
mlx5_0:1
export
UCX_IB_PCI_BW
=
mlx5_0:50Gbs
echo
NCCL_SOCKET_IFNAME
=
eno1 numactl
--cpunodebind
=
0
--membind
=
0
${
APP
}
NCCL_SOCKET_IFNAME
=
eno1 numactl
--cpunodebind
=
0
--membind
=
0
${
APP
}
;;
[
1]
)
export
HIP_VISIBLE_DEVICES
=
1
export
UCX_NET_DEVICES
=
mlx5_1:1
export
UCX_IB_PCI_BW
=
mlx5_1:50Gbs
echo
NCCL_SOCKET_IFNAME
=
eno1 numactl
--cpunodebind
=
1
--membind
=
1
${
APP
}
NCCL_SOCKET_IFNAME
=
eno1 numactl
--cpunodebind
=
1
--membind
=
1
${
APP
}
;;
[
2]
)
export
HIP_VISIBLE_DEVICES
=
2
export
UCX_NET_DEVICES
=
mlx5_2:1
export
UCX_IB_PCI_BW
=
mlx5_2:50Gbs
echo
NCCL_SOCKET_IFNAME
=
eno1 numactl
--cpunodebind
=
2
--membind
=
2
${
APP
}
NCCL_SOCKET_IFNAME
=
eno1 numactl
--cpunodebind
=
2
--membind
=
2
${
APP
}
;;
[
3]
)
export
HIP_VISIBLE_DEVICES
=
3
export
UCX_NET_DEVICES
=
mlx5_3:1
export
UCX_IB_PCI_BW
=
mlx5_3:50Gbs
echo
NCCL_SOCKET_IFNAME
=
eno1 numactl
--cpunodebind
=
3
--membind
=
3
${
APP
}
NCCL_SOCKET_IFNAME
=
eno1 numactl
--cpunodebind
=
3
--membind
=
3
${
APP
}
;;
esac
PyTorch/NLP/BERT/2node-run-pre/run_bert_pre1_4dcus.sh
0 → 100644
View file @
b1232fb0
#!/usr/bin/env bash
#SBATCH -J 2node-test
#SBATCH -p wzhdtest
#SBATCH -N 2
#SBARCH -n 32
#SBATCH --ntasks-per-node=4
#SBATCH --cpus-per-task=8
#SBATCH --gres=dcu:4
set
-x
HOME_PATH
=
/work/home/hepj
WORK_PATH
=
${
HOME_PATH
}
/torch/bert-pretrain/2node-run
source
~/env22.10.sh
which python3
hostfile
=
./
$SLURM_JOB_ID
scontrol show hostnames
$SLURM_JOB_NODELIST
>
${
hostfile
}
for
i
in
`
cat
$hostfile
`
do
echo
${
i
}
slots
=
4
>>
`
pwd
`
/hostfile-
$SLURM_JOB_ID
((
num_node
=
${
num_node
}
+1
))
done
num_dcu
=
$((${
num_node
}
*
4
))
echo
$num_dcu
nodename
=
$(
cat
$hostfile
|sed
-n
"1p"
)
echo
$nodename
dist_url
=
`
echo
$nodename
|
awk
'{print $1}'
`
export
NCCL_DEBUG
=
INFO
export
HSA_USERPTR_FOR_PAGED_MEM
=
0
mpirun
-np
${
num_dcu
}
--hostfile
hostfile-
$SLURM_JOB_ID
${
WORK_PATH
}
/2nodes_single_process_pre1.sh
$dist_url
PyTorch/NLP/BERT/2node-run-pre/run_bert_pre2_4dcus.sh
0 → 100644
View file @
b1232fb0
#!/usr/bin/env bash
#SBATCH -J 2node-test
#SBATCH -p wzhdtest
#SBATCH -N 2
#SBARCH -n 32
#SBATCH --ntasks-per-node=4
#SBATCH --cpus-per-task=8
#SBATCH --gres=dcu:4
set
-x
HOME_PATH
=
/work/home/hepj
WORK_PATH
=
${
HOME_PATH
}
/torch/bert-pretrain/2node-run
source
~/env22.10.sh
which python3
hostfile
=
./
$SLURM_JOB_ID
scontrol show hostnames
$SLURM_JOB_NODELIST
>
${
hostfile
}
for
i
in
`
cat
$hostfile
`
do
echo
${
i
}
slots
=
4
>>
`
pwd
`
/hostfile-
$SLURM_JOB_ID
((
num_node
=
${
num_node
}
+1
))
done
num_dcu
=
$((${
num_node
}
*
4
))
echo
$num_dcu
nodename
=
$(
cat
$hostfile
|sed
-n
"1p"
)
echo
$nodename
dist_url
=
`
echo
$nodename
|
awk
'{print $1}'
`
export
NCCL_DEBUG
=
INFO
export
HSA_USERPTR_FOR_PAGED_MEM
=
0
mpirun
-np
${
num_dcu
}
--hostfile
hostfile-
$SLURM_JOB_ID
${
WORK_PATH
}
/2nodes_single_process_pre2.sh
$dist_url
PyTorch/NLP/BERT/2node-run-squad/2nodes_single_process.sh
0 → 100644
View file @
b1232fb0
#!/bin/bash
export
MIOPEN_DEBUG_DISABLE_FIND_DB
=
1
export
NCCL_SOCKET_IFNAME
=
eno1
export
HSA_USERPTR_FOR_PAGED_MEM
=
0
export
HIP_LAUNCH_BLOCKING
=
1
lrank
=
$OMPI_COMM_WORLD_LOCAL_RANK
comm_rank
=
$OMPI_COMM_WORLD_RANK
comm_size
=
$OMPI_COMM_WORLD_SIZE
APP
=
"python3 /work/home/hepj/torch/bert-squad/run_squad_v4.py
\
--train_file
${
HOME
}
/data/sq1.1/train-v1.1.json
\
--predict_file
${
HOME
}
/data/sq1.1/dev-v1.1.json
\
--init_checkpoint
${
HOME
}
/model/pytorch_bert/model.ckpt-28252.pt
\
--vocab_file
${
HOME
}
/model/pytorch_bert/vocab.txt
\
--output_dir
${
HOME
}
/outdir/torch/SQUAD4
\
--config_file
${
HOME
}
/model/pytorch_bert/bert_config.json
\
--json-summary
${
HOME
}
/outdir/torch/SQUAD4/results.json
\
--bert_model bert-large-uncased
\
--do_train
\
--do_predict
\
--train_batch_size 4
\
--predict_batch_size 4
\
--gpus_per_node 2
\
--local_rank
${
comm_rank
}
\
--world_size
${
comm_size
}
\
--use_env
\
--dist_url tcp://
${
1
}
:34567
\
"
case
${
lrank
}
in
[
0]
)
export
HIP_VISIBLE_DEVICES
=
0
export
UCX_NET_DEVICES
=
mlx5_0:1
export
UCX_IB_PCI_BW
=
mlx5_0:50Gbs
echo
NCCL_SOCKET_IFNAME
=
eno1 numactl
--cpunodebind
=
0
--membind
=
0
${
APP
}
NCCL_SOCKET_IFNAME
=
eno1 numactl
--cpunodebind
=
0
--membind
=
0
${
APP
}
;;
[
1]
)
export
HIP_VISIBLE_DEVICES
=
1
export
UCX_NET_DEVICES
=
mlx5_1:1
export
UCX_IB_PCI_BW
=
mlx5_1:50Gbs
echo
NCCL_SOCKET_IFNAME
=
eno1 numactl
--cpunodebind
=
1
--membind
=
1
${
APP
}
NCCL_SOCKET_IFNAME
=
eno1 numactl
--cpunodebind
=
1
--membind
=
1
${
APP
}
;;
[
2]
)
export
HIP_VISIBLE_DEVICES
=
2
export
UCX_NET_DEVICES
=
mlx5_2:1
export
UCX_IB_PCI_BW
=
mlx5_2:50Gbs
echo
NCCL_SOCKET_IFNAME
=
eno1 numactl
--cpunodebind
=
2
--membind
=
2
${
APP
}
NCCL_SOCKET_IFNAME
=
eno1 numactl
--cpunodebind
=
2
--membind
=
2
${
APP
}
;;
[
3]
)
export
HIP_VISIBLE_DEVICES
=
3
export
UCX_NET_DEVICES
=
mlx5_3:1
export
UCX_IB_PCI_BW
=
mlx5_3:50Gbs
echo
NCCL_SOCKET_IFNAME
=
eno1 numactl
--cpunodebind
=
3
--membind
=
3
${
APP
}
NCCL_SOCKET_IFNAME
=
eno1 numactl
--cpunodebind
=
3
--membind
=
3
${
APP
}
;;
esac
PyTorch/NLP/BERT/2node-run-squad/run_bert_squad_4dcus.sh
0 → 100644
View file @
b1232fb0
#!/usr/bin/env bash
#SBATCH -J 2node-test
#SBATCH -p wzhdtest
#SBATCH -N 2
#SBARCH -n 32
#SBATCH --ntasks-per-node=4
#SBATCH --cpus-per-task=8
#SBATCH --gres=dcu:4
set
-x
HOME_PATH
=
/work/home/hepj
WORK_PATH
=
${
HOME_PATH
}
/torch/bert-squad/2node-run
source
~/env22.10.sh
which python3
#export NCCL_GRAPH_DUMP_FILE=graph.xml
#export NCCL_GRAPH_FILE=test.xml
#export NCCL_NET_GDR_LEVEL=5
hostfile
=
./
$SLURM_JOB_ID
scontrol show hostnames
$SLURM_JOB_NODELIST
>
${
hostfile
}
for
i
in
`
cat
$hostfile
`
do
echo
${
i
}
slots
=
4
>>
`
pwd
`
/hostfile-
$SLURM_JOB_ID
((
num_node
=
${
num_node
}
+1
))
done
num_dcu
=
$((${
num_node
}
*
4
))
echo
$num_dcu
nodename
=
$(
cat
$hostfile
|sed
-n
"1p"
)
echo
$nodename
dist_url
=
`
echo
$nodename
|
awk
'{print $1}'
`
export
NCCL_DEBUG
=
INFO
export
HSA_USERPTR_FOR_PAGED_MEM
=
0
#mpirun -np ${num_dcu} --hostfile hostfile-$SLURM_JOB_ID ${WORK_PATH}/single_process.sh $dist_url
#mpirun -np ${num_dcu} --hostfile hostfile-$SLURM_JOB_ID ${WORK_PATH}/single_process_ddp.sh $dist_url
#mpirun -np ${num_dcu} --hostfile hostfile-$SLURM_JOB_ID ${WORK_PATH}/2nodes_single_process.sh $dist_url
#hipprof mpirun -np ${num_dcu} --hostfile hostfile-$SLURM_JOB_ID ${WORK_PATH}/hipprof_single.sh $dist_url
#hipprof mpirun -np 4 --hostfile hostfile-18261131 hipprof_single.sh j17r3n01
mpirun
-np
${
num_dcu
}
--hostfile
hostfile-
$SLURM_JOB_ID
${
WORK_PATH
}
/2nodes_single_process.sh
$dist_url
#mpirun -np ${num_dcu} --hostfile hostfile-$SLURM_JOB_ID hipprof ${WORK_PATH}/2nodes_single_process.sh $dist_url
PyTorch/NLP/BERT/README.md
View file @
b1232fb0
...
@@ -108,6 +108,14 @@ python3 tf_to_torch/convert_tf_checkpoint.py --tf_checkpoint ~/NLP/cks/bs64k_32k
...
@@ -108,6 +108,14 @@ python3 tf_to_torch/convert_tf_checkpoint.py --tf_checkpoint ~/NLP/cks/bs64k_32k
./bert_squad4_fp16.sh #半精度 (按自己路径对single_squad4_fp16.sh里APP设置进行修改)
./bert_squad4_fp16.sh #半精度 (按自己路径对single_squad4_fp16.sh里APP设置进行修改)
```
```
```
#多机多卡
cd 2node-run-squad
sbatch run_bert_squad_4dcu.sh (按照自己情况对#SBATCH -p、#SBATCH -J进行修改;需要fp16可以在相应single文件APP中增加 --fp16 与 --amp参数,运行结果保存在相应的slurm文件中)
```
## 4.**PHRASE测试**
## 4.**PHRASE测试**
### 1.参数说明
### 1.参数说明
...
@@ -142,6 +150,9 @@ python3 tf_to_torch/convert_tf_checkpoint.py --tf_checkpoint ~/NLP/cks/bs64k_32k
...
@@ -142,6 +150,9 @@ python3 tf_to_torch/convert_tf_checkpoint.py --tf_checkpoint ~/NLP/cks/bs64k_32k
#多卡
#多卡
./bert_pre1_4.sh #单精度 (按自己路径对single_pre1_4.sh里APP设置进行修改)
./bert_pre1_4.sh #单精度 (按自己路径对single_pre1_4.sh里APP设置进行修改)
./bert_pre1_4_fp16.sh #半精度 (按自己路径对single_pre1_4_fp16.sh里APP设置进行修改)
./bert_pre1_4_fp16.sh #半精度 (按自己路径对single_pre1_4_fp16.sh里APP设置进行修改)
#多机多卡
cd 2node-run-pre
sbatch run_bert_pre1_4dcu.sh (按照自己情况对#SBATCH -p、#SBATCH -J进行修改;需要fp16可以在相应single文件APP中增加 --fp16 与 --amp参数,运行结果保存在相应的slurm文件中)
```
```
### 3.PHRASE2
### 3.PHRASE2
...
@@ -153,6 +164,8 @@ python3 tf_to_torch/convert_tf_checkpoint.py --tf_checkpoint ~/NLP/cks/bs64k_32k
...
@@ -153,6 +164,8 @@ python3 tf_to_torch/convert_tf_checkpoint.py --tf_checkpoint ~/NLP/cks/bs64k_32k
#多卡
#多卡
./bert_pre2_4.sh #单精度 (按自己路径对single_pre2_4.sh里APP设置进行修改)
./bert_pre2_4.sh #单精度 (按自己路径对single_pre2_4.sh里APP设置进行修改)
./bert_pre2_4_fp16.sh #半精度 (按自己路径对single_pre2_4_fp16.sh里APP设置进行修改)
./bert_pre2_4_fp16.sh #半精度 (按自己路径对single_pre2_4_fp16.sh里APP设置进行修改)
#多机多卡
cd 2node-run-pre
sbatch run_bert_pre2_4dcu.sh (按照自己情况对#SBATCH -p、#SBATCH -J进行修改;需要fp16可以在相应single文件APP中增加 --fp16 与 --amp参数,运行结果保存在相应的slurm文件中)
```
```
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment