Commit 56940268 authored by hepj987's avatar hepj987
Browse files

调整多节点运行

parent ae52a181
......@@ -22,7 +22,7 @@ APP="python3 ${HOME}/torch/bert-pretrain/run_pretraining_v4.py \
--seed=12439 \
--gradient_accumulation_steps=1 \
--allreduce_post_accumulation \
--gpus_per_node 2 \
--gpus_per_node ${2} \
--do_train \
--local_rank ${comm_rank} \
--world_size ${comm_size} \
......@@ -32,30 +32,42 @@ APP="python3 ${HOME}/torch/bert-pretrain/run_pretraining_v4.py \
case ${lrank} in
[0])
export HIP_VISIBLE_DEVICES=0
export UCX_NET_DEVICES=mlx5_0:1
export UCX_IB_PCI_BW=mlx5_0:50Gbs
echo NCCL_SOCKET_IFNAME=eno1 numactl --cpunodebind=0 --membind=0 ${APP}
NCCL_SOCKET_IFNAME=eno1 numactl --cpunodebind=0 --membind=0 ${APP}
echo numactl --cpunodebind=0 --membind=0 ${APP}
numactl --cpunodebind=0 --membind=0 ${APP}
;;
[1])
export HIP_VISIBLE_DEVICES=1
export UCX_NET_DEVICES=mlx5_1:1
export UCX_IB_PCI_BW=mlx5_1:50Gbs
echo NCCL_SOCKET_IFNAME=eno1 numactl --cpunodebind=1 --membind=1 ${APP}
NCCL_SOCKET_IFNAME=eno1 numactl --cpunodebind=1 --membind=1 ${APP}
echo numactl --cpunodebind=0 --membind=0 ${APP}
numactl --cpunodebind=0 --membind=0 ${APP}
;;
[2])
export HIP_VISIBLE_DEVICES=2
export UCX_NET_DEVICES=mlx5_2:1
export UCX_IB_PCI_BW=mlx5_2:50Gbs
echo NCCL_SOCKET_IFNAME=eno1 numactl --cpunodebind=2 --membind=2 ${APP}
NCCL_SOCKET_IFNAME=eno1 numactl --cpunodebind=2 --membind=2 ${APP}
echo numactl --cpunodebind=0 --membind=0 ${APP}
numactl --cpunodebind=0 --membind=0 ${APP}
;;
[3])
export HIP_VISIBLE_DEVICES=3
export UCX_NET_DEVICES=mlx5_3:1
export UCX_IB_PCI_BW=mlx5_3:50Gbs
echo NCCL_SOCKET_IFNAME=eno1 numactl --cpunodebind=3 --membind=3 ${APP}
NCCL_SOCKET_IFNAME=eno1 numactl --cpunodebind=3 --membind=3 ${APP}
echo numactl --cpunodebind=0 --membind=0 ${APP}
numactl --cpunodebind=0 --membind=0 ${APP}
;;
[4])
export HIP_VISIBLE_DEVICES=4
echo numactl --cpunodebind=3 --membind=3 ${APP}
numactl --cpunodebind=3 --membind=3 ${APP}
;;
[5])
export HIP_VISIBLE_DEVICES=5
echo numactl --cpunodebind=3 --membind=3 ${APP}
numactl --cpunodebind=3 --membind=3 ${APP}
;;
[6])
export HIP_VISIBLE_DEVICES=6
echo numactl --cpunodebind=3 --membind=3 ${APP}
numactl --cpunodebind=3 --membind=3 ${APP}
;;
[7])
export HIP_VISIBLE_DEVICES=7
echo numactl --cpunodebind=3 --membind=3 ${APP}
numactl --cpunodebind=3 --membind=3 ${APP}
;;
esac
......@@ -22,7 +22,7 @@ APP="python3 ${HOME}/torch/bert-pretrain/run_pretraining_v4.py \
--seed=12439 \
--gradient_accumulation_steps=1 \
--allreduce_post_accumulation \
--gpus_per_node 2 \
--gpus_per_node ${2} \
--do_train \
--phase2 \
--phase1_end_step=0 \
......@@ -35,30 +35,42 @@ APP="python3 ${HOME}/torch/bert-pretrain/run_pretraining_v4.py \
case ${lrank} in
[0])
export HIP_VISIBLE_DEVICES=0
export UCX_NET_DEVICES=mlx5_0:1
export UCX_IB_PCI_BW=mlx5_0:50Gbs
echo NCCL_SOCKET_IFNAME=eno1 numactl --cpunodebind=0 --membind=0 ${APP}
NCCL_SOCKET_IFNAME=eno1 numactl --cpunodebind=0 --membind=0 ${APP}
echo numactl --cpunodebind=0 --membind=0 ${APP}
numactl --cpunodebind=0 --membind=0 ${APP}
;;
[1])
export HIP_VISIBLE_DEVICES=1
export UCX_NET_DEVICES=mlx5_1:1
export UCX_IB_PCI_BW=mlx5_1:50Gbs
echo NCCL_SOCKET_IFNAME=eno1 numactl --cpunodebind=1 --membind=1 ${APP}
NCCL_SOCKET_IFNAME=eno1 numactl --cpunodebind=1 --membind=1 ${APP}
echo numactl --cpunodebind=0 --membind=0 ${APP}
numactl --cpunodebind=0 --membind=0 ${APP}
;;
[2])
export HIP_VISIBLE_DEVICES=2
export UCX_NET_DEVICES=mlx5_2:1
export UCX_IB_PCI_BW=mlx5_2:50Gbs
echo NCCL_SOCKET_IFNAME=eno1 numactl --cpunodebind=2 --membind=2 ${APP}
NCCL_SOCKET_IFNAME=eno1 numactl --cpunodebind=2 --membind=2 ${APP}
echo numactl --cpunodebind=0 --membind=0 ${APP}
numactl --cpunodebind=0 --membind=0 ${APP}
;;
[3])
export HIP_VISIBLE_DEVICES=3
export UCX_NET_DEVICES=mlx5_3:1
export UCX_IB_PCI_BW=mlx5_3:50Gbs
echo NCCL_SOCKET_IFNAME=eno1 numactl --cpunodebind=3 --membind=3 ${APP}
NCCL_SOCKET_IFNAME=eno1 numactl --cpunodebind=3 --membind=3 ${APP}
echo numactl --cpunodebind=0 --membind=0 ${APP}
numactl --cpunodebind=0 --membind=0 ${APP}
;;
[4])
export HIP_VISIBLE_DEVICES=4
echo numactl --cpunodebind=3 --membind=3 ${APP}
numactl --cpunodebind=3 --membind=3 ${APP}
;;
[5])
export HIP_VISIBLE_DEVICES=5
echo numactl --cpunodebind=3 --membind=3 ${APP}
numactl --cpunodebind=3 --membind=3 ${APP}
;;
[6])
export HIP_VISIBLE_DEVICES=6
echo numactl --cpunodebind=3 --membind=3 ${APP}
numactl --cpunodebind=3 --membind=3 ${APP}
;;
[7])
export HIP_VISIBLE_DEVICES=7
echo numactl --cpunodebind=3 --membind=3 ${APP}
numactl --cpunodebind=3 --membind=3 ${APP}
;;
esac
source $ROCM_PATH/env.sh
\ No newline at end of file
j20r4n01 slots=8
j20r4n02 slots=8
#!/usr/bin/env bash
#SBATCH -J 2node-test
#SBATCH -p wzhdtest
#SBATCH -N 2
#SBARCH -n 32
#SBATCH --ntasks-per-node=4
#SBATCH --cpus-per-task=8
#SBATCH --gres=dcu:4
set -x
HOME_PATH=/work/home/hepj
WORK_PATH=${HOME_PATH}/torch/bert-pretrain/2node-run
source ~/env22.10.sh
WORK_PATH=${HOME_PATH}/bert-pytorch/2node-run-pre
which python3
hostfile=./$SLURM_JOB_ID
scontrol show hostnames $SLURM_JOB_NODELIST > ${hostfile}
for i in `cat $hostfile`
do
echo ${i} slots=4 >> `pwd`/hostfile-$SLURM_JOB_ID
((num_node=${num_node}+1))
done
num_dcu=$((${num_node}*4))
echo $num_dcu
source env.sh
hostfile=./hostfile
node=$(cat $hostfile|sort|uniq |wc -l)
np=$(($node*8))
nodename=$(cat $hostfile |sed -n "1p")
echo $nodename
dist_url=`echo $nodename | awk '{print $1}'`
export NCCL_DEBUG=INFO
export HSA_USERPTR_FOR_PAGED_MEM=0
mpirun -np ${num_dcu} --hostfile hostfile-$SLURM_JOB_ID ${WORK_PATH}/2nodes_single_process_pre1.sh $dist_url
mpirun -np ${np} --hostfile hostfile-$SLURM_JOB_ID --bind-to none --mca btl_tcp_if_include $dist_url ${WORK_PATH}/2nodes_single_process_pre1.sh $dist_url $node
#!/usr/bin/env bash
#SBATCH -J 2node-test
#SBATCH -p wzhdtest
#SBATCH -N 2
#SBARCH -n 32
#SBATCH --ntasks-per-node=4
#SBATCH --cpus-per-task=8
#SBATCH --gres=dcu:4
set -x
HOME_PATH=/work/home/hepj
WORK_PATH=${HOME_PATH}/torch/bert-pretrain/2node-run
source ~/env22.10.sh
WORK_PATH=${HOME_PATH}/bert-pytorch/2node-run-pre
which python3
hostfile=./$SLURM_JOB_ID
scontrol show hostnames $SLURM_JOB_NODELIST > ${hostfile}
for i in `cat $hostfile`
do
echo ${i} slots=4 >> `pwd`/hostfile-$SLURM_JOB_ID
((num_node=${num_node}+1))
done
num_dcu=$((${num_node}*4))
echo $num_dcu
source env.sh
hostfile=./hostfile
node=$(cat $hostfile|sort|uniq |wc -l)
np=$(($node*8))
nodename=$(cat $hostfile |sed -n "1p")
echo $nodename
dist_url=`echo $nodename | awk '{print $1}'`
export NCCL_DEBUG=INFO
export HSA_USERPTR_FOR_PAGED_MEM=0
mpirun -np ${num_dcu} --hostfile hostfile-$SLURM_JOB_ID ${WORK_PATH}/2nodes_single_process_pre2.sh $dist_url
mpirun -np ${np} --hostfile hostfile-$SLURM_JOB_ID --bind-to none --mca btl_tcp_if_include $dist_url ${WORK_PATH}/2nodes_single_process_pre2.sh $dist_url $node
#!/bin/bash
export MIOPEN_DEBUG_DISABLE_FIND_DB=1
export NCCL_SOCKET_IFNAME=eno1
export HSA_USERPTR_FOR_PAGED_MEM=0
export HIP_LAUNCH_BLOCKING=1
lrank=$OMPI_COMM_WORLD_LOCAL_RANK
comm_rank=$OMPI_COMM_WORLD_RANK
comm_size=$OMPI_COMM_WORLD_SIZE
......@@ -28,30 +26,42 @@ APP="python3 /work/home/hepj/torch/bert-squad/run_squad_v4.py \
case ${lrank} in
[0])
export HIP_VISIBLE_DEVICES=0
export UCX_NET_DEVICES=mlx5_0:1
export UCX_IB_PCI_BW=mlx5_0:50Gbs
echo NCCL_SOCKET_IFNAME=eno1 numactl --cpunodebind=0 --membind=0 ${APP}
NCCL_SOCKET_IFNAME=eno1 numactl --cpunodebind=0 --membind=0 ${APP}
echo numactl --cpunodebind=0 --membind=0 ${APP}
numactl --cpunodebind=0 --membind=0 ${APP}
;;
[1])
export HIP_VISIBLE_DEVICES=1
export UCX_NET_DEVICES=mlx5_1:1
export UCX_IB_PCI_BW=mlx5_1:50Gbs
echo NCCL_SOCKET_IFNAME=eno1 numactl --cpunodebind=1 --membind=1 ${APP}
NCCL_SOCKET_IFNAME=eno1 numactl --cpunodebind=1 --membind=1 ${APP}
echo numactl --cpunodebind=0 --membind=0 ${APP}
numactl --cpunodebind=0 --membind=0 ${APP}
;;
[2])
export HIP_VISIBLE_DEVICES=2
export UCX_NET_DEVICES=mlx5_2:1
export UCX_IB_PCI_BW=mlx5_2:50Gbs
echo NCCL_SOCKET_IFNAME=eno1 numactl --cpunodebind=2 --membind=2 ${APP}
NCCL_SOCKET_IFNAME=eno1 numactl --cpunodebind=2 --membind=2 ${APP}
echo numactl --cpunodebind=0 --membind=0 ${APP}
numactl --cpunodebind=0 --membind=0 ${APP}
;;
[3])
export HIP_VISIBLE_DEVICES=3
export UCX_NET_DEVICES=mlx5_3:1
export UCX_IB_PCI_BW=mlx5_3:50Gbs
echo NCCL_SOCKET_IFNAME=eno1 numactl --cpunodebind=3 --membind=3 ${APP}
NCCL_SOCKET_IFNAME=eno1 numactl --cpunodebind=3 --membind=3 ${APP}
echo numactl --cpunodebind=0 --membind=0 ${APP}
numactl --cpunodebind=0 --membind=0 ${APP}
;;
[4])
export HIP_VISIBLE_DEVICES=4
echo numactl --cpunodebind=3 --membind=3 ${APP}
numactl --cpunodebind=3 --membind=3 ${APP}
;;
[5])
export HIP_VISIBLE_DEVICES=5
echo numactl --cpunodebind=3 --membind=3 ${APP}
numactl --cpunodebind=3 --membind=3 ${APP}
;;
[6])
export HIP_VISIBLE_DEVICES=6
echo numactl --cpunodebind=3 --membind=3 ${APP}
numactl --cpunodebind=3 --membind=3 ${APP}
;;
[7])
export HIP_VISIBLE_DEVICES=7
echo numactl --cpunodebind=3 --membind=3 ${APP}
numactl --cpunodebind=3 --membind=3 ${APP}
;;
esac
source $ROCM_PATH/env.sh
\ No newline at end of file
j20r4n01 slots=8
j20r4n02 slots=8
j20r4n01 slots=4
j20r4n02 slots=4
#!/usr/bin/env bash
#SBATCH -J 2node-test
#SBATCH -p kshdnormal
#SBATCH -N 2
#SBARCH -n 32
#SBATCH --ntasks-per-node=4
#SBATCH --cpus-per-task=8
#SBATCH --gres=dcu:4
set -x
HOME_PATH=/public/home/hepj
WORK_PATH=${HOME_PATH}/torch/BERT/2node-run-squad
source ~/env22.10.sh
WORK_PATH=${HOME_PATH}/bert-pytorch/2node-run-squad
which python3
hostfile=./$SLURM_JOB_ID
scontrol show hostnames $SLURM_JOB_NODELIST > ${hostfile}
for i in `cat $hostfile`
do
echo ${i} slots=4 >> `pwd`/hostfile-$SLURM_JOB_ID
((num_node=${num_node}+1))
done
num_dcu=$((${num_node}*4))
echo $num_dcu
source env.sh
hostfile=./hostfile
np=$(cat $hostfile|sort|uniq |wc -l)
np=$(($np*8))
nodename=$(cat $hostfile |sed -n "1p")
echo $nodename
dist_url=`echo $nodename | awk '{print $1}'`
export HSA_USERPTR_FOR_PAGED_MEM=0
mpirun -np ${num_dcu} --hostfile hostfile-$SLURM_JOB_ID ${WORK_PATH}/2nodes_single_process.sh $dist_url
mpirun -np ${np} --hostfile hostfile-$SLURM_JOB_ID --bind-to none --mca btl_tcp_if_include $dist_url ${WORK_PATH}/2nodes_single_process.sh $dist_url
# Copyright (c) 2020 NVIDIA CORPORATION. All rights reserved.
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
FROM image.sourcefind.cn:5000/dcu/admin/base/pytorch:1.10.0-centos7.6-dtk-22.10.1-py37-latest
COPY requirements.txt requirements.txt
RUN source /opt/dtk-22.10.1/env.sh
RUN cp /usr/share/zoneinfo/Asia/Shanghai /etc/localtime && echo 'Asia/Shanghai' >/etc/timezone
ENV LANG C.UTF-8
RUN pip install -r requirements.txt -i http://mirrors.aliyun.com/pypi/simple/ --trusted-host mirrors.aliyun.com
ARG FROM_IMAGE_NAME=nvcr.io/nvidia/pytorch:20.06-py3
FROM nvcr.io/nvidia/tritonserver:20.06-v1-py3-clientsdk as trt
FROM ${FROM_IMAGE_NAME}
RUN apt-get update && apt-get install -y pbzip2 pv bzip2 cabextract
ENV BERT_PREP_WORKING_DIR /workspace/bert/data
WORKDIR /workspace
RUN git clone https://github.com/attardi/wikiextractor.git && cd wikiextractor && git checkout 6408a430fc504a38b04d37ce5e7fc740191dee16 && cd ..
RUN git clone https://github.com/soskek/bookcorpus.git
# Copy the perf_client over
COPY --from=trt /workspace/install/ /workspace/install/
ENV LD_LIBRARY_PATH /workspace/install/lib:${LD_LIBRARY_PATH}
# Install trt python api
RUN apt-get install libb64-0d
RUN pip install /workspace/install/python/tensorrtserver*.whl
WORKDIR /workspace/bert
RUN pip install --upgrade --no-cache-dir pip \
&& pip install --no-cache-dir \
tqdm boto3 requests six ipdb h5py html2text nltk progressbar onnxruntime \
git+https://github.com/NVIDIA/dllogger wget
RUN apt-get install -y iputils-ping
COPY . .
......@@ -23,36 +23,51 @@ BERT的全称为Bidirectional Encoder Representation from Transformers,是一
BERT并没有采用整个的Transformer结构(Encoder+Decoder),仅仅使用了Transformer结构里的Encoder部分,BERT将多层的Encoder搭建一起组成了它的基本网络结构。
```
环境配置
## 环境配置
`注意dtk python torch apex 等版本要对齐`
### Docker(方式一)
```
docker pull image.sourcefind.cn:5000/dcu/admin/base/pytorch:1.10.0-centos7.6-dtk-22.10.1-py37-latest
进入docker安装没有的依赖
docker run -dit --network=host --name=bert-pytorch --privileged --device=/dev/kfd --device=/dev/dri --ipc=host --shm-size=16G --group-add video --cap-add=SYS_PTRACE --security-opt seccomp=unconfined -u root --ulimit stack=-1:-1 --ulimit memlock=-1:-1 image.sourcefind.cn:5000/dcu/admin/base/pytorch:1.10.0-centos7.6-dtk-22.10.1-py37-latest
docker exec -it llama-tencentpretrain /bin/bash
pip install -r requirements.txt -i http://mirrors.aliyun.com/pypi/simple/ --trusted-host mirrors.aliyun.com
#tensorflow安装包参考conda方式下载地址
pip install tensorflow-2.7.0+git67f0ade9.dtk2210-cp37-cp37m-manylinux2014_x86_64.whl
```
1.创建python虚拟环境并进入
virtualenv --python=~/package/Python-3.6.8/build/bin/python3 venv_dtk21.10.1_torch1.10
source venv_dtk21.10_torch1.10/bin/activate
2.安装依赖包
pip3 install -r requirements.txt -i http://mirrors.aliyun.com/pypi/simple/ --trusted-host mirrors.aliyun.com
### Dockerfile(方式二)
```
docker build -t bert:latest .
docker run -dit --network=host --name=bert-pytorch --privileged --device=/dev/kfd --device=/dev/dri --ipc=host --shm-size=16G --group-add video --cap-add=SYS_PTRACE --security-opt seccomp=unconfined -u root --ulimit stack=-1:-1 --ulimit memlock=-1:-1 bert:latest
docker exec -it bert-pytorch /bin/bash
#tensorflow安装包参考conda方式下载地址
pip install tensorflow-2.7.0+git67f0ade9.dtk2210-cp37-cp37m-manylinux2014_x86_64.whl
```
pip install torch-1.10.0a0+gitcc7c9c7-cp36-cp36m-linux_x86_64.whl
pip install torchvision-0.10.0a0+300a8a4-cp36-cp36m-linux_x86_64.whl
pip install apex-0.1-cp36-cp36m-linux_x86_64.whl
### Conda(方式三)
3.环境变量设置
module rm compiler/rocm/2.9
export ROCM_PATH=/public/home/hepj/job_env/apps/dtk-21.10.1
export HIP_PATH=${ROCM_PATH}/hip
export PATH=${ROCM_PATH}/bin:${ROCM_PATH}/llvm/bin:${ROCM_PATH}/hcc/bin:${ROCM_PATH}/hip/bin:$PAT
export HSA_FORCE_FINE_GRAIN_PCIE=1
export MIOPEN_FIND_MODE=3
export MIOPEN_ENABLE_LOGGING_CMD=1
export ROCBLAS_LAYER=3
module unload compiler/rocm/2.9
echo "MIOPEN_FIND_MODE=$MIOPEN_FIND_MODE"
lrank=$OMPI_COMM_WORLD_LOCAL_RANK
comm_rank=$OMPI_COMM_WORLD_RANK
comm_size=$OMPI_COMM_WORLD_SIZ
```
#创建虚拟环境
conda create -n bert-pytorch python=3.7
```
关于本项目DCU显卡所需的工具包、深度学习库等均可从[光合](https://developer.hpccube.com/tool/)开发者社区下载安装。
[pytorch1.10](https://cancon.hpccube.com:65024/directlink/4/pytorch/dtk22.10/torch-1.10.0a0+git2040069.dtk2210-cp37-cp37m-manylinux2014_x86_64.whl)
[tensorflow2.7](https://cancon.hpccube.com:65024/directlink/4/tensorflow/dtk22.10/tensorflow-2.7.0+git67f0ade9.dtk2210-cp37-cp37m-manylinux2014_x86_64.whl)
[DTK22.10](https://cancon.hpccube.com:65024/directlink/1/DTK-22.10.1/CentOS7.6/DTK-22.10.1-CentOS7.6-x86_64.tar.gz)
其它依赖库参照requirements.txt安装:
```
pip install -r requirements.txt
```
......@@ -65,21 +80,6 @@ https://dumps.wikimedia.org/enwiki/20220401/
这里使用服务器已有的wiki数据集服务器上有已经下载处理好的数据,预训练数据分为PHRASE1、PHRASE2
```
昆山wiki数据集地址PHRASE1:
PATH_PHRASE1=/public/software/apps/DeepLearning/Data/wikicorpus_en/lower_case_1_seq_len_128_max_pred_20_masked_lm_prob_0.15_random_seed_12345_dupe_factor_5_shard_1472_test_split_10/wikicorpus_en/training
昆山wiki数据集地址PHRASE2:
PATH_PHRASE2=/public/software/apps/DeepLearning/Data/wikicorpus_en/lower_case_1_seq_len_512_max_pred_80_masked_lm_prob_0.15_random_seed_12345_dupe_factor_5_shard_1472_test_split_10/wikicorpus_en/training
```
```
乌镇wiki地址PHRASE1:
/public/DL_DATA/wikicorpus_en/lower_case_1_seq_len_128_max_pred_20_masked_lm_prob_0.15_random_seed_12345_dupe_factor_5_shard_1472_test_split_10/wikicorpus_en
乌镇wiki地址PHRASE2:
/public/DL_DATA/wikicorpus_en/lower_case_1_seq_len_512_max_pred_80_masked_lm_prob_0.15_random_seed_12345_dupe_factor_5_shard_1472_test_split_10/wikicorpus_en
```
`wiki数据集结构`
```
......@@ -94,6 +94,21 @@ PATH_PHRASE2=/public/software/apps/DeepLearning/Data/wikicorpus_en/lower_case_1_
│ └── ...
```
```
#wiki数据集下载与处理示例
cd cleanup_scripts
mkdir -p wiki
cd wiki
wget https://dumps.wikimedia.org/enwiki/20200101/enwiki-20200101-pages-articles-multistream.xml.bz2 # Optionally use curl instead
bzip2 -d enwiki-20200101-pages-articles-multistream.xml.bz2
cd .. # back to bert/cleanup_scripts
git clone https://github.com/attardi/wikiextractor.git
python3 wikiextractor/WikiExtractor.py wiki/enwiki-20200101-pages-articles-multistream.xml # Results are placed in bert/cleanup_scripts/text
./process_wiki.sh '<text/*/wiki_??'
```
问答SQUAD1.1数据:
[train-v1.1](https://rajpurkar.github.io/SQuAD-explorer/dataset/train-v1.1.json)
......@@ -107,20 +122,25 @@ PATH_PHRASE2=/public/software/apps/DeepLearning/Data/wikicorpus_en/lower_case_1_
└── train-v1.1.json
```
## 模型权重下载
[用于squad训练的bert-large-uncased模型(已转换可直接使用) 提取密码:vs8d](https://pan.baidu.com/share/init?surl=V8kFpgsLQe8tOAeft-5UpQ)
[bert-large-uncased_L-24_H-1024_A-16(需要转换)](https://storage.googleapis.com/bert_models/2018_10_18/uncased_L-24_H-1024_A-16.zip)
[bert-base-uncased_L-12_H-768_A-12(需要转换)](https://storage.googleapis.com/bert_models/2018_10_18/uncased_L-12_H-768_A-12.zip)
## 训练
### squad训练
#### 1.模型转
#### 1.模型转
```
python3 tf_to_torch/convert_tf_checkpoint.py --tf_checkpoint ~/NLP/cks/bs64k_32k_ckpt/model.ckpt-28252 --bert_config_path ~/NLP/cks/bs64k_32k_ckpt/bert_config.json --output_checkpoint model.ckpt-28252.pt
#如果下载的是.ckpt格式的模型,需要转换为.ckpt.pt格式
python3 tf_to_torch/convert_tf_checkpoint.py --tf_checkpoint uncased_L-24_H-1024_A-16/bert_model.ckpt --bert_config_path uncased_L-24_H-1024_A-16/bert_config.json --output_checkpoint uncased_L-24_H-1024_A-16/model.ckpt.pt
```
目前模型转换还存在问题,可能是由于下载的TF模型与model.ckpt-28252不同导致,或torch 、apex版本兼容性问题,还在排查当中,可以直接使用转换好的模型进行squad任务的微调训练(PHRASE的测试则不受此影响,PHRASE为预训练只需要训练数据与网络结构即可,不需要加载模型)
[转换好的模型 提取密码:vs8d](https://pan.baidu.com/share/init?surl=V8kFpgsLQe8tOAeft-5UpQ)
#### 2.参数说明
```
......@@ -148,23 +168,27 @@ python3 tf_to_torch/convert_tf_checkpoint.py --tf_checkpoint ~/NLP/cks/bs64k_32k
#单卡
./bert_squad.sh #单精度 (按自己路径对single_squad.sh里APP设置进行修改)
./bert_squad_fp16.sh #半精度 (按自己路径对single_squad_fp16.sh里APP设置进行修改)
--init_checkpoint使用model.ckpt-28252.pt或者自己转换好的model.ckpt.pt
```
```
#多卡
./bert_squad4.sh #单精度 (按自己路径对single_squad4.sh里APP设置进行修改)
./bert_squad4_fp16.sh #半精度 (按自己路径对single_squad4_fp16.sh里APP设置进行修改)
--init_checkpoint使用model.ckpt-28252.pt或者自己转换好的model.ckpt.pt
```
```
#多机多卡
#进入节点1,根据环境修改hostfile,保证两节点文件路径一致,配置相同,按需修改hostfile改为ip a命令后对应节点ip的网卡名,numa可以根据当前节点拓扑更改绑定
cd 2node-run-squad
sbatch run_bert_squad_4dcu.sh (按照自己情况对#SBATCH -p、#SBATCH -J进行修改;需要fp16可以在相应single文件APP中增加 --fp16 与 --amp参数,运行结果保存在相应的slurm文件中)
sh run_bert_squad_4dcu.sh (需要fp16可以在相应single文件APP中增加 --fp16 与 --amp参数)
--init_checkpoint使用model.ckpt-28252.pt或者自己转换好的model.ckpt.pt
```
### 4.**PHRASE测试**
### **PHRASE测试**
#### 1.参数说明
......@@ -198,9 +222,11 @@ sbatch run_bert_squad_4dcu.sh (按照自己情况对#SBATCH -p、#SBATCH -J进
#多卡
./bert_pre1_4.sh #单精度 (按自己路径对single_pre1_4.sh里APP设置进行修改)
./bert_pre1_4_fp16.sh #半精度 (按自己路径对single_pre1_4_fp16.sh里APP设置进行修改)
#多机多卡
#进入节点1,根据环境修改hostfile,保证两节点文件路径一致,配置相同,按需修改hostfile改为ip a命令后对应节点ip的网卡名,numa可以根据当前节点拓扑更改绑定
cd 2node-run-pre
sbatch run_bert_pre1_4dcu.sh (按照自己情况对#SBATCH -p、#SBATCH -J进行修改;需要fp16可以在相应single文件APP中增加 --fp16 与 --amp参数,运行结果保存在相应的slurm文件中
sh run_bert_pre1_4dcu.sh (需要fp16可以在相应single文件APP中增加 --fp16 与 --amp参数)
```
#### 3.PHRASE2
......@@ -212,12 +238,14 @@ sbatch run_bert_pre1_4dcu.sh (按照自己情况对#SBATCH -p、#SBATCH -J进
#多卡
./bert_pre2_4.sh #单精度 (按自己路径对single_pre2_4.sh里APP设置进行修改)
./bert_pre2_4_fp16.sh #半精度 (按自己路径对single_pre2_4_fp16.sh里APP设置进行修改)
#多机多卡
#进入节点1,根据环境修改hostfile,保证两节点文件路径一致,配置相同,按需修改hostfile改为ip a命令后对应节点ip的网卡名,numa可以根据当前节点拓扑更改绑定
cd 2node-run-pre
sbatch run_bert_pre2_4dcu.sh (按照自己情况对#SBATCH -p、#SBATCH -J进行修改;需要fp16可以在相应single文件APP中增加 --fp16 与 --amp参数,运行结果保存在相应的slurm文件中
sh run_bert_pre2_4dcu.sh (需要fp16可以在相应single文件APP中增加 --fp16 与 --amp参数)
```
## 准确率数据
## 精度
| 训练 | 卡数 | batch size | 迭代计数 | 精度 |
| ------- | ---- | ---------- | -------- | ------------------------------ |
......@@ -228,7 +256,7 @@ sbatch run_bert_pre2_4dcu.sh (按照自己情况对#SBATCH -p、#SBATCH -J进
## 算法类别
`自然语言处理`
`知识问答`
## 热点行业
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment