调整多节点运行

56940268 · hepj987 · ae52a181 · 56940268 · 56940268 · 56940268
Commit 56940268 authored Nov 14, 2023 by hepj987
14 changed files
--- a/2node-run-pre/2nodes_single_process_pre1.sh
+++ b/2node-run-pre/2nodes_single_process_pre1.sh
@@ -22,7 +22,7 @@ APP="python3 ${HOME}/torch/bert-pretrain/run_pretraining_v4.py  \
    --seed=12439 \
    --gradient_accumulation_steps=1 \
    --allreduce_post_accumulation \
-    --gpus_per_node 2 \
+    --gpus_per_node ${2} \
    --do_train \
    --local_rank ${comm_rank} \
    --world_size ${comm_size} \
@@ -32,30 +32,42 @@ APP="python3 ${HOME}/torch/bert-pretrain/run_pretraining_v4.py  \
 case ${lrank} in
 [0])
  export HIP_VISIBLE_DEVICES=0
-  export UCX_NET_DEVICES=mlx5_0:1
-  export UCX_IB_PCI_BW=mlx5_0:50Gbs
-  echo NCCL_SOCKET_IFNAME=eno1 numactl --cpunodebind=0 --membind=0 ${APP}
-  NCCL_SOCKET_IFNAME=eno1 numactl --cpunodebind=0 --membind=0 ${APP}
+  echo numactl --cpunodebind=0 --membind=0 ${APP}
+  numactl --cpunodebind=0 --membind=0 ${APP}
  ;;
 [1])
  export HIP_VISIBLE_DEVICES=1
-  export UCX_NET_DEVICES=mlx5_1:1
-  export UCX_IB_PCI_BW=mlx5_1:50Gbs
-  echo NCCL_SOCKET_IFNAME=eno1 numactl --cpunodebind=1 --membind=1 ${APP}
-  NCCL_SOCKET_IFNAME=eno1 numactl --cpunodebind=1 --membind=1 ${APP}
+  echo numactl --cpunodebind=0 --membind=0 ${APP}
+  numactl --cpunodebind=0 --membind=0 ${APP}
  ;;
 [2])
  export HIP_VISIBLE_DEVICES=2
-  export UCX_NET_DEVICES=mlx5_2:1
-  export UCX_IB_PCI_BW=mlx5_2:50Gbs
-  echo NCCL_SOCKET_IFNAME=eno1 numactl --cpunodebind=2 --membind=2 ${APP}  
-  NCCL_SOCKET_IFNAME=eno1 numactl --cpunodebind=2 --membind=2 ${APP}
+  echo numactl --cpunodebind=0 --membind=0 ${APP}  
+  numactl --cpunodebind=0 --membind=0 ${APP}
  ;;
 [3])
  export HIP_VISIBLE_DEVICES=3
-  export UCX_NET_DEVICES=mlx5_3:1
-  export UCX_IB_PCI_BW=mlx5_3:50Gbs
-  echo NCCL_SOCKET_IFNAME=eno1 numactl --cpunodebind=3 --membind=3 ${APP}  
-  NCCL_SOCKET_IFNAME=eno1 numactl --cpunodebind=3 --membind=3 ${APP}
+  echo numactl --cpunodebind=0 --membind=0 ${APP}  
+  numactl --cpunodebind=0 --membind=0 ${APP}
+  ;;
+[4])
+  export HIP_VISIBLE_DEVICES=4
+  echo numactl --cpunodebind=3 --membind=3 ${APP}
+  numactl --cpunodebind=3 --membind=3 ${APP}
+  ;;
+[5])
+  export HIP_VISIBLE_DEVICES=5
+  echo numactl --cpunodebind=3 --membind=3 ${APP}
+  numactl --cpunodebind=3 --membind=3 ${APP}
+  ;;
+[6])
+  export HIP_VISIBLE_DEVICES=6
+  echo numactl --cpunodebind=3 --membind=3 ${APP}  
+  numactl --cpunodebind=3 --membind=3 ${APP}
+  ;;
+[7])
+  export HIP_VISIBLE_DEVICES=7
+  echo numactl --cpunodebind=3 --membind=3 ${APP}  
+  numactl --cpunodebind=3 --membind=3 ${APP}
  ;;
 esac
--- a/2node-run-pre/2nodes_single_process_pre2.sh
+++ b/2node-run-pre/2nodes_single_process_pre2.sh
@@ -22,7 +22,7 @@ APP="python3 ${HOME}/torch/bert-pretrain/run_pretraining_v4.py  \
    --seed=12439 \
    --gradient_accumulation_steps=1 \
    --allreduce_post_accumulation \
-    --gpus_per_node 2 \
+    --gpus_per_node ${2} \
    --do_train \
    --phase2 \
    --phase1_end_step=0 \
@@ -35,30 +35,42 @@ APP="python3 ${HOME}/torch/bert-pretrain/run_pretraining_v4.py  \
 case ${lrank} in
 [0])
  export HIP_VISIBLE_DEVICES=0
-  export UCX_NET_DEVICES=mlx5_0:1
-  export UCX_IB_PCI_BW=mlx5_0:50Gbs
-  echo NCCL_SOCKET_IFNAME=eno1 numactl --cpunodebind=0 --membind=0 ${APP}
-  NCCL_SOCKET_IFNAME=eno1 numactl --cpunodebind=0 --membind=0 ${APP}
+  echo numactl --cpunodebind=0 --membind=0 ${APP}
+  numactl --cpunodebind=0 --membind=0 ${APP}
  ;;
 [1])
  export HIP_VISIBLE_DEVICES=1
-  export UCX_NET_DEVICES=mlx5_1:1
-  export UCX_IB_PCI_BW=mlx5_1:50Gbs
-  echo NCCL_SOCKET_IFNAME=eno1 numactl --cpunodebind=1 --membind=1 ${APP}
-  NCCL_SOCKET_IFNAME=eno1 numactl --cpunodebind=1 --membind=1 ${APP}
+  echo numactl --cpunodebind=0 --membind=0 ${APP}
+  numactl --cpunodebind=0 --membind=0 ${APP}
  ;;
 [2])
  export HIP_VISIBLE_DEVICES=2
-  export UCX_NET_DEVICES=mlx5_2:1
-  export UCX_IB_PCI_BW=mlx5_2:50Gbs
-  echo NCCL_SOCKET_IFNAME=eno1 numactl --cpunodebind=2 --membind=2 ${APP}  
-  NCCL_SOCKET_IFNAME=eno1 numactl --cpunodebind=2 --membind=2 ${APP}
+  echo numactl --cpunodebind=0 --membind=0 ${APP}  
+  numactl --cpunodebind=0 --membind=0 ${APP}
  ;;
 [3])
  export HIP_VISIBLE_DEVICES=3
-  export UCX_NET_DEVICES=mlx5_3:1
-  export UCX_IB_PCI_BW=mlx5_3:50Gbs
-  echo NCCL_SOCKET_IFNAME=eno1 numactl --cpunodebind=3 --membind=3 ${APP}  
-  NCCL_SOCKET_IFNAME=eno1 numactl --cpunodebind=3 --membind=3 ${APP}
+  echo numactl --cpunodebind=0 --membind=0 ${APP}  
+  numactl --cpunodebind=0 --membind=0 ${APP}
+  ;;
+[4])
+  export HIP_VISIBLE_DEVICES=4
+  echo numactl --cpunodebind=3 --membind=3 ${APP}
+  numactl --cpunodebind=3 --membind=3 ${APP}
+  ;;
+[5])
+  export HIP_VISIBLE_DEVICES=5
+  echo numactl --cpunodebind=3 --membind=3 ${APP}
+  numactl --cpunodebind=3 --membind=3 ${APP}
+  ;;
+[6])
+  export HIP_VISIBLE_DEVICES=6
+  echo numactl --cpunodebind=3 --membind=3 ${APP}  
+  numactl --cpunodebind=3 --membind=3 ${APP}
+  ;;
+[7])
+  export HIP_VISIBLE_DEVICES=7
+  echo numactl --cpunodebind=3 --membind=3 ${APP}  
+  numactl --cpunodebind=3 --membind=3 ${APP}
  ;;
 esac
--- a/2node-run-pre/env.sh
+++ b/2node-run-pre/env.sh
+source $ROCM_PATH/env.sh
\ No newline at end of file
--- a/2node-run-pre/hostfile
+++ b/2node-run-pre/hostfile
+j20r4n01 slots=8
+j20r4n02 slots=8
--- a/2node-run-pre/run_bert_pre1_4dcus.sh
+++ b/2node-run-pre/run_bert_pre1_4dcus.sh
 #!/usr/bin/env bash
-#SBATCH -J 2node-test
-#SBATCH -p wzhdtest
-#SBATCH -N 2
-#SBARCH -n 32
-#SBATCH --ntasks-per-node=4
-#SBATCH --cpus-per-task=8
-#SBATCH --gres=dcu:4
-set -x
-
 HOME_PATH=/work/home/hepj
-WORK_PATH=${HOME_PATH}/torch/bert-pretrain/2node-run
-source ~/env22.10.sh
+WORK_PATH=${HOME_PATH}/bert-pytorch/2node-run-pre
 which python3
-hostfile=./$SLURM_JOB_ID
-scontrol show hostnames $SLURM_JOB_NODELIST > ${hostfile}
-for i in `cat $hostfile`
-do
-    echo ${i} slots=4 >> `pwd`/hostfile-$SLURM_JOB_ID
-    ((num_node=${num_node}+1))
-done
-num_dcu=$((${num_node}*4))
-echo $num_dcu
+source env.sh
+hostfile=./hostfile
+node=$(cat $hostfile|sort|uniq |wc -l)
+np=$(($node*8))

 nodename=$(cat $hostfile |sed -n "1p")
-echo $nodename
 dist_url=`echo $nodename | awk '{print $1}'`
-export NCCL_DEBUG=INFO
-export HSA_USERPTR_FOR_PAGED_MEM=0

-mpirun -np ${num_dcu} --hostfile hostfile-$SLURM_JOB_ID ${WORK_PATH}/2nodes_single_process_pre1.sh $dist_url
+mpirun -np ${np} --hostfile hostfile-$SLURM_JOB_ID  --bind-to none --mca btl_tcp_if_include $dist_url ${WORK_PATH}/2nodes_single_process_pre1.sh $dist_url $node

--- a/2node-run-pre/run_bert_pre2_4dcus.sh
+++ b/2node-run-pre/run_bert_pre2_4dcus.sh
 #!/usr/bin/env bash
-#SBATCH -J 2node-test
-#SBATCH -p wzhdtest
-#SBATCH -N 2
-#SBARCH -n 32
-#SBATCH --ntasks-per-node=4
-#SBATCH --cpus-per-task=8
-#SBATCH --gres=dcu:4
-set -x
-
 HOME_PATH=/work/home/hepj
-WORK_PATH=${HOME_PATH}/torch/bert-pretrain/2node-run
-source ~/env22.10.sh
+WORK_PATH=${HOME_PATH}/bert-pytorch/2node-run-pre
 which python3
-hostfile=./$SLURM_JOB_ID
-scontrol show hostnames $SLURM_JOB_NODELIST > ${hostfile}
-for i in `cat $hostfile`
-do
-    echo ${i} slots=4 >> `pwd`/hostfile-$SLURM_JOB_ID
-    ((num_node=${num_node}+1))
-done
-num_dcu=$((${num_node}*4))
-echo $num_dcu
+source env.sh
+hostfile=./hostfile
+node=$(cat $hostfile|sort|uniq |wc -l)
+np=$(($node*8))

 nodename=$(cat $hostfile |sed -n "1p")
-echo $nodename
 dist_url=`echo $nodename | awk '{print $1}'`
-export NCCL_DEBUG=INFO
-export HSA_USERPTR_FOR_PAGED_MEM=0

-mpirun -np ${num_dcu} --hostfile hostfile-$SLURM_JOB_ID ${WORK_PATH}/2nodes_single_process_pre2.sh $dist_url
+mpirun -np ${np} --hostfile hostfile-$SLURM_JOB_ID  --bind-to none --mca btl_tcp_if_include $dist_url ${WORK_PATH}/2nodes_single_process_pre2.sh $dist_url $node

--- a/2node-run-squad/2nodes_single_process.sh
+++ b/2node-run-squad/2nodes_single_process.sh
 #!/bin/bash
 export MIOPEN_DEBUG_DISABLE_FIND_DB=1
-export NCCL_SOCKET_IFNAME=eno1
 export HSA_USERPTR_FOR_PAGED_MEM=0
-export HIP_LAUNCH_BLOCKING=1
 lrank=$OMPI_COMM_WORLD_LOCAL_RANK
 comm_rank=$OMPI_COMM_WORLD_RANK
 comm_size=$OMPI_COMM_WORLD_SIZE
@@ -28,30 +26,42 @@ APP="python3 /work/home/hepj/torch/bert-squad/run_squad_v4.py \
 case ${lrank} in
 [0])
  export HIP_VISIBLE_DEVICES=0
-  export UCX_NET_DEVICES=mlx5_0:1
-  export UCX_IB_PCI_BW=mlx5_0:50Gbs
-  echo NCCL_SOCKET_IFNAME=eno1 numactl --cpunodebind=0 --membind=0 ${APP}
-  NCCL_SOCKET_IFNAME=eno1 numactl --cpunodebind=0 --membind=0 ${APP}
+  echo numactl --cpunodebind=0 --membind=0 ${APP}
+  numactl --cpunodebind=0 --membind=0 ${APP}
  ;;
 [1])
  export HIP_VISIBLE_DEVICES=1
-  export UCX_NET_DEVICES=mlx5_1:1
-  export UCX_IB_PCI_BW=mlx5_1:50Gbs
-  echo NCCL_SOCKET_IFNAME=eno1 numactl --cpunodebind=1 --membind=1 ${APP}
-  NCCL_SOCKET_IFNAME=eno1 numactl --cpunodebind=1 --membind=1 ${APP}
+  echo numactl --cpunodebind=0 --membind=0 ${APP}
+  numactl --cpunodebind=0 --membind=0 ${APP}
  ;;
 [2])
  export HIP_VISIBLE_DEVICES=2
-  export UCX_NET_DEVICES=mlx5_2:1
-  export UCX_IB_PCI_BW=mlx5_2:50Gbs
-  echo NCCL_SOCKET_IFNAME=eno1 numactl --cpunodebind=2 --membind=2 ${APP}  
-  NCCL_SOCKET_IFNAME=eno1 numactl --cpunodebind=2 --membind=2 ${APP}
+  echo numactl --cpunodebind=0 --membind=0 ${APP}  
+  numactl --cpunodebind=0 --membind=0 ${APP}
  ;;
 [3])
  export HIP_VISIBLE_DEVICES=3
-  export UCX_NET_DEVICES=mlx5_3:1
-  export UCX_IB_PCI_BW=mlx5_3:50Gbs
-  echo NCCL_SOCKET_IFNAME=eno1 numactl --cpunodebind=3 --membind=3 ${APP}  
-  NCCL_SOCKET_IFNAME=eno1 numactl --cpunodebind=3 --membind=3 ${APP}
+  echo numactl --cpunodebind=0 --membind=0 ${APP}  
+  numactl --cpunodebind=0 --membind=0 ${APP}
+  ;;
+[4])
+  export HIP_VISIBLE_DEVICES=4
+  echo numactl --cpunodebind=3 --membind=3 ${APP}
+  numactl --cpunodebind=3 --membind=3 ${APP}
+  ;;
+[5])
+  export HIP_VISIBLE_DEVICES=5
+  echo numactl --cpunodebind=3 --membind=3 ${APP}
+  numactl --cpunodebind=3 --membind=3 ${APP}
+  ;;
+[6])
+  export HIP_VISIBLE_DEVICES=6
+  echo numactl --cpunodebind=3 --membind=3 ${APP}  
+  numactl --cpunodebind=3 --membind=3 ${APP}
+  ;;
+[7])
+  export HIP_VISIBLE_DEVICES=7
+  echo numactl --cpunodebind=3 --membind=3 ${APP}  
+  numactl --cpunodebind=3 --membind=3 ${APP}
  ;;
 esac
--- a/2node-run-squad/37908748
+++ b/2node-run-squad/37908748
-j20r4n01
-j20r4n02
--- a/2node-run-squad/env.sh
+++ b/2node-run-squad/env.sh
+source $ROCM_PATH/env.sh
\ No newline at end of file
--- a/2node-run-squad/hostfile
+++ b/2node-run-squad/hostfile
+j20r4n01 slots=8
+j20r4n02 slots=8
--- a/2node-run-squad/hostfile-37908748
+++ b/2node-run-squad/hostfile-37908748
-j20r4n01 slots=4
-j20r4n02 slots=4
--- a/2node-run-squad/run_bert_squad_4dcus.sh
+++ b/2node-run-squad/run_bert_squad_4dcus.sh
 #!/usr/bin/env bash
-#SBATCH -J 2node-test
-#SBATCH -p kshdnormal
-#SBATCH -N 2
-#SBARCH -n 32
-#SBATCH --ntasks-per-node=4
-#SBATCH --cpus-per-task=8
-#SBATCH --gres=dcu:4
-set -x
-
 HOME_PATH=/public/home/hepj
-WORK_PATH=${HOME_PATH}/torch/BERT/2node-run-squad
-source ~/env22.10.sh
+WORK_PATH=${HOME_PATH}/bert-pytorch/2node-run-squad
 which python3
-hostfile=./$SLURM_JOB_ID
-scontrol show hostnames $SLURM_JOB_NODELIST > ${hostfile}
-for i in `cat $hostfile`
-do
-    echo ${i} slots=4 >> `pwd`/hostfile-$SLURM_JOB_ID
-    ((num_node=${num_node}+1))
-done
-num_dcu=$((${num_node}*4))
-echo $num_dcu
+source env.sh
+hostfile=./hostfile
+np=$(cat $hostfile|sort|uniq |wc -l)
+np=$(($np*8))

 nodename=$(cat $hostfile |sed -n "1p")
-echo $nodename
 dist_url=`echo $nodename | awk '{print $1}'`
-export HSA_USERPTR_FOR_PAGED_MEM=0
-
-
-mpirun -np ${num_dcu} --hostfile hostfile-$SLURM_JOB_ID ${WORK_PATH}/2nodes_single_process.sh $dist_url
+mpirun -np ${np} --hostfile hostfile-$SLURM_JOB_ID  --bind-to none --mca btl_tcp_if_include $dist_url ${WORK_PATH}/2nodes_single_process.sh $dist_url

--- a/Dockerfile
+++ b/Dockerfile
-# Copyright (c) 2020 NVIDIA CORPORATION. All rights reserved.
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
+FROM image.sourcefind.cn:5000/dcu/admin/base/pytorch:1.10.0-centos7.6-dtk-22.10.1-py37-latest
+COPY requirements.txt requirements.txt
+RUN source /opt/dtk-22.10.1/env.sh
+RUN cp /usr/share/zoneinfo/Asia/Shanghai /etc/localtime && echo 'Asia/Shanghai' >/etc/timezone 
+ENV LANG C.UTF-8
+RUN pip install -r requirements.txt -i http://mirrors.aliyun.com/pypi/simple/ --trusted-host mirrors.aliyun.com

-ARG FROM_IMAGE_NAME=nvcr.io/nvidia/pytorch:20.06-py3
-FROM nvcr.io/nvidia/tritonserver:20.06-v1-py3-clientsdk as trt
-FROM ${FROM_IMAGE_NAME}
-RUN apt-get update && apt-get install -y pbzip2 pv bzip2 cabextract
-
-ENV BERT_PREP_WORKING_DIR /workspace/bert/data
-
-WORKDIR /workspace
-RUN git clone https://github.com/attardi/wikiextractor.git && cd wikiextractor && git checkout 6408a430fc504a38b04d37ce5e7fc740191dee16 && cd ..
-RUN git clone https://github.com/soskek/bookcorpus.git
-
-# Copy the perf_client over
-COPY --from=trt /workspace/install/ /workspace/install/
-ENV LD_LIBRARY_PATH /workspace/install/lib:${LD_LIBRARY_PATH}
-
-# Install trt python api
-RUN apt-get install libb64-0d
-RUN pip install /workspace/install/python/tensorrtserver*.whl
-
-WORKDIR /workspace/bert
-RUN pip install --upgrade --no-cache-dir pip \
- && pip install --no-cache-dir \
- tqdm boto3 requests six ipdb h5py html2text nltk progressbar onnxruntime \
- git+https://github.com/NVIDIA/dllogger wget
-
-RUN apt-get install -y iputils-ping
-
-COPY . .
--- a/README.md
+++ b/README.md
@@ -23,36 +23,51 @@ BERT的全称为Bidirectional Encoder Representation from Transformers，是一
 BERT并没有采用整个的Transformer结构（Encoder+Decoder），仅仅使用了Transformer结构里的Encoder部分,BERT将多层的Encoder搭建一起组成了它的基本网络结构。
 ```

-环境配置
+## 环境配置

 `注意dtk python torch apex 等版本要对齐`

+### Docker(方式一)
+
+```
+docker pull image.sourcefind.cn:5000/dcu/admin/base/pytorch:1.10.0-centos7.6-dtk-22.10.1-py37-latest
+进入docker安装没有的依赖
+docker run -dit --network=host --name=bert-pytorch --privileged --device=/dev/kfd --device=/dev/dri --ipc=host --shm-size=16G  --group-add video --cap-add=SYS_PTRACE --security-opt seccomp=unconfined -u root --ulimit stack=-1:-1 --ulimit memlock=-1:-1 image.sourcefind.cn:5000/dcu/admin/base/pytorch:1.10.0-centos7.6-dtk-22.10.1-py37-latest
+docker exec -it llama-tencentpretrain /bin/bash
+pip install -r requirements.txt -i http://mirrors.aliyun.com/pypi/simple/ --trusted-host mirrors.aliyun.com
+#tensorflow安装包参考conda方式下载地址
+pip install tensorflow-2.7.0+git67f0ade9.dtk2210-cp37-cp37m-manylinux2014_x86_64.whl
 ```
-1.创建python虚拟环境并进入
-virtualenv --python=~/package/Python-3.6.8/build/bin/python3 venv_dtk21.10.1_torch1.10
-source venv_dtk21.10_torch1.10/bin/activate

-2.安装依赖包
-pip3 install -r requirements.txt -i http://mirrors.aliyun.com/pypi/simple/ --trusted-host mirrors.aliyun.com
+### Dockerfile(方式二)
+
+```
+docker build -t bert:latest .
+docker run -dit --network=host --name=bert-pytorch --privileged --device=/dev/kfd --device=/dev/dri --ipc=host --shm-size=16G  --group-add video --cap-add=SYS_PTRACE --security-opt seccomp=unconfined -u root --ulimit stack=-1:-1 --ulimit memlock=-1:-1 bert:latest
+docker exec -it bert-pytorch /bin/bash
+#tensorflow安装包参考conda方式下载地址
+pip install tensorflow-2.7.0+git67f0ade9.dtk2210-cp37-cp37m-manylinux2014_x86_64.whl
+```

-pip install torch-1.10.0a0+gitcc7c9c7-cp36-cp36m-linux_x86_64.whl
-pip install torchvision-0.10.0a0+300a8a4-cp36-cp36m-linux_x86_64.whl
-pip install apex-0.1-cp36-cp36m-linux_x86_64.whl
+### Conda（方式三）

-3.环境变量设置
-module rm compiler/rocm/2.9 
-export ROCM_PATH=/public/home/hepj/job_env/apps/dtk-21.10.1
-export HIP_PATH=${ROCM_PATH}/hip
-export PATH=${ROCM_PATH}/bin:${ROCM_PATH}/llvm/bin:${ROCM_PATH}/hcc/bin:${ROCM_PATH}/hip/bin:$PAT
-export HSA_FORCE_FINE_GRAIN_PCIE=1
-export MIOPEN_FIND_MODE=3
-export MIOPEN_ENABLE_LOGGING_CMD=1
-export ROCBLAS_LAYER=3
-module unload compiler/rocm/2.9
-echo "MIOPEN_FIND_MODE=$MIOPEN_FIND_MODE"
-lrank=$OMPI_COMM_WORLD_LOCAL_RANK
-comm_rank=$OMPI_COMM_WORLD_RANK
-comm_size=$OMPI_COMM_WORLD_SIZ
+```
+#创建虚拟环境
+conda create -n bert-pytorch python=3.7
+```
+
+关于本项目DCU显卡所需的工具包、深度学习库等均可从[光合](https://developer.hpccube.com/tool/)开发者社区下载安装。
+
+[pytorch1.10](https://cancon.hpccube.com:65024/directlink/4/pytorch/dtk22.10/torch-1.10.0a0+git2040069.dtk2210-cp37-cp37m-manylinux2014_x86_64.whl)
+
+[tensorflow2.7](https://cancon.hpccube.com:65024/directlink/4/tensorflow/dtk22.10/tensorflow-2.7.0+git67f0ade9.dtk2210-cp37-cp37m-manylinux2014_x86_64.whl)
+
+[DTK22.10](https://cancon.hpccube.com:65024/directlink/1/DTK-22.10.1/CentOS7.6/DTK-22.10.1-CentOS7.6-x86_64.tar.gz)
+
+其它依赖库参照requirements.txt安装：
+
+```
+pip install -r requirements.txt
 ```


@@ -65,21 +80,6 @@ https://dumps.wikimedia.org/enwiki/20220401/

 这里使用服务器已有的wiki数据集服务器上有已经下载处理好的数据，预训练数据分为PHRASE1、PHRASE2

-```
-昆山wiki数据集地址PHRASE1:
-PATH_PHRASE1=/public/software/apps/DeepLearning/Data/wikicorpus_en/lower_case_1_seq_len_128_max_pred_20_masked_lm_prob_0.15_random_seed_12345_dupe_factor_5_shard_1472_test_split_10/wikicorpus_en/training
-
-昆山wiki数据集地址PHRASE2:
-PATH_PHRASE2=/public/software/apps/DeepLearning/Data/wikicorpus_en/lower_case_1_seq_len_512_max_pred_80_masked_lm_prob_0.15_random_seed_12345_dupe_factor_5_shard_1472_test_split_10/wikicorpus_en/training
-```
-
-```
-乌镇wiki地址PHRASE1:
-/public/DL_DATA/wikicorpus_en/lower_case_1_seq_len_128_max_pred_20_masked_lm_prob_0.15_random_seed_12345_dupe_factor_5_shard_1472_test_split_10/wikicorpus_en
-乌镇wiki地址PHRASE2:
-/public/DL_DATA/wikicorpus_en/lower_case_1_seq_len_512_max_pred_80_masked_lm_prob_0.15_random_seed_12345_dupe_factor_5_shard_1472_test_split_10/wikicorpus_en
-```
-
 `wiki数据集结构`

 ```
@@ -94,6 +94,21 @@ PATH_PHRASE2=/public/software/apps/DeepLearning/Data/wikicorpus_en/lower_case_1_
    │             └── ...
 ```

+```
+#wiki数据集下载与处理示例
+cd cleanup_scripts  
+mkdir -p wiki  
+cd wiki  
+wget https://dumps.wikimedia.org/enwiki/20200101/enwiki-20200101-pages-articles-multistream.xml.bz2    # Optionally use curl instead  
+bzip2 -d enwiki-20200101-pages-articles-multistream.xml.bz2  
+cd ..    # back to bert/cleanup_scripts  
+git clone https://github.com/attardi/wikiextractor.git  
+python3 wikiextractor/WikiExtractor.py wiki/enwiki-20200101-pages-articles-multistream.xml    # Results are placed in bert/cleanup_scripts/text  
+./process_wiki.sh '<text/*/wiki_??'  
+```
+
+
+
 问答SQUAD1.1数据：

 [train-v1.1](https://rajpurkar.github.io/SQuAD-explorer/dataset/train-v1.1.json)
@@ -107,20 +122,25 @@ PATH_PHRASE2=/public/software/apps/DeepLearning/Data/wikicorpus_en/lower_case_1_
 └── train-v1.1.json
 ```

+## 模型权重下载
+
+[用于squad训练的bert-large-uncased模型(已转换可直接使用)  提取密码：vs8d](https://pan.baidu.com/share/init?surl=V8kFpgsLQe8tOAeft-5UpQ)
+
+[bert-large-uncased_L-24_H-1024_A-16(需要转换)](https://storage.googleapis.com/bert_models/2018_10_18/uncased_L-24_H-1024_A-16.zip)
+
+[bert-base-uncased_L-12_H-768_A-12(需要转换)](https://storage.googleapis.com/bert_models/2018_10_18/uncased_L-12_H-768_A-12.zip)
+
 ## 训练

 ### squad训练

-#### 1.模型转化
+#### 1.模型转换

 ```
-python3 tf_to_torch/convert_tf_checkpoint.py --tf_checkpoint ~/NLP/cks/bs64k_32k_ckpt/model.ckpt-28252 --bert_config_path ~/NLP/cks/bs64k_32k_ckpt/bert_config.json --output_checkpoint model.ckpt-28252.pt
+#如果下载的是.ckpt格式的模型，需要转换为.ckpt.pt格式
+python3 tf_to_torch/convert_tf_checkpoint.py --tf_checkpoint uncased_L-24_H-1024_A-16/bert_model.ckpt --bert_config_path uncased_L-24_H-1024_A-16/bert_config.json --output_checkpoint uncased_L-24_H-1024_A-16/model.ckpt.pt
 ```

-目前模型转换还存在问题，可能是由于下载的TF模型与model.ckpt-28252不同导致，或torch 、apex版本兼容性问题，还在排查当中，可以直接使用转换好的模型进行squad任务的微调训练（PHRASE的测试则不受此影响，PHRASE为预训练只需要训练数据与网络结构即可，不需要加载模型）
-
-[转换好的模型  提取密码：vs8d](https://pan.baidu.com/share/init?surl=V8kFpgsLQe8tOAeft-5UpQ)
-
 #### 2.参数说明

 ```
@@ -148,23 +168,27 @@ python3 tf_to_torch/convert_tf_checkpoint.py --tf_checkpoint ~/NLP/cks/bs64k_32k
 #单卡
 ./bert_squad.sh #单精度 （按自己路径对single_squad.sh里APP设置进行修改）
 ./bert_squad_fp16.sh  #半精度 （按自己路径对single_squad_fp16.sh里APP设置进行修改）
+--init_checkpoint使用model.ckpt-28252.pt或者自己转换好的model.ckpt.pt
 ```

 ```
 #多卡
 ./bert_squad4.sh #单精度  （按自己路径对single_squad4.sh里APP设置进行修改）
 ./bert_squad4_fp16.sh #半精度  （按自己路径对single_squad4_fp16.sh里APP设置进行修改）
+--init_checkpoint使用model.ckpt-28252.pt或者自己转换好的model.ckpt.pt
 ```

 ```
 #多机多卡
+#进入节点1，根据环境修改hostfile，保证两节点文件路径一致，配置相同，按需修改hostfile改为ip a命令后对应节点ip的网卡名，numa可以根据当前节点拓扑更改绑定
 cd 2node-run-squad
-sbatch run_bert_squad_4dcu.sh （按照自己情况对#SBATCH -p、#SBATCH -J进行修改；需要fp16可以在相应single文件APP中增加 --fp16 与 --amp参数,运行结果保存在相应的slurm文件中）
+sh run_bert_squad_4dcu.sh （需要fp16可以在相应single文件APP中增加 --fp16 与 --amp参数）
+--init_checkpoint使用model.ckpt-28252.pt或者自己转换好的model.ckpt.pt
 ```



-### 4.**PHRASE测试**
+### **PHRASE测试**

 #### 1.参数说明

@@ -198,9 +222,11 @@ sbatch run_bert_squad_4dcu.sh （按照自己情况对#SBATCH -p、#SBATCH -J进
 #多卡
 ./bert_pre1_4.sh #单精度 （按自己路径对single_pre1_4.sh里APP设置进行修改）
 ./bert_pre1_4_fp16.sh   #半精度 （按自己路径对single_pre1_4_fp16.sh里APP设置进行修改）
+
 #多机多卡
+#进入节点1，根据环境修改hostfile，保证两节点文件路径一致，配置相同，按需修改hostfile改为ip a命令后对应节点ip的网卡名，numa可以根据当前节点拓扑更改绑定
 cd 2node-run-pre
-sbatch run_bert_pre1_4dcu.sh （按照自己情况对#SBATCH -p、#SBATCH -J进行修改；需要fp16可以在相应single文件APP中增加 --fp16 与 --amp参数,运行结果保存在相应的slurm文件中）
+sh run_bert_pre1_4dcu.sh （需要fp16可以在相应single文件APP中增加 --fp16 与 --amp参数）
 ```

 #### 3.PHRASE2
@@ -212,12 +238,14 @@ sbatch run_bert_pre1_4dcu.sh （按照自己情况对#SBATCH -p、#SBATCH -J进
 #多卡
 ./bert_pre2_4.sh  #单精度 （按自己路径对single_pre2_4.sh里APP设置进行修改）
 ./bert_pre2_4_fp16.sh  #半精度 （按自己路径对single_pre2_4_fp16.sh里APP设置进行修改）
+
 #多机多卡
+#进入节点1，根据环境修改hostfile，保证两节点文件路径一致，配置相同，按需修改hostfile改为ip a命令后对应节点ip的网卡名，numa可以根据当前节点拓扑更改绑定
 cd 2node-run-pre
-sbatch run_bert_pre2_4dcu.sh （按照自己情况对#SBATCH -p、#SBATCH -J进行修改；需要fp16可以在相应single文件APP中增加 --fp16 与 --amp参数,运行结果保存在相应的slurm文件中）
+sh run_bert_pre2_4dcu.sh （需要fp16可以在相应single文件APP中增加 --fp16 与 --amp参数）
 ```

-## 准确率数据
+## 精度

 | 训练    | 卡数 | batch size | 迭代计数 | 精度                           |
 | ------- | ---- | ---------- | -------- | ------------------------------ |
@@ -228,7 +256,7 @@ sbatch run_bert_pre2_4dcu.sh （按照自己情况对#SBATCH -p、#SBATCH -J进

 ## 算法类别

-`自然语言处理`
+`知识问答`

 ## 热点行业