初始化仓库

e5ca7e62 · hepj987 · e5ca7e62 · e5ca7e62 · e5ca7e62 · e5ca7e62
Commit e5ca7e62 authored Jul 17, 2023 by hepj987
20 changed files
--- a/.gitignore
+++ b/.gitignore
+# Initially taken from Github's Python gitignore file
+
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+
+# C extensions
+*.so
+
+#Data checkpoints and results       
+data/*/*/   
+data/*/*.zip
+checkpoints/
+results
+results/*
+
+#Editor
+.idea
+.idea/*
+
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+.hypothesis/
+.pytest_cache/
+
+# Translations
+*.mo
+*.pot
+
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+
+# Flask stuff:
+instance/
+.webassets-cache
+
+# Scrapy stuff:
+.scrapy
+
+# Sphinx documentation
+docs/_build/
+
+# PyBuilder
+target/
+
+# Jupyter Notebook
+.ipynb_checkpoints
+
+# IPython
+profile_default/
+ipython_config.py
+
+# pyenv
+.python-version
+
+# celery beat schedule file
+celerybeat-schedule
+
+# SageMath parsed files
+*.sage.py
+
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+
+# Spyder project settings
+.spyderproject
+.spyproject
+
+# Rope project settings
+.ropeproject
+
+# mkdocs documentation
+/site
+
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+
+# Pyre type checker
+.pyre/
+
+# vscode
+.vscode
+
+# TF code
+tensorflow_code
+
+# Models
+models
--- a/.gitmodules
+++ b/.gitmodules
--- a/2node-run-pre/2nodes_single_process_pre1.sh
+++ b/2node-run-pre/2nodes_single_process_pre1.sh
+#!/bin/bash
+export MIOPEN_DEBUG_DISABLE_FIND_DB=1
+export NCCL_SOCKET_IFNAME=eno1
+export HSA_USERPTR_FOR_PAGED_MEM=0
+export HIP_LAUNCH_BLOCKING=1
+lrank=$OMPI_COMM_WORLD_LOCAL_RANK
+comm_rank=$OMPI_COMM_WORLD_RANK
+comm_size=$OMPI_COMM_WORLD_SIZE
+export PATH_PHRASE1=/public/DL_DATA/wikicorpus_en/lower_case_1_seq_len_128_max_pred_20_masked_lm_prob_0.15_random_seed_12345_dupe_factor_5_shard_1472_test_split_10/wikicorpus_en/training
+APP="python3 ${HOME}/torch/bert-pretrain/run_pretraining_v4.py  \
+    --input_dir=${PATH_PHRASE1}    \
+    --output_dir=${HOME}/outdir/torch/pre_wiki/phrase1 \
+    --config_file=${HOME}/model/uncased_L-24_H-1024_A-16/bert_config.json \
+    --bert_model=bert-large-uncased \
+    --train_batch_size=16 \
+    --max_seq_length=128 \
+    --max_predictions_per_seq=20 \
+    --max_steps=100000 \
+    --warmup_proportion=0.0 \
+    --num_steps_per_checkpoint=20000 \
+    --learning_rate=4.0e-4 \
+    --seed=12439 \
+    --gradient_accumulation_steps=1 \
+    --allreduce_post_accumulation \
+    --gpus_per_node 2 \
+    --do_train \
+    --local_rank ${comm_rank} \
+    --world_size ${comm_size} \
+    --dist_url tcp://${1}:34567 \
+    --json-summary ${HOME}/outdir/torch/pre_wiki/phrase1/dllogger.json
+ "
+case ${lrank} in
+[0])
+  export HIP_VISIBLE_DEVICES=0
+  export UCX_NET_DEVICES=mlx5_0:1
+  export UCX_IB_PCI_BW=mlx5_0:50Gbs
+  echo NCCL_SOCKET_IFNAME=eno1 numactl --cpunodebind=0 --membind=0 ${APP}
+  NCCL_SOCKET_IFNAME=eno1 numactl --cpunodebind=0 --membind=0 ${APP}
+  ;;
+[1])
+  export HIP_VISIBLE_DEVICES=1
+  export UCX_NET_DEVICES=mlx5_1:1
+  export UCX_IB_PCI_BW=mlx5_1:50Gbs
+  echo NCCL_SOCKET_IFNAME=eno1 numactl --cpunodebind=1 --membind=1 ${APP}
+  NCCL_SOCKET_IFNAME=eno1 numactl --cpunodebind=1 --membind=1 ${APP}
+  ;;
+[2])
+  export HIP_VISIBLE_DEVICES=2
+  export UCX_NET_DEVICES=mlx5_2:1
+  export UCX_IB_PCI_BW=mlx5_2:50Gbs
+  echo NCCL_SOCKET_IFNAME=eno1 numactl --cpunodebind=2 --membind=2 ${APP}  
+  NCCL_SOCKET_IFNAME=eno1 numactl --cpunodebind=2 --membind=2 ${APP}
+  ;;
+[3])
+  export HIP_VISIBLE_DEVICES=3
+  export UCX_NET_DEVICES=mlx5_3:1
+  export UCX_IB_PCI_BW=mlx5_3:50Gbs
+  echo NCCL_SOCKET_IFNAME=eno1 numactl --cpunodebind=3 --membind=3 ${APP}  
+  NCCL_SOCKET_IFNAME=eno1 numactl --cpunodebind=3 --membind=3 ${APP}
+  ;;
+esac
--- a/2node-run-pre/2nodes_single_process_pre2.sh
+++ b/2node-run-pre/2nodes_single_process_pre2.sh
+#!/bin/bash
+export MIOPEN_DEBUG_DISABLE_FIND_DB=1
+export NCCL_SOCKET_IFNAME=eno1
+export HSA_USERPTR_FOR_PAGED_MEM=0
+export HIP_LAUNCH_BLOCKING=1
+lrank=$OMPI_COMM_WORLD_LOCAL_RANK
+comm_rank=$OMPI_COMM_WORLD_RANK
+comm_size=$OMPI_COMM_WORLD_SIZE
+export PATH_PHRASE2=/public/DL_DATA/wikicorpus_en/lower_case_1_seq_len_512_max_pred_80_masked_lm_prob_0.15_random_seed_12345_dupe_factor_5_shard_1472_test_split_10/wikicorpus_en/training
+APP="python3 ${HOME}/torch/bert-pretrain/run_pretraining_v4.py  \
+    --input_dir=${PATH_PHRASE2}    \
+    --output_dir=${HOME}/outdir/torch/pre_wiki/phrase2 \
+    --config_file=${HOME}/model/uncased_L-24_H-1024_A-16/bert_config.json \
+    --bert_model=bert-large-uncased \
+    --train_batch_size=2 \
+    --max_seq_length=512 \
+    --max_predictions_per_seq=80 \
+    --max_steps=400000 \
+    --warmup_proportion=0.128 \
+    --num_steps_per_checkpoint=20000 \
+    --learning_rate=4.0e-3 \
+    --seed=12439 \
+    --gradient_accumulation_steps=1 \
+    --allreduce_post_accumulation \
+    --gpus_per_node 2 \
+    --do_train \
+    --phase2 \
+    --phase1_end_step=0 \
+    --local_rank ${comm_rank} \
+    --world_size ${comm_size} \
+    --dist_url tcp://${1}:34567 \
+    --json-summary ${HOME}/outdir/torch/pre_wiki4/phrase2/dllogger.json
+ "
+
+case ${lrank} in
+[0])
+  export HIP_VISIBLE_DEVICES=0
+  export UCX_NET_DEVICES=mlx5_0:1
+  export UCX_IB_PCI_BW=mlx5_0:50Gbs
+  echo NCCL_SOCKET_IFNAME=eno1 numactl --cpunodebind=0 --membind=0 ${APP}
+  NCCL_SOCKET_IFNAME=eno1 numactl --cpunodebind=0 --membind=0 ${APP}
+  ;;
+[1])
+  export HIP_VISIBLE_DEVICES=1
+  export UCX_NET_DEVICES=mlx5_1:1
+  export UCX_IB_PCI_BW=mlx5_1:50Gbs
+  echo NCCL_SOCKET_IFNAME=eno1 numactl --cpunodebind=1 --membind=1 ${APP}
+  NCCL_SOCKET_IFNAME=eno1 numactl --cpunodebind=1 --membind=1 ${APP}
+  ;;
+[2])
+  export HIP_VISIBLE_DEVICES=2
+  export UCX_NET_DEVICES=mlx5_2:1
+  export UCX_IB_PCI_BW=mlx5_2:50Gbs
+  echo NCCL_SOCKET_IFNAME=eno1 numactl --cpunodebind=2 --membind=2 ${APP}  
+  NCCL_SOCKET_IFNAME=eno1 numactl --cpunodebind=2 --membind=2 ${APP}
+  ;;
+[3])
+  export HIP_VISIBLE_DEVICES=3
+  export UCX_NET_DEVICES=mlx5_3:1
+  export UCX_IB_PCI_BW=mlx5_3:50Gbs
+  echo NCCL_SOCKET_IFNAME=eno1 numactl --cpunodebind=3 --membind=3 ${APP}  
+  NCCL_SOCKET_IFNAME=eno1 numactl --cpunodebind=3 --membind=3 ${APP}
+  ;;
+esac
--- a/2node-run-pre/run_bert_pre1_4dcus.sh
+++ b/2node-run-pre/run_bert_pre1_4dcus.sh
+#!/usr/bin/env bash
+#SBATCH -J 2node-test
+#SBATCH -p wzhdtest
+#SBATCH -N 2
+#SBARCH -n 32
+#SBATCH --ntasks-per-node=4
+#SBATCH --cpus-per-task=8
+#SBATCH --gres=dcu:4
+set -x
+
+HOME_PATH=/work/home/hepj
+WORK_PATH=${HOME_PATH}/torch/bert-pretrain/2node-run
+source ~/env22.10.sh
+which python3
+hostfile=./$SLURM_JOB_ID
+scontrol show hostnames $SLURM_JOB_NODELIST > ${hostfile}
+for i in `cat $hostfile`
+do
+    echo ${i} slots=4 >> `pwd`/hostfile-$SLURM_JOB_ID
+    ((num_node=${num_node}+1))
+done
+num_dcu=$((${num_node}*4))
+echo $num_dcu
+
+nodename=$(cat $hostfile |sed -n "1p")
+echo $nodename
+dist_url=`echo $nodename | awk '{print $1}'`
+export NCCL_DEBUG=INFO
+export HSA_USERPTR_FOR_PAGED_MEM=0
+
+mpirun -np ${num_dcu} --hostfile hostfile-$SLURM_JOB_ID ${WORK_PATH}/2nodes_single_process_pre1.sh $dist_url
+
--- a/2node-run-pre/run_bert_pre2_4dcus.sh
+++ b/2node-run-pre/run_bert_pre2_4dcus.sh
+#!/usr/bin/env bash
+#SBATCH -J 2node-test
+#SBATCH -p wzhdtest
+#SBATCH -N 2
+#SBARCH -n 32
+#SBATCH --ntasks-per-node=4
+#SBATCH --cpus-per-task=8
+#SBATCH --gres=dcu:4
+set -x
+
+HOME_PATH=/work/home/hepj
+WORK_PATH=${HOME_PATH}/torch/bert-pretrain/2node-run
+source ~/env22.10.sh
+which python3
+hostfile=./$SLURM_JOB_ID
+scontrol show hostnames $SLURM_JOB_NODELIST > ${hostfile}
+for i in `cat $hostfile`
+do
+    echo ${i} slots=4 >> `pwd`/hostfile-$SLURM_JOB_ID
+    ((num_node=${num_node}+1))
+done
+num_dcu=$((${num_node}*4))
+echo $num_dcu
+
+nodename=$(cat $hostfile |sed -n "1p")
+echo $nodename
+dist_url=`echo $nodename | awk '{print $1}'`
+export NCCL_DEBUG=INFO
+export HSA_USERPTR_FOR_PAGED_MEM=0
+
+mpirun -np ${num_dcu} --hostfile hostfile-$SLURM_JOB_ID ${WORK_PATH}/2nodes_single_process_pre2.sh $dist_url
+
--- a/2node-run-squad/2nodes_single_process.sh
+++ b/2node-run-squad/2nodes_single_process.sh
+#!/bin/bash
+export MIOPEN_DEBUG_DISABLE_FIND_DB=1
+export NCCL_SOCKET_IFNAME=eno1
+export HSA_USERPTR_FOR_PAGED_MEM=0
+export HIP_LAUNCH_BLOCKING=1
+lrank=$OMPI_COMM_WORLD_LOCAL_RANK
+comm_rank=$OMPI_COMM_WORLD_RANK
+comm_size=$OMPI_COMM_WORLD_SIZE
+APP="python3 /work/home/hepj/torch/bert-squad/run_squad_v4.py \
+  --train_file  ${HOME}/data/sq1.1/train-v1.1.json \
+  --predict_file  ${HOME}/data/sq1.1/dev-v1.1.json \
+  --init_checkpoint  ${HOME}/model/pytorch_bert/model.ckpt-28252.pt \
+  --vocab_file  ${HOME}/model/pytorch_bert/vocab.txt \
+  --output_dir  ${HOME}/outdir/torch/SQUAD4 \
+  --config_file  ${HOME}/model/pytorch_bert/bert_config.json \
+  --json-summary  ${HOME}/outdir/torch/SQUAD4/results.json \
+  --bert_model bert-large-uncased \
+  --do_train \
+  --do_predict \
+  --train_batch_size  4 \
+  --predict_batch_size 4 \
+  --gpus_per_node  2 \
+  --local_rank ${comm_rank} \
+  --world_size ${comm_size} \
+  --use_env  \
+  --dist_url tcp://${1}:34567 \
+ "
+case ${lrank} in
+[0])
+  export HIP_VISIBLE_DEVICES=0
+  export UCX_NET_DEVICES=mlx5_0:1
+  export UCX_IB_PCI_BW=mlx5_0:50Gbs
+  echo NCCL_SOCKET_IFNAME=eno1 numactl --cpunodebind=0 --membind=0 ${APP}
+  NCCL_SOCKET_IFNAME=eno1 numactl --cpunodebind=0 --membind=0 ${APP}
+  ;;
+[1])
+  export HIP_VISIBLE_DEVICES=1
+  export UCX_NET_DEVICES=mlx5_1:1
+  export UCX_IB_PCI_BW=mlx5_1:50Gbs
+  echo NCCL_SOCKET_IFNAME=eno1 numactl --cpunodebind=1 --membind=1 ${APP}
+  NCCL_SOCKET_IFNAME=eno1 numactl --cpunodebind=1 --membind=1 ${APP}
+  ;;
+[2])
+  export HIP_VISIBLE_DEVICES=2
+  export UCX_NET_DEVICES=mlx5_2:1
+  export UCX_IB_PCI_BW=mlx5_2:50Gbs
+  echo NCCL_SOCKET_IFNAME=eno1 numactl --cpunodebind=2 --membind=2 ${APP}  
+  NCCL_SOCKET_IFNAME=eno1 numactl --cpunodebind=2 --membind=2 ${APP}
+  ;;
+[3])
+  export HIP_VISIBLE_DEVICES=3
+  export UCX_NET_DEVICES=mlx5_3:1
+  export UCX_IB_PCI_BW=mlx5_3:50Gbs
+  echo NCCL_SOCKET_IFNAME=eno1 numactl --cpunodebind=3 --membind=3 ${APP}  
+  NCCL_SOCKET_IFNAME=eno1 numactl --cpunodebind=3 --membind=3 ${APP}
+  ;;
+esac
--- a/2node-run-squad/37908748
+++ b/2node-run-squad/37908748
+j20r4n01
+j20r4n02
--- a/2node-run-squad/hostfile-37908748
+++ b/2node-run-squad/hostfile-37908748
+j20r4n01 slots=4
+j20r4n02 slots=4
--- a/2node-run-squad/run_bert_squad_4dcus.sh
+++ b/2node-run-squad/run_bert_squad_4dcus.sh
+#!/usr/bin/env bash
+#SBATCH -J 2node-test
+#SBATCH -p kshdnormal
+#SBATCH -N 2
+#SBARCH -n 32
+#SBATCH --ntasks-per-node=4
+#SBATCH --cpus-per-task=8
+#SBATCH --gres=dcu:4
+set -x
+
+HOME_PATH=/public/home/hepj
+WORK_PATH=${HOME_PATH}/torch/BERT/2node-run-squad
+source ~/env22.10.sh
+which python3
+hostfile=./$SLURM_JOB_ID
+scontrol show hostnames $SLURM_JOB_NODELIST > ${hostfile}
+for i in `cat $hostfile`
+do
+    echo ${i} slots=4 >> `pwd`/hostfile-$SLURM_JOB_ID
+    ((num_node=${num_node}+1))
+done
+num_dcu=$((${num_node}*4))
+echo $num_dcu
+
+nodename=$(cat $hostfile |sed -n "1p")
+echo $nodename
+dist_url=`echo $nodename | awk '{print $1}'`
+export HSA_USERPTR_FOR_PAGED_MEM=0
+
+
+mpirun -np ${num_dcu} --hostfile hostfile-$SLURM_JOB_ID ${WORK_PATH}/2nodes_single_process.sh $dist_url
+
--- a/2node-run-squad/slurm-37908748.out
+++ b/2node-run-squad/slurm-37908748.out
+ HOME_PATH=/public/home/hepj
+ WORK_PATH=/public/home/hepj/torch/BERT/2node-run-squad
+ source /public/home/hepj/env22.10.sh
+/opt/gridview/slurm/spool_slurmd/job37908748/slurm_script: line 13: /public/home/hepj/env22.10.sh: No such file or directory
+ which python3
+/public/home/hepj/job_env/dtk22.10-torch-1.10-py3.7/bin/python3
+ hostfile=./37908748
+ scontrol show hostnames 'j20r4n[01-02]'
++ cat ./37908748
+ for i in '`cat $hostfile`'
+ echo j20r4n01 slots=4
++ pwd
+ (( num_node=+1 ))
+ for i in '`cat $hostfile`'
+ echo j20r4n02 slots=4
++ pwd
+ (( num_node=1+1 ))
+ num_dcu=8
+ echo 8
+8
++ cat ./37908748
++ sed -n 1p
+ nodename=j20r4n01
+ echo j20r4n01
+j20r4n01
++ echo j20r4n01
++ awk '{print $1}'
+ dist_url=j20r4n01
+ export HSA_USERPTR_FOR_PAGED_MEM=0
+ HSA_USERPTR_FOR_PAGED_MEM=0
+ mpirun -np 8 --hostfile hostfile-37908748 /public/home/hepj/torch/BERT/2node-run-squad/2nodes_single_process.sh j20r4n01
+NCCL_SOCKET_IFNAME=eno1 numactl --cpunodebind=0 --membind=0 python3 /work/home/hepj/torch/bert-squad/run_squad_v4.py --train_file /public/home/hepj/data/sq1.1/train-v1.1.json --predict_file /public/home/hepj/data/sq1.1/dev-v1.1.json --init_checkpoint /public/home/hepj/model/pytorch_bert/model.ckpt-28252.pt --vocab_file /public/home/hepj/model/pytorch_bert/vocab.txt --output_dir /public/home/hepj/outdir/torch/SQUAD4 --config_file /public/home/hepj/model/pytorch_bert/bert_config.json --json-summary /public/home/hepj/outdir/torch/SQUAD4/results.json --bert_model bert-large-uncased --do_train --do_predict --train_batch_size 4 --predict_batch_size 4 --gpus_per_node 2 --local_rank 0 --world_size 8 --use_env --dist_url tcp://j20r4n01:34567
+NCCL_SOCKET_IFNAME=eno1 numactl --cpunodebind=1 --membind=1 python3 /work/home/hepj/torch/bert-squad/run_squad_v4.py --train_file /public/home/hepj/data/sq1.1/train-v1.1.json --predict_file /public/home/hepj/data/sq1.1/dev-v1.1.json --init_checkpoint /public/home/hepj/model/pytorch_bert/model.ckpt-28252.pt --vocab_file /public/home/hepj/model/pytorch_bert/vocab.txt --output_dir /public/home/hepj/outdir/torch/SQUAD4 --config_file /public/home/hepj/model/pytorch_bert/bert_config.json --json-summary /public/home/hepj/outdir/torch/SQUAD4/results.json --bert_model bert-large-uncased --do_train --do_predict --train_batch_size 4 --predict_batch_size 4 --gpus_per_node 2 --local_rank 1 --world_size 8 --use_env --dist_url tcp://j20r4n01:34567
+NCCL_SOCKET_IFNAME=eno1 numactl --cpunodebind=2 --membind=2 python3 /work/home/hepj/torch/bert-squad/run_squad_v4.py --train_file /public/home/hepj/data/sq1.1/train-v1.1.json --predict_file /public/home/hepj/data/sq1.1/dev-v1.1.json --init_checkpoint /public/home/hepj/model/pytorch_bert/model.ckpt-28252.pt --vocab_file /public/home/hepj/model/pytorch_bert/vocab.txt --output_dir /public/home/hepj/outdir/torch/SQUAD4 --config_file /public/home/hepj/model/pytorch_bert/bert_config.json --json-summary /public/home/hepj/outdir/torch/SQUAD4/results.json --bert_model bert-large-uncased --do_train --do_predict --train_batch_size 4 --predict_batch_size 4 --gpus_per_node 2 --local_rank 2 --world_size 8 --use_env --dist_url tcp://j20r4n01:34567
+NCCL_SOCKET_IFNAME=eno1 numactl --cpunodebind=3 --membind=3 python3 /work/home/hepj/torch/bert-squad/run_squad_v4.py --train_file /public/home/hepj/data/sq1.1/train-v1.1.json --predict_file /public/home/hepj/data/sq1.1/dev-v1.1.json --init_checkpoint /public/home/hepj/model/pytorch_bert/model.ckpt-28252.pt --vocab_file /public/home/hepj/model/pytorch_bert/vocab.txt --output_dir /public/home/hepj/outdir/torch/SQUAD4 --config_file /public/home/hepj/model/pytorch_bert/bert_config.json --json-summary /public/home/hepj/outdir/torch/SQUAD4/results.json --bert_model bert-large-uncased --do_train --do_predict --train_batch_size 4 --predict_batch_size 4 --gpus_per_node 2 --local_rank 3 --world_size 8 --use_env --dist_url tcp://j20r4n01:34567
+NCCL_SOCKET_IFNAME=eno1 numactl --cpunodebind=0 --membind=0 python3 /work/home/hepj/torch/bert-squad/run_squad_v4.py --train_file /public/home/hepj/data/sq1.1/train-v1.1.json --predict_file /public/home/hepj/data/sq1.1/dev-v1.1.json --init_checkpoint /public/home/hepj/model/pytorch_bert/model.ckpt-28252.pt --vocab_file /public/home/hepj/model/pytorch_bert/vocab.txt --output_dir /public/home/hepj/outdir/torch/SQUAD4 --config_file /public/home/hepj/model/pytorch_bert/bert_config.json --json-summary /public/home/hepj/outdir/torch/SQUAD4/results.json --bert_model bert-large-uncased --do_train --do_predict --train_batch_size 4 --predict_batch_size 4 --gpus_per_node 2 --local_rank 4 --world_size 8 --use_env --dist_url tcp://j20r4n01:34567
+NCCL_SOCKET_IFNAME=eno1 numactl --cpunodebind=1 --membind=1 python3 /work/home/hepj/torch/bert-squad/run_squad_v4.py --train_file /public/home/hepj/data/sq1.1/train-v1.1.json --predict_file /public/home/hepj/data/sq1.1/dev-v1.1.json --init_checkpoint /public/home/hepj/model/pytorch_bert/model.ckpt-28252.pt --vocab_file /public/home/hepj/model/pytorch_bert/vocab.txt --output_dir /public/home/hepj/outdir/torch/SQUAD4 --config_file /public/home/hepj/model/pytorch_bert/bert_config.json --json-summary /public/home/hepj/outdir/torch/SQUAD4/results.json --bert_model bert-large-uncased --do_train --do_predict --train_batch_size 4 --predict_batch_size 4 --gpus_per_node 2 --local_rank 5 --world_size 8 --use_env --dist_url tcp://j20r4n01:34567
+NCCL_SOCKET_IFNAME=eno1 numactl --cpunodebind=2 --membind=2 python3 /work/home/hepj/torch/bert-squad/run_squad_v4.py --train_file /public/home/hepj/data/sq1.1/train-v1.1.json --predict_file /public/home/hepj/data/sq1.1/dev-v1.1.json --init_checkpoint /public/home/hepj/model/pytorch_bert/model.ckpt-28252.pt --vocab_file /public/home/hepj/model/pytorch_bert/vocab.txt --output_dir /public/home/hepj/outdir/torch/SQUAD4 --config_file /public/home/hepj/model/pytorch_bert/bert_config.json --json-summary /public/home/hepj/outdir/torch/SQUAD4/results.json --bert_model bert-large-uncased --do_train --do_predict --train_batch_size 4 --predict_batch_size 4 --gpus_per_node 2 --local_rank 6 --world_size 8 --use_env --dist_url tcp://j20r4n01:34567
+NCCL_SOCKET_IFNAME=eno1 numactl --cpunodebind=3 --membind=3 python3 /work/home/hepj/torch/bert-squad/run_squad_v4.py --train_file /public/home/hepj/data/sq1.1/train-v1.1.json --predict_file /public/home/hepj/data/sq1.1/dev-v1.1.json --init_checkpoint /public/home/hepj/model/pytorch_bert/model.ckpt-28252.pt --vocab_file /public/home/hepj/model/pytorch_bert/vocab.txt --output_dir /public/home/hepj/outdir/torch/SQUAD4 --config_file /public/home/hepj/model/pytorch_bert/bert_config.json --json-summary /public/home/hepj/outdir/torch/SQUAD4/results.json --bert_model bert-large-uncased --do_train --do_predict --train_batch_size 4 --predict_batch_size 4 --gpus_per_node 2 --local_rank 7 --world_size 8 --use_env --dist_url tcp://j20r4n01:34567
+(null): can't open file '/work/home/hepj/torch/bert-squad/run_squad_v4.py': [Errno 2] No such file or directory
+(null): can't open file '/work/home/hepj/torch/bert-squad/run_squad_v4.py': [Errno 2] No such file or directory
+(null): can't open file '/work/home/hepj/torch/bert-squad/run_squad_v4.py': [Errno 2] No such file or directory
+(null): can't open file '/work/home/hepj/torch/bert-squad/run_squad_v4.py': [Errno 2] No such file or directory
+(null): can't open file '/work/home/hepj/torch/bert-squad/run_squad_v4.py': [Errno 2] No such file or directory
+(null): can't open file '/work/home/hepj/torch/bert-squad/run_squad_v4.py': [Errno 2] No such file or directory
+(null): can't open file '/work/home/hepj/torch/bert-squad/run_squad_v4.py': [Errno 2] No such file or directory
+(null): can't open file '/work/home/hepj/torch/bert-squad/run_squad_v4.py': [Errno 2] No such file or directory
+--------------------------------------------------------------------------
+Primary job  terminated normally, but 1 process returned
+a non-zero exit code. Per user-direction, the job has been aborted.
+--------------------------------------------------------------------------
+--------------------------------------------------------------------------
+mpirun detected that one or more processes exited with non-zero status, thus causing
+the job to be terminated. The first process to do so was:
+
+  Process name: [[60184,1],5]
+  Exit code:    2
+--------------------------------------------------------------------------
--- a/Dockerfile
+++ b/Dockerfile
+# Copyright (c) 2020 NVIDIA CORPORATION. All rights reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+ARG FROM_IMAGE_NAME=nvcr.io/nvidia/pytorch:20.06-py3
+FROM nvcr.io/nvidia/tritonserver:20.06-v1-py3-clientsdk as trt
+FROM ${FROM_IMAGE_NAME}
+RUN apt-get update && apt-get install -y pbzip2 pv bzip2 cabextract
+
+ENV BERT_PREP_WORKING_DIR /workspace/bert/data
+
+WORKDIR /workspace
+RUN git clone https://github.com/attardi/wikiextractor.git && cd wikiextractor && git checkout 6408a430fc504a38b04d37ce5e7fc740191dee16 && cd ..
+RUN git clone https://github.com/soskek/bookcorpus.git
+
+# Copy the perf_client over
+COPY --from=trt /workspace/install/ /workspace/install/
+ENV LD_LIBRARY_PATH /workspace/install/lib:${LD_LIBRARY_PATH}
+
+# Install trt python api
+RUN apt-get install libb64-0d
+RUN pip install /workspace/install/python/tensorrtserver*.whl
+
+WORKDIR /workspace/bert
+RUN pip install --upgrade --no-cache-dir pip \
+ && pip install --no-cache-dir \
+ tqdm boto3 requests six ipdb h5py html2text nltk progressbar onnxruntime \
+ git+https://github.com/NVIDIA/dllogger wget
+
+RUN apt-get install -y iputils-ping
+
+COPY . .
--- a/LICENSE
+++ b/LICENSE
+
+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+   1. Definitions.
+
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+
+   END OF TERMS AND CONDITIONS
+
+   Copyright 2019 NVIDIA CORPORATION. All rights reserved.
+   APPENDIX: How to apply the Apache License to your work.
+
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+
+   Copyright [yyyy] [name of copyright owner]
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
\ No newline at end of file
--- a/NOTICE
+++ b/NOTICE
+BERT PyTorch
+
+This repository includes software from https://github.com/huggingface/pytorch-pretrained-BERT
+licensed under the Apache License 2.0.
--- a/README.md
+++ b/README.md
+# **Bert算力测试**
+
+## 1.数据集准备
+
+pre_train 数据，目前最新的是wiki20220401的数据，但数据集压缩后近20GB，解压后300GB下载速度慢，解压占大量空间。enwiki-20220401-pages-articles-multistream.xml.bz2下载链接如下：
+
+https://dumps.wikimedia.org/enwiki/20220401/ 
+
+这里使用服务器已有的wiki数据集服务器上有已经下载处理好的数据，预训练数据分为PHRASE1、PHRASE2
+
+```
+昆山wiki数据集地址PHRASE1:
+PATH_PHRASE1=/public/software/apps/DeepLearning/Data/wikicorpus_en/lower_case_1_seq_len_128_max_pred_20_masked_lm_prob_0.15_random_seed_12345_dupe_factor_5_shard_1472_test_split_10/wikicorpus_en/training
+
+昆山wiki数据集地址PHRASE2:
+PATH_PHRASE2=/public/software/apps/DeepLearning/Data/wikicorpus_en/lower_case_1_seq_len_512_max_pred_80_masked_lm_prob_0.15_random_seed_12345_dupe_factor_5_shard_1472_test_split_10/wikicorpus_en/training
+```
+
+```
+乌镇wiki地址PHRASE1:
+/public/DL_DATA/wikicorpus_en/lower_case_1_seq_len_128_max_pred_20_masked_lm_prob_0.15_random_seed_12345_dupe_factor_5_shard_1472_test_split_10/wikicorpus_en
+乌镇wiki地址PHRASE2:
+/public/DL_DATA/wikicorpus_en/lower_case_1_seq_len_512_max_pred_80_masked_lm_prob_0.15_random_seed_12345_dupe_factor_5_shard_1472_test_split_10/wikicorpus_en
+```
+
+问答SQUAD1.1数据：
+
+[train-v1.1](https://rajpurkar.github.io/SQuAD-explorer/dataset/train-v1.1.json)
+
+[dev-v1.1](https://rajpurkar.github.io/SQuAD-explorer/dataset/dev-v1.1.json)
+
+## 2.测试环境
+
+注意dtk python torch apex 等版本要对齐
+
+```
+1.创建python虚拟环境并进入
+virtualenv --python=~/package/Python-3.6.8/build/bin/python3 venv_dtk21.10.1_torch1.10
+source venv_dtk21.10_torch1.10/bin/activate
+
+2.安装依赖包
+pip3 install -r requirements.txt -i http://mirrors.aliyun.com/pypi/simple/ --trusted-host mirrors.aliyun.com
+
+pip install torch-1.10.0a0+gitcc7c9c7-cp36-cp36m-linux_x86_64.whl
+pip install torchvision-0.10.0a0+300a8a4-cp36-cp36m-linux_x86_64.whl
+pip install apex-0.1-cp36-cp36m-linux_x86_64.whl
+
+3.环境变量设置
+module rm compiler/rocm/2.9 
+export ROCM_PATH=/public/home/hepj/job_env/apps/dtk-21.10.1
+export HIP_PATH=${ROCM_PATH}/hip
+export PATH=${ROCM_PATH}/bin:${ROCM_PATH}/llvm/bin:${ROCM_PATH}/hcc/bin:${ROCM_PATH}/hip/bin:$PAT
+export HSA_FORCE_FINE_GRAIN_PCIE=1
+export MIOPEN_FIND_MODE=3
+export MIOPEN_ENABLE_LOGGING_CMD=1
+export ROCBLAS_LAYER=3
+module unload compiler/rocm/2.9
+echo "MIOPEN_FIND_MODE=$MIOPEN_FIND_MODE"
+lrank=$OMPI_COMM_WORLD_LOCAL_RANK
+comm_rank=$OMPI_COMM_WORLD_RANK
+comm_size=$OMPI_COMM_WORLD_SIZ
+```
+
+## 3.squad测试
+
+### 1.模型转化
+
+```
+python3 tf_to_torch/convert_tf_checkpoint.py --tf_checkpoint ~/NLP/cks/bs64k_32k_ckpt/model.ckpt-28252 --bert_config_path ~/NLP/cks/bs64k_32k_ckpt/bert_config.json --output_checkpoint model.ckpt-28252.pt
+```
+
+目前模型转换还存在问题，可能是由于下载的TF模型与model.ckpt-28252不同导致，或torch 、apex版本兼容性问题，还在排查当中，可以直接使用转换好的模型进行squad任务的微调训练（PHRASE的测试则不受此影响，PHRASE为预训练只需要训练数据与网络结构即可，不需要加载模型）
+
+[转换好的模型  提取密码：vs8d](https://pan.baidu.com/share/init?surl=V8kFpgsLQe8tOAeft-5UpQ)
+
+### 2.参数说明
+
+```
+  --train_file  训练数据
+  --predict_file  预测文件
+  --init_checkpoint  模型文件
+  --vocab_file  词向量文件
+  --output_dir  输出文件夹
+  --config_file  模型配置文件
+  --json-summary  输出json文件
+  --bert_model bert模型类型可选： bert-base-uncased, bert-large-uncased, bert-base-cased, bert-large-cased, bert-base-multilingual-uncased,bert-base-multilingual-cased, bert-base-chinese
+  --do_train 是否训练
+  --do_predict 是否预测
+  --train_batch_size  训练batch_size
+  --predict_batch_size 预测batch_size
+  --gpus_per_node  使用gpu节点数
+  --local_rank 基于GPU的分布式训练的local_rank（单卡设置为-1）
+  --fp16 混合精度训练
+  --amp 混合精度训练
+```
+
+### 3.运行
+
+```
+#单卡
+./bert_squad.sh #单精度 （按自己路径对single_squad.sh里APP设置进行修改）
+./bert_squad_fp16.sh  #半精度 （按自己路径对single_squad_fp16.sh里APP设置进行修改）
+```
+
+```
+#多卡
+./bert_squad4.sh #单精度  （按自己路径对single_squad4.sh里APP设置进行修改）
+./bert_squad4_fp16.sh #半精度  （按自己路径对single_squad4_fp16.sh里APP设置进行修改）
+```
+
+```
+#多机多卡
+cd 2node-run-squad
+sbatch run_bert_squad_4dcu.sh （按照自己情况对#SBATCH -p、#SBATCH -J进行修改；需要fp16可以在相应single文件APP中增加 --fp16 与 --amp参数,运行结果保存在相应的slurm文件中）
+```
+
+
+
+## 4.**PHRASE测试**
+
+### 1.参数说明
+
+```
+    --input_dir  输入数据文件夹
+    --output_dir 输出保存文件夹
+    --config_file 模型配置文件
+    --bert_model  bert模型类型可选： bert-base-uncased, bert-large-uncased, bert-base-cased, bert-large-cased, bert-base-multilingual-uncased,bert-base-multilingual-cased, bert-base-chinese
+    --train_batch_size 训练batch_size
+    --max_seq_length=128 最大长度（需要和训练数据相匹配）
+    --max_predictions_per_seq 输入序列中屏蔽标记的最大总数 
+    --max_steps 最大步长
+    --warmup_proportion 进行线性学习率热身的训练比例
+    --num_steps_per_checkpoint 多少步保存一次模型
+    --learning_rate 学习率
+    --seed 随机种子
+    --gradient_accumulation_steps 在执行向后/更新过程之前，Accumulte的更新步骤数
+    --allreduce_post_accumulation 是否在梯度累积步骤期间执行所有减少
+    --do_train 是否训练
+    --fp16 混合精度训练
+    --amp 混合精度训练
+    --json-summary 输出json文件
+```
+
+### 2.PHRASE1
+
+```
+#单卡
+./bert_pre1.sh #单精度 （按自己路径对single_pre1_1.sh里APP设置进行修改）
+./bert_pre1_fp16.sh  #半精度 （按自己路径对single_pre1_1_fp16.sh里APP设置进行修改）
+#多卡
+./bert_pre1_4.sh #单精度 （按自己路径对single_pre1_4.sh里APP设置进行修改）
+./bert_pre1_4_fp16.sh   #半精度 （按自己路径对single_pre1_4_fp16.sh里APP设置进行修改）
+#多机多卡
+cd 2node-run-pre
+sbatch run_bert_pre1_4dcu.sh （按照自己情况对#SBATCH -p、#SBATCH -J进行修改；需要fp16可以在相应single文件APP中增加 --fp16 与 --amp参数,运行结果保存在相应的slurm文件中）
+```
+
+### 3.PHRASE2
+
+```
+#单卡
+./bert_pre2.sh  #单精度 （按自己路径对single_pre2_1.sh里APP设置进行修改）
+./bert_pre2_fp16.sh  #半精度 （按自己路径对single_pre2_1_fp16.sh里APP设置进行修改）
+#多卡
+./bert_pre2_4.sh  #单精度 （按自己路径对single_pre2_4.sh里APP设置进行修改）
+./bert_pre2_4_fp16.sh  #半精度 （按自己路径对single_pre2_4_fp16.sh里APP设置进行修改）
+#多机多卡
+cd 2node-run-pre
+sbatch run_bert_pre2_4dcu.sh （按照自己情况对#SBATCH -p、#SBATCH -J进行修改；需要fp16可以在相应single文件APP中增加 --fp16 与 --amp参数,运行结果保存在相应的slurm文件中）
+```
+
--- a/README_ORIGIN.md
+++ b/README_ORIGIN.md
--- a/README_old.md
+++ b/README_old.md
+# 简介
+
+使用PyTorch框架计算Bert网络。
+
+* BERT 的训练分为pre-train和fine-tune两种，pre-train训练分为两个phrase。
+
+* BERT 的推理可基于不同数据集进行精度验证
+* 数据生成、模型转换相关细节见  [README.md](http://10.0.100.3/dcutoolkit/deeplearing/dlexamples/-/blob/develop/PyTorch/NLP/BERT/scripts/README.md)
+
+# 运行示例
+
+目前提供基于wiki英文数据集 pre-train 两个阶段的训练和基于squad数据集fine-tune 训练的代码示例，
+
+## pre-train phrase1
+
+|参数名|解释|示例|
+|:---:|:---:|:---:|
+|PATH_PHRASE1|第一阶段训练数据集路径|/workspace/lower_case_1_seq_len_128_max_pred_20_masked_lm_prob_0.<br>15_random_seed_12345_dupe_factor_5_shard_1472_test_split_10
+|OUTPUT_DIR|输出路径|/workspace/results
+|PATH_CONFIG|confing路径|/workspace/bert_large_uncased
+|PATH_PHRASE2|第一阶段训练数据集路径|/workspace/lower_case_1_seq_len_512_max_pred_80_masked_lm_prob_0.<br>15_random_seed_12345_dupe_factor_5_shard_1472_test_split_10
+<br>
+
+### 单卡
+```
+export HIP_VISIBLE_DEVICES=0
+python3 run_pretraining_v1.py  \
+    --input_dir=${PATH_PHRASE1}    \
+    --output_dir=${OUTPUT_DIR}/checkpoints1 \
+    --config_file=${PATH_CONFIG}bert_config.json \
+    --bert_model=bert-large-uncased \
+    --train_batch_size=16 \
+    --max_seq_length=128 \
+    --max_predictions_per_seq=20 \
+    --max_steps=100000 \
+    --warmup_proportion=0.0 \
+    --num_steps_per_checkpoint=20000 \
+    --learning_rate=4.0e-4 \
+    --seed=12439 \
+    --gradient_accumulation_steps=1 \
+    --allreduce_post_accumulation \
+    --do_train \
+    --json-summary dllogger.json
+```
+
+### 多卡
+
+* 方法一
+```
+export HIP_VISIBLE_DEVICES=0,1,2,3
+python3 run_pretraining_v1.py  \
+    --input_dir=${PATH_PHRASE1}    \
+    --output_dir=${OUTPUT_DIR}/checkpoints \
+    --config_file=${PATH_CONFIG}bert_config.json \
+    --bert_model=bert-large-uncased \
+    --train_batch_size=16 \
+    --max_seq_length=128 \
+    --max_predictions_per_seq=20 \
+    --max_steps=100000 \
+    --warmup_proportion=0.0 \
+    --num_steps_per_checkpoint=20000 \
+    --learning_rate=4.0e-4 \
+    --seed=12439 \
+    --gradient_accumulation_steps=1 \
+    --allreduce_post_accumulation \
+    --do_train \
+    --json-summary dllogger.json
+```
+* 方法二
+
+hostfile:
+```
+node1 slots=4
+node2 slots=4
+```
+
+```
+#scripts/run_pretrain.sh 脚本默认每个节点四块卡
+cd scripts; bash run_pretrain.sh
+```
+
+
+## pre-train phrase2
+
+### 单卡
+```
+HIP_VISIBLE_DEVICES=0
+python3 run_pretraining_v1.py
+   --input_dir=${PATH_PHRASE2} \
+   --output_dir=${OUTPUT_DIR}/checkpoints2 \
+   --config_file=${PATH_CONFIG}bert_config.json \
+   --bert_model=bert-large-uncased \
+   --train_batch_size=4 \
+   --max_seq_length=512 \
+   --max_predictions_per_seq=80 \
+   --max_steps=400000 \
+   --warmup_proportion=0.128 \
+   --num_steps_per_checkpoint=200000 \
+   --learning_rate=4e-3 \
+   --seed=12439 \
+   --gradient_accumulation_steps=1 \
+   --allreduce_post_accumulation \
+   --do_train \
+   --phase2 \
+   --phase1_end_step=0 \
+   --json-summary dllogger.json
+```
+
+### 多卡
+
+* 方法一
+```
+export HIP_VISIBLE_DEVICES=0,1,2,3
+python3 run_pretraining_v1.py
+   --input_dir=${PATH_PHRASE2} \
+   --output_dir=${OUTPUT_DIR}/checkpoints2 \
+   --config_file=${PATH_CONFIG}bert_config.json \
+   --bert_model=bert-large-uncased \
+   --train_batch_size=4 \
+   --max_seq_length=512 \
+   --max_predictions_per_seq=80 \
+   --max_steps=400000 \
+   --warmup_proportion=0.128 \
+   --num_steps_per_checkpoint=200000 \
+   --learning_rate=4e-3 \
+   --seed=12439 \
+   --gradient_accumulation_steps=1 \
+   --allreduce_post_accumulation \
+   --do_train \
+   --phase2 \
+   --phase1_end_step=0 \
+   --json-summary dllogger.json
+```
+* 方法二
+
+hostfile:
+```
+node1 slots=4
+node2 slots=4
+```
+
+```
+#scripts/run_pretrain2.sh 脚本默认每个节点四块卡
+cd scripts; bash run_pretrain2.sh
+```
+
+
+
+## fine-tune 训练
+
+### 单卡
+```
+python3 run_squad_v1.py \
+  --train_file squad/v1.1/train-v1.1.json \
+  --init_checkpoint model.ckpt-28252.pt \
+  --vocab_file vocab.txt \
+  --output_dir SQuAD \
+  --config_file bert_config.json \
+  --bert_model=bert-large-uncased \
+  --do_train \
+  --train_batch_size 1 \
+  --gpus_per_node 1 
+```
+### 多卡
+
+hostfile:
+```
+node1 slots=4
+node2 slots=4
+```
+
+```
+#scripts/run_squad_1.sh 脚本默认每个节点四块卡
+bash run_squad_1.sh
+```
+
+
+
+# 参考资料
+[https://github.com/mlperf/training_results_v0.7/blob/master/NVIDIA/benchmarks/bert/implementations/pytorch](https://github.com/mlperf/training_results_v0.7/blob/master/NVIDIA/benchmarks/bert/implementations/pytorch)
+[https://github.com/NVIDIA/DeepLearningExamples/blob/master/PyTorch/LanguageModeling/BERT](https://github.com/NVIDIA/DeepLearningExamples/blob/master/PyTorch/LanguageModeling/BERT)
--- a/bert_config.json
+++ b/bert_config.json
+{
+  "attention_probs_dropout_prob": 0.1,
+  "hidden_act": "gelu",
+  "hidden_dropout_prob": 0.1,
+  "hidden_size": 1024,
+  "initializer_range": 0.02,
+  "intermediate_size": 4096,
+  "max_position_embeddings": 512,
+  "num_attention_heads": 16,
+  "num_hidden_layers": 24,
+  "type_vocab_size": 2,
+  "vocab_size": 30522
+}
--- a/bert_per1_4_fp16.sh
+++ b/bert_per1_4_fp16.sh
+export HIP_LAUNCH_BLOCKING=1
+mpirun --allow-run-as-root -np 4  single_pre1_4_fp16.sh
+
+
--- a/bert_pre1.sh
+++ b/bert_pre1.sh
+#!/bin/bash
+mpirun --allow-run-as-root -np 1  single_pre1_1.sh