bert-large training

230156c4 · yangzhong · 230156c4 · 230156c4 · 230156c4 · 230156c4
Commit 230156c4 authored Oct 21, 2025 by yangzhong
20 changed files
--- a/2node-run-pre/2nodes_single_process_pre1.sh
+++ b/2node-run-pre/2nodes_single_process_pre1.sh
+#!/bin/bash
+export MIOPEN_DEBUG_DISABLE_FIND_DB=1
+export NCCL_SOCKET_IFNAME=eno1
+export HSA_USERPTR_FOR_PAGED_MEM=0
+export HIP_LAUNCH_BLOCKING=1
+lrank=$OMPI_COMM_WORLD_LOCAL_RANK
+comm_rank=$OMPI_COMM_WORLD_RANK
+comm_size=$OMPI_COMM_WORLD_SIZE
+export PATH_PHRASE1=/public/DL_DATA/wikicorpus_en/lower_case_1_seq_len_128_max_pred_20_masked_lm_prob_0.15_random_seed_12345_dupe_factor_5_shard_1472_test_split_10/wikicorpus_en/training
+APP="python3 ${HOME}/torch/bert-pretrain/run_pretraining_v4.py  \
+    --input_dir=${PATH_PHRASE1}    \
+    --output_dir=${HOME}/outdir/torch/pre_wiki/phrase1 \
+    --config_file=${HOME}/model/uncased_L-24_H-1024_A-16/bert_config.json \
+    --bert_model=bert-large-uncased \
+    --train_batch_size=16 \
+    --max_seq_length=128 \
+    --max_predictions_per_seq=20 \
+    --max_steps=100000 \
+    --warmup_proportion=0.0 \
+    --num_steps_per_checkpoint=20000 \
+    --learning_rate=4.0e-4 \
+    --seed=12439 \
+    --gradient_accumulation_steps=1 \
+    --allreduce_post_accumulation \
+    --gpus_per_node ${2} \
+    --do_train \
+    --local_rank ${comm_rank} \
+    --world_size ${comm_size} \
+    --dist_url tcp://${1}:34567 \
+    --json-summary ${HOME}/outdir/torch/pre_wiki/phrase1/dllogger.json
+ "
+case ${lrank} in
+[0])
+  export HIP_VISIBLE_DEVICES=0
+  echo numactl --cpunodebind=0 --membind=0 ${APP}
+  numactl --cpunodebind=0 --membind=0 ${APP}
+  ;;
+[1])
+  export HIP_VISIBLE_DEVICES=1
+  echo numactl --cpunodebind=0 --membind=0 ${APP}
+  numactl --cpunodebind=0 --membind=0 ${APP}
+  ;;
+[2])
+  export HIP_VISIBLE_DEVICES=2
+  echo numactl --cpunodebind=0 --membind=0 ${APP}  
+  numactl --cpunodebind=0 --membind=0 ${APP}
+  ;;
+[3])
+  export HIP_VISIBLE_DEVICES=3
+  echo numactl --cpunodebind=0 --membind=0 ${APP}  
+  numactl --cpunodebind=0 --membind=0 ${APP}
+  ;;
+[4])
+  export HIP_VISIBLE_DEVICES=4
+  echo numactl --cpunodebind=3 --membind=3 ${APP}
+  numactl --cpunodebind=3 --membind=3 ${APP}
+  ;;
+[5])
+  export HIP_VISIBLE_DEVICES=5
+  echo numactl --cpunodebind=3 --membind=3 ${APP}
+  numactl --cpunodebind=3 --membind=3 ${APP}
+  ;;
+[6])
+  export HIP_VISIBLE_DEVICES=6
+  echo numactl --cpunodebind=3 --membind=3 ${APP}  
+  numactl --cpunodebind=3 --membind=3 ${APP}
+  ;;
+[7])
+  export HIP_VISIBLE_DEVICES=7
+  echo numactl --cpunodebind=3 --membind=3 ${APP}  
+  numactl --cpunodebind=3 --membind=3 ${APP}
+  ;;
+esac
--- a/2node-run-pre/2nodes_single_process_pre2.sh
+++ b/2node-run-pre/2nodes_single_process_pre2.sh
+#!/bin/bash
+export MIOPEN_DEBUG_DISABLE_FIND_DB=1
+export NCCL_SOCKET_IFNAME=eno1
+export HSA_USERPTR_FOR_PAGED_MEM=0
+export HIP_LAUNCH_BLOCKING=1
+lrank=$OMPI_COMM_WORLD_LOCAL_RANK
+comm_rank=$OMPI_COMM_WORLD_RANK
+comm_size=$OMPI_COMM_WORLD_SIZE
+export PATH_PHRASE2=/public/DL_DATA/wikicorpus_en/lower_case_1_seq_len_512_max_pred_80_masked_lm_prob_0.15_random_seed_12345_dupe_factor_5_shard_1472_test_split_10/wikicorpus_en/training
+APP="python3 ${HOME}/torch/bert-pretrain/run_pretraining_v4.py  \
+    --input_dir=${PATH_PHRASE2}    \
+    --output_dir=${HOME}/outdir/torch/pre_wiki/phrase2 \
+    --config_file=${HOME}/model/uncased_L-24_H-1024_A-16/bert_config.json \
+    --bert_model=bert-large-uncased \
+    --train_batch_size=2 \
+    --max_seq_length=512 \
+    --max_predictions_per_seq=80 \
+    --max_steps=400000 \
+    --warmup_proportion=0.128 \
+    --num_steps_per_checkpoint=20000 \
+    --learning_rate=4.0e-3 \
+    --seed=12439 \
+    --gradient_accumulation_steps=1 \
+    --allreduce_post_accumulation \
+    --gpus_per_node ${2} \
+    --do_train \
+    --phase2 \
+    --phase1_end_step=0 \
+    --local_rank ${comm_rank} \
+    --world_size ${comm_size} \
+    --dist_url tcp://${1}:34567 \
+    --json-summary ${HOME}/outdir/torch/pre_wiki4/phrase2/dllogger.json
+ "
+case ${lrank} in
+[0])
+  export HIP_VISIBLE_DEVICES=0
+  echo numactl --cpunodebind=0 --membind=0 ${APP}
+  numactl --cpunodebind=0 --membind=0 ${APP}
+  ;;
+[1])
+  export HIP_VISIBLE_DEVICES=1
+  echo numactl --cpunodebind=0 --membind=0 ${APP}
+  numactl --cpunodebind=0 --membind=0 ${APP}
+  ;;
+[2])
+  export HIP_VISIBLE_DEVICES=2
+  echo numactl --cpunodebind=0 --membind=0 ${APP}  
+  numactl --cpunodebind=0 --membind=0 ${APP}
+  ;;
+[3])
+  export HIP_VISIBLE_DEVICES=3
+  echo numactl --cpunodebind=0 --membind=0 ${APP}  
+  numactl --cpunodebind=0 --membind=0 ${APP}
+  ;;
+[4])
+  export HIP_VISIBLE_DEVICES=4
+  echo numactl --cpunodebind=3 --membind=3 ${APP}
+  numactl --cpunodebind=3 --membind=3 ${APP}
+  ;;
+[5])
+  export HIP_VISIBLE_DEVICES=5
+  echo numactl --cpunodebind=3 --membind=3 ${APP}
+  numactl --cpunodebind=3 --membind=3 ${APP}
+  ;;
+[6])
+  export HIP_VISIBLE_DEVICES=6
+  echo numactl --cpunodebind=3 --membind=3 ${APP}  
+  numactl --cpunodebind=3 --membind=3 ${APP}
+  ;;
+[7])
+  export HIP_VISIBLE_DEVICES=7
+  echo numactl --cpunodebind=3 --membind=3 ${APP}  
+  numactl --cpunodebind=3 --membind=3 ${APP}
+  ;;
+esac
--- a/2node-run-pre/env.sh
+++ b/2node-run-pre/env.sh
+source $ROCM_PATH/env.sh
\ No newline at end of file
--- a/2node-run-pre/hostfile
+++ b/2node-run-pre/hostfile
+j20r4n01 slots=8
+j20r4n02 slots=8
--- a/2node-run-pre/run_bert_pre1_4dcus.sh
+++ b/2node-run-pre/run_bert_pre1_4dcus.sh
+#!/usr/bin/env bash
+HOME_PATH=/work/home/hepj
+WORK_PATH=${HOME_PATH}/bert-pytorch/2node-run-pre
+which python3
+source env.sh
+hostfile=./hostfile
+node=$(cat $hostfile|sort|uniq |wc -l)
+np=$(($node*8))
+nodename=$(cat $hostfile |sed -n "1p")
+dist_url=`echo $nodename | awk '{print $1}'`
+mpirun -np ${np} --hostfile hostfile-$SLURM_JOB_ID  --bind-to none --mca btl_tcp_if_include $dist_url ${WORK_PATH}/2nodes_single_process_pre1.sh $dist_url $node
--- a/2node-run-pre/run_bert_pre2_4dcus.sh
+++ b/2node-run-pre/run_bert_pre2_4dcus.sh
+#!/usr/bin/env bash
+HOME_PATH=/work/home/hepj
+WORK_PATH=${HOME_PATH}/bert-pytorch/2node-run-pre
+which python3
+source env.sh
+hostfile=./hostfile
+node=$(cat $hostfile|sort|uniq |wc -l)
+np=$(($node*8))
+nodename=$(cat $hostfile |sed -n "1p")
+dist_url=`echo $nodename | awk '{print $1}'`
+mpirun -np ${np} --hostfile hostfile-$SLURM_JOB_ID  --bind-to none --mca btl_tcp_if_include $dist_url ${WORK_PATH}/2nodes_single_process_pre2.sh $dist_url $node
--- a/2node-run-squad/2nodes_single_process.sh
+++ b/2node-run-squad/2nodes_single_process.sh
+#!/bin/bash
+export MIOPEN_DEBUG_DISABLE_FIND_DB=1
+export HSA_USERPTR_FOR_PAGED_MEM=0
+lrank=$OMPI_COMM_WORLD_LOCAL_RANK
+comm_rank=$OMPI_COMM_WORLD_RANK
+comm_size=$OMPI_COMM_WORLD_SIZE
+APP="python3 /work/home/hepj/torch/bert-squad/run_squad_v4.py \
+  --train_file  ${HOME}/data/sq1.1/train-v1.1.json \
+  --predict_file  ${HOME}/data/sq1.1/dev-v1.1.json \
+  --init_checkpoint  ${HOME}/model/pytorch_bert/model.ckpt-28252.pt \
+  --vocab_file  ${HOME}/model/pytorch_bert/vocab.txt \
+  --output_dir  ${HOME}/outdir/torch/SQUAD4 \
+  --config_file  ${HOME}/model/pytorch_bert/bert_config.json \
+  --json-summary  ${HOME}/outdir/torch/SQUAD4/results.json \
+  --bert_model bert-large-uncased \
+  --do_train \
+  --do_predict \
+  --train_batch_size  4 \
+  --predict_batch_size 4 \
+  --gpus_per_node  2 \
+  --local_rank ${comm_rank} \
+  --world_size ${comm_size} \
+  --use_env  \
+  --dist_url tcp://${1}:34567 \
+ "
+case ${lrank} in
+[0])
+  export HIP_VISIBLE_DEVICES=0
+  echo numactl --cpunodebind=0 --membind=0 ${APP}
+  numactl --cpunodebind=0 --membind=0 ${APP}
+  ;;
+[1])
+  export HIP_VISIBLE_DEVICES=1
+  echo numactl --cpunodebind=0 --membind=0 ${APP}
+  numactl --cpunodebind=0 --membind=0 ${APP}
+  ;;
+[2])
+  export HIP_VISIBLE_DEVICES=2
+  echo numactl --cpunodebind=0 --membind=0 ${APP}  
+  numactl --cpunodebind=0 --membind=0 ${APP}
+  ;;
+[3])
+  export HIP_VISIBLE_DEVICES=3
+  echo numactl --cpunodebind=0 --membind=0 ${APP}  
+  numactl --cpunodebind=0 --membind=0 ${APP}
+  ;;
+[4])
+  export HIP_VISIBLE_DEVICES=4
+  echo numactl --cpunodebind=3 --membind=3 ${APP}
+  numactl --cpunodebind=3 --membind=3 ${APP}
+  ;;
+[5])
+  export HIP_VISIBLE_DEVICES=5
+  echo numactl --cpunodebind=3 --membind=3 ${APP}
+  numactl --cpunodebind=3 --membind=3 ${APP}
+  ;;
+[6])
+  export HIP_VISIBLE_DEVICES=6
+  echo numactl --cpunodebind=3 --membind=3 ${APP}  
+  numactl --cpunodebind=3 --membind=3 ${APP}
+  ;;
+[7])
+  export HIP_VISIBLE_DEVICES=7
+  echo numactl --cpunodebind=3 --membind=3 ${APP}  
+  numactl --cpunodebind=3 --membind=3 ${APP}
+  ;;
+esac
--- a/2node-run-squad/env.sh
+++ b/2node-run-squad/env.sh
+source $ROCM_PATH/env.sh
\ No newline at end of file
--- a/2node-run-squad/hostfile
+++ b/2node-run-squad/hostfile
+j20r4n01 slots=8
+j20r4n02 slots=8
--- a/2node-run-squad/run_bert_squad_4dcus.sh
+++ b/2node-run-squad/run_bert_squad_4dcus.sh
+#!/usr/bin/env bash
+HOME_PATH=/public/home/hepj
+WORK_PATH=${HOME_PATH}/bert-pytorch/2node-run-squad
+which python3
+source env.sh
+hostfile=./hostfile
+np=$(cat $hostfile|sort|uniq |wc -l)
+np=$(($np*8))
+nodename=$(cat $hostfile |sed -n "1p")
+dist_url=`echo $nodename | awk '{print $1}'`
+mpirun -np ${np} --hostfile hostfile-$SLURM_JOB_ID  --bind-to none --mca btl_tcp_if_include $dist_url ${WORK_PATH}/2nodes_single_process.sh $dist_url
--- a/2node-run-squad/slurm-37908748.out
+++ b/2node-run-squad/slurm-37908748.out
+ HOME_PATH=/public/home/hepj
+ WORK_PATH=/public/home/hepj/torch/BERT/2node-run-squad
+ source /public/home/hepj/env22.10.sh
+/opt/gridview/slurm/spool_slurmd/job37908748/slurm_script: line 13: /public/home/hepj/env22.10.sh: No such file or directory
+ which python3
+/public/home/hepj/job_env/dtk22.10-torch-1.10-py3.7/bin/python3
+ hostfile=./37908748
+ scontrol show hostnames 'j20r4n[01-02]'
++ cat ./37908748
+ for i in '`cat $hostfile`'
+ echo j20r4n01 slots=4
++ pwd
+ (( num_node=+1 ))
+ for i in '`cat $hostfile`'
+ echo j20r4n02 slots=4
++ pwd
+ (( num_node=1+1 ))
+ num_dcu=8
+ echo 8
+8
++ cat ./37908748
++ sed -n 1p
+ nodename=j20r4n01
+ echo j20r4n01
+j20r4n01
++ echo j20r4n01
++ awk '{print $1}'
+ dist_url=j20r4n01
+ export HSA_USERPTR_FOR_PAGED_MEM=0
+ HSA_USERPTR_FOR_PAGED_MEM=0
+ mpirun -np 8 --hostfile hostfile-37908748 /public/home/hepj/torch/BERT/2node-run-squad/2nodes_single_process.sh j20r4n01
+NCCL_SOCKET_IFNAME=eno1 numactl --cpunodebind=0 --membind=0 python3 /work/home/hepj/torch/bert-squad/run_squad_v4.py --train_file /public/home/hepj/data/sq1.1/train-v1.1.json --predict_file /public/home/hepj/data/sq1.1/dev-v1.1.json --init_checkpoint /public/home/hepj/model/pytorch_bert/model.ckpt-28252.pt --vocab_file /public/home/hepj/model/pytorch_bert/vocab.txt --output_dir /public/home/hepj/outdir/torch/SQUAD4 --config_file /public/home/hepj/model/pytorch_bert/bert_config.json --json-summary /public/home/hepj/outdir/torch/SQUAD4/results.json --bert_model bert-large-uncased --do_train --do_predict --train_batch_size 4 --predict_batch_size 4 --gpus_per_node 2 --local_rank 0 --world_size 8 --use_env --dist_url tcp://j20r4n01:34567
+NCCL_SOCKET_IFNAME=eno1 numactl --cpunodebind=1 --membind=1 python3 /work/home/hepj/torch/bert-squad/run_squad_v4.py --train_file /public/home/hepj/data/sq1.1/train-v1.1.json --predict_file /public/home/hepj/data/sq1.1/dev-v1.1.json --init_checkpoint /public/home/hepj/model/pytorch_bert/model.ckpt-28252.pt --vocab_file /public/home/hepj/model/pytorch_bert/vocab.txt --output_dir /public/home/hepj/outdir/torch/SQUAD4 --config_file /public/home/hepj/model/pytorch_bert/bert_config.json --json-summary /public/home/hepj/outdir/torch/SQUAD4/results.json --bert_model bert-large-uncased --do_train --do_predict --train_batch_size 4 --predict_batch_size 4 --gpus_per_node 2 --local_rank 1 --world_size 8 --use_env --dist_url tcp://j20r4n01:34567
+NCCL_SOCKET_IFNAME=eno1 numactl --cpunodebind=2 --membind=2 python3 /work/home/hepj/torch/bert-squad/run_squad_v4.py --train_file /public/home/hepj/data/sq1.1/train-v1.1.json --predict_file /public/home/hepj/data/sq1.1/dev-v1.1.json --init_checkpoint /public/home/hepj/model/pytorch_bert/model.ckpt-28252.pt --vocab_file /public/home/hepj/model/pytorch_bert/vocab.txt --output_dir /public/home/hepj/outdir/torch/SQUAD4 --config_file /public/home/hepj/model/pytorch_bert/bert_config.json --json-summary /public/home/hepj/outdir/torch/SQUAD4/results.json --bert_model bert-large-uncased --do_train --do_predict --train_batch_size 4 --predict_batch_size 4 --gpus_per_node 2 --local_rank 2 --world_size 8 --use_env --dist_url tcp://j20r4n01:34567
+NCCL_SOCKET_IFNAME=eno1 numactl --cpunodebind=3 --membind=3 python3 /work/home/hepj/torch/bert-squad/run_squad_v4.py --train_file /public/home/hepj/data/sq1.1/train-v1.1.json --predict_file /public/home/hepj/data/sq1.1/dev-v1.1.json --init_checkpoint /public/home/hepj/model/pytorch_bert/model.ckpt-28252.pt --vocab_file /public/home/hepj/model/pytorch_bert/vocab.txt --output_dir /public/home/hepj/outdir/torch/SQUAD4 --config_file /public/home/hepj/model/pytorch_bert/bert_config.json --json-summary /public/home/hepj/outdir/torch/SQUAD4/results.json --bert_model bert-large-uncased --do_train --do_predict --train_batch_size 4 --predict_batch_size 4 --gpus_per_node 2 --local_rank 3 --world_size 8 --use_env --dist_url tcp://j20r4n01:34567
+NCCL_SOCKET_IFNAME=eno1 numactl --cpunodebind=0 --membind=0 python3 /work/home/hepj/torch/bert-squad/run_squad_v4.py --train_file /public/home/hepj/data/sq1.1/train-v1.1.json --predict_file /public/home/hepj/data/sq1.1/dev-v1.1.json --init_checkpoint /public/home/hepj/model/pytorch_bert/model.ckpt-28252.pt --vocab_file /public/home/hepj/model/pytorch_bert/vocab.txt --output_dir /public/home/hepj/outdir/torch/SQUAD4 --config_file /public/home/hepj/model/pytorch_bert/bert_config.json --json-summary /public/home/hepj/outdir/torch/SQUAD4/results.json --bert_model bert-large-uncased --do_train --do_predict --train_batch_size 4 --predict_batch_size 4 --gpus_per_node 2 --local_rank 4 --world_size 8 --use_env --dist_url tcp://j20r4n01:34567
+NCCL_SOCKET_IFNAME=eno1 numactl --cpunodebind=1 --membind=1 python3 /work/home/hepj/torch/bert-squad/run_squad_v4.py --train_file /public/home/hepj/data/sq1.1/train-v1.1.json --predict_file /public/home/hepj/data/sq1.1/dev-v1.1.json --init_checkpoint /public/home/hepj/model/pytorch_bert/model.ckpt-28252.pt --vocab_file /public/home/hepj/model/pytorch_bert/vocab.txt --output_dir /public/home/hepj/outdir/torch/SQUAD4 --config_file /public/home/hepj/model/pytorch_bert/bert_config.json --json-summary /public/home/hepj/outdir/torch/SQUAD4/results.json --bert_model bert-large-uncased --do_train --do_predict --train_batch_size 4 --predict_batch_size 4 --gpus_per_node 2 --local_rank 5 --world_size 8 --use_env --dist_url tcp://j20r4n01:34567
+NCCL_SOCKET_IFNAME=eno1 numactl --cpunodebind=2 --membind=2 python3 /work/home/hepj/torch/bert-squad/run_squad_v4.py --train_file /public/home/hepj/data/sq1.1/train-v1.1.json --predict_file /public/home/hepj/data/sq1.1/dev-v1.1.json --init_checkpoint /public/home/hepj/model/pytorch_bert/model.ckpt-28252.pt --vocab_file /public/home/hepj/model/pytorch_bert/vocab.txt --output_dir /public/home/hepj/outdir/torch/SQUAD4 --config_file /public/home/hepj/model/pytorch_bert/bert_config.json --json-summary /public/home/hepj/outdir/torch/SQUAD4/results.json --bert_model bert-large-uncased --do_train --do_predict --train_batch_size 4 --predict_batch_size 4 --gpus_per_node 2 --local_rank 6 --world_size 8 --use_env --dist_url tcp://j20r4n01:34567
+NCCL_SOCKET_IFNAME=eno1 numactl --cpunodebind=3 --membind=3 python3 /work/home/hepj/torch/bert-squad/run_squad_v4.py --train_file /public/home/hepj/data/sq1.1/train-v1.1.json --predict_file /public/home/hepj/data/sq1.1/dev-v1.1.json --init_checkpoint /public/home/hepj/model/pytorch_bert/model.ckpt-28252.pt --vocab_file /public/home/hepj/model/pytorch_bert/vocab.txt --output_dir /public/home/hepj/outdir/torch/SQUAD4 --config_file /public/home/hepj/model/pytorch_bert/bert_config.json --json-summary /public/home/hepj/outdir/torch/SQUAD4/results.json --bert_model bert-large-uncased --do_train --do_predict --train_batch_size 4 --predict_batch_size 4 --gpus_per_node 2 --local_rank 7 --world_size 8 --use_env --dist_url tcp://j20r4n01:34567
+(null): can't open file '/work/home/hepj/torch/bert-squad/run_squad_v4.py': [Errno 2] No such file or directory
+(null): can't open file '/work/home/hepj/torch/bert-squad/run_squad_v4.py': [Errno 2] No such file or directory
+(null): can't open file '/work/home/hepj/torch/bert-squad/run_squad_v4.py': [Errno 2] No such file or directory
+(null): can't open file '/work/home/hepj/torch/bert-squad/run_squad_v4.py': [Errno 2] No such file or directory
+(null): can't open file '/work/home/hepj/torch/bert-squad/run_squad_v4.py': [Errno 2] No such file or directory
+(null): can't open file '/work/home/hepj/torch/bert-squad/run_squad_v4.py': [Errno 2] No such file or directory
+(null): can't open file '/work/home/hepj/torch/bert-squad/run_squad_v4.py': [Errno 2] No such file or directory
+(null): can't open file '/work/home/hepj/torch/bert-squad/run_squad_v4.py': [Errno 2] No such file or directory
+--------------------------------------------------------------------------
+Primary job  terminated normally, but 1 process returned
+a non-zero exit code. Per user-direction, the job has been aborted.
+--------------------------------------------------------------------------
+--------------------------------------------------------------------------
+mpirun detected that one or more processes exited with non-zero status, thus causing
+the job to be terminated. The first process to do so was:
+  Process name: [[60184,1],5]
+  Exit code:    2
+--------------------------------------------------------------------------
--- a/Dockerfile
+++ b/Dockerfile
+FROM image.sourcefind.cn:5000/dcu/admin/base/pytorch:2.1.0-ubuntu20.04-dtk24.04.1-py3.10
+COPY requirements.txt requirements.txt
+RUN source /opt/dtk-24.04.1/env.sh
+RUN cp /usr/share/zoneinfo/Asia/Shanghai /etc/localtime && echo 'Asia/Shanghai' >/etc/timezone 
+ENV LANG C.UTF-8
+RUN pip install -r requirements.txt -i http://mirrors.aliyun.com/pypi/simple/ --trusted-host mirrors.aliyun.com
--- a/LICENSE
+++ b/LICENSE
+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+   1. Definitions.
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+   END OF TERMS AND CONDITIONS
+   Copyright 2019 NVIDIA CORPORATION. All rights reserved.
+   APPENDIX: How to apply the Apache License to your work.
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+   Copyright [yyyy] [name of copyright owner]
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+       http://www.apache.org/licenses/LICENSE-2.0
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
\ No newline at end of file
--- a/NOTICE
+++ b/NOTICE
+BERT PyTorch
+This repository includes software from https://github.com/huggingface/pytorch-pretrained-BERT
+licensed under the Apache License 2.0.
--- a/README.md
+++ b/README.md
+# bert-large 训练
+## 论文
+`BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding`
+[BERT论文pdf地址](https://arxiv.org/pdf/1810.04805.pdf)
+## 环境配置
+### Docker
+```
+docker pull image.sourcefind.cn:5000/dcu/admin/base/pytorch:2.4.1-ubuntu22.04-dtk25.04.1-py3.10
+```
+其它依赖库参照requirements.txt安装：
+```
+pip install -r requirements.txt -i https://pypi.tuna.tsinghua.edu.cn/simple/  
+```
+## 数据集
+pre_train 数据，本项目使用的是wiki20220401的数据，但数据集压缩后近20GB，解压后300GB下载速度慢，解压占大量空间。由于wiki数据集经常更新,官网并不保留旧版数据集，这里提供处理好的seq128和seq512的数据集网盘下载链接。
+（seq128对应PHRASE1）链接：https://pan.baidu.com/s/13GA-Jmfr2qXrChjiM2UfFQ?pwd=l30u  提取码：l30u
+（seq512对应PHRASE2）链接：https://pan.baidu.com/s/1MBFjYNsGQzlnc8aEb7Pg4w?pwd=6ap2  提取码：6ap2 
+**这里使用服务器已有的wiki数据集服务器上有已经下载处理好的数据，预训练数据分为PHRASE1、PHRASE2**
+`wiki数据集结构`
+```
+ ──wikicorpus_en_128 
+    │   ├── training
+    │             ├── wikicorpus_en_training_0.tfrecord.hdf5
+    │             ├── wikicorpus_en_training_1000.tfrecord.hdf5
+    │             └── ...
+    │   └── test
+    │             ├── wikicorpus_en_test_99.tfrecord.hdf5
+    │             ├── wikicorpus_en_test_9.tfrecord.hdf5
+    │             └── ...
+──wikicorpus_en_512 
+    │   ├── training
+    │             ├── wikicorpus_en_training_0.tfrecord.hdf5
+    │             ├── wikicorpus_en_training_1000.tfrecord.hdf5
+    │             └── ...
+    │   └── test
+    │             ├── wikicorpus_en_test_99.tfrecord.hdf5
+    │             ├── wikicorpus_en_test_9.tfrecord.hdf5
+    │             └── ...
+```
+问答SQUAD1.1数据：
+[train-v1.1](https://rajpurkar.github.io/SQuAD-explorer/dataset/train-v1.1.json)
+[dev-v1.1](https://rajpurkar.github.io/SQuAD-explorer/dataset/dev-v1.1.json)
+`squadv1.1数据结构`
+```
+├── dev-v1.1.json
+└── train-v1.1.json
+```
+## 训练
+#### 1.参数说明
+```
+    --input_dir  输入数据文件夹
+    --output_dir 输出保存文件夹
+    --config_file 模型配置文件
+    --bert_model  bert模型类型可选： bert-base-uncased, bert-large-uncased, bert-base-cased, bert-large-cased, bert-base-multilingual-uncased,bert-base-multilingual-cased, bert-base-chinese
+    --train_batch_size 训练batch_size
+    --max_seq_length=128 最大长度（需要和训练数据相匹配）
+    --max_predictions_per_seq 输入序列中屏蔽标记的最大总数 
+    --max_steps 最大步长
+    --warmup_proportion 进行线性学习率热身的训练比例
+    --num_steps_per_checkpoint 多少步保存一次模型
+    --learning_rate 学习率
+    --seed 随机种子
+    --gradient_accumulation_steps 在执行向后/更新过程之前，Accumulte的更新步骤数
+    --allreduce_post_accumulation 是否在梯度累积步骤期间执行所有减少
+    --do_train 是否训练
+    --fp16 混合精度训练
+    --amp 混合精度训练
+    --json-summary 输出json文件
+```
+#### 2.PHRASE1
+```
+#多卡
+bash bert_pre1_4.sh        #单精度 （按自己路径对single_pre1_4.sh里APP设置进行修改）
+bash bert_pre1_4_fp16.sh   #半精度 （按自己路径对single_pre1_4_fp16.sh里APP设置进行修改）
+```
+#### 3.PHRASE2
+```
+#多卡
+bash bert_pre2_4.sh       #单精度 （按自己路径对single_pre2_4.sh里APP设置进行修改）
+bash bert_pre2_4_fp16.sh  #半精度 （按自己路径对single_pre2_4_fp16.sh里APP设置进行修改）
+```
--- a/README_ORIGIN.md
+++ b/README_ORIGIN.md
--- a/README_old.md
+++ b/README_old.md
+# 简介
+使用PyTorch框架计算Bert网络。
+* BERT 的训练分为pre-train和fine-tune两种，pre-train训练分为两个phrase。
+* BERT 的推理可基于不同数据集进行精度验证
+* 数据生成、模型转换相关细节见  [README.md](http://10.0.100.3/dcutoolkit/deeplearing/dlexamples/-/blob/develop/PyTorch/NLP/BERT/scripts/README.md)
+# 运行示例
+目前提供基于wiki英文数据集 pre-train 两个阶段的训练和基于squad数据集fine-tune 训练的代码示例，
+## pre-train phrase1
+|参数名|解释|示例|
+|:---:|:---:|:---:|
+|PATH_PHRASE1|第一阶段训练数据集路径|/workspace/lower_case_1_seq_len_128_max_pred_20_masked_lm_prob_0.<br>15_random_seed_12345_dupe_factor_5_shard_1472_test_split_10
+|OUTPUT_DIR|输出路径|/workspace/results
+|PATH_CONFIG|confing路径|/workspace/bert_large_uncased
+|PATH_PHRASE2|第一阶段训练数据集路径|/workspace/lower_case_1_seq_len_512_max_pred_80_masked_lm_prob_0.<br>15_random_seed_12345_dupe_factor_5_shard_1472_test_split_10
+<br>
+### 单卡
+```
+export HIP_VISIBLE_DEVICES=0
+python3 run_pretraining_v1.py  \
+    --input_dir=${PATH_PHRASE1}    \
+    --output_dir=${OUTPUT_DIR}/checkpoints1 \
+    --config_file=${PATH_CONFIG}bert_config.json \
+    --bert_model=bert-large-uncased \
+    --train_batch_size=16 \
+    --max_seq_length=128 \
+    --max_predictions_per_seq=20 \
+    --max_steps=100000 \
+    --warmup_proportion=0.0 \
+    --num_steps_per_checkpoint=20000 \
+    --learning_rate=4.0e-4 \
+    --seed=12439 \
+    --gradient_accumulation_steps=1 \
+    --allreduce_post_accumulation \
+    --do_train \
+    --json-summary dllogger.json
+```
+### 多卡
+* 方法一
+```
+export HIP_VISIBLE_DEVICES=0,1,2,3
+python3 run_pretraining_v1.py  \
+    --input_dir=${PATH_PHRASE1}    \
+    --output_dir=${OUTPUT_DIR}/checkpoints \
+    --config_file=${PATH_CONFIG}bert_config.json \
+    --bert_model=bert-large-uncased \
+    --train_batch_size=16 \
+    --max_seq_length=128 \
+    --max_predictions_per_seq=20 \
+    --max_steps=100000 \
+    --warmup_proportion=0.0 \
+    --num_steps_per_checkpoint=20000 \
+    --learning_rate=4.0e-4 \
+    --seed=12439 \
+    --gradient_accumulation_steps=1 \
+    --allreduce_post_accumulation \
+    --do_train \
+    --json-summary dllogger.json
+```
+* 方法二
+hostfile:
+```
+node1 slots=4
+node2 slots=4
+```
+```
+#scripts/run_pretrain.sh 脚本默认每个节点四块卡
+cd scripts; bash run_pretrain.sh
+```
+## pre-train phrase2
+### 单卡
+```
+HIP_VISIBLE_DEVICES=0
+python3 run_pretraining_v1.py
+   --input_dir=${PATH_PHRASE2} \
+   --output_dir=${OUTPUT_DIR}/checkpoints2 \
+   --config_file=${PATH_CONFIG}bert_config.json \
+   --bert_model=bert-large-uncased \
+   --train_batch_size=4 \
+   --max_seq_length=512 \
+   --max_predictions_per_seq=80 \
+   --max_steps=400000 \
+   --warmup_proportion=0.128 \
+   --num_steps_per_checkpoint=200000 \
+   --learning_rate=4e-3 \
+   --seed=12439 \
+   --gradient_accumulation_steps=1 \
+   --allreduce_post_accumulation \
+   --do_train \
+   --phase2 \
+   --phase1_end_step=0 \
+   --json-summary dllogger.json
+```
+### 多卡
+* 方法一
+```
+export HIP_VISIBLE_DEVICES=0,1,2,3
+python3 run_pretraining_v1.py
+   --input_dir=${PATH_PHRASE2} \
+   --output_dir=${OUTPUT_DIR}/checkpoints2 \
+   --config_file=${PATH_CONFIG}bert_config.json \
+   --bert_model=bert-large-uncased \
+   --train_batch_size=4 \
+   --max_seq_length=512 \
+   --max_predictions_per_seq=80 \
+   --max_steps=400000 \
+   --warmup_proportion=0.128 \
+   --num_steps_per_checkpoint=200000 \
+   --learning_rate=4e-3 \
+   --seed=12439 \
+   --gradient_accumulation_steps=1 \
+   --allreduce_post_accumulation \
+   --do_train \
+   --phase2 \
+   --phase1_end_step=0 \
+   --json-summary dllogger.json
+```
+* 方法二
+hostfile:
+```
+node1 slots=4
+node2 slots=4
+```
+```
+#scripts/run_pretrain2.sh 脚本默认每个节点四块卡
+cd scripts; bash run_pretrain2.sh
+```
+## fine-tune 训练
+### 单卡
+```
+python3 run_squad_v1.py \
+  --train_file squad/v1.1/train-v1.1.json \
+  --init_checkpoint model.ckpt-28252.pt \
+  --vocab_file vocab.txt \
+  --output_dir SQuAD \
+  --config_file bert_config.json \
+  --bert_model=bert-large-uncased \
+  --do_train \
+  --train_batch_size 1 \
+  --gpus_per_node 1 
+```
+### 多卡
+hostfile:
+```
+node1 slots=4
+node2 slots=4
+```
+```
+#scripts/run_squad_1.sh 脚本默认每个节点四块卡
+bash run_squad_1.sh
+```
+# 参考资料
+[https://github.com/mlperf/training_results_v0.7/blob/master/NVIDIA/benchmarks/bert/implementations/pytorch](https://github.com/mlperf/training_results_v0.7/blob/master/NVIDIA/benchmarks/bert/implementations/pytorch)
+[https://github.com/NVIDIA/DeepLearningExamples/blob/master/PyTorch/LanguageModeling/BERT](https://github.com/NVIDIA/DeepLearningExamples/blob/master/PyTorch/LanguageModeling/BERT)
--- a/__pycache__/file_utils.cpython-310.pyc
+++ b/__pycache__/file_utils.cpython-310.pyc
--- a/__pycache__/modeling.cpython-310.pyc
+++ b/__pycache__/modeling.cpython-310.pyc
--- a/__pycache__/schedulers.cpython-310.pyc
+++ b/__pycache__/schedulers.cpython-310.pyc