添加运行脚本

dbb44794 · hepj987 · 6652f879 · dbb44794 · dbb44794 · dbb44794
Commit dbb44794 authored Jul 04, 2023 by hepj987
Showing with 90 additions and 18 deletions

README.md README.md +26 -18

bert_class.sh bert_class.sh +16 -0

bert_class_gpus.sh bert_class_gpus.sh +16 -0

bert_squad.sh bert_squad.sh +16 -0

bert_squad_gpus.sh bert_squad_gpus.sh +16 -0

No files found.
--- a/README.md
+++ b/README.md
@@ -37,6 +37,8 @@ docker pull image.sourcefind.cn:5000/dcu/admin/base/tensorflow:2.7.0-centos7.6-d
 ## 安装依赖
+安装过程可能顶掉DCU版本的tensorflow，可以到[开发者社区](https://cancon.hpccube.com:65024/4/main/tensorflow/dtk22.10)下载DCU版本对应包
 ```
 pip install requirements.txt
 ```
@@ -50,10 +52,10 @@ TF2.0版本读取数据需要转化为tf_record格式
 ```
 python create_finetuning_data.py \
 --input_data_dir=/public/home/hepj/data/MNLI \
- --vocab_file=/public/home/hepj/model/tf2.7.0_Bert/pre_tf2x/vocab.txt \
+ --vocab_file=/public/home/hepj/model_source/uncased_L-12_H-768_A-12/vocab.txt \
- --train_data_output_path=/public/home/hepj/model/tf2.7.0_Bert/MNLI/train.tf_record \
+ --train_data_output_path=/public/home/hepj/MNLI/train.tf_record \
- --eval_data_output_path=/public/home/hepj/model/tf2.7.0_Bert/MNLI/eval.tf_record \
+ --eval_data_output_path=/public/home/hepj/MNLI/eval.tf_record \
- --meta_data_file_path=/public/home/hepj/model/tf2.7.0_Bert/MNLI/meta_data \
+ --meta_data_file_path=/public/home/hepj/MNLI/meta_data \
 --fine_tuning_task_type=classification 
 --max_seq_length=32 \
 --classification_task_name=MNLI
@@ -76,13 +78,16 @@ TF2.7.2与TF1.15.0模型存储、读取格式不同，官网给出的Bert一般
 ```
 python3 tf2_encoder_checkpoint_converter.py \
 --bert_config_file /public/home/hepj/model_source/uncased_L-12_H-768_A-12/bert_config.json \
--checkpoint_to_convert /public/home/hepjl/model_source/uncased_L-12_H-768_A-12/bert_model.ckpt \
+--checkpoint_to_convert /public/home/hepj/model_source/uncased_L-12_H-768_A-12/bert_model.ckpt \
--converted_checkpoint_path pre_tf2x/
+--converted_checkpoint_path /public/home/hepj/model_source/bert-base-TF2/bert_model.ckpt
 #参数说明
 --bert_config_file			bert模型config文件
 --checkpoint_to_convert		需要转换的模型路径
 --converted_checkpoint_path	转换后模型路径
+将转换完后的bert_model.ckpt-1.data-00000-of-00001 改为bert_model.ckpt.data-00000-of-00001
+bert_model.ckpt-1.index改为 bert_model.ckpt.index
 ```
 ## 单卡运行
@@ -109,12 +114,12 @@ sh bert_class.sh
 ## 多卡运行
 ```
-sh bert_class4.sh
+sh bert_class_gpus.sh
 ```
 # SQUAD1.1问答测试
-### 数据转化
+## 数据转化
 TF2.0版本读取数据需要转化为tf_record格式
@@ -123,7 +128,7 @@ python3 create_finetuning_data.py \
 --squad_data_file=/public/home/hepj/model/model_source/sq1.1/train-v1.1.json \
 --vocab_file=/public/home/hepj/model_source/bert-large-uncased-TF2/uncased_L-24_H-1024_A-16/vocab.txt \
 --train_data_output_path=/public/home/hepj/model/tf2.7.0_Bert/squad1.1/train_new.tf_record \
- --meta_data_file_path=/public/home/hepj/model/tf2.7.0_Bert/squad1.1/meta_data_new \
+ --meta_data_file_path=/public/home/hepj/model/tf2.7.0_Bert/squad1.1/meta_data \
 --eval_data_output_path=/public/home/hepj/model/tf2.7.0_Bert/squad1.1/eval_new.tf_record \
 --fine_tuning_task_type=squad \
 --do_lower_case=Flase \
@@ -139,21 +144,24 @@ python3 create_finetuning_data.py \
 --max_seq_length				最大句子长度
 ```
-### 模型转化
+## 模型转化
 ```
 python3 tf2_encoder_checkpoint_converter.py \
 --bert_config_file /public/home/hepj/model/model_source/uncased_L-24_H-1024_A-16/bert_config.json \
 --checkpoint_to_convert /public/home/hepj/model/model_sourceuncased_L-24_H-1024_A-16/bert_model.ckpt \
--converted_checkpoint_path  /public/home/hepj/model_source/bert-large-uncased-TF2/
+--converted_checkpoint_path  /public/home/hepj/model_source/bert-large-TF2/bert_model.ckpt
 #参数说明
 --bert_config_file			bert模型config文件
 --checkpoint_to_convert		需要转换的模型路径
 --converted_checkpoint_path	转换后模型路径
+将转换完后的bert_model.ckpt-1.data-00000-of-00001 改为bert_model.ckpt.data-00000-of-00001
+bert_model.ckpt-1.index改为 bert_model.ckpt.index
 ```
-### 单卡运行
+## 单卡运行
 ```
 sh bert_squad.sh
@@ -165,7 +173,7 @@ sh bert_squad.sh
  --eval_data_path				验证数据路径
  --bert_config_file			bert模型config文件
  --init_checkpoint				初始化模型路径
-  --train_batch_size			训练批大小
+  --train_batch_size			总训练批大小
  --predict_file				预测文件路径
  --eval_batch_size				验证批大小
  --steps_per_loop				打印log间隔
@@ -176,20 +184,20 @@ sh bert_squad.sh
  --num_gpus					使用gpu数量
 ```
-### 多卡运行
+## 多卡运行
 ```
-sh bert_squad4.sh
+sh bert_squad_gpus.sh
 ```
 ## 模型精度
+待完善...
+# 源码仓库及问题反馈
-## 源码仓库及问题反馈
 https://developer.hpccube.com/codes/modelzoo/bert-tf2
-## 参考
+# 参考
 https://github.com/tensorflow/models/tree/v2.3.0/official/nlp
--- a/bert_class.sh
+++ b/bert_class.sh
+export HIP_VISIBLE_DEVICES=0
+python3 run_classifier.py \
+  --mode train_and_eval \
+  --input_meta_data_path //public/home/hepj/MNLI/meta_data \
+  --train_data_path /public/home/hepj/MNLI/train.tf_record \
+  --eval_data_path /public/home/hepj/MNLI/eval.tf_record \
+  --bert_config_file /public/home/hepj/model_source/uncased_L-12_H-768_A-12/bert_config.json \
+  --init_checkpoint /public/home/hepj/model_source/bert-base-TF2/bert_model.ckpt \
+  --train_batch_size  320 \
+  --eval_batch_size 32 \
+  --steps_per_loop 1000 \
+  --learning_rate 2e-5 \
+  --num_train_epochs 3 \
+  --num_gpus 1 \
+  --model_dir /public/home/hepj/model/tf2/out1 \
+  --distribution_strategy mirrored
--- a/bert_class_gpus.sh
+++ b/bert_class_gpus.sh
+export HIP_VISIBLE_DEVICES=0,1,2,3
+python3 run_classifier.py \
+  --mode train_and_eval \
+  --input_meta_data_path //public/home/hepj/MNLI/meta_data \
+  --train_data_path /public/home/hepj/MNLI/train.tf_record \
+  --eval_data_path /public/home/hepj/MNLI/eval.tf_record \
+  --bert_config_file /public/home/hepj/model_source/uncased_L-12_H-768_A-12/bert_config.json \
+  --init_checkpoint /public/home/hepj/model_source/bert-base-TF2/bert_model.ckpt \
+  --train_batch_size  320 \
+  --eval_batch_size 32 \
+  --steps_per_loop 1000 \
+  --learning_rate 2e-5 \
+  --num_train_epochs 3 \
+  --num_gpus 4 \
+  --model_dir /public/home/hepj/model/tf2/out1 \
+  --distribution_strategy mirrored
--- a/bert_squad.sh
+++ b/bert_squad.sh
+export HIP_VISIBLE_DEVICES=0
+python3 run_squad_xuan.py \
+    --mode train_and_eval \
+    --vocab_file /public/home/hepj/model_source/bert-large-uncased-TF2/uncased_L-24_H-1024_A-16/vocab.txt \
+    --bert_config_file /public/home/hepj/model_source/bert-large-uncased-TF2/uncased_L-24_H-1024_A-16/bert_config.json \
+    --input_meta_data_path /public/home/hepj/model/tf2.7.0_Bert/squad1.1/meta_data \
+    --train_data_path /public/home/hepj/model/tf2.7.0_Bert/squad1.1/train.tf_record \
+    --predict_file /public/home/hepj/model/model_source/sq1.1/dev-v1.1.json \
+    --init_checkpoint /public/home/hepj/model_source/bert-large-TF2/bert_model.ckpt \
+    --train_batch_size 4 \
+    --predict_batch_size 4 \
+    --learning_rate 2e-5 \
+    --log_steps 1 \
+    --num_gpus 1 \
+    --distribution_strategy mirrored \
+    --model_dir /public/home/hepj/model/tf2/squad1
--- a/bert_squad_gpus.sh
+++ b/bert_squad_gpus.sh
+export HIP_VISIBLE_DEVICES=0,1,2,3
+python3 run_squad_xuan.py \
+    --mode train_and_eval \
+    --vocab_file /public/home/hepj/model_source/bert-large-uncased-TF2/uncased_L-24_H-1024_A-16/vocab.txt \
+    --bert_config_file /public/home/hepj/model_source/bert-large-uncased-TF2/uncased_L-24_H-1024_A-16/bert_config.json \
+    --input_meta_data_path /public/home/hepj/model/tf2.7.0_Bert/squad1.1/meta_data \
+    --train_data_path /public/home/hepj/model/tf2.7.0_Bert/squad1.1/train.tf_record \
+    --predict_file /public/home/hepj/model/model_source/sq1.1/dev-v1.1.json \
+    --init_checkpoint /public/home/hepj/model_source/bert-large-TF2/bert_model.ckpt \
+    --train_batch_size 4 \
+    --predict_batch_size 4 \
+    --learning_rate 2e-5 \
+    --log_steps 1 \
+    --num_gpus 4 \
+    --distribution_strategy mirrored \
+    --model_dir /public/home/hepj/model/tf2/squad1