Commit dbb44794 authored by hepj987's avatar hepj987
Browse files

添加运行脚本

parent 6652f879
...@@ -37,6 +37,8 @@ docker pull image.sourcefind.cn:5000/dcu/admin/base/tensorflow:2.7.0-centos7.6-d ...@@ -37,6 +37,8 @@ docker pull image.sourcefind.cn:5000/dcu/admin/base/tensorflow:2.7.0-centos7.6-d
## 安装依赖 ## 安装依赖
安装过程可能顶掉DCU版本的tensorflow,可以到[开发者社区](https://cancon.hpccube.com:65024/4/main/tensorflow/dtk22.10)下载DCU版本对应包
``` ```
pip install requirements.txt pip install requirements.txt
``` ```
...@@ -50,10 +52,10 @@ TF2.0版本读取数据需要转化为tf_record格式 ...@@ -50,10 +52,10 @@ TF2.0版本读取数据需要转化为tf_record格式
``` ```
python create_finetuning_data.py \ python create_finetuning_data.py \
--input_data_dir=/public/home/hepj/data/MNLI \ --input_data_dir=/public/home/hepj/data/MNLI \
--vocab_file=/public/home/hepj/model/tf2.7.0_Bert/pre_tf2x/vocab.txt \ --vocab_file=/public/home/hepj/model_source/uncased_L-12_H-768_A-12/vocab.txt \
--train_data_output_path=/public/home/hepj/model/tf2.7.0_Bert/MNLI/train.tf_record \ --train_data_output_path=/public/home/hepj/MNLI/train.tf_record \
--eval_data_output_path=/public/home/hepj/model/tf2.7.0_Bert/MNLI/eval.tf_record \ --eval_data_output_path=/public/home/hepj/MNLI/eval.tf_record \
--meta_data_file_path=/public/home/hepj/model/tf2.7.0_Bert/MNLI/meta_data \ --meta_data_file_path=/public/home/hepj/MNLI/meta_data \
--fine_tuning_task_type=classification --fine_tuning_task_type=classification
--max_seq_length=32 \ --max_seq_length=32 \
--classification_task_name=MNLI --classification_task_name=MNLI
...@@ -76,13 +78,16 @@ TF2.7.2与TF1.15.0模型存储、读取格式不同,官网给出的Bert一般 ...@@ -76,13 +78,16 @@ TF2.7.2与TF1.15.0模型存储、读取格式不同,官网给出的Bert一般
``` ```
python3 tf2_encoder_checkpoint_converter.py \ python3 tf2_encoder_checkpoint_converter.py \
--bert_config_file /public/home/hepj/model_source/uncased_L-12_H-768_A-12/bert_config.json \ --bert_config_file /public/home/hepj/model_source/uncased_L-12_H-768_A-12/bert_config.json \
--checkpoint_to_convert /public/home/hepjl/model_source/uncased_L-12_H-768_A-12/bert_model.ckpt \ --checkpoint_to_convert /public/home/hepj/model_source/uncased_L-12_H-768_A-12/bert_model.ckpt \
--converted_checkpoint_path pre_tf2x/ --converted_checkpoint_path /public/home/hepj/model_source/bert-base-TF2/bert_model.ckpt
#参数说明 #参数说明
--bert_config_file bert模型config文件 --bert_config_file bert模型config文件
--checkpoint_to_convert 需要转换的模型路径 --checkpoint_to_convert 需要转换的模型路径
--converted_checkpoint_path 转换后模型路径 --converted_checkpoint_path 转换后模型路径
将转换完后的bert_model.ckpt-1.data-00000-of-00001 改为bert_model.ckpt.data-00000-of-00001
bert_model.ckpt-1.index改为 bert_model.ckpt.index
``` ```
## 单卡运行 ## 单卡运行
...@@ -109,12 +114,12 @@ sh bert_class.sh ...@@ -109,12 +114,12 @@ sh bert_class.sh
## 多卡运行 ## 多卡运行
``` ```
sh bert_class4.sh sh bert_class_gpus.sh
``` ```
# SQUAD1.1问答测试 # SQUAD1.1问答测试
### 数据转化 ## 数据转化
TF2.0版本读取数据需要转化为tf_record格式 TF2.0版本读取数据需要转化为tf_record格式
...@@ -123,7 +128,7 @@ python3 create_finetuning_data.py \ ...@@ -123,7 +128,7 @@ python3 create_finetuning_data.py \
--squad_data_file=/public/home/hepj/model/model_source/sq1.1/train-v1.1.json \ --squad_data_file=/public/home/hepj/model/model_source/sq1.1/train-v1.1.json \
--vocab_file=/public/home/hepj/model_source/bert-large-uncased-TF2/uncased_L-24_H-1024_A-16/vocab.txt \ --vocab_file=/public/home/hepj/model_source/bert-large-uncased-TF2/uncased_L-24_H-1024_A-16/vocab.txt \
--train_data_output_path=/public/home/hepj/model/tf2.7.0_Bert/squad1.1/train_new.tf_record \ --train_data_output_path=/public/home/hepj/model/tf2.7.0_Bert/squad1.1/train_new.tf_record \
--meta_data_file_path=/public/home/hepj/model/tf2.7.0_Bert/squad1.1/meta_data_new \ --meta_data_file_path=/public/home/hepj/model/tf2.7.0_Bert/squad1.1/meta_data \
--eval_data_output_path=/public/home/hepj/model/tf2.7.0_Bert/squad1.1/eval_new.tf_record \ --eval_data_output_path=/public/home/hepj/model/tf2.7.0_Bert/squad1.1/eval_new.tf_record \
--fine_tuning_task_type=squad \ --fine_tuning_task_type=squad \
--do_lower_case=Flase \ --do_lower_case=Flase \
...@@ -139,21 +144,24 @@ python3 create_finetuning_data.py \ ...@@ -139,21 +144,24 @@ python3 create_finetuning_data.py \
--max_seq_length 最大句子长度 --max_seq_length 最大句子长度
``` ```
### 模型转化 ## 模型转化
``` ```
python3 tf2_encoder_checkpoint_converter.py \ python3 tf2_encoder_checkpoint_converter.py \
--bert_config_file /public/home/hepj/model/model_source/uncased_L-24_H-1024_A-16/bert_config.json \ --bert_config_file /public/home/hepj/model/model_source/uncased_L-24_H-1024_A-16/bert_config.json \
--checkpoint_to_convert /public/home/hepj/model/model_sourceuncased_L-24_H-1024_A-16/bert_model.ckpt \ --checkpoint_to_convert /public/home/hepj/model/model_sourceuncased_L-24_H-1024_A-16/bert_model.ckpt \
--converted_checkpoint_path /public/home/hepj/model_source/bert-large-uncased-TF2/ --converted_checkpoint_path /public/home/hepj/model_source/bert-large-TF2/bert_model.ckpt
#参数说明 #参数说明
--bert_config_file bert模型config文件 --bert_config_file bert模型config文件
--checkpoint_to_convert 需要转换的模型路径 --checkpoint_to_convert 需要转换的模型路径
--converted_checkpoint_path 转换后模型路径 --converted_checkpoint_path 转换后模型路径
将转换完后的bert_model.ckpt-1.data-00000-of-00001 改为bert_model.ckpt.data-00000-of-00001
bert_model.ckpt-1.index改为 bert_model.ckpt.index
``` ```
### 单卡运行 ## 单卡运行
``` ```
sh bert_squad.sh sh bert_squad.sh
...@@ -165,7 +173,7 @@ sh bert_squad.sh ...@@ -165,7 +173,7 @@ sh bert_squad.sh
--eval_data_path 验证数据路径 --eval_data_path 验证数据路径
--bert_config_file bert模型config文件 --bert_config_file bert模型config文件
--init_checkpoint 初始化模型路径 --init_checkpoint 初始化模型路径
--train_batch_size 训练批大小 --train_batch_size 训练批大小
--predict_file 预测文件路径 --predict_file 预测文件路径
--eval_batch_size 验证批大小 --eval_batch_size 验证批大小
--steps_per_loop 打印log间隔 --steps_per_loop 打印log间隔
...@@ -176,20 +184,20 @@ sh bert_squad.sh ...@@ -176,20 +184,20 @@ sh bert_squad.sh
--num_gpus 使用gpu数量 --num_gpus 使用gpu数量
``` ```
### 多卡运行 ## 多卡运行
``` ```
sh bert_squad4.sh sh bert_squad_gpus.sh
``` ```
## 模型精度 ## 模型精度
待完善...
# 源码仓库及问题反馈
## 源码仓库及问题反馈
https://developer.hpccube.com/codes/modelzoo/bert-tf2 https://developer.hpccube.com/codes/modelzoo/bert-tf2
## 参考 # 参考
https://github.com/tensorflow/models/tree/v2.3.0/official/nlp https://github.com/tensorflow/models/tree/v2.3.0/official/nlp
export HIP_VISIBLE_DEVICES=0
python3 run_classifier.py \
--mode train_and_eval \
--input_meta_data_path //public/home/hepj/MNLI/meta_data \
--train_data_path /public/home/hepj/MNLI/train.tf_record \
--eval_data_path /public/home/hepj/MNLI/eval.tf_record \
--bert_config_file /public/home/hepj/model_source/uncased_L-12_H-768_A-12/bert_config.json \
--init_checkpoint /public/home/hepj/model_source/bert-base-TF2/bert_model.ckpt \
--train_batch_size 320 \
--eval_batch_size 32 \
--steps_per_loop 1000 \
--learning_rate 2e-5 \
--num_train_epochs 3 \
--num_gpus 1 \
--model_dir /public/home/hepj/model/tf2/out1 \
--distribution_strategy mirrored
export HIP_VISIBLE_DEVICES=0,1,2,3
python3 run_classifier.py \
--mode train_and_eval \
--input_meta_data_path //public/home/hepj/MNLI/meta_data \
--train_data_path /public/home/hepj/MNLI/train.tf_record \
--eval_data_path /public/home/hepj/MNLI/eval.tf_record \
--bert_config_file /public/home/hepj/model_source/uncased_L-12_H-768_A-12/bert_config.json \
--init_checkpoint /public/home/hepj/model_source/bert-base-TF2/bert_model.ckpt \
--train_batch_size 320 \
--eval_batch_size 32 \
--steps_per_loop 1000 \
--learning_rate 2e-5 \
--num_train_epochs 3 \
--num_gpus 4 \
--model_dir /public/home/hepj/model/tf2/out1 \
--distribution_strategy mirrored
export HIP_VISIBLE_DEVICES=0
python3 run_squad_xuan.py \
--mode train_and_eval \
--vocab_file /public/home/hepj/model_source/bert-large-uncased-TF2/uncased_L-24_H-1024_A-16/vocab.txt \
--bert_config_file /public/home/hepj/model_source/bert-large-uncased-TF2/uncased_L-24_H-1024_A-16/bert_config.json \
--input_meta_data_path /public/home/hepj/model/tf2.7.0_Bert/squad1.1/meta_data \
--train_data_path /public/home/hepj/model/tf2.7.0_Bert/squad1.1/train.tf_record \
--predict_file /public/home/hepj/model/model_source/sq1.1/dev-v1.1.json \
--init_checkpoint /public/home/hepj/model_source/bert-large-TF2/bert_model.ckpt \
--train_batch_size 4 \
--predict_batch_size 4 \
--learning_rate 2e-5 \
--log_steps 1 \
--num_gpus 1 \
--distribution_strategy mirrored \
--model_dir /public/home/hepj/model/tf2/squad1
export HIP_VISIBLE_DEVICES=0,1,2,3
python3 run_squad_xuan.py \
--mode train_and_eval \
--vocab_file /public/home/hepj/model_source/bert-large-uncased-TF2/uncased_L-24_H-1024_A-16/vocab.txt \
--bert_config_file /public/home/hepj/model_source/bert-large-uncased-TF2/uncased_L-24_H-1024_A-16/bert_config.json \
--input_meta_data_path /public/home/hepj/model/tf2.7.0_Bert/squad1.1/meta_data \
--train_data_path /public/home/hepj/model/tf2.7.0_Bert/squad1.1/train.tf_record \
--predict_file /public/home/hepj/model/model_source/sq1.1/dev-v1.1.json \
--init_checkpoint /public/home/hepj/model_source/bert-large-TF2/bert_model.ckpt \
--train_batch_size 4 \
--predict_batch_size 4 \
--learning_rate 2e-5 \
--log_steps 1 \
--num_gpus 4 \
--distribution_strategy mirrored \
--model_dir /public/home/hepj/model/tf2/squad1
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment