Commit dbb44794 authored by hepj987's avatar hepj987
Browse files

添加运行脚本

parent 6652f879
......@@ -37,6 +37,8 @@ docker pull image.sourcefind.cn:5000/dcu/admin/base/tensorflow:2.7.0-centos7.6-d
## 安装依赖
安装过程可能顶掉DCU版本的tensorflow,可以到[开发者社区](https://cancon.hpccube.com:65024/4/main/tensorflow/dtk22.10)下载DCU版本对应包
```
pip install requirements.txt
```
......@@ -50,10 +52,10 @@ TF2.0版本读取数据需要转化为tf_record格式
```
python create_finetuning_data.py \
--input_data_dir=/public/home/hepj/data/MNLI \
--vocab_file=/public/home/hepj/model/tf2.7.0_Bert/pre_tf2x/vocab.txt \
--train_data_output_path=/public/home/hepj/model/tf2.7.0_Bert/MNLI/train.tf_record \
--eval_data_output_path=/public/home/hepj/model/tf2.7.0_Bert/MNLI/eval.tf_record \
--meta_data_file_path=/public/home/hepj/model/tf2.7.0_Bert/MNLI/meta_data \
--vocab_file=/public/home/hepj/model_source/uncased_L-12_H-768_A-12/vocab.txt \
--train_data_output_path=/public/home/hepj/MNLI/train.tf_record \
--eval_data_output_path=/public/home/hepj/MNLI/eval.tf_record \
--meta_data_file_path=/public/home/hepj/MNLI/meta_data \
--fine_tuning_task_type=classification
--max_seq_length=32 \
--classification_task_name=MNLI
......@@ -76,13 +78,16 @@ TF2.7.2与TF1.15.0模型存储、读取格式不同,官网给出的Bert一般
```
python3 tf2_encoder_checkpoint_converter.py \
--bert_config_file /public/home/hepj/model_source/uncased_L-12_H-768_A-12/bert_config.json \
--checkpoint_to_convert /public/home/hepjl/model_source/uncased_L-12_H-768_A-12/bert_model.ckpt \
--converted_checkpoint_path pre_tf2x/
--checkpoint_to_convert /public/home/hepj/model_source/uncased_L-12_H-768_A-12/bert_model.ckpt \
--converted_checkpoint_path /public/home/hepj/model_source/bert-base-TF2/bert_model.ckpt
#参数说明
--bert_config_file bert模型config文件
--checkpoint_to_convert 需要转换的模型路径
--converted_checkpoint_path 转换后模型路径
将转换完后的bert_model.ckpt-1.data-00000-of-00001 改为bert_model.ckpt.data-00000-of-00001
bert_model.ckpt-1.index改为 bert_model.ckpt.index
```
## 单卡运行
......@@ -109,12 +114,12 @@ sh bert_class.sh
## 多卡运行
```
sh bert_class4.sh
sh bert_class_gpus.sh
```
# SQUAD1.1问答测试
### 数据转化
## 数据转化
TF2.0版本读取数据需要转化为tf_record格式
......@@ -123,7 +128,7 @@ python3 create_finetuning_data.py \
--squad_data_file=/public/home/hepj/model/model_source/sq1.1/train-v1.1.json \
--vocab_file=/public/home/hepj/model_source/bert-large-uncased-TF2/uncased_L-24_H-1024_A-16/vocab.txt \
--train_data_output_path=/public/home/hepj/model/tf2.7.0_Bert/squad1.1/train_new.tf_record \
--meta_data_file_path=/public/home/hepj/model/tf2.7.0_Bert/squad1.1/meta_data_new \
--meta_data_file_path=/public/home/hepj/model/tf2.7.0_Bert/squad1.1/meta_data \
--eval_data_output_path=/public/home/hepj/model/tf2.7.0_Bert/squad1.1/eval_new.tf_record \
--fine_tuning_task_type=squad \
--do_lower_case=Flase \
......@@ -139,21 +144,24 @@ python3 create_finetuning_data.py \
--max_seq_length 最大句子长度
```
### 模型转化
## 模型转化
```
python3 tf2_encoder_checkpoint_converter.py \
--bert_config_file /public/home/hepj/model/model_source/uncased_L-24_H-1024_A-16/bert_config.json \
--checkpoint_to_convert /public/home/hepj/model/model_sourceuncased_L-24_H-1024_A-16/bert_model.ckpt \
--converted_checkpoint_path /public/home/hepj/model_source/bert-large-uncased-TF2/
--converted_checkpoint_path /public/home/hepj/model_source/bert-large-TF2/bert_model.ckpt
#参数说明
--bert_config_file bert模型config文件
--checkpoint_to_convert 需要转换的模型路径
--converted_checkpoint_path 转换后模型路径
将转换完后的bert_model.ckpt-1.data-00000-of-00001 改为bert_model.ckpt.data-00000-of-00001
bert_model.ckpt-1.index改为 bert_model.ckpt.index
```
### 单卡运行
## 单卡运行
```
sh bert_squad.sh
......@@ -165,7 +173,7 @@ sh bert_squad.sh
--eval_data_path 验证数据路径
--bert_config_file bert模型config文件
--init_checkpoint 初始化模型路径
--train_batch_size 训练批大小
--train_batch_size 训练批大小
--predict_file 预测文件路径
--eval_batch_size 验证批大小
--steps_per_loop 打印log间隔
......@@ -176,20 +184,20 @@ sh bert_squad.sh
--num_gpus 使用gpu数量
```
### 多卡运行
## 多卡运行
```
sh bert_squad4.sh
sh bert_squad_gpus.sh
```
## 模型精度
待完善...
## 源码仓库及问题反馈
# 源码仓库及问题反馈
https://developer.hpccube.com/codes/modelzoo/bert-tf2
## 参考
# 参考
https://github.com/tensorflow/models/tree/v2.3.0/official/nlp
export HIP_VISIBLE_DEVICES=0
python3 run_classifier.py \
--mode train_and_eval \
--input_meta_data_path //public/home/hepj/MNLI/meta_data \
--train_data_path /public/home/hepj/MNLI/train.tf_record \
--eval_data_path /public/home/hepj/MNLI/eval.tf_record \
--bert_config_file /public/home/hepj/model_source/uncased_L-12_H-768_A-12/bert_config.json \
--init_checkpoint /public/home/hepj/model_source/bert-base-TF2/bert_model.ckpt \
--train_batch_size 320 \
--eval_batch_size 32 \
--steps_per_loop 1000 \
--learning_rate 2e-5 \
--num_train_epochs 3 \
--num_gpus 1 \
--model_dir /public/home/hepj/model/tf2/out1 \
--distribution_strategy mirrored
export HIP_VISIBLE_DEVICES=0,1,2,3
python3 run_classifier.py \
--mode train_and_eval \
--input_meta_data_path //public/home/hepj/MNLI/meta_data \
--train_data_path /public/home/hepj/MNLI/train.tf_record \
--eval_data_path /public/home/hepj/MNLI/eval.tf_record \
--bert_config_file /public/home/hepj/model_source/uncased_L-12_H-768_A-12/bert_config.json \
--init_checkpoint /public/home/hepj/model_source/bert-base-TF2/bert_model.ckpt \
--train_batch_size 320 \
--eval_batch_size 32 \
--steps_per_loop 1000 \
--learning_rate 2e-5 \
--num_train_epochs 3 \
--num_gpus 4 \
--model_dir /public/home/hepj/model/tf2/out1 \
--distribution_strategy mirrored
export HIP_VISIBLE_DEVICES=0
python3 run_squad_xuan.py \
--mode train_and_eval \
--vocab_file /public/home/hepj/model_source/bert-large-uncased-TF2/uncased_L-24_H-1024_A-16/vocab.txt \
--bert_config_file /public/home/hepj/model_source/bert-large-uncased-TF2/uncased_L-24_H-1024_A-16/bert_config.json \
--input_meta_data_path /public/home/hepj/model/tf2.7.0_Bert/squad1.1/meta_data \
--train_data_path /public/home/hepj/model/tf2.7.0_Bert/squad1.1/train.tf_record \
--predict_file /public/home/hepj/model/model_source/sq1.1/dev-v1.1.json \
--init_checkpoint /public/home/hepj/model_source/bert-large-TF2/bert_model.ckpt \
--train_batch_size 4 \
--predict_batch_size 4 \
--learning_rate 2e-5 \
--log_steps 1 \
--num_gpus 1 \
--distribution_strategy mirrored \
--model_dir /public/home/hepj/model/tf2/squad1
export HIP_VISIBLE_DEVICES=0,1,2,3
python3 run_squad_xuan.py \
--mode train_and_eval \
--vocab_file /public/home/hepj/model_source/bert-large-uncased-TF2/uncased_L-24_H-1024_A-16/vocab.txt \
--bert_config_file /public/home/hepj/model_source/bert-large-uncased-TF2/uncased_L-24_H-1024_A-16/bert_config.json \
--input_meta_data_path /public/home/hepj/model/tf2.7.0_Bert/squad1.1/meta_data \
--train_data_path /public/home/hepj/model/tf2.7.0_Bert/squad1.1/train.tf_record \
--predict_file /public/home/hepj/model/model_source/sq1.1/dev-v1.1.json \
--init_checkpoint /public/home/hepj/model_source/bert-large-TF2/bert_model.ckpt \
--train_batch_size 4 \
--predict_batch_size 4 \
--learning_rate 2e-5 \
--log_steps 1 \
--num_gpus 4 \
--distribution_strategy mirrored \
--model_dir /public/home/hepj/model/tf2/squad1
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment