Unverified Commit 0507331f authored by MissPenguin's avatar MissPenguin Committed by GitHub
Browse files

Merge branch 'dygraph' into dygraph

parents 2cdaa0f8 39b831b9
...@@ -80,7 +80,6 @@ class CTCHead(nn.Layer): ...@@ -80,7 +80,6 @@ class CTCHead(nn.Layer):
result = (x, predicts) result = (x, predicts)
else: else:
result = predicts result = predicts
if not self.training: if not self.training:
predicts = F.softmax(predicts, axis=2) predicts = F.softmax(predicts, axis=2)
result = predicts result = predicts
......
...@@ -89,7 +89,7 @@ class CTCLabelDecode(BaseRecLabelDecode): ...@@ -89,7 +89,7 @@ class CTCLabelDecode(BaseRecLabelDecode):
use_space_char) use_space_char)
def __call__(self, preds, label=None, *args, **kwargs): def __call__(self, preds, label=None, *args, **kwargs):
if isinstance(preds, tuple): if isinstance(preds, tuple) or isinstance(preds, list):
preds = preds[-1] preds = preds[-1]
if isinstance(preds, paddle.Tensor): if isinstance(preds, paddle.Tensor):
preds = preds.numpy() preds = preds.numpy()
......
...@@ -117,7 +117,7 @@ teds: 93.32 ...@@ -117,7 +117,7 @@ teds: 93.32
```python ```python
cd PaddleOCR/ppstructure cd PaddleOCR/ppstructure
python3 table/predict_table.py --det_model_dir=path/to/det_model_dir --rec_model_dir=path/to/rec_model_dir --table_model_dir=path/to/table_model_dir --image_dir=../doc/table/1.png --rec_char_dict_path=../ppocr/utils/dict/table_dict.txt --table_char_dict_path=../ppocr/utils/dict/table_structure_dict.txt --rec_char_type=EN --det_limit_side_len=736 --det_limit_type=min --output ../output/table python3 table/predict_table.py --det_model_dir=path/to/det_model_dir --rec_model_dir=path/to/rec_model_dir --table_model_dir=path/to/table_model_dir --image_dir=../doc/table/1.png --rec_char_dict_path=../ppocr/utils/dict/table_dict.txt --table_char_dict_path=../ppocr/utils/dict/table_structure_dict.txt --det_limit_side_len=736 --det_limit_type=min --output ../output/table
``` ```
After running, the excel sheet of each picture will be saved in the directory specified by the output field After running, the excel sheet of each picture will be saved in the directory specified by the output field
......
...@@ -128,7 +128,7 @@ teds: 93.32 ...@@ -128,7 +128,7 @@ teds: 93.32
```python ```python
cd PaddleOCR/ppstructure cd PaddleOCR/ppstructure
python3 table/predict_table.py --det_model_dir=path/to/det_model_dir --rec_model_dir=path/to/rec_model_dir --table_model_dir=path/to/table_model_dir --image_dir=../doc/table/1.png --rec_char_dict_path=../ppocr/utils/dict/table_dict.txt --table_char_dict_path=../ppocr/utils/dict/table_structure_dict.txt --rec_char_type=EN --det_limit_side_len=736 --det_limit_type=min --output ../output/table python3 table/predict_table.py --det_model_dir=path/to/det_model_dir --rec_model_dir=path/to/rec_model_dir --table_model_dir=path/to/table_model_dir --image_dir=../doc/table/1.png --rec_char_dict_path=../ppocr/utils/dict/table_dict.txt --table_char_dict_path=../ppocr/utils/dict/table_structure_dict.txt --det_limit_side_len=736 --det_limit_type=min --output ../output/table
``` ```
# Reference # Reference
......
...@@ -58,7 +58,6 @@ class TableStructurer(object): ...@@ -58,7 +58,6 @@ class TableStructurer(object):
}] }]
postprocess_params = { postprocess_params = {
'name': 'TableLabelDecode', 'name': 'TableLabelDecode',
"character_type": args.table_char_type,
"character_dict_path": args.table_char_dict_path, "character_dict_path": args.table_char_dict_path,
} }
...@@ -104,7 +103,9 @@ class TableStructurer(object): ...@@ -104,7 +103,9 @@ class TableStructurer(object):
res_loc_final.append([left, top, right, bottom]) res_loc_final.append([left, top, right, bottom])
structure_str_list = structure_str_list[0][:-1] structure_str_list = structure_str_list[0][:-1]
structure_str_list = ['<html>', '<body>', '<table>'] + structure_str_list + ['</table>', '</body>', '</html>'] structure_str_list = [
'<html>', '<body>', '<table>'
] + structure_str_list + ['</table>', '</body>', '</html>']
elapse = time.time() - starttime elapse = time.time() - starttime
return (structure_str_list, res_loc_final), elapse return (structure_str_list, res_loc_final), elapse
......
...@@ -26,7 +26,6 @@ def init_args(): ...@@ -26,7 +26,6 @@ def init_args():
# params for table structure # params for table structure
parser.add_argument("--table_max_len", type=int, default=488) parser.add_argument("--table_max_len", type=int, default=488)
parser.add_argument("--table_model_dir", type=str) parser.add_argument("--table_model_dir", type=str)
parser.add_argument("--table_char_type", type=str, default='en')
parser.add_argument( parser.add_argument(
"--table_char_dict_path", "--table_char_dict_path",
type=str, type=str,
......
# 文档视觉问答(DocVQA) English | [简体中文](README_ch.md)
- [1. 简介](#1) - [Document Visual Question Answering (Doc-VQA)](#Document-Visual-Question-Answering)
- [2. 性能](#2) - [1. Introduction](#1-Introduction)
- [3. 效果演示](#3) - [2. Performance](#2-performance)
- [3.1 SER](#31) - [3. Effect demo](#3-Effect-demo)
- [3.2 RE](#32) - [3.1 SER](#31-ser)
- [4. 安装](#4) - [3.2 RE](#32-re)
- [4.1 安装依赖](#41) - [4. Install](#4-Install)
- [4.2 安装PaddleOCR](#42) - [4.1 Installation dependencies](#41-Install-dependencies)
- [5. 使用](#5) - [4.2 Install PaddleOCR](#42-Install-PaddleOCR)
- [5.1 数据和预训练模型准备](#51) - [5. Usage](#5-Usage)
- [5.2 SER](#52) - [5.1 Data and Model Preparation](#51-Data-and-Model-Preparation)
- [5.3 RE](#53) - [5.2 SER](#52-ser)
- [6. 参考链接](#6) - [5.3 RE](#53-re)
- [6. Reference](#6-Reference-Links)
# Document Visual Question Answering
## 1 Introduction
<a name="1"></a> VQA refers to visual question answering, which mainly asks and answers image content. DOC-VQA is one of the VQA tasks. DOC-VQA mainly asks questions about the text content of text images.
## 1. 简介
VQA指视觉问答,主要针对图像内容进行提问和回答,DocVQA是VQA任务中的一种,DocVQA主要针对文本图像的文字内容提出问题。 The DOC-VQA algorithm in PP-Structure is developed based on the PaddleNLP natural language processing algorithm library.
PP-Structure 里的DocVQA算法基于PaddleNLP自然语言处理算法库进行开发。 The main features are as follows:
主要特性如下: - Integrate [LayoutXLM](https://arxiv.org/pdf/2104.08836.pdf) model and PP-OCR prediction engine.
- Supports Semantic Entity Recognition (SER) and Relation Extraction (RE) tasks based on multimodal methods. Based on the SER task, the text recognition and classification in the image can be completed; based on the RE task, the relationship extraction of the text content in the image can be completed, such as judging the problem pair (pair).
- Supports custom training for SER tasks and RE tasks.
- Supports end-to-end system prediction and evaluation of OCR+SER.
- Supports end-to-end system prediction of OCR+SER+RE.
- 集成[LayoutXLM](https://arxiv.org/pdf/2104.08836.pdf)模型以及PP-OCR预测引擎。
- 支持基于多模态方法的语义实体识别 (Semantic Entity Recognition, SER) 以及关系抽取 (Relation Extraction, RE) 任务。基于 SER 任务,可以完成对图像中的文本识别与分类;基于 RE 任务,可以完成对图象中的文本内容的关系提取,如判断问题对(pair)。
- 支持SER任务和RE任务的自定义训练。
- 支持OCR+SER的端到端系统预测与评估。
- 支持OCR+SER+RE的端到端系统预测。
This project is an open source implementation of [LayoutXLM: Multimodal Pre-training for Multilingual Visually-rich Document Understanding](https://arxiv.org/pdf/2104.08836.pdf) on Paddle 2.2,
Included fine-tuning code on [XFUND dataset](https://github.com/doc-analysis/XFUND).
本项目是 [LayoutXLM: Multimodal Pre-training for Multilingual Visually-rich Document Understanding](https://arxiv.org/pdf/2104.08836.pdf) 在 Paddle 2.2上的开源实现, ## 2. Performance
包含了在 [XFUND数据集](https://github.com/doc-analysis/XFUND) 上的微调代码。
<a name="2"></a> We evaluate the algorithm on the Chinese dataset of [XFUND](https://github.com/doc-analysis/XFUND), and the performance is as follows
## 2. 性能
我们在 [XFUN](https://github.com/doc-analysis/XFUND) 的中文数据集上对算法进行了评估,性能如下 | Model | Task | hmean | Model download address |
| 模型 | 任务 | hmean | 模型下载地址 |
|:---:|:---:|:---:| :---:| |:---:|:---:|:---:| :---:|
| LayoutXLM | SER | 0.9038 | [链接](https://paddleocr.bj.bcebos.com/pplayout/ser_LayoutXLM_xfun_zh.tar) | | LayoutXLM | SER | 0.9038 | [link](https://paddleocr.bj.bcebos.com/pplayout/ser_LayoutXLM_xfun_zh.tar) |
| LayoutXLM | RE | 0.7483 | [链接](https://paddleocr.bj.bcebos.com/pplayout/re_LayoutXLM_xfun_zh.tar) | | LayoutXLM | RE | 0.7483 | [link](https://paddleocr.bj.bcebos.com/pplayout/re_LayoutXLM_xfun_zh.tar) |
| LayoutLMv2 | SER | 0.8544 | [链接](https://paddleocr.bj.bcebos.com/pplayout/ser_LayoutLMv2_xfun_zh.tar) | LayoutLMv2 | SER | 0.8544 | [link](https://paddleocr.bj.bcebos.com/pplayout/ser_LayoutLMv2_xfun_zh.tar)
| LayoutLMv2 | RE | 0.6777 | [链接](https://paddleocr.bj.bcebos.com/pplayout/re_LayoutLMv2_xfun_zh.tar) | | LayoutLMv2 | RE | 0.6777 | [link](https://paddleocr.bj.bcebos.com/pplayout/re_LayoutLMv2_xfun_zh.tar) |
| LayoutLM | SER | 0.7731 | [链接](https://paddleocr.bj.bcebos.com/pplayout/ser_LayoutLM_xfun_zh.tar) | | LayoutLM | SER | 0.7731 | [link](https://paddleocr.bj.bcebos.com/pplayout/ser_LayoutLM_xfun_zh.tar) |
<a name="3"></a> ## 3. Effect demo
## 3. 效果演示
**注意:** 测试图片来源于XFUN数据集。 **Note:** The test images are from the XFUND dataset.
<a name="31"></a> <a name="31"></a>
### 3.1 SER ### 3.1 SER
...@@ -59,13 +57,13 @@ PP-Structure 里的DocVQA算法基于PaddleNLP自然语言处理算法库进行 ...@@ -59,13 +57,13 @@ PP-Structure 里的DocVQA算法基于PaddleNLP自然语言处理算法库进行
![](../docs/vqa/result_ser/zh_val_0_ser.jpg) | ![](../docs/vqa/result_ser/zh_val_42_ser.jpg) ![](../docs/vqa/result_ser/zh_val_0_ser.jpg) | ![](../docs/vqa/result_ser/zh_val_42_ser.jpg)
---|--- ---|---
图中不同颜色的框表示不同的类别,对于XFUN数据集,有`QUESTION`, `ANSWER`, `HEADER` 3种类别 Boxes with different colors in the figure represent different categories. For the XFUND dataset, there are 3 categories: `QUESTION`, `ANSWER`, `HEADER`
* 深紫色:HEADER * Dark purple: HEADER
* 浅紫色:QUESTION * Light purple: QUESTION
* 军绿色:ANSWER * Army Green: ANSWER
在OCR检测框的左上方也标出了对应的类别和OCR识别结果。 The corresponding categories and OCR recognition results are also marked on the upper left of the OCR detection frame.
<a name="32"></a> <a name="32"></a>
### 3.2 RE ### 3.2 RE
...@@ -74,183 +72,187 @@ PP-Structure 里的DocVQA算法基于PaddleNLP自然语言处理算法库进行 ...@@ -74,183 +72,187 @@ PP-Structure 里的DocVQA算法基于PaddleNLP自然语言处理算法库进行
---|--- ---|---
图中红色框表示问题,蓝色框表示答案,问题和答案之间使用绿色线连接。在OCR检测框的左上方也标出了对应的类别和OCR识别结果。 The red box in the figure represents the question, the blue box represents the answer, and the question and the answer are connected by a green line. The corresponding categories and OCR recognition results are also marked on the upper left of the OCR detection frame.
<a name="4"></a> ## 4. Install
## 4. 安装
<a name="41"></a> ### 4.1 Install dependencies
### 4.1 安装依赖
- **1) 安装PaddlePaddle** - **(1) Install PaddlePaddle**
```bash ```bash
python3 -m pip install --upgrade pip python3 -m pip install --upgrade pip
# GPU安装 # GPU installation
python3 -m pip install "paddlepaddle-gpu>=2.2" -i https://mirror.baidu.com/pypi/simple python3 -m pip install "paddlepaddle-gpu>=2.2" -i https://mirror.baidu.com/pypi/simple
# CPU安装 # CPU installation
python3 -m pip install "paddlepaddle>=2.2" -i https://mirror.baidu.com/pypi/simple python3 -m pip install "paddlepaddle>=2.2" -i https://mirror.baidu.com/pypi/simple
``` ````
更多需求,请参照[安装文档](https://www.paddlepaddle.org.cn/install/quick)中的说明进行操作。 For more requirements, please refer to the instructions in [Installation Documentation](https://www.paddlepaddle.org.cn/install/quick).
<a name="42"></a> ### 4.2 Install PaddleOCR
### 4.2 安装PaddleOCR
- **(1)pip快速安装PaddleOCR whl包(仅预测)** - **(1) pip install PaddleOCR whl package quickly (prediction only)**
```bash ```bash
python3 -m pip install paddleocr python3 -m pip install paddleocr
``` ````
- **(2)下载VQA源码(预测+训练)** - **(2) Download VQA source code (prediction + training)**
```bash ```bash
【推荐】git clone https://github.com/PaddlePaddle/PaddleOCR [Recommended] git clone https://github.com/PaddlePaddle/PaddleOCR
# 如果因为网络问题无法pull成功,也可选择使用码云上的托管: # If the pull cannot be successful due to network problems, you can also choose to use the hosting on the code cloud:
git clone https://gitee.com/paddlepaddle/PaddleOCR git clone https://gitee.com/paddlepaddle/PaddleOCR
# 注:码云托管代码可能无法实时同步本github项目更新,存在3~5天延时,请优先使用推荐方式。 # Note: Code cloud hosting code may not be able to synchronize the update of this github project in real time, there is a delay of 3 to 5 days, please use the recommended method first.
``` ````
- **(3)安装VQA的`requirements`** - **(3) Install VQA's `requirements`**
```bash ```bash
python3 -m pip install -r ppstructure/vqa/requirements.txt python3 -m pip install -r ppstructure/vqa/requirements.txt
``` ````
<a name="5"></a> ## 5. Usage
## 5. 使用
<a name="51"></a> ### 5.1 Data and Model Preparation
### 5.1 数据和预训练模型准备
如果希望直接体验预测过程,可以下载我们提供的预训练模型,跳过训练过程,直接预测即可。 If you want to experience the prediction process directly, you can download the pre-training model provided by us, skip the training process, and just predict directly.
* 下载处理好的数据集 * Download the processed dataset
处理好的XFUN中文数据集下载地址:[https://paddleocr.bj.bcebos.com/dataset/XFUND.tar](https://paddleocr.bj.bcebos.com/dataset/XFUND.tar) The download address of the processed XFUND Chinese dataset: [https://paddleocr.bj.bcebos.com/dataset/XFUND.tar](https://paddleocr.bj.bcebos.com/dataset/XFUND.tar).
下载并解压该数据集,解压后将数据集放置在当前目录下。 Download and unzip the dataset, and place the dataset in the current directory after unzipping.
```shell ```shell
wget https://paddleocr.bj.bcebos.com/dataset/XFUND.tar wget https://paddleocr.bj.bcebos.com/dataset/XFUND.tar
``` ````
* Convert the dataset
* 转换数据集 If you need to train other XFUND datasets, you can use the following commands to convert the datasets
若需进行其他XFUN数据集的训练,可使用下面的命令进行数据集的转换 ```bash
python3 ppstructure/vqa/tools/trans_xfun_data.py --ori_gt_path=path/to/json_path --output_path=path/to/save_path
````
* Download the pretrained models
```bash ```bash
python3 ppstructure/vqa/helper/trans_xfun_data.py --ori_gt_path=path/to/json_path --output_path=path/to/save_path mkdir pretrain && cd pretrain
``` #download the SER model
wget https://paddleocr.bj.bcebos.com/pplayout/ser_LayoutXLM_xfun_zh.tar && tar -xvf ser_LayoutXLM_xfun_zh.tar
#download the RE model
wget https://paddleocr.bj.bcebos.com/pplayout/re_LayoutXLM_xfun_zh.tar && tar -xvf re_LayoutXLM_xfun_zh.tar
cd ../
````
<a name="52"></a> <a name="52"></a>
### 5.2 SER ### 5.2 SER
启动训练之前,需要修改下面的四个字段 Before starting training, you need to modify the following four fields
1. `Train.dataset.data_dir`:指向训练集图片存放目录 1. `Train.dataset.data_dir`: point to the directory where the training set images are stored
2. `Train.dataset.label_file_list`:指向训练集标注文件 2. `Train.dataset.label_file_list`: point to the training set label file
3. `Eval.dataset.data_dir`:指指向验证集图片存放目录 3. `Eval.dataset.data_dir`: refers to the directory where the validation set images are stored
4. `Eval.dataset.label_file_list`:指向验证集标注文件 4. `Eval.dataset.label_file_list`: point to the validation set label file
* 启动训练 * start training
```shell ```shell
CUDA_VISIBLE_DEVICES=0 python3 tools/train.py -c configs/vqa/ser/layoutxlm.yml CUDA_VISIBLE_DEVICES=0 python3 tools/train.py -c configs/vqa/ser/layoutxlm.yml
``` ````
最终会打印出`precision`, `recall`, `hmean`等指标。 Finally, `precision`, `recall`, `hmean` and other indicators will be printed.
`./output/ser_layoutxlm/`文件夹中会保存训练日志,最优的模型和最新epoch的模型。 In the `./output/ser_layoutxlm/` folder will save the training log, the optimal model and the model for the latest epoch.
* 恢复训练 * resume training
恢复训练需要将之前训练好的模型所在文件夹路径赋值给 `Architecture.Backbone.checkpoints` 字段。 To resume training, assign the folder path of the previously trained model to the `Architecture.Backbone.checkpoints` field.
```shell ```shell
CUDA_VISIBLE_DEVICES=0 python3 tools/train.py -c configs/vqa/ser/layoutxlm.yml -o Architecture.Backbone.checkpoints=path/to/model_dir CUDA_VISIBLE_DEVICES=0 python3 tools/train.py -c configs/vqa/ser/layoutxlm.yml -o Architecture.Backbone.checkpoints=path/to/model_dir
``` ````
* 评估 * evaluate
评估需要将待评估的模型所在文件夹路径赋值给 `Architecture.Backbone.checkpoints` 字段。 Evaluation requires assigning the folder path of the model to be evaluated to the `Architecture.Backbone.checkpoints` field.
```shell ```shell
CUDA_VISIBLE_DEVICES=0 python3 tools/eval.py -c configs/vqa/ser/layoutxlm.yml -o Architecture.Backbone.checkpoints=path/to/model_dir CUDA_VISIBLE_DEVICES=0 python3 tools/eval.py -c configs/vqa/ser/layoutxlm.yml -o Architecture.Backbone.checkpoints=path/to/model_dir
``` ````
最终会打印出`precision`, `recall`, `hmean`等指标 Finally, `precision`, `recall`, `hmean` and other indicators will be printed
* 使用`OCR引擎 + SER`串联预测 * Use `OCR engine + SER` tandem prediction
使用如下命令即可完成`OCR引擎 + SER`的串联预测 Use the following command to complete the series prediction of `OCR engine + SER`, taking the pretrained SER model as an example:
```shell ```shell
CUDA_VISIBLE_DEVICES=0 python3 tools/infer_vqa_token_ser.py -c configs/vqa/ser/layoutxlm.yml -o Architecture.Backbone.checkpoints=ser_LayoutXLM_xfun_zh/ Global.infer_img=doc/vqa/input/zh_val_42.jpg CUDA_VISIBLE_DEVICES=0 python3 tools/infer_vqa_token_ser.py -c configs/vqa/ser/layoutxlm.yml -o Architecture.Backbone.checkpoints=pretrain/ser_LayoutXLM_xfun_zh/Global.infer_img=doc/vqa/input/zh_val_42.jpg
``` ````
最终会在`config.Global.save_res_path`字段所配置的目录下保存预测结果可视化图像以及预测结果文本文件,预测结果文本文件名为`infer_results.txt` Finally, the prediction result visualization image and the prediction result text file will be saved in the directory configured by the `config.Global.save_res_path` field. The prediction result text file is named `infer_results.txt`.
* `OCR引擎 + SER`预测系统进行端到端评估 * End-to-end evaluation of `OCR engine + SER` prediction system
首先使用 `tools/infer_vqa_token_ser.py` 脚本完成数据集的预测,然后使用下面的命令进行评估。 First use the `tools/infer_vqa_token_ser.py` script to complete the prediction of the dataset, then use the following command to evaluate.
```shell ```shell
export CUDA_VISIBLE_DEVICES=0 export CUDA_VISIBLE_DEVICES=0
python3 helper/eval_with_label_end2end.py --gt_json_path XFUND/zh_val/xfun_normalize_val.json --pred_json_path output_res/infer_results.txt python3 tools/eval_with_label_end2end.py --gt_json_path XFUND/zh_val/xfun_normalize_val.json --pred_json_path output_res/infer_results.txt
``` ````
<a name="53"></a> <a name="53"></a>
### 5.3 RE ### 5.3 RE
* 启动训练 * start training
启动训练之前,需要修改下面的四个字段 Before starting training, you need to modify the following four fields
1. `Train.dataset.data_dir`:指向训练集图片存放目录 1. `Train.dataset.data_dir`: point to the directory where the training set images are stored
2. `Train.dataset.label_file_list`:指向训练集标注文件 2. `Train.dataset.label_file_list`: point to the training set label file
3. `Eval.dataset.data_dir`:指指向验证集图片存放目录 3. `Eval.dataset.data_dir`: refers to the directory where the validation set images are stored
4. `Eval.dataset.label_file_list`:指向验证集标注文件 4. `Eval.dataset.label_file_list`: point to the validation set label file
```shell ```shell
CUDA_VISIBLE_DEVICES=0 python3 tools/train.py -c configs/vqa/re/layoutxlm.yml CUDA_VISIBLE_DEVICES=0 python3 tools/train.py -c configs/vqa/re/layoutxlm.yml
``` ````
最终会打印出`precision`, `recall`, `hmean`等指标。 Finally, `precision`, `recall`, `hmean` and other indicators will be printed.
`./output/re_layoutxlm/`文件夹中会保存训练日志,最优的模型和最新epoch的模型。 In the `./output/re_layoutxlm/` folder will save the training log, the optimal model and the model for the latest epoch.
* 恢复训练 * resume training
恢复训练需要将之前训练好的模型所在文件夹路径赋值给 `Architecture.Backbone.checkpoints` 字段。 To resume training, assign the folder path of the previously trained model to the `Architecture.Backbone.checkpoints` field.
```shell ```shell
CUDA_VISIBLE_DEVICES=0 python3 tools/train.py -c configs/vqa/re/layoutxlm.yml -o Architecture.Backbone.checkpoints=path/to/model_dir CUDA_VISIBLE_DEVICES=0 python3 tools/train.py -c configs/vqa/re/layoutxlm.yml -o Architecture.Backbone.checkpoints=path/to/model_dir
``` ````
* 评估 * evaluate
评估需要将待评估的模型所在文件夹路径赋值给 `Architecture.Backbone.checkpoints` 字段。 Evaluation requires assigning the folder path of the model to be evaluated to the `Architecture.Backbone.checkpoints` field.
```shell ```shell
CUDA_VISIBLE_DEVICES=0 python3 tools/eval.py -c configs/vqa/re/layoutxlm.yml -o Architecture.Backbone.checkpoints=path/to/model_dir CUDA_VISIBLE_DEVICES=0 python3 tools/eval.py -c configs/vqa/re/layoutxlm.yml -o Architecture.Backbone.checkpoints=path/to/model_dir
``` ````
最终会打印出`precision`, `recall`, `hmean`等指标 Finally, `precision`, `recall`, `hmean` and other indicators will be printed
* 使用`OCR引擎 + SER + RE`串联预测 * Use `OCR engine + SER + RE` tandem prediction
使用如下命令即可完成`OCR引擎 + SER + RE`的串联预测 Use the following command to complete the series prediction of `OCR engine + SER + RE`, taking the pretrained SER and RE models as an example:
```shell ```shell
export CUDA_VISIBLE_DEVICES=0 export CUDA_VISIBLE_DEVICES=0
python3 tools/infer_vqa_token_ser_re.py -c configs/vqa/re/layoutxlm.yml -o Architecture.Backbone.checkpoints=re_LayoutXLM_xfun_zh/ Global.infer_img=doc/vqa/input/zh_val_21.jpg -c_ser configs/vqa/ser/layoutxlm.yml -o_ser Architecture.Backbone.checkpoints=ser_LayoutXLM_xfun_zh/ python3 tools/infer_vqa_token_ser_re.py -c configs/vqa/re/layoutxlm.yml -o Architecture.Backbone.checkpoints=pretrain/re_LayoutXLM_xfun_zh/Global.infer_img=doc/vqa/input/zh_val_21.jpg -c_ser configs/vqa/ser/layoutxlm. yml -o_ser Architecture.Backbone.checkpoints=pretrain/ser_LayoutXLM_xfun_zh/
``` ````
最终会在`config.Global.save_res_path`字段所配置的目录下保存预测结果可视化图像以及预测结果文本文件,预测结果文本文件名为`infer_results.txt` Finally, the prediction result visualization image and the prediction result text file will be saved in the directory configured by the `config.Global.save_res_path` field. The prediction result text file is named `infer_results.txt`.
<a name="6"></a> ## 6. Reference Links
## 6. 参考链接
- LayoutXLM: Multimodal Pre-training for Multilingual Visually-rich Document Understanding, https://arxiv.org/pdf/2104.08836.pdf - LayoutXLM: Multimodal Pre-training for Multilingual Visually-rich Document Understanding, https://arxiv.org/pdf/2104.08836.pdf
- microsoft/unilm/layoutxlm, https://github.com/microsoft/unilm/tree/master/layoutxlm - microsoft/unilm/layoutxlm, https://github.com/microsoft/unilm/tree/master/layoutxlm
......
[English](README.md) | 简体中文
- [文档视觉问答(DOC-VQA)](#文档视觉问答doc-vqa)
- [1. 简介](#1-简介)
- [2. 性能](#2-性能)
- [3. 效果演示](#3-效果演示)
- [3.1 SER](#31-ser)
- [3.2 RE](#32-re)
- [4. 安装](#4-安装)
- [4.1 安装依赖](#41-安装依赖)
- [4.2 安装PaddleOCR(包含 PP-OCR 和 VQA)](#42-安装paddleocr包含-pp-ocr-和-vqa)
- [5. 使用](#5-使用)
- [5.1 数据和预训练模型准备](#51-数据和预训练模型准备)
- [5.2 SER](#52-ser)
- [5.3 RE](#53-re)
- [6. 参考链接](#6-参考链接)
# 文档视觉问答(DOC-VQA)
## 1. 简介
VQA指视觉问答,主要针对图像内容进行提问和回答,DOC-VQA是VQA任务中的一种,DOC-VQA主要针对文本图像的文字内容提出问题。
PP-Structure 里的 DOC-VQA算法基于PaddleNLP自然语言处理算法库进行开发。
主要特性如下:
- 集成[LayoutXLM](https://arxiv.org/pdf/2104.08836.pdf)模型以及PP-OCR预测引擎。
- 支持基于多模态方法的语义实体识别 (Semantic Entity Recognition, SER) 以及关系抽取 (Relation Extraction, RE) 任务。基于 SER 任务,可以完成对图像中的文本识别与分类;基于 RE 任务,可以完成对图象中的文本内容的关系提取,如判断问题对(pair)。
- 支持SER任务和RE任务的自定义训练。
- 支持OCR+SER的端到端系统预测与评估。
- 支持OCR+SER+RE的端到端系统预测。
本项目是 [LayoutXLM: Multimodal Pre-training for Multilingual Visually-rich Document Understanding](https://arxiv.org/pdf/2104.08836.pdf) 在 Paddle 2.2上的开源实现,
包含了在 [XFUND数据集](https://github.com/doc-analysis/XFUND) 上的微调代码。
## 2. 性能
我们在 [XFUND](https://github.com/doc-analysis/XFUND) 的中文数据集上对算法进行了评估,性能如下
| 模型 | 任务 | hmean | 模型下载地址 |
|:---:|:---:|:---:| :---:|
| LayoutXLM | SER | 0.9038 | [链接](https://paddleocr.bj.bcebos.com/pplayout/ser_LayoutXLM_xfun_zh.tar) |
| LayoutXLM | RE | 0.7483 | [链接](https://paddleocr.bj.bcebos.com/pplayout/re_LayoutXLM_xfun_zh.tar) |
| LayoutLMv2 | SER | 0.8544 | [链接](https://paddleocr.bj.bcebos.com/pplayout/ser_LayoutLMv2_xfun_zh.tar)
| LayoutLMv2 | RE | 0.6777 | [链接](https://paddleocr.bj.bcebos.com/pplayout/re_LayoutLMv2_xfun_zh.tar) |
| LayoutLM | SER | 0.7731 | [链接](https://paddleocr.bj.bcebos.com/pplayout/ser_LayoutLM_xfun_zh.tar) |
## 3. 效果演示
**注意:** 测试图片来源于XFUND数据集。
### 3.1 SER
![](../../doc/vqa/result_ser/zh_val_0_ser.jpg) | ![](../../doc/vqa/result_ser/zh_val_42_ser.jpg)
---|---
图中不同颜色的框表示不同的类别,对于XFUND数据集,有`QUESTION`, `ANSWER`, `HEADER` 3种类别
* 深紫色:HEADER
* 浅紫色:QUESTION
* 军绿色:ANSWER
在OCR检测框的左上方也标出了对应的类别和OCR识别结果。
### 3.2 RE
![](../../doc/vqa/result_re/zh_val_21_re.jpg) | ![](../../doc/vqa/result_re/zh_val_40_re.jpg)
---|---
图中红色框表示问题,蓝色框表示答案,问题和答案之间使用绿色线连接。在OCR检测框的左上方也标出了对应的类别和OCR识别结果。
## 4. 安装
### 4.1 安装依赖
- **(1) 安装PaddlePaddle**
```bash
python3 -m pip install --upgrade pip
# GPU安装
python3 -m pip install "paddlepaddle-gpu>=2.2" -i https://mirror.baidu.com/pypi/simple
# CPU安装
python3 -m pip install "paddlepaddle>=2.2" -i https://mirror.baidu.com/pypi/simple
```
更多需求,请参照[安装文档](https://www.paddlepaddle.org.cn/install/quick)中的说明进行操作。
### 4.2 安装PaddleOCR(包含 PP-OCR 和 VQA)
- **(1)pip快速安装PaddleOCR whl包(仅预测)**
```bash
python3 -m pip install paddleocr
```
- **(2)下载VQA源码(预测+训练)**
```bash
【推荐】git clone https://github.com/PaddlePaddle/PaddleOCR
# 如果因为网络问题无法pull成功,也可选择使用码云上的托管:
git clone https://gitee.com/paddlepaddle/PaddleOCR
# 注:码云托管代码可能无法实时同步本github项目更新,存在3~5天延时,请优先使用推荐方式。
```
- **(3)安装VQA的`requirements`**
```bash
python3 -m pip install -r ppstructure/vqa/requirements.txt
```
## 5. 使用
### 5.1 数据和预训练模型准备
如果希望直接体验预测过程,可以下载我们提供的预训练模型,跳过训练过程,直接预测即可。
* 下载处理好的数据集
处理好的XFUND中文数据集下载地址:[https://paddleocr.bj.bcebos.com/dataset/XFUND.tar](https://paddleocr.bj.bcebos.com/dataset/XFUND.tar)
下载并解压该数据集,解压后将数据集放置在当前目录下。
```shell
wget https://paddleocr.bj.bcebos.com/dataset/XFUND.tar
```
* 转换数据集
若需进行其他XFUND数据集的训练,可使用下面的命令进行数据集的转换
```bash
python3 ppstructure/vqa/tools/trans_xfun_data.py --ori_gt_path=path/to/json_path --output_path=path/to/save_path
```
* 下载预训练模型
```bash
mkdir pretrain && cd pretrain
#下载SER模型
wget https://paddleocr.bj.bcebos.com/pplayout/ser_LayoutXLM_xfun_zh.tar && tar -xvf ser_LayoutXLM_xfun_zh.tar
#下载RE模型
wget https://paddleocr.bj.bcebos.com/pplayout/re_LayoutXLM_xfun_zh.tar && tar -xvf re_LayoutXLM_xfun_zh.tar
cd ../
```
### 5.2 SER
启动训练之前,需要修改下面的四个字段
1. `Train.dataset.data_dir`:指向训练集图片存放目录
2. `Train.dataset.label_file_list`:指向训练集标注文件
3. `Eval.dataset.data_dir`:指指向验证集图片存放目录
4. `Eval.dataset.label_file_list`:指向验证集标注文件
* 启动训练
```shell
CUDA_VISIBLE_DEVICES=0 python3 tools/train.py -c configs/vqa/ser/layoutxlm.yml
```
最终会打印出`precision`, `recall`, `hmean`等指标。
`./output/ser_layoutxlm/`文件夹中会保存训练日志,最优的模型和最新epoch的模型。
* 恢复训练
恢复训练需要将之前训练好的模型所在文件夹路径赋值给 `Architecture.Backbone.checkpoints` 字段。
```shell
CUDA_VISIBLE_DEVICES=0 python3 tools/train.py -c configs/vqa/ser/layoutxlm.yml -o Architecture.Backbone.checkpoints=path/to/model_dir
```
* 评估
评估需要将待评估的模型所在文件夹路径赋值给 `Architecture.Backbone.checkpoints` 字段。
```shell
CUDA_VISIBLE_DEVICES=0 python3 tools/eval.py -c configs/vqa/ser/layoutxlm.yml -o Architecture.Backbone.checkpoints=path/to/model_dir
```
最终会打印出`precision`, `recall`, `hmean`等指标
* 使用`OCR引擎 + SER`串联预测
使用如下命令即可完成`OCR引擎 + SER`的串联预测, 以SER预训练模型为例:
```shell
CUDA_VISIBLE_DEVICES=0 python3 tools/infer_vqa_token_ser.py -c configs/vqa/ser/layoutxlm.yml -o Architecture.Backbone.checkpoints=pretrain/ser_LayoutXLM_xfun_zh/ Global.infer_img=doc/vqa/input/zh_val_42.jpg
```
最终会在`config.Global.save_res_path`字段所配置的目录下保存预测结果可视化图像以及预测结果文本文件,预测结果文本文件名为`infer_results.txt`
*`OCR引擎 + SER`预测系统进行端到端评估
首先使用 `tools/infer_vqa_token_ser.py` 脚本完成数据集的预测,然后使用下面的命令进行评估。
```shell
export CUDA_VISIBLE_DEVICES=0
python3 tools/eval_with_label_end2end.py --gt_json_path XFUND/zh_val/xfun_normalize_val.json --pred_json_path output_res/infer_results.txt
```
### 5.3 RE
* 启动训练
启动训练之前,需要修改下面的四个字段
1. `Train.dataset.data_dir`:指向训练集图片存放目录
2. `Train.dataset.label_file_list`:指向训练集标注文件
3. `Eval.dataset.data_dir`:指指向验证集图片存放目录
4. `Eval.dataset.label_file_list`:指向验证集标注文件
```shell
CUDA_VISIBLE_DEVICES=0 python3 tools/train.py -c configs/vqa/re/layoutxlm.yml
```
最终会打印出`precision`, `recall`, `hmean`等指标。
`./output/re_layoutxlm/`文件夹中会保存训练日志,最优的模型和最新epoch的模型。
* 恢复训练
恢复训练需要将之前训练好的模型所在文件夹路径赋值给 `Architecture.Backbone.checkpoints` 字段。
```shell
CUDA_VISIBLE_DEVICES=0 python3 tools/train.py -c configs/vqa/re/layoutxlm.yml -o Architecture.Backbone.checkpoints=path/to/model_dir
```
* 评估
评估需要将待评估的模型所在文件夹路径赋值给 `Architecture.Backbone.checkpoints` 字段。
```shell
CUDA_VISIBLE_DEVICES=0 python3 tools/eval.py -c configs/vqa/re/layoutxlm.yml -o Architecture.Backbone.checkpoints=path/to/model_dir
```
最终会打印出`precision`, `recall`, `hmean`等指标
* 使用`OCR引擎 + SER + RE`串联预测
使用如下命令即可完成`OCR引擎 + SER + RE`的串联预测, 以预训练SER和RE模型为例:
```shell
export CUDA_VISIBLE_DEVICES=0
python3 tools/infer_vqa_token_ser_re.py -c configs/vqa/re/layoutxlm.yml -o Architecture.Backbone.checkpoints=pretrain/re_LayoutXLM_xfun_zh/ Global.infer_img=doc/vqa/input/zh_val_21.jpg -c_ser configs/vqa/ser/layoutxlm.yml -o_ser Architecture.Backbone.checkpoints=pretrain/ser_LayoutXLM_xfun_zh/
```
最终会在`config.Global.save_res_path`字段所配置的目录下保存预测结果可视化图像以及预测结果文本文件,预测结果文本文件名为`infer_results.txt`
## 6. 参考链接
- LayoutXLM: Multimodal Pre-training for Multilingual Visually-rich Document Understanding, https://arxiv.org/pdf/2104.08836.pdf
- microsoft/unilm/layoutxlm, https://github.com/microsoft/unilm/tree/master/layoutxlm
- XFUND dataset, https://github.com/doc-analysis/XFUND
## License
The content of this project itself is licensed under the [Attribution-NonCommercial-ShareAlike 4.0 International (CC BY-NC-SA 4.0)](https://creativecommons.org/licenses/by-nc-sa/4.0/)
...@@ -6,7 +6,7 @@ Global.use_gpu:True|True ...@@ -6,7 +6,7 @@ Global.use_gpu:True|True
Global.auto_cast:fp32 Global.auto_cast:fp32
Global.epoch_num:lite_train_lite_infer=1|whole_train_whole_infer=500 Global.epoch_num:lite_train_lite_infer=1|whole_train_whole_infer=500
Global.save_model_dir:./output/ Global.save_model_dir:./output/
Train.loader.batch_size_per_card:lite_train_lite_infer=2|whole_train_whole_infer=4 Train.loader.batch_size_per_card:lite_train_lite_infer=1|whole_train_whole_infer=4
Global.pretrained_model:null Global.pretrained_model:null
train_model_name:latest train_model_name:latest
train_infer_img_dir:./train_data/icdar2015/text_localization/ch4_test_images/ train_infer_img_dir:./train_data/icdar2015/text_localization/ch4_test_images/
......
===========================train_params=========================== ===========================train_params===========================
model_name:PPOCRv2_ocr_rec_pact model_name:ch_PPOCRv2_rec_PACT
python:python3.7 python:python3.7
gpu_list:0|0,1 gpu_list:0|0,1
Global.use_gpu:True|True Global.use_gpu:True|True
Global.auto_cast:fp32 Global.auto_cast:fp32
Global.epoch_num:lite_train_lite_infer=3|whole_train_whole_infer=300 Global.epoch_num:lite_train_lite_infer=6|whole_train_whole_infer=300
Global.save_model_dir:./output/ Global.save_model_dir:./output/
Train.loader.batch_size_per_card:lite_train_lite_infer=16|whole_train_whole_infer=128 Train.loader.batch_size_per_card:lite_train_lite_infer=16|whole_train_whole_infer=128
Global.pretrained_model:null Global.pretrained_model:pretrain_models/ch_PP-OCRv2_rec_train/best_accuracy
train_model_name:latest train_model_name:latest
train_infer_img_dir:./inference/rec_inference train_infer_img_dir:./inference/rec_inference
null:null null:null
......
...@@ -3,7 +3,7 @@ model_name:ocr_det ...@@ -3,7 +3,7 @@ model_name:ocr_det
use_opencv:True use_opencv:True
infer_model:./inference/ch_ppocr_mobile_v2.0_det_infer/ infer_model:./inference/ch_ppocr_mobile_v2.0_det_infer/
infer_quant:False infer_quant:False
inference:./deploy/cpp_infer/build/ppocr det inference:./deploy/cpp_infer/build/ppocr
--use_gpu:True|False --use_gpu:True|False
--enable_mkldnn:True|False --enable_mkldnn:True|False
--cpu_threads:1|6 --cpu_threads:1|6
...@@ -14,3 +14,7 @@ inference:./deploy/cpp_infer/build/ppocr det ...@@ -14,3 +14,7 @@ inference:./deploy/cpp_infer/build/ppocr det
--image_dir:./inference/ch_det_data_50/all-sum-510/ --image_dir:./inference/ch_det_data_50/all-sum-510/
null:null null:null
--benchmark:True --benchmark:True
--det:True
--rec:False
--cls:False
--use_angle_cls:False
\ No newline at end of file
===========================train_params=========================== ===========================train_params===========================
model_name:det_mv3_east_v2.0 model_name:det_mv3_east_v2.0
python:python3.7 python:python3.7
gpu_list:0 gpu_list:0|0,1
Global.use_gpu:True|True Global.use_gpu:True|True
Global.auto_cast:fp32 Global.auto_cast:fp32
Global.epoch_num:lite_train_lite_infer=1|whole_train_whole_infer=500 Global.epoch_num:lite_train_lite_infer=1|whole_train_whole_infer=500
Global.save_model_dir:./output/ Global.save_model_dir:./output/
Train.loader.batch_size_per_card:lite_train_lite_infer=2|whole_train_whole_infer=4 Train.loader.batch_size_per_card:lite_train_lite_infer=2|whole_train_whole_infer=4
Global.pretrained_model:null Global.pretrained_model:./pretrain_models/det_mv3_east_v2.0_train/best_accuracy
train_model_name:latest train_model_name:latest
train_infer_img_dir:./train_data/icdar2015/text_localization/ch4_test_images/ train_infer_img_dir:./train_data/icdar2015/text_localization/ch4_test_images/
null:null null:null
......
...@@ -37,7 +37,7 @@ export2:null ...@@ -37,7 +37,7 @@ export2:null
train_model:./inference/rec_mv3_tps_bilstm_att_v2.0_train/best_accuracy train_model:./inference/rec_mv3_tps_bilstm_att_v2.0_train/best_accuracy
infer_export:tools/export_model.py -c test_tipc/configs/rec_mv3_tps_bilstm_att_v2.0/rec_mv3_tps_bilstm_att.yml -o infer_export:tools/export_model.py -c test_tipc/configs/rec_mv3_tps_bilstm_att_v2.0/rec_mv3_tps_bilstm_att.yml -o
infer_quant:False infer_quant:False
inference:tools/infer/predict_rec.py --rec_char_dict_path=./ppocr/utils/ic15_dict.txt --rec_image_shape="3,32,100" --rec_algorithm="RARE" inference:tools/infer/predict_rec.py --rec_char_dict_path=./ppocr/utils/ic15_dict.txt --rec_image_shape="3,32,100" --rec_algorithm="RARE" --min_subgraph_size=5
--use_gpu:True|False --use_gpu:True|False
--enable_mkldnn:True|False --enable_mkldnn:True|False
--cpu_threads:1|6 --cpu_threads:1|6
......
...@@ -50,4 +50,4 @@ inference:tools/infer/predict_rec.py --rec_char_dict_path=./ppocr/utils/dict90.t ...@@ -50,4 +50,4 @@ inference:tools/infer/predict_rec.py --rec_char_dict_path=./ppocr/utils/dict90.t
--benchmark:True --benchmark:True
null:null null:null
===========================infer_benchmark_params========================== ===========================infer_benchmark_params==========================
random_infer_input:[{float32,[3,48,48,160]}] random_infer_input:[{float32,[3,48,160]}]
...@@ -37,7 +37,7 @@ export2:null ...@@ -37,7 +37,7 @@ export2:null
train_model:./inference/rec_r34_vd_tps_bilstm_att_v2.0_train/best_accuracy train_model:./inference/rec_r34_vd_tps_bilstm_att_v2.0_train/best_accuracy
infer_export:tools/export_model.py -c test_tipc/configs/rec_r34_vd_tps_bilstm_att_v2.0/rec_r34_vd_tps_bilstm_att.yml -o infer_export:tools/export_model.py -c test_tipc/configs/rec_r34_vd_tps_bilstm_att_v2.0/rec_r34_vd_tps_bilstm_att.yml -o
infer_quant:False infer_quant:False
inference:tools/infer/predict_rec.py --rec_char_dict_path=./ppocr/utils/ic15_dict.txt --rec_image_shape="3,32,100" --rec_algorithm="RARE" inference:tools/infer/predict_rec.py --rec_char_dict_path=./ppocr/utils/ic15_dict.txt --rec_image_shape="3,32,100" --rec_algorithm="RARE" --min_subgraph_size=5
--use_gpu:True|False --use_gpu:True|False
--enable_mkldnn:True|False --enable_mkldnn:True|False
--cpu_threads:1|6 --cpu_threads:1|6
......
...@@ -37,7 +37,7 @@ export2:null ...@@ -37,7 +37,7 @@ export2:null
train_model:./inference/rec_r50_vd_srn_train/best_accuracy train_model:./inference/rec_r50_vd_srn_train/best_accuracy
infer_export:tools/export_model.py -c test_tipc/configs/rec_r50_fpn_vd_none_srn/rec_r50_fpn_srn.yml -o infer_export:tools/export_model.py -c test_tipc/configs/rec_r50_fpn_vd_none_srn/rec_r50_fpn_srn.yml -o
infer_quant:False infer_quant:False
inference:tools/infer/predict_rec.py --rec_char_dict_path=./ppocr/utils/ic15_dict.txt --rec_image_shape="1,64,256" --rec_algorithm="SRN" --use_space_char=False inference:tools/infer/predict_rec.py --rec_char_dict_path=./ppocr/utils/ic15_dict.txt --rec_image_shape="1,64,256" --rec_algorithm="SRN" --use_space_char=False --min_subgraph_size=3
--use_gpu:True|False --use_gpu:True|False
--enable_mkldnn:True|False --enable_mkldnn:True|False
--cpu_threads:1|6 --cpu_threads:1|6
......
...@@ -64,6 +64,10 @@ if [ ${MODE} = "lite_train_lite_infer" ];then ...@@ -64,6 +64,10 @@ if [ ${MODE} = "lite_train_lite_infer" ];then
wget -nc -P ./pretrain_models/ https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_server_v2.0_det_train.tar --no-check-certificate wget -nc -P ./pretrain_models/ https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_server_v2.0_det_train.tar --no-check-certificate
cd ./pretrain_models/ && tar xf ch_ppocr_server_v2.0_det_train.tar && cd ../ cd ./pretrain_models/ && tar xf ch_ppocr_server_v2.0_det_train.tar && cd ../
fi fi
if [ ${model_name} == "ch_PPOCRv2_rec" ] || [ ${model_name} == "ch_PPOCRv2_rec_PACT" ]; then
wget -nc -P ./pretrain_models/ https://paddleocr.bj.bcebos.com/PP-OCRv2/chinese/ch_PP-OCRv2_rec_train.tar --no-check-certificate
cd ./pretrain_models/ && tar xf ch_PP-OCRv2_rec_train.tar && cd ../
fi
if [ ${model_name} == "det_r18_db_v2_0" ]; then if [ ${model_name} == "det_r18_db_v2_0" ]; then
wget -nc -P ./pretrain_models/ https://paddleocr.bj.bcebos.com/pretrained/ResNet18_vd_pretrained.pdparams --no-check-certificate wget -nc -P ./pretrain_models/ https://paddleocr.bj.bcebos.com/pretrained/ResNet18_vd_pretrained.pdparams --no-check-certificate
fi fi
...@@ -91,6 +95,10 @@ if [ ${MODE} = "lite_train_lite_infer" ];then ...@@ -91,6 +95,10 @@ if [ ${MODE} = "lite_train_lite_infer" ];then
wget -nc -P ./pretrain_models/ https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_rec_train.tar --no-check-certificate wget -nc -P ./pretrain_models/ https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_rec_train.tar --no-check-certificate
cd ./pretrain_models/ && tar xf ch_ppocr_mobile_v2.0_rec_train.tar && cd ../ cd ./pretrain_models/ && tar xf ch_ppocr_mobile_v2.0_rec_train.tar && cd ../
fi fi
if [ ${model_name} == "det_mv3_east_v2.0" ]; then
wget -nc -P ./pretrain_models/ https://paddleocr.bj.bcebos.com/dygraph_v2.0/en/det_mv3_east_v2.0_train.tar --no-check-certificate
cd ./pretrain_models/ && tar xf det_mv3_east_v2.0_train.tar && cd ../
fi
elif [ ${MODE} = "whole_train_whole_infer" ];then elif [ ${MODE} = "whole_train_whole_infer" ];then
wget -nc -P ./pretrain_models/ https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/MobileNetV3_large_x0_5_pretrained.pdparams --no-check-certificate wget -nc -P ./pretrain_models/ https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/MobileNetV3_large_x0_5_pretrained.pdparams --no-check-certificate
......
...@@ -2,7 +2,7 @@ ...@@ -2,7 +2,7 @@
source test_tipc/common_func.sh source test_tipc/common_func.sh
FILENAME=$1 FILENAME=$1
dataline=$(awk 'NR==1, NR==16{print}' $FILENAME) dataline=$(awk 'NR==1, NR==20{print}' $FILENAME)
# parser params # parser params
IFS=$'\n' IFS=$'\n'
...@@ -34,6 +34,14 @@ cpp_infer_key1=$(func_parser_key "${lines[14]}") ...@@ -34,6 +34,14 @@ cpp_infer_key1=$(func_parser_key "${lines[14]}")
cpp_infer_value1=$(func_parser_value "${lines[14]}") cpp_infer_value1=$(func_parser_value "${lines[14]}")
cpp_benchmark_key=$(func_parser_key "${lines[15]}") cpp_benchmark_key=$(func_parser_key "${lines[15]}")
cpp_benchmark_value=$(func_parser_value "${lines[15]}") cpp_benchmark_value=$(func_parser_value "${lines[15]}")
cpp_det_key=$(func_parser_key "${lines[16]}")
cpp_det_value=$(func_parser_value "${lines[16]}")
cpp_rec_key=$(func_parser_key "${lines[17]}")
cpp_rec_value=$(func_parser_value "${lines[17]}")
cpp_cls_key=$(func_parser_key "${lines[18]}")
cpp_cls_value=$(func_parser_value "${lines[18]}")
cpp_use_angle_cls_key=$(func_parser_key "${lines[19]}")
cpp_use_angle_cls_value=$(func_parser_value "${lines[19]}")
LOG_PATH="./test_tipc/output" LOG_PATH="./test_tipc/output"
mkdir -p ${LOG_PATH} mkdir -p ${LOG_PATH}
...@@ -68,7 +76,11 @@ function func_cpp_inference(){ ...@@ -68,7 +76,11 @@ function func_cpp_inference(){
set_cpu_threads=$(func_set_params "${cpp_cpu_threads_key}" "${threads}") set_cpu_threads=$(func_set_params "${cpp_cpu_threads_key}" "${threads}")
set_model_dir=$(func_set_params "${cpp_infer_model_key}" "${_model_dir}") set_model_dir=$(func_set_params "${cpp_infer_model_key}" "${_model_dir}")
set_infer_params1=$(func_set_params "${cpp_infer_key1}" "${cpp_infer_value1}") set_infer_params1=$(func_set_params "${cpp_infer_key1}" "${cpp_infer_value1}")
command="${_script} ${cpp_use_gpu_key}=${use_gpu} ${set_mkldnn} ${set_cpu_threads} ${set_model_dir} ${set_batchsize} ${set_infer_data} ${set_benchmark} ${set_infer_params1} > ${_save_log_path} 2>&1 " set_det=$(func_set_params "${cpp_det_key}" "${cpp_det_value}")
set_rec=$(func_set_params "${cpp_rec_key}" "${cpp_rec_value}")
set_cls=$(func_set_params "${cpp_cls_key}" "${cpp_cls_value}")
set_use_angle_cls=$(func_set_params "${cpp_use_angle_cls_key}" "${cpp_use_angle_cls_value}")
command="${_script} ${cpp_use_gpu_key}=${use_gpu} ${set_mkldnn} ${set_cpu_threads} ${set_model_dir} ${set_batchsize} ${set_infer_data} ${set_benchmark} ${set_det} ${set_rec} ${set_cls} ${set_use_angle_cls} ${set_infer_params1} > ${_save_log_path} 2>&1 "
eval $command eval $command
last_status=${PIPESTATUS[0]} last_status=${PIPESTATUS[0]}
eval "cat ${_save_log_path}" eval "cat ${_save_log_path}"
...@@ -97,7 +109,11 @@ function func_cpp_inference(){ ...@@ -97,7 +109,11 @@ function func_cpp_inference(){
set_precision=$(func_set_params "${cpp_precision_key}" "${precision}") set_precision=$(func_set_params "${cpp_precision_key}" "${precision}")
set_model_dir=$(func_set_params "${cpp_infer_model_key}" "${_model_dir}") set_model_dir=$(func_set_params "${cpp_infer_model_key}" "${_model_dir}")
set_infer_params1=$(func_set_params "${cpp_infer_key1}" "${cpp_infer_value1}") set_infer_params1=$(func_set_params "${cpp_infer_key1}" "${cpp_infer_value1}")
command="${_script} ${cpp_use_gpu_key}=${use_gpu} ${set_tensorrt} ${set_precision} ${set_model_dir} ${set_batchsize} ${set_infer_data} ${set_benchmark} ${set_infer_params1} > ${_save_log_path} 2>&1 " set_det=$(func_set_params "${cpp_det_key}" "${cpp_det_value}")
set_rec=$(func_set_params "${cpp_rec_key}" "${cpp_rec_value}")
set_cls=$(func_set_params "${cpp_cls_key}" "${cpp_cls_value}")
set_use_angle_cls=$(func_set_params "${cpp_use_angle_cls_key}" "${cpp_use_angle_cls_value}")
command="${_script} ${cpp_use_gpu_key}=${use_gpu} ${set_tensorrt} ${set_precision} ${set_model_dir} ${set_batchsize} ${set_infer_data} ${set_benchmark} ${set_det} ${set_rec} ${set_cls} ${set_use_angle_cls} ${set_infer_params1} > ${_save_log_path} 2>&1 "
eval $command eval $command
last_status=${PIPESTATUS[0]} last_status=${PIPESTATUS[0]}
eval "cat ${_save_log_path}" eval "cat ${_save_log_path}"
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment