Merge remote-tracking branch 'origin/dygraph' into dygraph

df001f3c · Leif · 9cce1213 · bdca6cd7 · df001f3c · df001f3c
Commit df001f3c authored Apr 06, 2022 by Leif
20 changed files
--- a/deploy/hubserving/structure_table/config.json
+++ b/deploy/hubserving/structure_table/config.json
+{
+    "modules_info": {
+        "structure_table": {
+            "init_args": {
+                "version": "1.0.0",
+                "use_gpu": true
+            },
+            "predict_args": {
+            }
+        }
+    },
+    "port": 8869,
+    "use_multiprocess": false,
+    "workers": 2
+}
+
--- a/deploy/hubserving/structure_table/module.py
+++ b/deploy/hubserving/structure_table/module.py
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+import sys
+sys.path.insert(0, ".")
+import copy
+
+import time
+import paddlehub
+from paddlehub.common.logger import logger
+from paddlehub.module.module import moduleinfo, runnable, serving
+import cv2
+import numpy as np
+import paddlehub as hub
+
+from tools.infer.utility import base64_to_cv2
+from ppstructure.table.predict_table import TableSystem as _TableSystem
+from ppstructure.predict_system import save_structure_res
+from ppstructure.utility import parse_args
+from deploy.hubserving.structure_table.params import read_params
+
+
+@moduleinfo(
+    name="structure_table",
+    version="1.0.0",
+    summary="PP-Structure table service",
+    author="paddle-dev",
+    author_email="paddle-dev@baidu.com",
+    type="cv/structure_table")
+class TableSystem(hub.Module):
+    def _initialize(self, use_gpu=False, enable_mkldnn=False):
+        """
+        initialize with the necessary elements
+        """
+        cfg = self.merge_configs()
+        cfg.use_gpu = use_gpu
+        if use_gpu:
+            try:
+                _places = os.environ["CUDA_VISIBLE_DEVICES"]
+                int(_places[0])
+                print("use gpu: ", use_gpu)
+                print("CUDA_VISIBLE_DEVICES: ", _places)
+                cfg.gpu_mem = 8000
+            except:
+                raise RuntimeError(
+                    "Environment Variable CUDA_VISIBLE_DEVICES is not set correctly. If you wanna use gpu, please set CUDA_VISIBLE_DEVICES via export CUDA_VISIBLE_DEVICES=cuda_device_id."
+                )
+        cfg.ir_optim = True
+        cfg.enable_mkldnn = enable_mkldnn
+
+        self.table_sys = _TableSystem(cfg)
+
+    def merge_configs(self):
+        # deafult cfg
+        backup_argv = copy.deepcopy(sys.argv)
+        sys.argv = sys.argv[:1]
+        cfg = parse_args()
+
+        update_cfg_map = vars(read_params())
+
+        for key in update_cfg_map:
+            cfg.__setattr__(key, update_cfg_map[key])
+
+        sys.argv = copy.deepcopy(backup_argv)
+        return cfg
+
+    def read_images(self, paths=[]):
+        images = []
+        for img_path in paths:
+            assert os.path.isfile(
+                img_path), "The {} isn't a valid file.".format(img_path)
+            img = cv2.imread(img_path)
+            if img is None:
+                logger.info("error in loading image:{}".format(img_path))
+                continue
+            images.append(img)
+        return images
+
+    def predict(self, images=[], paths=[]):
+        """
+        Get the chinese texts in the predicted images.
+        Args:
+            images (list(numpy.ndarray)): images data, shape of each is [H, W, C]. If images not paths
+            paths (list[str]): The paths of images. If paths not images
+        Returns:
+            res (list): The result of chinese texts and save path of images.
+        """
+
+        if images != [] and isinstance(images, list) and paths == []:
+            predicted_data = images
+        elif images == [] and isinstance(paths, list) and paths != []:
+            predicted_data = self.read_images(paths)
+        else:
+            raise TypeError("The input data is inconsistent with expectations.")
+
+        assert predicted_data != [], "There is not any image to be predicted. Please check the input data."
+
+        all_results = []
+        for img in predicted_data:
+            if img is None:
+                logger.info("error in loading image")
+                all_results.append([])
+                continue
+            starttime = time.time()
+            pred_html = self.table_sys(img)
+            elapse = time.time() - starttime
+            logger.info("Predict time: {}".format(elapse))
+
+            all_results.append({'html': pred_html})
+        return all_results
+
+    @serving
+    def serving_method(self, images, **kwargs):
+        """
+        Run as a service.
+        """
+        images_decode = [base64_to_cv2(image) for image in images]
+        results = self.predict(images_decode, **kwargs)
+        return results
+
+
+if __name__ == '__main__':
+    table_system = TableSystem()
+    table_system._initialize()
+    image_path = ['./doc/table/table.jpg']
+    res = table_system.predict(paths=image_path)
+    print(res)
--- a/deploy/hubserving/structure_table/params.py
+++ b/deploy/hubserving/structure_table/params.py
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from deploy.hubserving.ocr_system.params import read_params as pp_ocr_read_params
+
+
+def read_params():
+    cfg = pp_ocr_read_params()
+
+    # params for table structure model
+    cfg.table_max_len = 488
+    cfg.table_model_dir = './inference/en_ppocr_mobile_v2.0_table_structure_infer/'
+    cfg.table_char_type = 'en'
+    cfg.table_char_dict_path = './ppocr/utils/dict/table_structure_dict.txt'
+    cfg.show_log = False
+    return cfg
--- a/deploy/ios_demo/README.md
+++ b/deploy/ios_demo/README.md
+# ios Demo
+
+参考 https://github.com/PaddlePaddle/Paddle-Lite-Demo/blob/develop/ocr/ios/ppocr_demo/ppocr_demo/README.md
--- a/deploy/slim/quantization/quant.py
+++ b/deploy/slim/quantization/quant.py
@@ -118,6 +118,11 @@ def main(config, device, logger, vdl_writer):
            config['Architecture']["Head"]['out_channels'] = char_num
    model = build_model(config['Architecture'])

+    pre_best_model_dict = dict()
+    # load fp32 model to begin quantization
+    if config["Global"]["pretrained_model"] is not None:
+        pre_best_model_dict = load_model(config, model)
+
    quanter = QAT(config=quant_config, act_preprocess=PACT)
    quanter.quantize(model)

@@ -134,10 +139,12 @@ def main(config, device, logger, vdl_writer):
        step_each_epoch=len(train_dataloader),
        parameters=model.parameters())

+    # resume PACT training process
+    if config["Global"]["checkpoints"] is not None:
+        pre_best_model_dict = load_model(config, model, optimizer)
+
    # build metric
    eval_class = build_metric(config['Metric'])
-    # load pretrain model
-    pre_best_model_dict = load_model(config, model, optimizer)

    logger.info('train dataloader has {} iters, valid dataloader has {} iters'.
                format(len(train_dataloader), len(valid_dataloader)))

--- a/doc/doc_ch/finetune.md
+++ b/doc/doc_ch/finetune.md
+# 模型微调
+
+## 1. 模型微调背景与意义
+
+PaddleOCR提供的PP-OCR系列模型在通用场景中性能优异，能够解决绝大多数情况下的检测与识别问题。在垂类场景中，如果希望获取更优的模型效果，可以通过模型微调的方法，进一步提升PP-OCR系列检测与识别模型的精度。
+
+本文主要介绍文本检测与识别模型在模型微调时的一些注意事项，最终希望您在自己的场景中，通过模型微调，可以获取精度更高的文本检测与识别模型。
+
+本文核心要点如下所示。
+
+1. PP-OCR提供的预训练模型有较好的泛化能力
+2. 加入少量真实数据（检测任务>=500张, 识别任务>=5000张），会大幅提升垂类场景的检测与识别效果
+3. 在模型微调时，加入真实通用场景数据，可以进一步提升模型精度与泛化性能
+4. 在图像检测任务中，增大图像的预测尺度，能够进一步提升较小文字区域的检测效果
+5. 在模型微调时，需要适当调整超参数（学习率，batch size最为重要），以获得更优的微调效果。
+
+更多详细内容，请参考第2章与第3章。
+
+## 2. 文本检测模型微调
+
+### 2.1 数据选择
+
+* 数据量：建议至少准备500张的文本检测数据集用于模型微调。
+
+* 数据标注：单行文本标注格式，建议标注的检测框与实际语义内容一致。如在火车票场景中，姓氏与名字可能离得较远，但是它们在语义上属于同一个检测字段，这里也需要将整个姓名标注为1个检测框。
+
+### 2.2 模型选择
+
+建议选择PP-OCRv2模型（配置文件：[ch_PP-OCRv2_det_student.yml](../../configs/det/ch_PP-OCRv2/ch_PP-OCRv2_det_student.yml)，预训练模型：[ch_PP-OCRv2_det_distill_train.tar](https://paddleocr.bj.bcebos.com/PP-OCRv2/chinese/ch_PP-OCRv2_det_distill_train.tar)）进行微调，其精度与泛化性能是目前提供的最优预训练模型。
+
+更多PP-OCR系列模型，请参考[PaddleOCR 首页说明文档](../../README_ch.md)。
+
+注意：在使用上述预训练模型的时候，由于保存的模型中包含教师模型，因此需要将其中的学生模型单独提取出来，再加载学生模型即可进行模型微调。
+
+```python
+import paddle
+# 加载完整的检测预训练模型
+a = paddle.load("ch_PP-OCRv2_det_distill_train/best_accuracy.pdparams")
+# 提取学生模型的参数
+b = {k[len("student_model."):]: a[k] for k in a if "student_model." in k}
+# 保存模型，用于后续模型微调
+paddle.save(b, "ch_PP-OCRv2_det_student.pdparams")
+```
+
+
+### 2.3 训练超参选择
+
+在模型微调的时候，最重要的超参就是预训练模型路径`pretrained_model`, 学习率`learning_rate`与`batch_size`，部分配置文件如下所示。
+
+```yaml
+Global:
+  pretrained_model: ./pretrain_models/student.pdparams # 预训练模型路径
+Optimizer:
+  lr:
+    name: Cosine
+    learning_rate: 0.001 # 学习率
+    warmup_epoch: 2
+  regularizer:
+    name: 'L2'
+    factor: 0
+
+Train:
+  loader:
+    shuffle: True
+    drop_last: False
+    batch_size_per_card: 8  # 单卡batch size
+    num_workers: 4
+```
+
+上述配置文件中，首先需要将`pretrained_model`字段指定为2.2章节中提取出来的`ch_PP-OCRv2_det_student.pdparams`文件路径。
+
+PaddleOCR提供的配置文件是在8卡训练（相当于总的batch size是`8*8=64`）、且没有加载预训练模型情况下的配置文件，因此您的场景中，学习率与总的batch size需要对应线性调整，例如
+
+* 如果您的场景中是单卡训练，单卡batch_size=8，则总的batch_size=8，建议将学习率调整为`1e-4`左右。
+* 如果您的场景中是单卡训练，由于显存限制，只能设置单卡batch_size=4，则总的batch_size=4，建议将学习率调整为`5e-5`左右。
+
+### 2.4 预测超参选择
+
+对训练好的模型导出并进行推理时，可以通过进一步调整预测的图像尺度，来提升小面积文本的检测效果，下面是DBNet推理时的一些超参数，可以通过适当调整，提升效果。
+
+| 参数名称 | 类型 | 默认值 | 含义 |
+| :--: | :--: | :--: | :--: |
+|  det_db_thresh | float | 0.3 | DB输出的概率图中，得分大于该阈值的像素点才会被认为是文字像素点 |
+|  det_db_box_thresh | float | 0.6 | 检测结果边框内，所有像素点的平均得分大于该阈值时，该结果会被认为是文字区域 |
+|  det_db_unclip_ratio | float | 1.5 | `Vatti clipping`算法的扩张系数，使用该方法对文字区域进行扩张 |
+|  max_batch_size | int | 10 | 预测的batch size |
+|  use_dilation | bool | False | 是否对分割结果进行膨胀以获取更优检测效果 |
+|  det_db_score_mode | str | "fast" | DB的检测结果得分计算方法，支持`fast`和`slow`，`fast`是根据polygon的外接矩形边框内的所有像素计算平均得分，`slow`是根据原始polygon内的所有像素计算平均得分，计算速度相对较慢一些，但是更加准确一些。 |
+
+
+更多关于推理方法的介绍可以参考[Paddle Inference推理教程](./inference.md)。
+
+
+## 3. 文本识别模型微调
+
+
+### 3.1 数据选择
+
+* 数据量：不更换字典的情况下，建议至少准备5000张的文本识别数据集用于模型微调；如果更换了字典（不建议），需要的数量更多。
+
+* 数据分布：建议分布与实测场景尽量一致。如果实测场景包含大量短文本，则训练数据中建议也包含较多短文本，如果实测场景对于空格识别效果要求较高，则训练数据中建议也包含较多带空格的文本内容。
+
+
+* 通用中英文数据：在训练的时候，可以在训练集中添加通用真实数据（如在不更换字典的微调场景中，建议添加LSVT、RCTW、MTWI等真实数据），进一步提升模型的泛化性能。
+
+### 3.2 模型选择
+
+建议选择PP-OCRv2模型（配置文件：[ch_PP-OCRv2_rec_distillation.yml](../../configs/rec/ch_PP-OCRv2/ch_PP-OCRv2_rec_distillation.yml)，预训练模型：[ch_PP-OCRv2_rec_train.tar](https://paddleocr.bj.bcebos.com/PP-OCRv2/chinese/ch_PP-OCRv2_rec_train.tar)）进行微调，其精度与泛化性能是目前提供的最优预训练模型。
+
+更多PP-OCR系列，模型请参考[PaddleOCR 首页说明文档](../../README_ch.md)。
+
+
+### 3.3 训练超参选择
+
+与文本检测任务微调相同，在识别模型微调的时候，最重要的超参就是预训练模型路径`pretrained_model`, 学习率`learning_rate`与`batch_size`，部分默认配置文件如下所示。
+
+```yaml
+Global:
+  pretrained_model:  # 预训练模型路径
+Optimizer:
+  lr:
+    name: Piecewise
+    decay_epochs : [700, 800]
+    values : [0.001, 0.0001]  # 学习率
+    warmup_epoch: 5
+  regularizer:
+    name: 'L2'
+    factor: 0
+
+Train:
+  dataset:
+    name: SimpleDataSet
+    data_dir: ./train_data/
+    label_file_list:
+    - ./train_data/train_list.txt
+    ratio_list: [1.0] # 采样比例，默认值是[1.0]
+  loader:
+    shuffle: True
+    drop_last: False
+    batch_size_per_card: 128 # 单卡batch size
+    num_workers: 8
+
+```
+
+
+上述配置文件中，首先需要将`pretrained_model`字段指定为2.2章节中解压得到的`ch_PP-OCRv2_rec_train/best_accuracy.pdparams`文件路径。
+
+PaddleOCR提供的配置文件是在8卡训练（相当于总的batch size是`8*128=1024`）、且没有加载预训练模型情况下的配置文件，因此您的场景中，学习率与总的batch size需要对应线性调整，例如：
+
+* 如果您的场景中是单卡训练，单卡batch_size=128，则总的batch_size=128，在加载预训练模型的情况下，建议将学习率调整为`[1e-4, 2e-5]`左右（piecewise学习率策略，需设置2个值，下同）。
+* 如果您的场景中是单卡训练，因为显存限制，只能设置单卡batch_size=64，则总的batch_size=64，在加载预训练模型的情况下，建议将学习率调整为`[5e-5, 1e-5]`左右。
+
+
+如果有通用真实场景数据加进来，建议每个epoch中，垂类场景数据与真实场景的数据量保持在1:1左右。
+
+比如：您自己的垂类场景识别数据量为1W，数据标签文件为`vertical.txt`，收集到的通用场景识别数据量为10W，数据标签文件为`general.txt`，
+
+
+那么，可以设置`label_file_list`和`ratio_list`参数如下所示。每个epoch中，`vertical.txt`中会进行全采样（采样比例为1.0），包含1W条数据；`general.txt`中会按照0.1的采样比例进行采样，包含`10W*0.1=1W`条数据，最终二者的比例为`1:1`。
+
+```yaml
+Train:
+  dataset:
+    name: SimpleDataSet
+    data_dir: ./train_data/
+    label_file_list:
+    - vertical.txt
+    - general.txt
+    ratio_list: [1.0, 0.1]
+```
--- a/doc/doc_ch/inference.md
+++ b/doc/doc_ch/inference.md
@@ -36,6 +36,8 @@ inference 模型（`paddle.jit.save`保存的模型）

 - [六、参数解释](#参数解释)

+- [七、FAQ](#FAQ)
+

 <a name="训练模型转inference模型"></a>
 ## 一、训练模型转inference模型
@@ -520,3 +522,9 @@ PSE算法相关参数如下
 |  label_list | list | ['0', '180'] | class id对应的角度值 |
 |  cls_batch_num | int | 6 | 方向分类器预测的batch size |
 |  cls_thresh | float | 0.9 | 预测阈值，模型预测结果为180度，且得分大于该阈值时，认为最终预测结果为180度，需要翻转 |
+
+
+
+# 七、FAQ
+
+* 如果是使用paddle2.0之前版本的代码导出的`inference模型`，则其文件名为`model`与`params`，分别对应paddle2.0或者之后版本导出的`inference.pdmodel`与`inference.pdiparams`；不过目前PaddleOCR的release分支已经不支持paddle2.0之前版本导出的inference 模型，如果希望使用，需要使用develop分支（静态图分支）的代码与文档。
--- a/doc/doc_ch/recognition.md
+++ b/doc/doc_ch/recognition.md
@@ -75,9 +75,9 @@ train_data/rec/train/word_002.jpg   用科技让复杂的世界更简单
 上述示例标注文件中，"11.jpg"和"12.jpg"的标签相同，都是`简单可依赖`，在训练的时候，对于该行标注，会随机选择其中的一张图片进行训练。


- 测试集
+- 验证集

-同训练集类似，测试集也需要提供一个包含所有图片的文件夹（test）和一个rec_gt_test.txt，测试集的结构如下所示：
+同训练集类似，验证集也需要提供一个包含所有图片的文件夹（test）和一个rec_gt_test.txt，验证集的结构如下所示：

 ```
 |-train_data
@@ -247,7 +247,10 @@ PaddleOCR支持训练和评估交替进行, 可以在 `configs/rec/rec_icdar15_t
 | rec_r31_sar.yml               | SAR | ResNet31 | None | LSTM encoder | LSTM decoder |
 | rec_resnet_stn_bilstm_att.yml | SEED | Aster_Resnet | STN | BiLSTM | att |

-*其中SEED模型需要额外加载FastText训练好的[语言模型](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.en.300.bin.gz)
+*其中SEED模型需要额外加载FastText训练好的[语言模型](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.en.300.bin.gz) ,并且安装 fasttext 依赖：
+```
+python3.7 -m pip install fasttext==0.9.1
+```

 训练中文数据，推荐使用[rec_chinese_lite_train_v2.0.yml](../../configs/rec/ch_ppocr_v2.0/rec_chinese_lite_train_v2.0.yml)，如您希望尝试其他算法在中文数据集上的效果，请参考下列说明修改配置文件：


--- a/doc/doc_ch/whl.md
+++ b/doc/doc_ch/whl.md
@@ -418,7 +418,7 @@ im_show.save('result.jpg')
 | det                     | 前向时使用启动检测                                                                                                                                                                                                   | TRUE                    |
 | rec                     | 前向时是否启动识别                                                                                                                                                                                                   | TRUE                    |
 | cls                     | 前向时是否启动分类 (命令行模式下使用use_angle_cls控制前向是否启动分类)                                                                                                                                                                                                | FALSE                    |
-| show_log                     | 是否打印det和rec等信息                                                                                                                                                                                                | FALSE                    |
+| show_log                     | 是否打印logger信息                                                                                                                                               | FALSE                    |
 | type                     | 执行ocr或者表格结构化, 值可选['ocr','structure']                                                                                                                                                                                             | ocr                    |
 | ocr_version                     | OCR模型版本，可选PP-OCRv2, PP-OCR。PP-OCRv2 目前仅支持中文的检测和识别模型，PP-OCR支持中文的检测，识别，多语种识别，方向分类器等模型                                                                                                                                        | PP-OCRv2                   |
 | structure_version                     | 表格结构化模型版本，可选 STRUCTURE。STRUCTURE支持表格结构化模型                                                                                                                                                                                        | STRUCTURE                    |
--- a/doc/doc_en/whl_en.md
+++ b/doc/doc_en/whl_en.md
@@ -365,7 +365,7 @@ im_show.save('result.jpg')
 | det                     | Enable detction when `ppocr.ocr` func exec                                                                                                                                                                                                   | TRUE                    |
 | rec                     | Enable recognition when `ppocr.ocr` func exec                                                                                                                                                                                                   | TRUE                    |
 | cls                     | Enable classification when `ppocr.ocr` func exec((Use use_angle_cls in command line mode to control whether to start classification in the forward direction)                                                                                                                                                                                                   | FALSE                    |
-| show_log                     | Whether to print log in det and rec | FALSE                    |
+| show_log                     | Whether to print log| FALSE                    |
 | type                     | Perform ocr or table structuring, the value is selected in ['ocr','structure']                                                                                                                                                                                             | ocr                    |
 | ocr_version                     | OCR Model version number, the current model support list is as follows: PP-OCRv2 support Chinese detection and recognition model, PP-OCR support Chinese detection, recognition and direction classifier, multilingual recognition model | PP-OCRv2                 |
 | structure_version                     | table structure Model version number, the current model support list is as follows: STRUCTURE support english table structure model | STRUCTURE                 |
--- a/paddleocr.py
+++ b/paddleocr.py
@@ -14,6 +14,7 @@

 import os
 import sys
+import importlib

 __dir__ = os.path.dirname(__file__)

@@ -26,6 +27,10 @@ import logging
 import numpy as np
 from pathlib import Path

+tools = importlib.import_module('.', 'tools')
+ppocr = importlib.import_module('.', 'ppocr')
+ppstructure = importlib.import_module('.', 'ppstructure')
+
 from tools.infer import predict_system
 from ppocr.utils.logging import get_logger

@@ -34,7 +39,7 @@ from ppocr.utils.utility import check_and_read_gif, get_image_file_list
 from ppocr.utils.network import maybe_download, download_with_progressbar, is_link, confirm_model_dir_url
 from tools.infer.utility import draw_ocr, str2bool, check_gpu
 from ppstructure.utility import init_args, draw_structure_result
-from ppstructure.predict_system import OCRSystem, save_structure_res
+from ppstructure.predict_system import StructureSystem, save_structure_res

 __all__ = [
    'PaddleOCR', 'PPStructure', 'draw_ocr', 'draw_structure_result',
@@ -42,7 +47,7 @@ __all__ = [
 ]

 SUPPORT_DET_MODEL = ['DB']
-VERSION = '2.4'
+VERSION = '2.4.0.4'
 SUPPORT_REC_MODEL = ['CRNN']
 BASE_DIR = os.path.expanduser("~/.paddleocr/")

@@ -308,20 +313,18 @@ class PaddleOCR(predict_system.TextSystem):
                                            det_lang)
        params.det_model_dir, det_url = confirm_model_dir_url(
            params.det_model_dir,
-            os.path.join(BASE_DIR, VERSION, 'ocr', 'det', det_lang),
+            os.path.join(BASE_DIR, 'whl', 'det', det_lang),
            det_model_config['url'])
        rec_model_config = get_model_config('OCR', params.ocr_version, 'rec',
                                            lang)
        params.rec_model_dir, rec_url = confirm_model_dir_url(
            params.rec_model_dir,
-            os.path.join(BASE_DIR, VERSION, 'ocr', 'rec', lang),
-            rec_model_config['url'])
+            os.path.join(BASE_DIR, 'whl', 'rec', lang), rec_model_config['url'])
        cls_model_config = get_model_config('OCR', params.ocr_version, 'cls',
                                            'ch')
        params.cls_model_dir, cls_url = confirm_model_dir_url(
            params.cls_model_dir,
-            os.path.join(BASE_DIR, VERSION, 'ocr', 'cls'),
-            cls_model_config['url'])
+            os.path.join(BASE_DIR, 'whl', 'cls'), cls_model_config['url'])
        # download model
        maybe_download(params.det_model_dir, det_url)
        maybe_download(params.rec_model_dir, rec_url)
@@ -338,7 +341,7 @@ class PaddleOCR(predict_system.TextSystem):
            params.rec_char_dict_path = str(
                Path(__file__).parent / rec_model_config['dict_path'])

-        print(params)
+        logger.debug(params)
        # init det_model and rec_model
        super().__init__(params)

@@ -395,7 +398,7 @@ class PaddleOCR(predict_system.TextSystem):
            return rec_res


-class PPStructure(OCRSystem):
+class PPStructure(StructureSystem):
    def __init__(self, **kwargs):
        params = parse_args(mMain=False)
        params.__dict__.update(**kwargs)
@@ -412,20 +415,18 @@ class PPStructure(OCRSystem):
                                            det_lang)
        params.det_model_dir, det_url = confirm_model_dir_url(
            params.det_model_dir,
-            os.path.join(BASE_DIR, VERSION, 'ocr', 'det', det_lang),
+            os.path.join(BASE_DIR, 'whl', 'det', det_lang),
            det_model_config['url'])
        rec_model_config = get_model_config('OCR', params.ocr_version, 'rec',
                                            lang)
        params.rec_model_dir, rec_url = confirm_model_dir_url(
            params.rec_model_dir,
-            os.path.join(BASE_DIR, VERSION, 'ocr', 'rec', lang),
-            rec_model_config['url'])
+            os.path.join(BASE_DIR, 'whl', 'rec', lang), rec_model_config['url'])
        table_model_config = get_model_config(
            'STRUCTURE', params.structure_version, 'table', 'en')
        params.table_model_dir, table_url = confirm_model_dir_url(
            params.table_model_dir,
-            os.path.join(BASE_DIR, VERSION, 'ocr', 'table'),
-            table_model_config['url'])
+            os.path.join(BASE_DIR, 'whl', 'table'), table_model_config['url'])
        # download model
        maybe_download(params.det_model_dir, det_url)
        maybe_download(params.rec_model_dir, rec_url)
@@ -438,7 +439,7 @@ class PPStructure(OCRSystem):
            params.table_char_dict_path = str(
                Path(__file__).parent / table_model_config['dict_path'])

-        print(params)
+        logger.debug(params)
        super().__init__(params)

    def __call__(self, img):

--- a/ppocr/losses/det_pse_loss.py
+++ b/ppocr/losses/det_pse_loss.py
@@ -121,9 +121,9 @@ class PSELoss(nn.Layer):

        if neg_num == 0:
            selected_mask = training_mask
-            selected_mask = selected_mask.view(
-                1, selected_mask.shape[0],
-                selected_mask.shape[1]).astype('float32')
+            selected_mask = selected_mask.reshape(
+                [1, selected_mask.shape[0], selected_mask.shape[1]]).astype(
+                    'float32')
            return selected_mask

        neg_score = paddle.masked_select(score, gt_text <= 0.5)

--- a/ppocr/losses/kie_sdmgr_loss.py
+++ b/ppocr/losses/kie_sdmgr_loss.py
-# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
+# copyright (c) 2022 PaddlePaddle Authors. All Rights Reserve.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.

+# reference from : https://github.com/open-mmlab/mmocr/blob/main/mmocr/models/kie/losses/sdmgr_loss.py
+
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function

--- a/ppocr/metrics/kie_metric.py
+++ b/ppocr/metrics/kie_metric.py
@@ -11,6 +11,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+# The code is refer from: https://github.com/open-mmlab/mmocr/blob/main/mmocr/core/evaluation/kie_metric.py

 from __future__ import absolute_import
 from __future__ import division

--- a/ppocr/modeling/heads/kie_sdmgr_head.py
+++ b/ppocr/modeling/heads/kie_sdmgr_head.py
-# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
+# copyright (c) 2022 PaddlePaddle Authors. All Rights Reserve.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -11,6 +11,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+# reference from : https://github.com/open-mmlab/mmocr/blob/main/mmocr/models/kie/heads/sdmgr_head.py

 from __future__ import absolute_import
 from __future__ import division

--- a/ppocr/modeling/heads/rec_sar_head.py
+++ b/ppocr/modeling/heads/rec_sar_head.py
@@ -216,7 +216,7 @@ class ParallelSARDecoder(BaseDecoder):
        self.pred_dropout = nn.Dropout(pred_dropout)
        pred_num_classes = self.num_classes - 1
        if pred_concat:
-            fc_in_channel = decoder_rnn_out_size + d_model + d_enc
+            fc_in_channel = decoder_rnn_out_size + d_model + encoder_rnn_out_size
        else:
            fc_in_channel = d_model
        self.prediction = nn.Linear(fc_in_channel, pred_num_classes)

--- a/ppocr/postprocess/rec_postprocess.py
+++ b/ppocr/postprocess/rec_postprocess.py
@@ -54,22 +54,24 @@ class BaseRecLabelDecode(object):
        ignored_tokens = self.get_ignored_tokens()
        batch_size = len(text_index)
        for batch_idx in range(batch_size):
-            char_list = []
-            conf_list = []
-            for idx in range(len(text_index[batch_idx])):
-                if text_index[batch_idx][idx] in ignored_tokens:
-                    continue
+            selection = np.ones(len(text_index[batch_idx]), dtype=bool)
            if is_remove_duplicate:
-                    # only for predict
-                    if idx > 0 and text_index[batch_idx][idx - 1] == text_index[
-                            batch_idx][idx]:
-                        continue
-                char_list.append(self.character[int(text_index[batch_idx][
-                    idx])])
+                selection[1:] = text_index[batch_idx][1:] != text_index[
+                    batch_idx][:-1]
+            for ignored_token in ignored_tokens:
+                selection &= text_index[batch_idx] != ignored_token
+
+            char_list = [
+                self.character[text_id]
+                for text_id in text_index[batch_idx][selection]
+            ]
            if text_prob is not None:
-                    conf_list.append(text_prob[batch_idx][idx])
+                conf_list = text_prob[batch_idx][selection]
            else:
-                    conf_list.append(1)
+                conf_list = [1] * len(selection)
+            if len(conf_list) == 0:
+                conf_list = [0]
+
            text = ''.join(char_list)
            result_list.append((text, np.mean(conf_list)))
        return result_list

--- a/ppocr/utils/logging.py
+++ b/ppocr/utils/logging.py
@@ -26,7 +26,7 @@ logger_initialized = {}


 @functools.lru_cache()
-def get_logger(name='root', log_file=None, log_level=logging.DEBUG):
+def get_logger(name='ppocr', log_file=None, log_level=logging.DEBUG):
    """Initialize and get a logger by name.
    If the logger has not been initialized, this method will initialize the
    logger by adding one or two handlers, otherwise the initialized logger will
@@ -67,4 +67,5 @@ def get_logger(name='root', log_file=None, log_level=logging.DEBUG):
    else:
        logger.setLevel(logging.ERROR)
    logger_initialized[name] = True
+    logger.propagate = False
    return logger
--- a/ppstructure/predict_system.py
+++ b/ppstructure/predict_system.py
@@ -22,6 +22,7 @@ sys.path.append(os.path.abspath(os.path.join(__dir__, '..')))

 os.environ["FLAGS_allocator_strategy"] = 'auto_growth'
 import cv2
+import json
 import numpy as np
 import time
 import logging
@@ -35,7 +36,7 @@ from ppstructure.utility import parse_args, draw_structure_result
 logger = get_logger()


-class OCRSystem(object):
+class StructureSystem(object):
    def __init__(self, args):
        self.mode = args.mode
        if self.mode == 'structure':
@@ -66,8 +67,7 @@ class OCRSystem(object):
            self.use_angle_cls = args.use_angle_cls
            self.drop_score = args.drop_score
        elif self.mode == 'vqa':
-            from ppstructure.vqa.infer_ser_e2e import SerPredictor, draw_ser_results
-            self.vqa_engine = SerPredictor(args)
+            raise NotImplementedError

    def __call__(self, img):
        if self.mode == 'structure':
@@ -82,24 +82,24 @@ class OCRSystem(object):
                    res = self.table_system(roi_img)
                else:
                    filter_boxes, filter_rec_res = self.text_system(roi_img)
-                    filter_boxes = [x + [x1, y1] for x in filter_boxes]
-                    filter_boxes = [
-                        x.reshape(-1).tolist() for x in filter_boxes
-                    ]
                    # remove style char
                    style_token = [
                        '<strike>', '<strike>', '<sup>', '</sub>', '<b>',
                        '</b>', '<sub>', '</sup>', '<overline>', '</overline>',
                        '<underline>', '</underline>', '<i>', '</i>'
                    ]
-                    filter_rec_res_tmp = []
-                    for rec_res in filter_rec_res:
+                    res = []
+                    for box, rec_res in zip(filter_boxes, filter_rec_res):
                        rec_str, rec_conf = rec_res
                        for token in style_token:
                            if token in rec_str:
                                rec_str = rec_str.replace(token, '')
-                        filter_rec_res_tmp.append((rec_str, rec_conf))
-                    res = (filter_boxes, filter_rec_res_tmp)
+                        box += [x1, y1]
+                        res.append({
+                            'text': rec_str,
+                            'confidence': float(rec_conf),
+                            'text_region': box.tolist()
+                        })
                res_list.append({
                    'type': region.type,
                    'bbox': [x1, y1, x2, y2],
@@ -107,7 +107,7 @@ class OCRSystem(object):
                    'res': res
                })
        elif self.mode == 'vqa':
-            res_list, _ = self.vqa_engine(img)
+            raise NotImplementedError
        return res_list


@@ -123,15 +123,14 @@ def save_structure_res(res, save_folder, img_name):
                excel_path = os.path.join(excel_save_folder,
                                          '{}.xlsx'.format(region['bbox']))
                to_excel(region['res'], excel_path)
-            if region['type'] == 'Figure':
+            elif region['type'] == 'Figure':
                roi_img = region['img']
                img_path = os.path.join(excel_save_folder,
                                        '{}.jpg'.format(region['bbox']))
                cv2.imwrite(img_path, roi_img)
            else:
-                for box, rec_res in zip(region['res'][0], region['res'][1]):
-                    f.write('{}\t{}\n'.format(
-                        np.array(box).reshape(-1).tolist(), rec_res))
+                for text_result in region['res']:
+                    f.write('{}\n'.format(json.dumps(text_result)))


 def main(args):
@@ -139,7 +138,7 @@ def main(args):
    image_file_list = image_file_list
    image_file_list = image_file_list[args.process_id::args.total_process_num]

-    structure_sys = OCRSystem(args)
+    structure_sys = StructureSystem(args)
    img_num = len(image_file_list)
    save_folder = os.path.join(args.output, structure_sys.mode)
    os.makedirs(save_folder, exist_ok=True)
@@ -162,8 +161,9 @@ def main(args):
            draw_img = draw_structure_result(img, res, args.vis_font_path)
            img_save_path = os.path.join(save_folder, img_name, 'show.jpg')
        elif structure_sys.mode == 'vqa':
-            draw_img = draw_ser_results(img, res, args.vis_font_path)
-            img_save_path = os.path.join(save_folder, img_name + '.jpg')
+            raise NotImplementedError
+            # draw_img = draw_ser_results(img, res, args.vis_font_path)
+            # img_save_path = os.path.join(save_folder, img_name + '.jpg')
        cv2.imwrite(img_save_path, draw_img)
        logger.info('result save to {}'.format(img_save_path))
        elapse = time.time() - starttime

--- a/ppstructure/utility.py
+++ b/ppstructure/utility.py
@@ -40,12 +40,6 @@ def init_args():
        type=ast.literal_eval,
        default=None,
        help='label map according to ppstructure/layout/README_ch.md')
-    # params for ser
-    parser.add_argument("--model_name_or_path", type=str)
-    parser.add_argument("--max_seq_length", type=int, default=512)
-    parser.add_argument(
-        "--label_map_path", type=str, default='./vqa/labels/labels_ser.txt')
-
    parser.add_argument(
        "--mode",
        type=str,
@@ -67,10 +61,10 @@ def draw_structure_result(image, result, font_path):
        if region['type'] == 'Table':
            pass
        else:
-            for box, rec_res in zip(region['res'][0], region['res'][1]):
-                boxes.append(np.array(box).reshape(-1, 2))
-                txts.append(rec_res[0])
-                scores.append(rec_res[1])
+            for text_result in region['res']:
+                boxes.append(np.array(text_result['text_region']))
+                txts.append(text_result['text'])
+                scores.append(text_result['confidence'])
    im_show = draw_ocr_box_txt(
        image, boxes, txts, scores, font_path=font_path, drop_score=0)
    return im_show