Merge remote-tracking branch 'origin/release/2.5' into release2.5

4c6b03ad · Leif · edad6e7c · 77331549 · 4c6b03ad · 4c6b03ad
Commit 4c6b03ad authored Jul 04, 2022 by Leif
17 changed files
--- a/ppocr/losses/rec_aster_loss.py
+++ b/ppocr/losses/rec_aster_loss.py
@@ -27,12 +27,12 @@ class CosineEmbeddingLoss(nn.Layer):
        self.epsilon = 1e-12
    def forward(self, x1, x2, target):
-        similarity = paddle.fluid.layers.reduce_sum(
+        similarity = paddle.sum(
            x1 * x2, dim=-1) / (paddle.norm(
                x1, axis=-1) * paddle.norm(
                    x2, axis=-1) + self.epsilon)
        one_list = paddle.full_like(target, fill_value=1)
-        out = paddle.fluid.layers.reduce_mean(
+        out = paddle.mean(
            paddle.where(
                paddle.equal(target, one_list), 1. - similarity,
                paddle.maximum(

--- a/ppocr/losses/table_att_loss.py
+++ b/ppocr/losses/table_att_loss.py
@@ -19,7 +19,6 @@ from __future__ import print_function
 import paddle
 from paddle import nn
 from paddle.nn import functional as F
-from paddle import fluid
 class TableAttentionLoss(nn.Layer):
    def __init__(self, structure_weight, loc_weight, use_giou=False, giou_weight=1.0, **kwargs):
@@ -36,13 +35,13 @@ class TableAttentionLoss(nn.Layer):
        :param bbox:[[x1,y1,x2,y2], [x1,y1,x2,y2],,,]
        :return: loss
        '''
-        ix1 = fluid.layers.elementwise_max(preds[:, 0], bbox[:, 0])
+        ix1 = paddle.maximum(preds[:, 0], bbox[:, 0])
-        iy1 = fluid.layers.elementwise_max(preds[:, 1], bbox[:, 1])
+        iy1 = paddle.maximum(preds[:, 1], bbox[:, 1])
-        ix2 = fluid.layers.elementwise_min(preds[:, 2], bbox[:, 2])
+        ix2 = paddle.minimum(preds[:, 2], bbox[:, 2])
-        iy2 = fluid.layers.elementwise_min(preds[:, 3], bbox[:, 3])
+        iy2 = paddle.minimum(preds[:, 3], bbox[:, 3])
-        iw = fluid.layers.clip(ix2 - ix1 + 1e-3, 0., 1e10)
+        iw = paddle.clip(ix2 - ix1 + 1e-3, 0., 1e10)
-        ih = fluid.layers.clip(iy2 - iy1 + 1e-3, 0., 1e10)
+        ih = paddle.clip(iy2 - iy1 + 1e-3, 0., 1e10)
        # overlap
        inters = iw * ih
@@ -55,12 +54,12 @@ class TableAttentionLoss(nn.Layer):
        # ious
        ious = inters / uni
-        ex1 = fluid.layers.elementwise_min(preds[:, 0], bbox[:, 0])
+        ex1 = paddle.minimum(preds[:, 0], bbox[:, 0])
-        ey1 = fluid.layers.elementwise_min(preds[:, 1], bbox[:, 1])
+        ey1 = paddle.minimum(preds[:, 1], bbox[:, 1])
-        ex2 = fluid.layers.elementwise_max(preds[:, 2], bbox[:, 2])
+        ex2 = paddle.maximum(preds[:, 2], bbox[:, 2])
-        ey2 = fluid.layers.elementwise_max(preds[:, 3], bbox[:, 3])
+        ey2 = paddle.maximum(preds[:, 3], bbox[:, 3])
-        ew = fluid.layers.clip(ex2 - ex1 + 1e-3, 0., 1e10)
+        ew = paddle.clip(ex2 - ex1 + 1e-3, 0., 1e10)
-        eh = fluid.layers.clip(ey2 - ey1 + 1e-3, 0., 1e10)
+        eh = paddle.clip(ey2 - ey1 + 1e-3, 0., 1e10)
        # enclose erea
        enclose = ew * eh + eps

--- a/ppocr/modeling/backbones/kie_unet_sdmgr.py
+++ b/ppocr/modeling/backbones/kie_unet_sdmgr.py
@@ -175,12 +175,7 @@ class Kie_backbone(nn.Layer):
            img, relations, texts, gt_bboxes, tag, img_size)
        x = self.img_feat(img)
        boxes, rois_num = self.bbox2roi(gt_bboxes)
-        feats = paddle.fluid.layers.roi_align(
+        feats = paddle.vision.ops.roi_align(
-            x,
+            x, boxes, spatial_scale=1.0, output_size=7, boxes_num=rois_num)
-            boxes,
-            spatial_scale=1.0,
-            pooled_height=7,
-            pooled_width=7,
-            rois_num=rois_num)
        feats = self.maxpool(feats).squeeze(-1).squeeze(-1)
        return [relations, texts, feats]
--- a/ppocr/modeling/backbones/rec_resnet_fpn.py
+++ b/ppocr/modeling/backbones/rec_resnet_fpn.py
@@ -18,7 +18,6 @@ from __future__ import print_function
 from paddle import nn, ParamAttr
 from paddle.nn import functional as F
-import paddle.fluid as fluid
 import paddle
 import numpy as np

--- a/ppocr/modeling/heads/rec_srn_head.py
+++ b/ppocr/modeling/heads/rec_srn_head.py
@@ -20,13 +20,11 @@ import math
 import paddle
 from paddle import nn, ParamAttr
 from paddle.nn import functional as F
-import paddle.fluid as fluid
 import numpy as np
 from .self_attention import WrapEncoderForFeature
 from .self_attention import WrapEncoder
 from paddle.static import Program
 from ppocr.modeling.backbones.rec_resnet_fpn import ResNetFPN
-import paddle.fluid.framework as framework
 from collections import OrderedDict
 gradient_clip = 10

--- a/ppocr/modeling/heads/self_attention.py
+++ b/ppocr/modeling/heads/self_attention.py
@@ -22,7 +22,6 @@ import paddle
 from paddle import ParamAttr, nn
 from paddle import nn, ParamAttr
 from paddle.nn import functional as F
-import paddle.fluid as fluid
 import numpy as np
 gradient_clip = 10
@@ -288,10 +287,10 @@ class PrePostProcessLayer(nn.Layer):
                        "layer_norm_%d" % len(self.sublayers()),
                        paddle.nn.LayerNorm(
                            normalized_shape=d_model,
-                            weight_attr=fluid.ParamAttr(
+                            weight_attr=paddle.ParamAttr(
-                                initializer=fluid.initializer.Constant(1.)),
+                                initializer=paddle.nn.initializer.Constant(1.)),
-                            bias_attr=fluid.ParamAttr(
+                            bias_attr=paddle.ParamAttr(
-                                initializer=fluid.initializer.Constant(0.)))))
+                                initializer=paddle.nn.initializer.Constant(0.)))))
            elif cmd == "d":  # add dropout
                self.functors.append(lambda x: F.dropout(
                    x, p=dropout_rate, mode="downscale_in_infer")
@@ -324,7 +323,7 @@ class PrepareEncoder(nn.Layer):
    def forward(self, src_word, src_pos):
        src_word_emb = src_word
-        src_word_emb = fluid.layers.cast(src_word_emb, 'float32')
+        src_word_emb = paddle.cast(src_word_emb, 'float32')
        src_word_emb = paddle.scale(x=src_word_emb, scale=self.src_emb_dim**0.5)
        src_pos = paddle.squeeze(src_pos, axis=-1)
        src_pos_enc = self.emb(src_pos)
@@ -367,7 +366,7 @@ class PrepareDecoder(nn.Layer):
        self.dropout_rate = dropout_rate
    def forward(self, src_word, src_pos):
-        src_word = fluid.layers.cast(src_word, 'int64')
+        src_word = paddle.cast(src_word, 'int64')
        src_word = paddle.squeeze(src_word, axis=-1)
        src_word_emb = self.emb0(src_word)
        src_word_emb = paddle.scale(x=src_word_emb, scale=self.src_emb_dim**0.5)

--- a/ppocr/postprocess/db_postprocess.py
+++ b/ppocr/postprocess/db_postprocess.py
@@ -38,6 +38,7 @@ class DBPostProcess(object):
                 unclip_ratio=2.0,
                 use_dilation=False,
                 score_mode="fast",
+                 visual_output=False,
                 **kwargs):
        self.thresh = thresh
        self.box_thresh = box_thresh
@@ -51,6 +52,7 @@ class DBPostProcess(object):
        self.dilation_kernel = None if not use_dilation else np.array(
            [[1, 1], [1, 1]])
+        self.visual = visual_output
    def boxes_from_bitmap(self, pred, _bitmap, dest_width, dest_height):
        '''
@@ -169,12 +171,19 @@ class DBPostProcess(object):
        cv2.fillPoly(mask, contour.reshape(1, -1, 2).astype(np.int32), 1)
        return cv2.mean(bitmap[ymin:ymax + 1, xmin:xmax + 1], mask)[0]
+    def visual_output(self, pred):
+        im = np.array(pred[0] * 255).astype(np.uint8)
+        cv2.imwrite("db_probability_map.png", im)
+        print("The probalibity map is visualized in db_probability_map.png")
    def __call__(self, outs_dict, shape_list):
        pred = outs_dict['maps']
        if isinstance(pred, paddle.Tensor):
            pred = pred.numpy()
        pred = pred[:, 0, :, :]
        segmentation = pred > self.thresh
+        if self.visual:
+            self.visual_output(pred)
        boxes_batch = []
        for batch_index in range(pred.shape[0]):

--- a/ppocr/utils/save_load.py
+++ b/ppocr/utils/save_load.py
@@ -177,9 +177,9 @@ def save_model(model,
            model.backbone.model.save_pretrained(model_prefix)
        metric_prefix = os.path.join(model_prefix, 'metric')
    # save metric and config
+    with open(metric_prefix + '.states', 'wb') as f:
+        pickle.dump(kwargs, f, protocol=2)
    if is_best:
-        with open(metric_prefix + '.states', 'wb') as f:
-            pickle.dump(kwargs, f, protocol=2)
        logger.info('save best model is to {}'.format(model_prefix))
    else:
        logger.info("save model in {}".format(model_prefix))
--- a/ppstructure/docs/kie.md
+++ b/ppstructure/docs/kie.md
@@ -19,6 +19,24 @@ SDMGR是一个关键信息提取算法，将每个检测到的文本区域分类
 wget https://paddleocr.bj.bcebos.com/dygraph_v2.1/kie/wildreceipt.tar && tar xf wildreceipt.tar
 ```
+数据集格式：
+```
+./wildreceipt
+├── class_list.txt          # box内的文本类别，比如金额、时间、日期等。
+├── dict.txt                # 识别的字典文件，数据集中包含的字符列表
+├── wildreceipt_train.txt   # 训练数据标签文件
+└── wildreceipt_test.txt    # 评估数据标签文件
+└── image_files/            # 图像数据文件夹
+```
+其中标签文件里的格式为：
+```
+" 图像文件名                    json.dumps编码的图像标注信息"
+image_files/Image_16/11/d5de7f2a20751e50b84c747c17a24cd98bed3554.jpeg	[{"label": 1, "transcription": "SAFEWAY", "points": [[550.0, 190.0], [937.0, 190.0], [937.0, 104.0], [550.0, 104.0]]}, {"label": 25, "transcription": "TM", "points": [[1048.0, 211.0], [1074.0, 211.0], [1074.0, 196.0], [1048.0, 196.0]]}, {"label": 25, "transcription": "ATOREMGRTOMMILAZZO", "points": [[535.0, 239.0], [833.0, 239.0], [833.0, 200.0], [535.0, 200.0]]}, {"label": 5, "transcription": "703-777-5833", "points": [[907.0, 256.0], [1081.0, 256.0], [1081.0, 223.0], [907.0, 223.0]]}......
+```
+**注：如果您希望在自己的数据集上训练，建议按照上述数据个数准备数据集。**
 执行预测：
 ```

--- a/ppstructure/docs/kie_en.md
+++ b/ppstructure/docs/kie_en.md
@@ -18,6 +18,22 @@ This section provides a tutorial example on how to quickly use, train, and evalu
 wget https://paddleocr.bj.bcebos.com/dygraph_v2.1/kie/wildreceipt.tar && tar xf wildreceipt.tar
 ```
+The dataset format are as follows:
+```
+./wildreceipt
+├── class_list.txt          # The text category inside the box, such as amount, time, date, etc.
+├── dict.txt                # A recognized dictionary file, a list of characters contained in the dataset
+├── wildreceipt_train.txt   # training data label file
+└── wildreceipt_test.txt    # testing data label file
+└── image_files/            # image dataset file
+```
+The format in the label file is:
+```
+" The image file path                    Image annotation information encoded by json.dumps"
+image_files/Image_16/11/d5de7f2a20751e50b84c747c17a24cd98bed3554.jpeg    [{"label": 1, "transcription": "SAFEWAY", "points": [[550.0, 190.0], [937.0, 190.0], [937.0, 104.0], [550.0, 104.0]]}, {"label": 25, "transcription": "TM", "points": [[1048.0, 211.0], [1074.0, 211.0], [1074.0, 196.0], [1048.0, 196.0]]}, {"label": 25, "transcription": "ATOREMGRTOMMILAZZO", "points": [[535.0, 239.0], [833.0, 239.0], [833.0, 200.0], [535.0, 200.0]]}, {"label": 5, "transcription": "703-777-5833", "points": [[907.0, 256.0], [1081.0, 256.0], [1081.0, 223.0], [907.0, 223.0]]}......
+```
 Download the pretrained model and predict the result:
 ```shell

--- a/ppstructure/vqa/README.md
+++ b/ppstructure/vqa/README.md
@@ -192,7 +192,7 @@ Finally, `precision`, `recall`, `hmean` and other indicators will be printed
 Use the following command to complete the series prediction of `OCR engine + SER`, taking the pretrained SER model as an example:
 ```shell
-CUDA_VISIBLE_DEVICES=0 python3 tools/infer_vqa_token_ser.py -c configs/vqa/ser/layoutxlm.yml -o Architecture.Backbone.checkpoints=pretrain/ser_LayoutXLM_xfun_zh/Global.infer_img=doc/vqa/input/zh_val_42.jpg
+CUDA_VISIBLE_DEVICES=0 python3 tools/infer_vqa_token_ser.py -c configs/vqa/ser/layoutxlm.yml -o Architecture.Backbone.checkpoints=pretrain/ser_LayoutXLM_xfun_zh/ Global.infer_img=doc/vqa/input/zh_val_42.jpg
 ````
 Finally, the prediction result visualization image and the prediction result text file will be saved in the directory configured by the `config.Global.save_res_path` field. The prediction result text file is named `infer_results.txt`.
@@ -203,7 +203,7 @@ First use the `tools/infer_vqa_token_ser.py` script to complete the prediction o
 ```shell
 export CUDA_VISIBLE_DEVICES=0
-python3 tools/eval_with_label_end2end.py --gt_json_path XFUND/zh_val/xfun_normalize_val.json --pred_json_path output_res/infer_results.txt
+python3 ppstructure/vqa/tools/eval_with_label_end2end.py --gt_json_path XFUND/zh_val/xfun_normalize_val.json --pred_json_path output_res/infer_results.txt
 ````
 <a name="53"></a>
@@ -247,7 +247,7 @@ Finally, `precision`, `recall`, `hmean` and other indicators will be printed
 Use the following command to complete the series prediction of `OCR engine + SER + RE`, taking the pretrained SER and RE models as an example:
 ```shell
 export CUDA_VISIBLE_DEVICES=0
-python3 tools/infer_vqa_token_ser_re.py -c configs/vqa/re/layoutxlm.yml -o Architecture.Backbone.checkpoints=pretrain/re_LayoutXLM_xfun_zh/Global.infer_img=doc/vqa/input/zh_val_21.jpg -c_ser configs/vqa/ser/layoutxlm. yml -o_ser Architecture.Backbone.checkpoints=pretrain/ser_LayoutXLM_xfun_zh/
+python3 tools/infer_vqa_token_ser_re.py -c configs/vqa/re/layoutxlm.yml -o Architecture.Backbone.checkpoints=pretrain/re_LayoutXLM_xfun_zh/ Global.infer_img=doc/vqa/input/zh_val_21.jpg -c_ser configs/vqa/ser/layoutxlm. yml -o_ser Architecture.Backbone.checkpoints=pretrain/ser_LayoutXLM_xfun_zh/
 ````
 Finally, the prediction result visualization image and the prediction result text file will be saved in the directory configured by the `config.Global.save_res_path` field. The prediction result text file is named `infer_results.txt`.

--- a/ppstructure/vqa/README_ch.md
+++ b/ppstructure/vqa/README_ch.md
@@ -198,7 +198,7 @@ CUDA_VISIBLE_DEVICES=0 python3 tools/infer_vqa_token_ser.py -c configs/vqa/ser/l
 ```shell
 export CUDA_VISIBLE_DEVICES=0
-python3 tools/eval_with_label_end2end.py --gt_json_path XFUND/zh_val/xfun_normalize_val.json  --pred_json_path output_res/infer_results.txt
+python3 ppstructure/vqa/tools/eval_with_label_end2end.py --gt_json_path XFUND/zh_val/xfun_normalize_val.json  --pred_json_path output_res/infer_results.txt
 ```
 ### 5.3 RE

--- a/test_tipc/test_train_inference_python.sh
+++ b/test_tipc/test_train_inference_python.sh
@@ -329,6 +329,7 @@ else
                set_save_model=$(func_set_params "${save_model_key}" "${save_log}")
                if [ ${#gpu} -le 2 ];then  # train with cpu or single gpu
+                    eval ${env}
                    cmd="${python} ${run_train} ${set_use_gpu}  ${set_save_model} ${set_epoch} ${set_pretrain} ${set_autocast} ${set_batchsize} ${set_train_params1} ${set_amp_config} "
                elif [ ${#ips} -le 26 ];then  # train with multi-gpu
                    cmd="${python} -m paddle.distributed.launch --gpus=${gpu} ${run_train} ${set_use_gpu} ${set_save_model} ${set_epoch} ${set_pretrain} ${set_autocast} ${set_batchsize} ${set_train_params1} ${set_amp_config}"

--- a/tools/export_model.py
+++ b/tools/export_model.py
@@ -76,7 +76,7 @@ def export_single_model(model, arch_config, save_path, logger, quanter=None):
    else:
        infer_shape = [3, -1, -1]
        if arch_config["model_type"] == "rec":
-            infer_shape = [3, 32, -1]  # for rec model, H must be 32
+            infer_shape = [3, 48, -1]  # for rec model, H must be 32
            if "Transform" in arch_config and arch_config[
                    "Transform"] is not None and arch_config["Transform"][
                        "name"] == "TPS":

--- a/tools/infer/predict_det.py
+++ b/tools/infer/predict_det.py
@@ -24,6 +24,7 @@ import cv2
 import numpy as np
 import time
 import sys
+from scipy.spatial import distance as dist
 import tools.infer.utility as utility
 from ppocr.utils.logging import get_logger
@@ -154,9 +155,10 @@ class TextDetector(object):
        s = pts.sum(axis=1)
        rect[0] = pts[np.argmin(s)]
        rect[2] = pts[np.argmax(s)]
-        diff = np.diff(pts, axis=1)
+        tmp = np.delete(pts, (np.argmin(s), np.argmax(s)), axis=0)
-        rect[1] = pts[np.argmin(diff)]
+        diff = np.diff(np.array(tmp), axis=1)
-        rect[3] = pts[np.argmax(diff)]
+        rect[1] = tmp[np.argmin(diff)]
+        rect[3] = tmp[np.argmax(diff)]
        return rect
    def clip_det_res(self, points, img_height, img_width):

--- a/tools/infer/predict_system.py
+++ b/tools/infer/predict_system.py
@@ -114,11 +114,14 @@ def sorted_boxes(dt_boxes):
    _boxes = list(sorted_boxes)
    for i in range(num_boxes - 1):
-        if abs(_boxes[i + 1][0][1] - _boxes[i][0][1]) < 10 and \
+        for j in range(i, 0, -1):
-                (_boxes[i + 1][0][0] < _boxes[i][0][0]):
+            if abs(_boxes[j + 1][0][1] - _boxes[j][0][1]) < 10 and \
-            tmp = _boxes[i]
+                    (_boxes[j + 1][0][0] < _boxes[j][0][0]):
-            _boxes[i] = _boxes[i + 1]
+                tmp = _boxes[j]
-            _boxes[i + 1] = tmp
+                _boxes[j] = _boxes[j + 1]
+                _boxes[j + 1] = tmp
+            else:
+                break
    return _boxes
@@ -135,7 +138,7 @@ def main(args):
    logger.info("In PP-OCRv3, rec_image_shape parameter defaults to '3, 48, 320', "
                "if you are using recognition model with PP-OCRv2 or an older version, please set --rec_image_shape='3,32,320")
    # warm up 10 times
    if args.warmup:
        img = np.random.uniform(0, 255, [640, 640, 3]).astype(np.uint8)
@@ -198,7 +201,12 @@ def main(args):
        text_sys.text_detector.autolog.report()
        text_sys.text_recognizer.autolog.report()
-    with open(os.path.join(draw_img_save_dir, "system_results.txt"), 'w', encoding='utf-8') as f:
+    if args.total_process_num > 1:
+        save_results_path = os.path.join(draw_img_save_dir, f"system_results_{args.process_id}.txt")
+    else:
+        save_results_path = os.path.join(draw_img_save_dir, "system_results.txt")
+    with open(save_results_path, 'w', encoding='utf-8') as f:
        f.writelines(save_results)

--- a/tools/infer/utility.py
+++ b/tools/infer/utility.py
@@ -55,6 +55,7 @@ def init_args():
    parser.add_argument("--max_batch_size", type=int, default=10)
    parser.add_argument("--use_dilation", type=str2bool, default=False)
    parser.add_argument("--det_db_score_mode", type=str, default="fast")
+    parser.add_argument("--vis_seg_map", type=str2bool, default=False)
    # EAST parmas
    parser.add_argument("--det_east_score_thresh", type=float, default=0.8)
    parser.add_argument("--det_east_cover_thresh", type=float, default=0.1)
@@ -276,6 +277,7 @@ def create_predictor(args, mode, logger):
                min_input_shape = {"x": [1, 3, imgH, 10]}
                max_input_shape = {"x": [args.rec_batch_num, 3, imgH, 2304]}
                opt_input_shape = {"x": [args.rec_batch_num, 3, imgH, 320]}
+                config.exp_disable_tensorrt_ops(["transpose2"])
            elif mode == "cls":
                min_input_shape = {"x": [1, 3, 48, 10]}
                max_input_shape = {"x": [args.rec_batch_num, 3, 48, 1024]}
@@ -587,7 +589,7 @@ def text_visual(texts,
 def base64_to_cv2(b64str):
    import base64
    data = base64.b64decode(b64str.encode('utf8'))
-    data = np.fromstring(data, np.uint8)
+    data = np.frombuffer(data, np.uint8)
    data = cv2.imdecode(data, cv2.IMREAD_COLOR)
    return data