Merge branch 'dygraph' of https://github.com/PaddlePaddle/PaddleOCR into dev/add_thread_pred

76320bf0 · littletomatodonkey · e19bedf5 · 824ceca6 · 76320bf0 · 76320bf0
Commit 76320bf0 authored Apr 09, 2021 by littletomatodonkey
7 changed files
--- a/ppocr/utils/e2e_utils/visual.py
+++ b/ppocr/utils/e2e_utils/visual.py
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import numpy as np
+import cv2
+import time
+
+
+def resize_image(im, max_side_len=512):
+    """
+    resize image to a size multiple of max_stride which is required by the network
+    :param im: the resized image
+    :param max_side_len: limit of max image size to avoid out of memory in gpu
+    :return: the resized image and the resize ratio
+    """
+    h, w, _ = im.shape
+
+    resize_w = w
+    resize_h = h
+
+    if resize_h > resize_w:
+        ratio = float(max_side_len) / resize_h
+    else:
+        ratio = float(max_side_len) / resize_w
+
+    resize_h = int(resize_h * ratio)
+    resize_w = int(resize_w * ratio)
+
+    max_stride = 128
+    resize_h = (resize_h + max_stride - 1) // max_stride * max_stride
+    resize_w = (resize_w + max_stride - 1) // max_stride * max_stride
+    im = cv2.resize(im, (int(resize_w), int(resize_h)))
+    ratio_h = resize_h / float(h)
+    ratio_w = resize_w / float(w)
+
+    return im, (ratio_h, ratio_w)
+
+
+def resize_image_min(im, max_side_len=512):
+    """
+    """
+    h, w, _ = im.shape
+
+    resize_w = w
+    resize_h = h
+
+    if resize_h < resize_w:
+        ratio = float(max_side_len) / resize_h
+    else:
+        ratio = float(max_side_len) / resize_w
+
+    resize_h = int(resize_h * ratio)
+    resize_w = int(resize_w * ratio)
+
+    max_stride = 128
+    resize_h = (resize_h + max_stride - 1) // max_stride * max_stride
+    resize_w = (resize_w + max_stride - 1) // max_stride * max_stride
+    im = cv2.resize(im, (int(resize_w), int(resize_h)))
+    ratio_h = resize_h / float(h)
+    ratio_w = resize_w / float(w)
+    return im, (ratio_h, ratio_w)
+
+
+def resize_image_for_totaltext(im, max_side_len=512):
+    """
+    """
+    h, w, _ = im.shape
+
+    resize_w = w
+    resize_h = h
+    ratio = 1.25
+    if h * ratio > max_side_len:
+        ratio = float(max_side_len) / resize_h
+
+    resize_h = int(resize_h * ratio)
+    resize_w = int(resize_w * ratio)
+
+    max_stride = 128
+    resize_h = (resize_h + max_stride - 1) // max_stride * max_stride
+    resize_w = (resize_w + max_stride - 1) // max_stride * max_stride
+    im = cv2.resize(im, (int(resize_w), int(resize_h)))
+    ratio_h = resize_h / float(h)
+    ratio_w = resize_w / float(w)
+    return im, (ratio_h, ratio_w)
+
+
+def point_pair2poly(point_pair_list):
+    """
+    Transfer vertical point_pairs into poly point in clockwise.
+    """
+    pair_length_list = []
+    for point_pair in point_pair_list:
+        pair_length = np.linalg.norm(point_pair[0] - point_pair[1])
+        pair_length_list.append(pair_length)
+    pair_length_list = np.array(pair_length_list)
+    pair_info = (pair_length_list.max(), pair_length_list.min(),
+                 pair_length_list.mean())
+
+    point_num = len(point_pair_list) * 2
+    point_list = [0] * point_num
+    for idx, point_pair in enumerate(point_pair_list):
+        point_list[idx] = point_pair[0]
+        point_list[point_num - 1 - idx] = point_pair[1]
+    return np.array(point_list).reshape(-1, 2), pair_info
+
+
+def shrink_quad_along_width(quad, begin_width_ratio=0., end_width_ratio=1.):
+    """
+    Generate shrink_quad_along_width.
+    """
+    ratio_pair = np.array(
+        [[begin_width_ratio], [end_width_ratio]], dtype=np.float32)
+    p0_1 = quad[0] + (quad[1] - quad[0]) * ratio_pair
+    p3_2 = quad[3] + (quad[2] - quad[3]) * ratio_pair
+    return np.array([p0_1[0], p0_1[1], p3_2[1], p3_2[0]])
+
+
+def expand_poly_along_width(poly, shrink_ratio_of_width=0.3):
+    """
+    expand poly along width.
+    """
+    point_num = poly.shape[0]
+    left_quad = np.array(
+        [poly[0], poly[1], poly[-2], poly[-1]], dtype=np.float32)
+    left_ratio = -shrink_ratio_of_width * np.linalg.norm(left_quad[0] - left_quad[3]) / \
+                 (np.linalg.norm(left_quad[0] - left_quad[1]) + 1e-6)
+    left_quad_expand = shrink_quad_along_width(left_quad, left_ratio, 1.0)
+    right_quad = np.array(
+        [
+            poly[point_num // 2 - 2], poly[point_num // 2 - 1],
+            poly[point_num // 2], poly[point_num // 2 + 1]
+        ],
+        dtype=np.float32)
+    right_ratio = 1.0 + \
+                  shrink_ratio_of_width * np.linalg.norm(right_quad[0] - right_quad[3]) / \
+                  (np.linalg.norm(right_quad[0] - right_quad[1]) + 1e-6)
+    right_quad_expand = shrink_quad_along_width(right_quad, 0.0, right_ratio)
+    poly[0] = left_quad_expand[0]
+    poly[-1] = left_quad_expand[-1]
+    poly[point_num // 2 - 1] = right_quad_expand[1]
+    poly[point_num // 2] = right_quad_expand[2]
+    return poly
+
+
+def norm2(x, axis=None):
+    if axis:
+        return np.sqrt(np.sum(x**2, axis=axis))
+    return np.sqrt(np.sum(x**2))
+
+
+def cos(p1, p2):
+    return (p1 * p2).sum() / (norm2(p1) * norm2(p2))
--- a/requirements.txt
+++ b/requirements.txt
@@ -7,4 +7,5 @@ opencv-python==4.2.0.32
 tqdm
 numpy
 visualdl
-python-Levenshtein
\ No newline at end of file
+python-Levenshtein
+opencv-contrib-python
\ No newline at end of file
--- a/setup.py
+++ b/setup.py
@@ -32,7 +32,7 @@ setup(
    package_dir={'paddleocr': ''},
    include_package_data=True,
    entry_points={"console_scripts": ["paddleocr= paddleocr.paddleocr:main"]},
-    version='2.0.3',
+    version='2.0.4',
    install_requires=requirements,
    license='Apache License 2.0',
    description='Awesome OCR toolkits based on PaddlePaddle （8.6M ultra-lightweight pre-trained model, support training and deployment among server, mobile, embeded and IoT devices',

--- a/tools/infer/predict_e2e.py
+++ b/tools/infer/predict_e2e.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import os
+import sys
+
+__dir__ = os.path.dirname(os.path.abspath(__file__))
+sys.path.append(__dir__)
+sys.path.append(os.path.abspath(os.path.join(__dir__, '../..')))
+
+os.environ["FLAGS_allocator_strategy"] = 'auto_growth'
+
+import cv2
+import numpy as np
+import time
+import sys
+
+import tools.infer.utility as utility
+from ppocr.utils.logging import get_logger
+from ppocr.utils.utility import get_image_file_list, check_and_read_gif
+from ppocr.data import create_operators, transform
+from ppocr.postprocess import build_post_process
+
+logger = get_logger()
+
+
+class TextE2E(object):
+    def __init__(self, args):
+        self.args = args
+        self.e2e_algorithm = args.e2e_algorithm
+        pre_process_list = [{
+            'E2EResizeForTest': {}
+        }, {
+            'NormalizeImage': {
+                'std': [0.229, 0.224, 0.225],
+                'mean': [0.485, 0.456, 0.406],
+                'scale': '1./255.',
+                'order': 'hwc'
+            }
+        }, {
+            'ToCHWImage': None
+        }, {
+            'KeepKeys': {
+                'keep_keys': ['image', 'shape']
+            }
+        }]
+        postprocess_params = {}
+        if self.e2e_algorithm == "PGNet":
+            pre_process_list[0] = {
+                'E2EResizeForTest': {
+                    'max_side_len': args.e2e_limit_side_len,
+                    'valid_set': 'totaltext'
+                }
+            }
+            postprocess_params['name'] = 'PGPostProcess'
+            postprocess_params["score_thresh"] = args.e2e_pgnet_score_thresh
+            postprocess_params["character_dict_path"] = args.e2e_char_dict_path
+            postprocess_params["valid_set"] = args.e2e_pgnet_valid_set
+            self.e2e_pgnet_polygon = args.e2e_pgnet_polygon
+        else:
+            logger.info("unknown e2e_algorithm:{}".format(self.e2e_algorithm))
+            sys.exit(0)
+
+        self.preprocess_op = create_operators(pre_process_list)
+        self.postprocess_op = build_post_process(postprocess_params)
+        self.predictor, self.input_tensor, self.output_tensors = utility.create_predictor(
+            args, 'e2e', logger)  # paddle.jit.load(args.det_model_dir)
+        # self.predictor.eval()
+
+    def clip_det_res(self, points, img_height, img_width):
+        for pno in range(points.shape[0]):
+            points[pno, 0] = int(min(max(points[pno, 0], 0), img_width - 1))
+            points[pno, 1] = int(min(max(points[pno, 1], 0), img_height - 1))
+        return points
+
+    def filter_tag_det_res_only_clip(self, dt_boxes, image_shape):
+        img_height, img_width = image_shape[0:2]
+        dt_boxes_new = []
+        for box in dt_boxes:
+            box = self.clip_det_res(box, img_height, img_width)
+            dt_boxes_new.append(box)
+        dt_boxes = np.array(dt_boxes_new)
+        return dt_boxes
+
+    def __call__(self, img):
+
+        ori_im = img.copy()
+        data = {'image': img}
+        data = transform(data, self.preprocess_op)
+        img, shape_list = data
+        if img is None:
+            return None, 0
+        img = np.expand_dims(img, axis=0)
+        shape_list = np.expand_dims(shape_list, axis=0)
+        img = img.copy()
+        starttime = time.time()
+
+        self.input_tensor.copy_from_cpu(img)
+        self.predictor.run()
+        outputs = []
+        for output_tensor in self.output_tensors:
+            output = output_tensor.copy_to_cpu()
+            outputs.append(output)
+
+        preds = {}
+        if self.e2e_algorithm == 'PGNet':
+            preds['f_border'] = outputs[0]
+            preds['f_char'] = outputs[1]
+            preds['f_direction'] = outputs[2]
+            preds['f_score'] = outputs[3]
+        else:
+            raise NotImplementedError
+        post_result = self.postprocess_op(preds, shape_list)
+        points, strs = post_result['points'], post_result['strs']
+        dt_boxes = self.filter_tag_det_res_only_clip(points, ori_im.shape)
+        elapse = time.time() - starttime
+        return dt_boxes, strs, elapse
+
+
+if __name__ == "__main__":
+    args = utility.parse_args()
+    image_file_list = get_image_file_list(args.image_dir)
+    text_detector = TextE2E(args)
+    count = 0
+    total_time = 0
+    draw_img_save = "./inference_results"
+    if not os.path.exists(draw_img_save):
+        os.makedirs(draw_img_save)
+    for image_file in image_file_list:
+        img, flag = check_and_read_gif(image_file)
+        if not flag:
+            img = cv2.imread(image_file)
+        if img is None:
+            logger.info("error in loading image:{}".format(image_file))
+            continue
+        points, strs, elapse = text_detector(img)
+        if count > 0:
+            total_time += elapse
+        count += 1
+        logger.info("Predict time of {}: {}".format(image_file, elapse))
+        src_im = utility.draw_e2e_res(points, strs, image_file)
+        img_name_pure = os.path.split(image_file)[-1]
+        img_path = os.path.join(draw_img_save,
+                                "e2e_res_{}".format(img_name_pure))
+        cv2.imwrite(img_path, src_im)
+        logger.info("The visualized image saved in {}".format(img_path))
+    if count > 1:
+        logger.info("Avg Time: {}".format(total_time / (count - 1)))
--- a/tools/infer/utility.py
+++ b/tools/infer/utility.py
@@ -74,6 +74,19 @@ def parse_args():
        "--vis_font_path", type=str, default="./doc/fonts/simfang.ttf")
    parser.add_argument("--drop_score", type=float, default=0.5)

+    # params for e2e
+    parser.add_argument("--e2e_algorithm", type=str, default='PGNet')
+    parser.add_argument("--e2e_model_dir", type=str)
+    parser.add_argument("--e2e_limit_side_len", type=float, default=768)
+    parser.add_argument("--e2e_limit_type", type=str, default='max')
+
+    # PGNet parmas
+    parser.add_argument("--e2e_pgnet_score_thresh", type=float, default=0.5)
+    parser.add_argument(
+        "--e2e_char_dict_path", type=str, default="./ppocr/utils/ic15_dict.txt")
+    parser.add_argument("--e2e_pgnet_valid_set", type=str, default='totaltext')
+    parser.add_argument("--e2e_pgnet_polygon", type=bool, default=True)
+
    # params for text classifier
    parser.add_argument("--use_angle_cls", type=str2bool, default=False)
    parser.add_argument("--cls_model_dir", type=str)
@@ -96,8 +109,10 @@ def create_predictor(args, mode, logger):
        model_dir = args.det_model_dir
    elif mode == 'cls':
        model_dir = args.cls_model_dir
-    else:
+    elif mode == 'rec':
        model_dir = args.rec_model_dir
+    else:
+        model_dir = args.e2e_model_dir

    if model_dir is None:
        logger.info("not find {} model file path {}".format(mode, model_dir))
@@ -151,6 +166,22 @@ def create_predictor(args, mode, logger):
    return predictor, input_tensor, output_tensors


+def draw_e2e_res(dt_boxes, strs, img_path):
+    src_im = cv2.imread(img_path)
+    for box, str in zip(dt_boxes, strs):
+        box = box.astype(np.int32).reshape((-1, 1, 2))
+        cv2.polylines(src_im, [box], True, color=(255, 255, 0), thickness=2)
+        cv2.putText(
+            src_im,
+            str,
+            org=(int(box[0, 0, 0]), int(box[0, 0, 1])),
+            fontFace=cv2.FONT_HERSHEY_COMPLEX,
+            fontScale=0.7,
+            color=(0, 255, 0),
+            thickness=1)
+    return src_im
+
+
 def draw_text_det_res(dt_boxes, img_path):
    src_im = cv2.imread(img_path)
    for box in dt_boxes:

--- a/tools/infer_e2e.py
+++ b/tools/infer_e2e.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+import os
+import sys
+
+__dir__ = os.path.dirname(os.path.abspath(__file__))
+sys.path.append(__dir__)
+sys.path.append(os.path.abspath(os.path.join(__dir__, '..')))
+
+os.environ["FLAGS_allocator_strategy"] = 'auto_growth'
+
+import cv2
+import json
+import paddle
+
+from ppocr.data import create_operators, transform
+from ppocr.modeling.architectures import build_model
+from ppocr.postprocess import build_post_process
+from ppocr.utils.save_load import init_model
+from ppocr.utils.utility import get_image_file_list
+import tools.program as program
+
+
+def draw_e2e_res(dt_boxes, strs, config, img, img_name):
+    if len(dt_boxes) > 0:
+        src_im = img
+        for box, str in zip(dt_boxes, strs):
+            box = box.astype(np.int32).reshape((-1, 1, 2))
+            cv2.polylines(src_im, [box], True, color=(255, 255, 0), thickness=2)
+            cv2.putText(
+                src_im,
+                str,
+                org=(int(box[0, 0, 0]), int(box[0, 0, 1])),
+                fontFace=cv2.FONT_HERSHEY_COMPLEX,
+                fontScale=0.7,
+                color=(0, 255, 0),
+                thickness=1)
+        save_det_path = os.path.dirname(config['Global'][
+            'save_res_path']) + "/e2e_results/"
+        if not os.path.exists(save_det_path):
+            os.makedirs(save_det_path)
+        save_path = os.path.join(save_det_path, os.path.basename(img_name))
+        cv2.imwrite(save_path, src_im)
+        logger.info("The e2e Image saved in {}".format(save_path))
+
+
+def main():
+    global_config = config['Global']
+
+    # build model
+    model = build_model(config['Architecture'])
+
+    init_model(config, model, logger)
+
+    # build post process
+    post_process_class = build_post_process(config['PostProcess'],
+                                            global_config)
+
+    # create data ops
+    transforms = []
+    for op in config['Eval']['dataset']['transforms']:
+        op_name = list(op)[0]
+        if 'Label' in op_name:
+            continue
+        elif op_name == 'KeepKeys':
+            op[op_name]['keep_keys'] = ['image', 'shape']
+        transforms.append(op)
+
+    ops = create_operators(transforms, global_config)
+
+    save_res_path = config['Global']['save_res_path']
+    if not os.path.exists(os.path.dirname(save_res_path)):
+        os.makedirs(os.path.dirname(save_res_path))
+
+    model.eval()
+    with open(save_res_path, "wb") as fout:
+        for file in get_image_file_list(config['Global']['infer_img']):
+            logger.info("infer_img: {}".format(file))
+            with open(file, 'rb') as f:
+                img = f.read()
+                data = {'image': img}
+            batch = transform(data, ops)
+            images = np.expand_dims(batch[0], axis=0)
+            shape_list = np.expand_dims(batch[1], axis=0)
+            images = paddle.to_tensor(images)
+            preds = model(images)
+            post_result = post_process_class(preds, shape_list)
+            points, strs = post_result['points'], post_result['strs']
+            # write resule
+            dt_boxes_json = []
+            for poly, str in zip(points, strs):
+                tmp_json = {"transcription": str}
+                tmp_json['points'] = poly.tolist()
+                dt_boxes_json.append(tmp_json)
+            otstr = file + "\t" + json.dumps(dt_boxes_json) + "\n"
+            fout.write(otstr.encode())
+            src_img = cv2.imread(file)
+            draw_e2e_res(points, strs, config, src_img, file)
+    logger.info("success!")
+
+
+if __name__ == '__main__':
+    config, device, logger, vdl_writer = program.preprocess()
+    main()
--- a/tools/program.py
+++ b/tools/program.py
@@ -375,7 +375,8 @@ def preprocess(is_train=False):

    alg = config['Architecture']['algorithm']
    assert alg in [
-        'EAST', 'DB', 'SAST', 'Rosetta', 'CRNN', 'STARNet', 'RARE', 'SRN', 'CLS'
+        'EAST', 'DB', 'SAST', 'Rosetta', 'CRNN', 'STARNet', 'RARE', 'SRN',
+        'CLS', 'PGNet'
    ]

    device = 'gpu:{}'.format(dist.ParallelEnv().dev_id) if use_gpu else 'cpu'