first commit

019b16be · chenxj · 3019db46 · 019b16be · 019b16be · 019b16be
Commit 019b16be authored Sep 25, 2023 by chenxj
20 changed files
--- a/ppocr/data/imaug/__pycache__/label_ops.cpython-37.pyc
+++ b/ppocr/data/imaug/__pycache__/label_ops.cpython-37.pyc
--- a/ppocr/data/imaug/__pycache__/make_border_map.cpython-37.pyc
+++ b/ppocr/data/imaug/__pycache__/make_border_map.cpython-37.pyc
--- a/ppocr/data/imaug/__pycache__/make_pse_gt.cpython-37.pyc
+++ b/ppocr/data/imaug/__pycache__/make_pse_gt.cpython-37.pyc
--- a/ppocr/data/imaug/__pycache__/make_shrink_map.cpython-37.pyc
+++ b/ppocr/data/imaug/__pycache__/make_shrink_map.cpython-37.pyc
--- a/ppocr/data/imaug/__pycache__/operators.cpython-37.pyc
+++ b/ppocr/data/imaug/__pycache__/operators.cpython-37.pyc
--- a/ppocr/data/imaug/__pycache__/pg_process.cpython-37.pyc
+++ b/ppocr/data/imaug/__pycache__/pg_process.cpython-37.pyc
--- a/ppocr/data/imaug/__pycache__/randaugment.cpython-37.pyc
+++ b/ppocr/data/imaug/__pycache__/randaugment.cpython-37.pyc
--- a/ppocr/data/imaug/__pycache__/random_crop_data.cpython-37.pyc
+++ b/ppocr/data/imaug/__pycache__/random_crop_data.cpython-37.pyc
--- a/ppocr/data/imaug/__pycache__/rec_img_aug.cpython-37.pyc
+++ b/ppocr/data/imaug/__pycache__/rec_img_aug.cpython-37.pyc
--- a/ppocr/data/imaug/__pycache__/sast_process.cpython-37.pyc
+++ b/ppocr/data/imaug/__pycache__/sast_process.cpython-37.pyc
--- a/ppocr/data/imaug/__pycache__/ssl_img_aug.cpython-37.pyc
+++ b/ppocr/data/imaug/__pycache__/ssl_img_aug.cpython-37.pyc
--- a/ppocr/data/imaug/copy_paste.py
+++ b/ppocr/data/imaug/copy_paste.py
+# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import copy
+import cv2
+import random
+import numpy as np
+from PIL import Image
+from shapely.geometry import Polygon
+
+from ppocr.data.imaug.iaa_augment import IaaAugment
+from ppocr.data.imaug.random_crop_data import is_poly_outside_rect
+from tools.infer.utility import get_rotate_crop_image
+
+
+class CopyPaste(object):
+    def __init__(self, objects_paste_ratio=0.2, limit_paste=True, **kwargs):
+        self.ext_data_num = 1
+        self.objects_paste_ratio = objects_paste_ratio
+        self.limit_paste = limit_paste
+        augmenter_args = [{'type': 'Resize', 'args': {'size': [0.5, 3]}}]
+        self.aug = IaaAugment(augmenter_args)
+
+    def __call__(self, data):
+        point_num = data['polys'].shape[1]
+        src_img = data['image']
+        src_polys = data['polys'].tolist()
+        src_texts = data['texts']
+        src_ignores = data['ignore_tags'].tolist()
+        ext_data = data['ext_data'][0]
+        ext_image = ext_data['image']
+        ext_polys = ext_data['polys']
+        ext_texts = ext_data['texts']
+        ext_ignores = ext_data['ignore_tags']
+
+        indexs = [i for i in range(len(ext_ignores)) if not ext_ignores[i]]
+        select_num = max(
+            1, min(int(self.objects_paste_ratio * len(ext_polys)), 30))
+
+        random.shuffle(indexs)
+        select_idxs = indexs[:select_num]
+        select_polys = ext_polys[select_idxs]
+        select_ignores = ext_ignores[select_idxs]
+
+        src_img = cv2.cvtColor(src_img, cv2.COLOR_BGR2RGB)
+        ext_image = cv2.cvtColor(ext_image, cv2.COLOR_BGR2RGB)
+        src_img = Image.fromarray(src_img).convert('RGBA')
+        for idx, poly, tag in zip(select_idxs, select_polys, select_ignores):
+            box_img = get_rotate_crop_image(ext_image, poly)
+
+            src_img, box = self.paste_img(src_img, box_img, src_polys)
+            if box is not None:
+                box = box.tolist() 
+                for _ in range(len(box), point_num):
+                    box.append(box[-1])
+                src_polys.append(box)
+                src_texts.append(ext_texts[idx])
+                src_ignores.append(tag)
+        src_img = cv2.cvtColor(np.array(src_img), cv2.COLOR_RGB2BGR)
+        h, w = src_img.shape[:2]
+        src_polys = np.array(src_polys)
+        src_polys[:, :, 0] = np.clip(src_polys[:, :, 0], 0, w)
+        src_polys[:, :, 1] = np.clip(src_polys[:, :, 1], 0, h)
+        data['image'] = src_img
+        data['polys'] = src_polys
+        data['texts'] = src_texts
+        data['ignore_tags'] = np.array(src_ignores)
+        return data
+
+    def paste_img(self, src_img, box_img, src_polys):
+        box_img_pil = Image.fromarray(box_img).convert('RGBA')
+        src_w, src_h = src_img.size
+        box_w, box_h = box_img_pil.size
+
+        angle = np.random.randint(0, 360)
+        box = np.array([[[0, 0], [box_w, 0], [box_w, box_h], [0, box_h]]])
+        box = rotate_bbox(box_img, box, angle)[0]
+        box_img_pil = box_img_pil.rotate(angle, expand=1)
+        box_w, box_h = box_img_pil.width, box_img_pil.height
+        if src_w - box_w < 0 or src_h - box_h < 0:
+            return src_img, None
+
+        paste_x, paste_y = self.select_coord(src_polys, box, src_w - box_w,
+                                             src_h - box_h)
+        if paste_x is None:
+            return src_img, None
+        box[:, 0] += paste_x
+        box[:, 1] += paste_y
+        r, g, b, A = box_img_pil.split()
+        src_img.paste(box_img_pil, (paste_x, paste_y), mask=A)
+
+        return src_img, box
+
+    def select_coord(self, src_polys, box, endx, endy):
+        if self.limit_paste:
+            xmin, ymin, xmax, ymax = box[:, 0].min(), box[:, 1].min(
+            ), box[:, 0].max(), box[:, 1].max()
+            for _ in range(50):
+                paste_x = random.randint(0, endx)
+                paste_y = random.randint(0, endy)
+                xmin1 = xmin + paste_x
+                xmax1 = xmax + paste_x
+                ymin1 = ymin + paste_y
+                ymax1 = ymax + paste_y
+
+                num_poly_in_rect = 0
+                for poly in src_polys:
+                    if not is_poly_outside_rect(poly, xmin1, ymin1,
+                                                xmax1 - xmin1, ymax1 - ymin1):
+                        num_poly_in_rect += 1
+                        break
+                if num_poly_in_rect == 0:
+                    return paste_x, paste_y
+            return None, None
+        else:
+            paste_x = random.randint(0, endx)
+            paste_y = random.randint(0, endy)
+            return paste_x, paste_y
+
+
+def get_union(pD, pG):
+    return Polygon(pD).union(Polygon(pG)).area
+
+
+def get_intersection_over_union(pD, pG):
+    return get_intersection(pD, pG) / get_union(pD, pG)
+
+
+def get_intersection(pD, pG):
+    return Polygon(pD).intersection(Polygon(pG)).area
+
+
+def rotate_bbox(img, text_polys, angle, scale=1):
+    """
+    from https://github.com/WenmuZhou/DBNet.pytorch/blob/master/data_loader/modules/augment.py
+    Args:
+        img: np.ndarray
+        text_polys: np.ndarray N*4*2
+        angle: int
+        scale: int
+
+    Returns:
+
+    """
+    w = img.shape[1]
+    h = img.shape[0]
+
+    rangle = np.deg2rad(angle)
+    nw = (abs(np.sin(rangle) * h) + abs(np.cos(rangle) * w))
+    nh = (abs(np.cos(rangle) * h) + abs(np.sin(rangle) * w))
+    rot_mat = cv2.getRotationMatrix2D((nw * 0.5, nh * 0.5), angle, scale)
+    rot_move = np.dot(rot_mat, np.array([(nw - w) * 0.5, (nh - h) * 0.5, 0]))
+    rot_mat[0, 2] += rot_move[0]
+    rot_mat[1, 2] += rot_move[1]
+
+    # ---------------------- rotate box ----------------------
+    rot_text_polys = list()
+    for bbox in text_polys:
+        point1 = np.dot(rot_mat, np.array([bbox[0, 0], bbox[0, 1], 1]))
+        point2 = np.dot(rot_mat, np.array([bbox[1, 0], bbox[1, 1], 1]))
+        point3 = np.dot(rot_mat, np.array([bbox[2, 0], bbox[2, 1], 1]))
+        point4 = np.dot(rot_mat, np.array([bbox[3, 0], bbox[3, 1], 1]))
+        rot_text_polys.append([point1, point2, point3, point4])
+    return np.array(rot_text_polys, dtype=np.float32)
--- a/ppocr/data/imaug/iaa_augment.py
+++ b/ppocr/data/imaug/iaa_augment.py
+# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+This code is refer from:
+https://github.com/WenmuZhou/DBNet.pytorch/blob/master/data_loader/modules/iaa_augment.py
+"""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+import numpy as np
+import imgaug
+import imgaug.augmenters as iaa
+
+
+class AugmenterBuilder(object):
+    def __init__(self):
+        pass
+
+    def build(self, args, root=True):
+        if args is None or len(args) == 0:
+            return None
+        elif isinstance(args, list):
+            if root:
+                sequence = [self.build(value, root=False) for value in args]
+                return iaa.Sequential(sequence)
+            else:
+                return getattr(iaa, args[0])(
+                    *[self.to_tuple_if_list(a) for a in args[1:]])
+        elif isinstance(args, dict):
+            cls = getattr(iaa, args['type'])
+            return cls(**{
+                k: self.to_tuple_if_list(v)
+                for k, v in args['args'].items()
+            })
+        else:
+            raise RuntimeError('unknown augmenter arg: ' + str(args))
+
+    def to_tuple_if_list(self, obj):
+        if isinstance(obj, list):
+            return tuple(obj)
+        return obj
+
+
+class IaaAugment():
+    def __init__(self, augmenter_args=None, **kwargs):
+        if augmenter_args is None:
+            augmenter_args = [{
+                'type': 'Fliplr',
+                'args': {
+                    'p': 0.5
+                }
+            }, {
+                'type': 'Affine',
+                'args': {
+                    'rotate': [-10, 10]
+                }
+            }, {
+                'type': 'Resize',
+                'args': {
+                    'size': [0.5, 3]
+                }
+            }]
+        self.augmenter = AugmenterBuilder().build(augmenter_args)
+
+    def __call__(self, data):
+        image = data['image']
+        shape = image.shape
+
+        if self.augmenter:
+            aug = self.augmenter.to_deterministic()
+            data['image'] = aug.augment_image(image)
+            data = self.may_augment_annotation(aug, data, shape)
+        return data
+
+    def may_augment_annotation(self, aug, data, shape):
+        if aug is None:
+            return data
+
+        line_polys = []
+        for poly in data['polys']:
+            new_poly = self.may_augment_poly(aug, shape, poly)
+            line_polys.append(new_poly)
+        data['polys'] = np.array(line_polys)
+        return data
+
+    def may_augment_poly(self, aug, img_shape, poly):
+        keypoints = [imgaug.Keypoint(p[0], p[1]) for p in poly]
+        keypoints = aug.augment_keypoints(
+            [imgaug.KeypointsOnImage(
+                keypoints, shape=img_shape)])[0].keypoints
+        poly = [(p.x, p.y) for p in keypoints]
+        return poly
--- a/ppocr/data/imaug/label_ops.py
+++ b/ppocr/data/imaug/label_ops.py
+# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+import copy
+import numpy as np
+import string
+from shapely.geometry import LineString, Point, Polygon
+import json
+import copy
+from scipy.spatial import distance as dist
+from ppocr.utils.logging import get_logger
+
+
+class ClsLabelEncode(object):
+    def __init__(self, label_list, **kwargs):
+        self.label_list = label_list
+
+    def __call__(self, data):
+        label = data['label']
+        if label not in self.label_list:
+            return None
+        label = self.label_list.index(label)
+        data['label'] = label
+        return data
+
+
+class DetLabelEncode(object):
+    def __init__(self, **kwargs):
+        pass
+
+    def __call__(self, data):
+        label = data['label']
+        label = json.loads(label)
+        nBox = len(label)
+        boxes, txts, txt_tags = [], [], []
+        for bno in range(0, nBox):
+            box = label[bno]['points']
+            txt = label[bno]['transcription']
+            boxes.append(box)
+            txts.append(txt)
+            if txt in ['*', '###']:
+                txt_tags.append(True)
+            else:
+                txt_tags.append(False)
+        if len(boxes) == 0:
+            return None
+        boxes = self.expand_points_num(boxes)
+        boxes = np.array(boxes, dtype=np.float32)
+        txt_tags = np.array(txt_tags, dtype=np.bool)
+
+        data['polys'] = boxes
+        data['texts'] = txts
+        data['ignore_tags'] = txt_tags
+        return data
+
+    def order_points_clockwise(self, pts):
+        rect = np.zeros((4, 2), dtype="float32")
+        s = pts.sum(axis=1)
+        rect[0] = pts[np.argmin(s)]
+        rect[2] = pts[np.argmax(s)]
+        tmp = np.delete(pts, (np.argmin(s), np.argmax(s)), axis=0)
+        diff = np.diff(np.array(tmp), axis=1)
+        rect[1] = tmp[np.argmin(diff)]
+        rect[3] = tmp[np.argmax(diff)]
+        return rect
+
+    def expand_points_num(self, boxes):
+        max_points_num = 0
+        for box in boxes:
+            if len(box) > max_points_num:
+                max_points_num = len(box)
+        ex_boxes = []
+        for box in boxes:
+            ex_box = box + [box[-1]] * (max_points_num - len(box))
+            ex_boxes.append(ex_box)
+        return ex_boxes
+
+
+class BaseRecLabelEncode(object):
+    """ Convert between text-label and text-index """
+
+    def __init__(self,
+                 max_text_length,
+                 character_dict_path=None,
+                 use_space_char=False):
+
+        self.max_text_len = max_text_length
+        self.beg_str = "sos"
+        self.end_str = "eos"
+        self.lower = False
+
+        if character_dict_path is None:
+            logger = get_logger()
+            logger.warning(
+                "The character_dict_path is None, model can only recognize number and lower letters"
+            )
+            self.character_str = "0123456789abcdefghijklmnopqrstuvwxyz"
+            dict_character = list(self.character_str)
+            self.lower = True
+        else:
+            self.character_str = []
+            with open(character_dict_path, "rb") as fin:
+                lines = fin.readlines()
+                for line in lines:
+                    line = line.decode('utf-8').strip("\n").strip("\r\n")
+                    self.character_str.append(line)
+            if use_space_char:
+                self.character_str.append(" ")
+            dict_character = list(self.character_str)
+        dict_character = self.add_special_char(dict_character)
+        self.dict = {}
+        for i, char in enumerate(dict_character):
+            self.dict[char] = i
+        self.character = dict_character
+
+    def add_special_char(self, dict_character):
+        return dict_character
+
+    def encode(self, text):
+        """convert text-label into text-index.
+        input:
+            text: text labels of each image. [batch_size]
+
+        output:
+            text: concatenated text index for CTCLoss.
+                    [sum(text_lengths)] = [text_index_0 + text_index_1 + ... + text_index_(n - 1)]
+            length: length of each text. [batch_size]
+        """
+        if len(text) == 0 or len(text) > self.max_text_len:
+            return None
+        if self.lower:
+            text = text.lower()
+        text_list = []
+        for char in text:
+            if char not in self.dict:
+                # logger = get_logger()
+                # logger.warning('{} is not in dict'.format(char))
+                continue
+            text_list.append(self.dict[char])
+        if len(text_list) == 0:
+            return None
+        return text_list
+
+
+class CTCLabelEncode(BaseRecLabelEncode):
+    """ Convert between text-label and text-index """
+
+    def __init__(self,
+                 max_text_length,
+                 character_dict_path=None,
+                 use_space_char=False,
+                 **kwargs):
+        super(CTCLabelEncode, self).__init__(
+            max_text_length, character_dict_path, use_space_char)
+
+    def __call__(self, data):
+        text = data['label']
+        text = self.encode(text)
+        if text is None:
+            return None
+        data['length'] = np.array(len(text))
+        text = text + [0] * (self.max_text_len - len(text))
+        data['label'] = np.array(text)
+
+        label = [0] * len(self.character)
+        for x in text:
+            label[x] += 1
+        data['label_ace'] = np.array(label)
+        return data
+
+    def add_special_char(self, dict_character):
+        dict_character = ['blank'] + dict_character
+        return dict_character
+
+
+class SARLabelEncode(BaseRecLabelEncode):
+    """ Convert between text-label and text-index """
+
+    def __init__(self,
+                 max_text_length,
+                 character_dict_path=None,
+                 use_space_char=False,
+                 **kwargs):
+        super(SARLabelEncode, self).__init__(
+            max_text_length, character_dict_path, use_space_char)
+
+    def add_special_char(self, dict_character):
+        beg_end_str = "<BOS/EOS>"
+        unknown_str = "<UKN>"
+        padding_str = "<PAD>"
+        dict_character = dict_character + [unknown_str]
+        self.unknown_idx = len(dict_character) - 1
+        dict_character = dict_character + [beg_end_str]
+        self.start_idx = len(dict_character) - 1
+        self.end_idx = len(dict_character) - 1
+        dict_character = dict_character + [padding_str]
+        self.padding_idx = len(dict_character) - 1
+
+        return dict_character
+
+    def __call__(self, data):
+        text = data['label']
+        text = self.encode(text)
+        if text is None:
+            return None
+        if len(text) >= self.max_text_len - 1:
+            return None
+        data['length'] = np.array(len(text))
+        target = [self.start_idx] + text + [self.end_idx]
+        padded_text = [self.padding_idx for _ in range(self.max_text_len)]
+
+        padded_text[:len(target)] = target
+        data['label'] = np.array(padded_text)
+        return data
+
+    def get_ignored_tokens(self):
+        return [self.padding_idx]
+    
+    
+class MultiLabelEncode(BaseRecLabelEncode):
+    def __init__(self,
+                 max_text_length,
+                 character_dict_path=None,
+                 use_space_char=False,
+                 **kwargs):
+        super(MultiLabelEncode, self).__init__(
+            max_text_length, character_dict_path, use_space_char)
+
+        self.ctc_encode = CTCLabelEncode(max_text_length, character_dict_path,
+                                         use_space_char, **kwargs)
+        self.sar_encode = SARLabelEncode(max_text_length, character_dict_path,
+                                         use_space_char, **kwargs)
+
+    def __call__(self, data):
+
+        data_ctc = copy.deepcopy(data)
+        data_sar = copy.deepcopy(data)
+        data_out = dict()
+        data_out['img_path'] = data.get('img_path', None)
+        data_out['image'] = data['image']
+        ctc = self.ctc_encode.__call__(data_ctc)
+        sar = self.sar_encode.__call__(data_sar)
+        if ctc is None or sar is None:
+            return None
+        data_out['label_ctc'] = ctc['label']
+        data_out['label_sar'] = sar['label']
+        data_out['length'] = ctc['length']
+        return data_out
+    
\ No newline at end of file
--- a/ppocr/data/imaug/make_border_map.py
+++ b/ppocr/data/imaug/make_border_map.py
+# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+This code is refer from:
+https://github.com/WenmuZhou/DBNet.pytorch/blob/master/data_loader/modules/make_border_map.py
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+import numpy as np
+import cv2
+
+np.seterr(divide='ignore', invalid='ignore')
+import pyclipper
+from shapely.geometry import Polygon
+import sys
+import warnings
+
+warnings.simplefilter("ignore")
+
+__all__ = ['MakeBorderMap']
+
+
+class MakeBorderMap(object):
+    def __init__(self,
+                 shrink_ratio=0.4,
+                 thresh_min=0.3,
+                 thresh_max=0.7,
+                 **kwargs):
+        self.shrink_ratio = shrink_ratio
+        self.thresh_min = thresh_min
+        self.thresh_max = thresh_max
+
+    def __call__(self, data):
+
+        img = data['image']
+        text_polys = data['polys']
+        ignore_tags = data['ignore_tags']
+
+        canvas = np.zeros(img.shape[:2], dtype=np.float32)
+        mask = np.zeros(img.shape[:2], dtype=np.float32)
+
+        for i in range(len(text_polys)):
+            if ignore_tags[i]:
+                continue
+            self.draw_border_map(text_polys[i], canvas, mask=mask)
+        canvas = canvas * (self.thresh_max - self.thresh_min) + self.thresh_min
+
+        data['threshold_map'] = canvas
+        data['threshold_mask'] = mask
+        return data
+
+    def draw_border_map(self, polygon, canvas, mask):
+        polygon = np.array(polygon)
+        assert polygon.ndim == 2
+        assert polygon.shape[1] == 2
+
+        polygon_shape = Polygon(polygon)
+        if polygon_shape.area <= 0:
+            return
+        distance = polygon_shape.area * (
+            1 - np.power(self.shrink_ratio, 2)) / polygon_shape.length
+        subject = [tuple(l) for l in polygon]
+        padding = pyclipper.PyclipperOffset()
+        padding.AddPath(subject, pyclipper.JT_ROUND, pyclipper.ET_CLOSEDPOLYGON)
+
+        padded_polygon = np.array(padding.Execute(distance)[0])
+        cv2.fillPoly(mask, [padded_polygon.astype(np.int32)], 1.0)
+
+        xmin = padded_polygon[:, 0].min()
+        xmax = padded_polygon[:, 0].max()
+        ymin = padded_polygon[:, 1].min()
+        ymax = padded_polygon[:, 1].max()
+        width = xmax - xmin + 1
+        height = ymax - ymin + 1
+
+        polygon[:, 0] = polygon[:, 0] - xmin
+        polygon[:, 1] = polygon[:, 1] - ymin
+
+        xs = np.broadcast_to(
+            np.linspace(
+                0, width - 1, num=width).reshape(1, width), (height, width))
+        ys = np.broadcast_to(
+            np.linspace(
+                0, height - 1, num=height).reshape(height, 1), (height, width))
+
+        distance_map = np.zeros(
+            (polygon.shape[0], height, width), dtype=np.float32)
+        for i in range(polygon.shape[0]):
+            j = (i + 1) % polygon.shape[0]
+            absolute_distance = self._distance(xs, ys, polygon[i], polygon[j])
+            distance_map[i] = np.clip(absolute_distance / distance, 0, 1)
+        distance_map = distance_map.min(axis=0)
+
+        xmin_valid = min(max(0, xmin), canvas.shape[1] - 1)
+        xmax_valid = min(max(0, xmax), canvas.shape[1] - 1)
+        ymin_valid = min(max(0, ymin), canvas.shape[0] - 1)
+        ymax_valid = min(max(0, ymax), canvas.shape[0] - 1)
+        canvas[ymin_valid:ymax_valid + 1, xmin_valid:xmax_valid + 1] = np.fmax(
+            1 - distance_map[ymin_valid - ymin:ymax_valid - ymax + height,
+                             xmin_valid - xmin:xmax_valid - xmax + width],
+            canvas[ymin_valid:ymax_valid + 1, xmin_valid:xmax_valid + 1])
+
+    def _distance(self, xs, ys, point_1, point_2):
+        '''
+        compute the distance from point to a line
+        ys: coordinates in the first axis
+        xs: coordinates in the second axis
+        point_1, point_2: (x, y), the end of the line
+        '''
+        height, width = xs.shape[:2]
+        square_distance_1 = np.square(xs - point_1[0]) + np.square(ys - point_1[
+            1])
+        square_distance_2 = np.square(xs - point_2[0]) + np.square(ys - point_2[
+            1])
+        square_distance = np.square(point_1[0] - point_2[0]) + np.square(
+            point_1[1] - point_2[1])
+
+        cosin = (square_distance - square_distance_1 - square_distance_2) / (
+            2 * np.sqrt(square_distance_1 * square_distance_2))
+        square_sin = 1 - np.square(cosin)
+        square_sin = np.nan_to_num(square_sin)
+        result = np.sqrt(square_distance_1 * square_distance_2 * square_sin /
+                         square_distance)
+
+        result[cosin <
+               0] = np.sqrt(np.fmin(square_distance_1, square_distance_2))[cosin
+                                                                           < 0]
+        # self.extend_line(point_1, point_2, result)
+        return result
+
+    def extend_line(self, point_1, point_2, result, shrink_ratio):
+        ex_point_1 = (int(
+            round(point_1[0] + (point_1[0] - point_2[0]) * (1 + shrink_ratio))),
+                      int(
+                          round(point_1[1] + (point_1[1] - point_2[1]) * (
+                              1 + shrink_ratio))))
+        cv2.line(
+            result,
+            tuple(ex_point_1),
+            tuple(point_1),
+            4096.0,
+            1,
+            lineType=cv2.LINE_AA,
+            shift=0)
+        ex_point_2 = (int(
+            round(point_2[0] + (point_2[0] - point_1[0]) * (1 + shrink_ratio))),
+                      int(
+                          round(point_2[1] + (point_2[1] - point_1[1]) * (
+                              1 + shrink_ratio))))
+        cv2.line(
+            result,
+            tuple(ex_point_2),
+            tuple(point_2),
+            4096.0,
+            1,
+            lineType=cv2.LINE_AA,
+            shift=0)
+        return ex_point_1, ex_point_2
--- a/ppocr/data/imaug/make_pse_gt.py
+++ b/ppocr/data/imaug/make_pse_gt.py
+# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+import cv2
+import numpy as np
+import pyclipper
+from shapely.geometry import Polygon
+
+__all__ = ['MakePseGt']
+
+
+class MakePseGt(object):
+    def __init__(self, kernel_num=7, size=640, min_shrink_ratio=0.4, **kwargs):
+        self.kernel_num = kernel_num
+        self.min_shrink_ratio = min_shrink_ratio
+        self.size = size
+
+    def __call__(self, data):
+
+        image = data['image']
+        text_polys = data['polys']
+        ignore_tags = data['ignore_tags']
+
+        h, w, _ = image.shape
+        short_edge = min(h, w)
+        if short_edge < self.size:
+            # keep short_size >= self.size
+            scale = self.size / short_edge
+            image = cv2.resize(image, dsize=None, fx=scale, fy=scale)
+            text_polys *= scale
+
+        gt_kernels = []
+        for i in range(1, self.kernel_num + 1):
+            # s1->sn, from big to small
+            rate = 1.0 - (1.0 - self.min_shrink_ratio) / (self.kernel_num - 1
+                                                          ) * i
+            text_kernel, ignore_tags = self.generate_kernel(
+                image.shape[0:2], rate, text_polys, ignore_tags)
+            gt_kernels.append(text_kernel)
+
+        training_mask = np.ones(image.shape[0:2], dtype='uint8')
+        for i in range(text_polys.shape[0]):
+            if ignore_tags[i]:
+                cv2.fillPoly(training_mask,
+                             text_polys[i].astype(np.int32)[np.newaxis, :, :],
+                             0)
+
+        gt_kernels = np.array(gt_kernels)
+        gt_kernels[gt_kernels > 0] = 1
+
+        data['image'] = image
+        data['polys'] = text_polys
+        data['gt_kernels'] = gt_kernels[0:]
+        data['gt_text'] = gt_kernels[0]
+        data['mask'] = training_mask.astype('float32')
+        return data
+
+    def generate_kernel(self,
+                        img_size,
+                        shrink_ratio,
+                        text_polys,
+                        ignore_tags=None):
+        """
+        Refer to part of the code:
+        https://github.com/open-mmlab/mmocr/blob/main/mmocr/datasets/pipelines/textdet_targets/base_textdet_targets.py
+        """
+
+        h, w = img_size
+        text_kernel = np.zeros((h, w), dtype=np.float32)
+        for i, poly in enumerate(text_polys):
+            polygon = Polygon(poly)
+            distance = polygon.area * (1 - shrink_ratio * shrink_ratio) / (
+                polygon.length + 1e-6)
+            subject = [tuple(l) for l in poly]
+            pco = pyclipper.PyclipperOffset()
+            pco.AddPath(subject, pyclipper.JT_ROUND, pyclipper.ET_CLOSEDPOLYGON)
+            shrinked = np.array(pco.Execute(-distance))
+
+            if len(shrinked) == 0 or shrinked.size == 0:
+                if ignore_tags is not None:
+                    ignore_tags[i] = True
+                continue
+            try:
+                shrinked = np.array(shrinked[0]).reshape(-1, 2)
+            except:
+                if ignore_tags is not None:
+                    ignore_tags[i] = True
+                continue
+            cv2.fillPoly(text_kernel, [shrinked.astype(np.int32)], i + 1)
+        return text_kernel, ignore_tags
--- a/ppocr/data/imaug/make_shrink_map.py
+++ b/ppocr/data/imaug/make_shrink_map.py
+# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+This code is refer from:
+https://github.com/WenmuZhou/DBNet.pytorch/blob/master/data_loader/modules/make_shrink_map.py
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+import numpy as np
+import cv2
+from shapely.geometry import Polygon
+import pyclipper
+
+__all__ = ['MakeShrinkMap']
+
+
+class MakeShrinkMap(object):
+    r'''
+    Making binary mask from detection data with ICDAR format.
+    Typically following the process of class `MakeICDARData`.
+    '''
+
+    def __init__(self, min_text_size=8, shrink_ratio=0.4, **kwargs):
+        self.min_text_size = min_text_size
+        self.shrink_ratio = shrink_ratio
+
+    def __call__(self, data):
+        image = data['image']
+        text_polys = data['polys']
+        ignore_tags = data['ignore_tags']
+
+        h, w = image.shape[:2]
+        text_polys, ignore_tags = self.validate_polygons(text_polys,
+                                                         ignore_tags, h, w)
+        gt = np.zeros((h, w), dtype=np.float32)
+        mask = np.ones((h, w), dtype=np.float32)
+        for i in range(len(text_polys)):
+            polygon = text_polys[i]
+            height = max(polygon[:, 1]) - min(polygon[:, 1])
+            width = max(polygon[:, 0]) - min(polygon[:, 0])
+            if ignore_tags[i] or min(height, width) < self.min_text_size:
+                cv2.fillPoly(mask,
+                             polygon.astype(np.int32)[np.newaxis, :, :], 0)
+                ignore_tags[i] = True
+            else:
+                polygon_shape = Polygon(polygon)
+                subject = [tuple(l) for l in polygon]
+                padding = pyclipper.PyclipperOffset()
+                padding.AddPath(subject, pyclipper.JT_ROUND,
+                                pyclipper.ET_CLOSEDPOLYGON)
+                shrinked = []
+
+                # Increase the shrink ratio every time we get multiple polygon returned back
+                possible_ratios = np.arange(self.shrink_ratio, 1,
+                                            self.shrink_ratio)
+                np.append(possible_ratios, 1)
+                # print(possible_ratios)
+                for ratio in possible_ratios:
+                    # print(f"Change shrink ratio to {ratio}")
+                    distance = polygon_shape.area * (
+                        1 - np.power(ratio, 2)) / polygon_shape.length
+                    shrinked = padding.Execute(-distance)
+                    if len(shrinked) == 1:
+                        break
+
+                if shrinked == []:
+                    cv2.fillPoly(mask,
+                                 polygon.astype(np.int32)[np.newaxis, :, :], 0)
+                    ignore_tags[i] = True
+                    continue
+
+                for each_shirnk in shrinked:
+                    shirnk = np.array(each_shirnk).reshape(-1, 2)
+                    cv2.fillPoly(gt, [shirnk.astype(np.int32)], 1)
+
+        data['shrink_map'] = gt
+        data['shrink_mask'] = mask
+        return data
+
+    def validate_polygons(self, polygons, ignore_tags, h, w):
+        '''
+        polygons (numpy.array, required): of shape (num_instances, num_points, 2)
+        '''
+        if len(polygons) == 0:
+            return polygons, ignore_tags
+        assert len(polygons) == len(ignore_tags)
+        for polygon in polygons:
+            polygon[:, 0] = np.clip(polygon[:, 0], 0, w - 1)
+            polygon[:, 1] = np.clip(polygon[:, 1], 0, h - 1)
+
+        for i in range(len(polygons)):
+            area = self.polygon_area(polygons[i])
+            if abs(area) < 1:
+                ignore_tags[i] = True
+            if area > 0:
+                polygons[i] = polygons[i][::-1, :]
+        return polygons, ignore_tags
+
+    def polygon_area(self, polygon):
+        """
+        compute polygon area
+        """
+        area = 0
+        q = polygon[-1]
+        for p in polygon:
+            area += p[0] * q[1] - p[1] * q[0]
+            q = p
+        return area / 2.0
--- a/ppocr/data/imaug/operators.py
+++ b/ppocr/data/imaug/operators.py
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+import sys
+import six
+import cv2
+import numpy as np
+import math
+
+
+class DecodeImage(object):
+    """ decode image """
+
+    def __init__(self,
+                 img_mode='RGB',
+                 channel_first=False,
+                 ignore_orientation=False,
+                 **kwargs):
+        self.img_mode = img_mode
+        self.channel_first = channel_first
+        self.ignore_orientation = ignore_orientation
+
+    def __call__(self, data):
+        img = data['image']
+        if six.PY2:
+            assert type(img) is str and len(
+                img) > 0, "invalid input 'img' in DecodeImage"
+        else:
+            assert type(img) is bytes and len(
+                img) > 0, "invalid input 'img' in DecodeImage"
+        img = np.frombuffer(img, dtype='uint8')
+        if self.ignore_orientation:
+            img = cv2.imdecode(img, cv2.IMREAD_IGNORE_ORIENTATION |
+                               cv2.IMREAD_COLOR)
+        else:
+            img = cv2.imdecode(img, 1)
+        if img is None:
+            return None
+        if self.img_mode == 'GRAY':
+            img = cv2.cvtColor(img, cv2.COLOR_GRAY2BGR)
+        elif self.img_mode == 'RGB':
+            assert img.shape[2] == 3, 'invalid shape of image[%s]' % (img.shape)
+            img = img[:, :, ::-1]
+
+        if self.channel_first:
+            img = img.transpose((2, 0, 1))
+
+        data['image'] = img
+        return data
+        
+
+class NormalizeImage(object):
+    """ normalize image such as substract mean, divide std
+    """
+
+    def __init__(self, scale=None, mean=None, std=None, order='chw', **kwargs):
+        if isinstance(scale, str):
+            scale = eval(scale)
+        self.scale = np.float32(scale if scale is not None else 1.0 / 255.0)
+        mean = mean if mean is not None else [0.485, 0.456, 0.406]
+        std = std if std is not None else [0.229, 0.224, 0.225]
+
+        shape = (3, 1, 1) if order == 'chw' else (1, 1, 3)
+        self.mean = np.array(mean).reshape(shape).astype('float32')
+        self.std = np.array(std).reshape(shape).astype('float32')
+
+    def __call__(self, data):
+        img = data['image']
+        from PIL import Image
+        if isinstance(img, Image.Image):
+            img = np.array(img)
+        assert isinstance(img,
+                          np.ndarray), "invalid input 'img' in NormalizeImage"
+        data['image'] = (
+            img.astype('float32') * self.scale - self.mean) / self.std
+        return data
+
+
+class ToCHWImage(object):
+    """ convert hwc image to chw image
+    """
+
+    def __init__(self, **kwargs):
+        pass
+
+    def __call__(self, data):
+        img = data['image']
+        from PIL import Image
+        if isinstance(img, Image.Image):
+            img = np.array(img)
+        data['image'] = img.transpose((2, 0, 1))
+        return data
+
+
+class KeepKeys(object):
+    def __init__(self, keep_keys, **kwargs):
+        self.keep_keys = keep_keys
+
+    def __call__(self, data):
+        data_list = []
+        for key in self.keep_keys:
+            data_list.append(data[key])
+        return data_list
+    
+    
+class DetResizeForTest(object):
+    def __init__(self, **kwargs):
+        super(DetResizeForTest, self).__init__()
+        self.resize_type = 0
+        if 'image_shape' in kwargs:
+            self.image_shape = kwargs['image_shape']
+            self.resize_type = 1
+        elif 'limit_side_len' in kwargs:
+            self.limit_side_len = kwargs['limit_side_len']
+            self.limit_type = kwargs.get('limit_type', 'min')
+        elif 'resize_long' in kwargs:
+            self.resize_type = 2
+            self.resize_long = kwargs.get('resize_long', 960)
+        else:
+            self.limit_side_len = 736
+            self.limit_type = 'min'
+
+    def __call__(self, data):
+        img = data['image']
+        src_h, src_w, _ = img.shape
+
+        if self.resize_type == 0:
+            # img, shape = self.resize_image_type0(img)
+            img, [ratio_h, ratio_w] = self.resize_image_type0(img)
+        elif self.resize_type == 2:
+            img, [ratio_h, ratio_w] = self.resize_image_type2(img)
+        else:
+            # img, shape = self.resize_image_type1(img)
+            img, [ratio_h, ratio_w] = self.resize_image_type1(img)
+        data['image'] = img
+        data['shape'] = np.array([src_h, src_w, ratio_h, ratio_w])
+        return data
+
+    def resize_image_type1(self, img):
+        resize_h, resize_w = self.image_shape
+        ori_h, ori_w = img.shape[:2]  # (h, w, c)
+        ratio_h = float(resize_h) / ori_h
+        ratio_w = float(resize_w) / ori_w
+        img = cv2.resize(img, (int(resize_w), int(resize_h)))
+        # return img, np.array([ori_h, ori_w])
+        return img, [ratio_h, ratio_w]
+
+    def resize_image_type0(self, img):
+        """
+        resize image to a size multiple of 32 which is required by the network
+        args:
+            img(array): array with shape [h, w, c]
+        return(tuple):
+            img, (ratio_h, ratio_w)
+        """
+        limit_side_len = self.limit_side_len
+        h, w, c = img.shape
+
+        # limit the max side
+        if self.limit_type == 'max':
+            if max(h, w) > limit_side_len:
+                if h > w:
+                    ratio = float(limit_side_len) / h
+                else:
+                    ratio = float(limit_side_len) / w
+            else:
+                ratio = 1.
+        elif self.limit_type == 'min':
+            if min(h, w) < limit_side_len:
+                if h < w:
+                    ratio = float(limit_side_len) / h
+                else:
+                    ratio = float(limit_side_len) / w
+            else:
+                ratio = 1.
+        elif self.limit_type == 'resize_long':
+            ratio = float(limit_side_len) / max(h, w)
+        else:
+            raise Exception('not support limit type, image ')
+        resize_h = int(h * ratio)
+        resize_w = int(w * ratio)
+
+        resize_h = max(int(round(resize_h / 32) * 32), 32)
+        resize_w = max(int(round(resize_w / 32) * 32), 32)
+
+        try:
+            if int(resize_w) <= 0 or int(resize_h) <= 0:
+                return None, (None, None)
+            img = cv2.resize(img, (int(resize_w), int(resize_h)))
+        except:
+            print(img.shape, resize_w, resize_h)
+            sys.exit(0)
+        ratio_h = resize_h / float(h)
+        ratio_w = resize_w / float(w)
+        return img, [ratio_h, ratio_w]
+
+    def resize_image_type2(self, img):
+        h, w, _ = img.shape
+
+        resize_w = w
+        resize_h = h
+
+        if resize_h > resize_w:
+            ratio = float(self.resize_long) / resize_h
+        else:
+            ratio = float(self.resize_long) / resize_w
+
+        resize_h = int(resize_h * ratio)
+        resize_w = int(resize_w * ratio)
+
+        max_stride = 128
+        resize_h = (resize_h + max_stride - 1) // max_stride * max_stride
+        resize_w = (resize_w + max_stride - 1) // max_stride * max_stride
+        img = cv2.resize(img, (int(resize_w), int(resize_h)))
+        ratio_h = resize_h / float(h)
+        ratio_w = resize_w / float(w)
+
+        return img, [ratio_h, ratio_w]
+
+
+class DetResizeForSingle(object):
+    def __init__(self, **kwargs):
+        super(DetResizeForSingle, self).__init__()
+
+    def __call__(self, data):
+        # print("DetResizeForSingle")
+        img = data['image']
+        src_h, src_w, _ = img.shape
+        img, [ratio_h, ratio_w], [resize_h, resize_w] = self.resize_image(img)
+        data['image'] = img
+        data['shape'] = np.array([src_h, src_w, ratio_h, ratio_w, resize_h, resize_w])
+        return data
+
+    def resize_image(self, img):
+        """
+        resize image to a size multiple of 32 which is required by the network
+        args:
+            img(array): array with shape [h, w, c]
+        return(tuple):
+            img, (ratio_h, ratio_w)
+        """
+        h, w, c = img.shape
+        if h > w:
+            resize_h = 1280
+            ratio = float(1280) / h
+            resize_w = int(w * ratio)
+        else:
+            resize_w = 1280
+            ratio = float(1280) / w
+            resize_h = int(h * ratio)
+
+        try:
+            if int(resize_w) <= 0 or int(resize_h) <= 0:
+                return None, (None, None)
+            img = cv2.resize(img, (int(resize_w), int(resize_h)))
+        except:
+            print(img.shape, resize_w, resize_h)
+            sys.exit(0)
+            
+        img_pd_h = 1280
+        img_pd_w = 1280
+        
+        padding_im = np.zeros((img_pd_h, img_pd_w, 3), dtype=np.uint8)
+        top = int((img_pd_h - resize_h) / 2)
+        left = int((img_pd_w -resize_w) / 2)
+        padding_im[top:top + int(resize_h), left:left + int(resize_w), :] = img
+            
+        ratio_h = img_pd_h / float(h)
+        ratio_w = img_pd_w / float(w)
+        return padding_im, [ratio_h, ratio_w], [resize_h, resize_w]
--- a/ppocr/data/imaug/randaugment.py
+++ b/ppocr/data/imaug/randaugment.py
+# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+from PIL import Image, ImageEnhance, ImageOps
+import numpy as np
+import random
+import six
+
+
+class RawRandAugment(object):
+    def __init__(self,
+                 num_layers=2,
+                 magnitude=5,
+                 fillcolor=(128, 128, 128),
+                 **kwargs):
+        self.num_layers = num_layers
+        self.magnitude = magnitude
+        self.max_level = 10
+
+        abso_level = self.magnitude / self.max_level
+        self.level_map = {
+            "shearX": 0.3 * abso_level,
+            "shearY": 0.3 * abso_level,
+            "translateX": 150.0 / 331 * abso_level,
+            "translateY": 150.0 / 331 * abso_level,
+            "rotate": 30 * abso_level,
+            "color": 0.9 * abso_level,
+            "posterize": int(4.0 * abso_level),
+            "solarize": 256.0 * abso_level,
+            "contrast": 0.9 * abso_level,
+            "sharpness": 0.9 * abso_level,
+            "brightness": 0.9 * abso_level,
+            "autocontrast": 0,
+            "equalize": 0,
+            "invert": 0
+        }
+
+        # from https://stackoverflow.com/questions/5252170/
+        # specify-image-filling-color-when-rotating-in-python-with-pil-and-setting-expand
+        def rotate_with_fill(img, magnitude):
+            rot = img.convert("RGBA").rotate(magnitude)
+            return Image.composite(rot,
+                                   Image.new("RGBA", rot.size, (128, ) * 4),
+                                   rot).convert(img.mode)
+
+        rnd_ch_op = random.choice
+
+        self.func = {
+            "shearX": lambda img, magnitude: img.transform(
+                img.size,
+                Image.AFFINE,
+                (1, magnitude * rnd_ch_op([-1, 1]), 0, 0, 1, 0),
+                Image.BICUBIC,
+                fillcolor=fillcolor),
+            "shearY": lambda img, magnitude: img.transform(
+                img.size,
+                Image.AFFINE,
+                (1, 0, 0, magnitude * rnd_ch_op([-1, 1]), 1, 0),
+                Image.BICUBIC,
+                fillcolor=fillcolor),
+            "translateX": lambda img, magnitude: img.transform(
+                img.size,
+                Image.AFFINE,
+                (1, 0, magnitude * img.size[0] * rnd_ch_op([-1, 1]), 0, 1, 0),
+                fillcolor=fillcolor),
+            "translateY": lambda img, magnitude: img.transform(
+                img.size,
+                Image.AFFINE,
+                (1, 0, 0, 0, 1, magnitude * img.size[1] * rnd_ch_op([-1, 1])),
+                fillcolor=fillcolor),
+            "rotate": lambda img, magnitude: rotate_with_fill(img, magnitude),
+            "color": lambda img, magnitude: ImageEnhance.Color(img).enhance(
+                1 + magnitude * rnd_ch_op([-1, 1])),
+            "posterize": lambda img, magnitude:
+            ImageOps.posterize(img, magnitude),
+            "solarize": lambda img, magnitude:
+            ImageOps.solarize(img, magnitude),
+            "contrast": lambda img, magnitude:
+            ImageEnhance.Contrast(img).enhance(
+                1 + magnitude * rnd_ch_op([-1, 1])),
+            "sharpness": lambda img, magnitude:
+            ImageEnhance.Sharpness(img).enhance(
+                1 + magnitude * rnd_ch_op([-1, 1])),
+            "brightness": lambda img, magnitude:
+            ImageEnhance.Brightness(img).enhance(
+                1 + magnitude * rnd_ch_op([-1, 1])),
+            "autocontrast": lambda img, magnitude:
+            ImageOps.autocontrast(img),
+            "equalize": lambda img, magnitude: ImageOps.equalize(img),
+            "invert": lambda img, magnitude: ImageOps.invert(img)
+        }
+
+    def __call__(self, img):
+        avaiable_op_names = list(self.level_map.keys())
+        for layer_num in range(self.num_layers):
+            op_name = np.random.choice(avaiable_op_names)
+            img = self.func[op_name](img, self.level_map[op_name])
+        return img
+
+
+class RandAugment(RawRandAugment):
+    """ RandAugment wrapper to auto fit different img types """
+
+    def __init__(self, prob=0.5, *args, **kwargs):
+        self.prob = prob
+        if six.PY2:
+            super(RandAugment, self).__init__(*args, **kwargs)
+        else:
+            super().__init__(*args, **kwargs)
+
+    def __call__(self, data):
+        if np.random.rand() > self.prob:
+            return data
+        img = data['image']
+        if not isinstance(img, Image.Image):
+            img = np.ascontiguousarray(img)
+            img = Image.fromarray(img)
+
+        if six.PY2:
+            img = super(RandAugment, self).__call__(img)
+        else:
+            img = super().__call__(img)
+
+        if isinstance(img, Image.Image):
+            img = np.asarray(img)
+        data['image'] = img
+        return data
--- a/ppocr/data/imaug/random_crop_data.py
+++ b/ppocr/data/imaug/random_crop_data.py
+# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+This code is refer from:
+https://github.com/WenmuZhou/DBNet.pytorch/blob/master/data_loader/modules/random_crop_data.py
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+import numpy as np
+import cv2
+import random
+
+
+def is_poly_in_rect(poly, x, y, w, h):
+    poly = np.array(poly)
+    if poly[:, 0].min() < x or poly[:, 0].max() > x + w:
+        return False
+    if poly[:, 1].min() < y or poly[:, 1].max() > y + h:
+        return False
+    return True
+
+
+def is_poly_outside_rect(poly, x, y, w, h):
+    poly = np.array(poly)
+    if poly[:, 0].max() < x or poly[:, 0].min() > x + w:
+        return True
+    if poly[:, 1].max() < y or poly[:, 1].min() > y + h:
+        return True
+    return False
+
+
+def split_regions(axis):
+    regions = []
+    min_axis = 0
+    for i in range(1, axis.shape[0]):
+        if axis[i] != axis[i - 1] + 1:
+            region = axis[min_axis:i]
+            min_axis = i
+            regions.append(region)
+    return regions
+
+
+def random_select(axis, max_size):
+    xx = np.random.choice(axis, size=2)
+    xmin = np.min(xx)
+    xmax = np.max(xx)
+    xmin = np.clip(xmin, 0, max_size - 1)
+    xmax = np.clip(xmax, 0, max_size - 1)
+    return xmin, xmax
+
+
+def region_wise_random_select(regions, max_size):
+    selected_index = list(np.random.choice(len(regions), 2))
+    selected_values = []
+    for index in selected_index:
+        axis = regions[index]
+        xx = int(np.random.choice(axis, size=1))
+        selected_values.append(xx)
+    xmin = min(selected_values)
+    xmax = max(selected_values)
+    return xmin, xmax
+
+
+def crop_area(im, text_polys, min_crop_side_ratio, max_tries):
+    h, w, _ = im.shape
+    h_array = np.zeros(h, dtype=np.int32)
+    w_array = np.zeros(w, dtype=np.int32)
+    for points in text_polys:
+        points = np.round(points, decimals=0).astype(np.int32)
+        minx = np.min(points[:, 0])
+        maxx = np.max(points[:, 0])
+        w_array[minx:maxx] = 1
+        miny = np.min(points[:, 1])
+        maxy = np.max(points[:, 1])
+        h_array[miny:maxy] = 1
+    # ensure the cropped area not across a text
+    h_axis = np.where(h_array == 0)[0]
+    w_axis = np.where(w_array == 0)[0]
+
+    if len(h_axis) == 0 or len(w_axis) == 0:
+        return 0, 0, w, h
+
+    h_regions = split_regions(h_axis)
+    w_regions = split_regions(w_axis)
+
+    for i in range(max_tries):
+        if len(w_regions) > 1:
+            xmin, xmax = region_wise_random_select(w_regions, w)
+        else:
+            xmin, xmax = random_select(w_axis, w)
+        if len(h_regions) > 1:
+            ymin, ymax = region_wise_random_select(h_regions, h)
+        else:
+            ymin, ymax = random_select(h_axis, h)
+
+        if xmax - xmin < min_crop_side_ratio * w or ymax - ymin < min_crop_side_ratio * h:
+            # area too small
+            continue
+        num_poly_in_rect = 0
+        for poly in text_polys:
+            if not is_poly_outside_rect(poly, xmin, ymin, xmax - xmin,
+                                        ymax - ymin):
+                num_poly_in_rect += 1
+                break
+
+        if num_poly_in_rect > 0:
+            return xmin, ymin, xmax - xmin, ymax - ymin
+
+    return 0, 0, w, h
+
+
+class EastRandomCropData(object):
+    def __init__(self,
+                 size=(640, 640),
+                 max_tries=10,
+                 min_crop_side_ratio=0.1,
+                 keep_ratio=True,
+                 **kwargs):
+        self.size = size
+        self.max_tries = max_tries
+        self.min_crop_side_ratio = min_crop_side_ratio
+        self.keep_ratio = keep_ratio
+
+    def __call__(self, data):
+        img = data['image']
+        text_polys = data['polys']
+        ignore_tags = data['ignore_tags']
+        texts = data['texts']
+        all_care_polys = [
+            text_polys[i] for i, tag in enumerate(ignore_tags) if not tag
+        ]
+        # 计算crop区域
+        crop_x, crop_y, crop_w, crop_h = crop_area(
+            img, all_care_polys, self.min_crop_side_ratio, self.max_tries)
+        # crop 图片 保持比例填充
+        scale_w = self.size[0] / crop_w
+        scale_h = self.size[1] / crop_h
+        scale = min(scale_w, scale_h)
+        h = int(crop_h * scale)
+        w = int(crop_w * scale)
+        if self.keep_ratio:
+            padimg = np.zeros((self.size[1], self.size[0], img.shape[2]),
+                              img.dtype)
+            padimg[:h, :w] = cv2.resize(
+                img[crop_y:crop_y + crop_h, crop_x:crop_x + crop_w], (w, h))
+            img = padimg
+        else:
+            img = cv2.resize(
+                img[crop_y:crop_y + crop_h, crop_x:crop_x + crop_w],
+                tuple(self.size))
+        # crop 文本框
+        text_polys_crop = []
+        ignore_tags_crop = []
+        texts_crop = []
+        for poly, text, tag in zip(text_polys, texts, ignore_tags):
+            poly = ((poly - (crop_x, crop_y)) * scale).tolist()
+            if not is_poly_outside_rect(poly, 0, 0, w, h):
+                text_polys_crop.append(poly)
+                ignore_tags_crop.append(tag)
+                texts_crop.append(text)
+        data['image'] = img
+        data['polys'] = np.array(text_polys_crop)
+        data['ignore_tags'] = ignore_tags_crop
+        data['texts'] = texts_crop
+        return data
+
+
+class RandomCropImgMask(object):
+    def __init__(self, size, main_key, crop_keys, p=3 / 8, **kwargs):
+        self.size = size
+        self.main_key = main_key
+        self.crop_keys = crop_keys
+        self.p = p
+
+    def __call__(self, data):
+        image = data['image']
+
+        h, w = image.shape[0:2]
+        th, tw = self.size
+        if w == tw and h == th:
+            return data
+
+        mask = data[self.main_key]
+        if np.max(mask) > 0 and random.random() > self.p:
+            # make sure to crop the text region
+            tl = np.min(np.where(mask > 0), axis=1) - (th, tw)
+            tl[tl < 0] = 0
+            br = np.max(np.where(mask > 0), axis=1) - (th, tw)
+            br[br < 0] = 0
+
+            br[0] = min(br[0], h - th)
+            br[1] = min(br[1], w - tw)
+
+            i = random.randint(tl[0], br[0]) if tl[0] < br[0] else 0
+            j = random.randint(tl[1], br[1]) if tl[1] < br[1] else 0
+        else:
+            i = random.randint(0, h - th) if h - th > 0 else 0
+            j = random.randint(0, w - tw) if w - tw > 0 else 0
+
+        # return i, j, th, tw
+        for k in data:
+            if k in self.crop_keys:
+                if len(data[k].shape) == 3:
+                    if np.argmin(data[k].shape) == 0:
+                        img = data[k][:, i:i + th, j:j + tw]
+                        if img.shape[1] != img.shape[2]:
+                            a = 1
+                    elif np.argmin(data[k].shape) == 2:
+                        img = data[k][i:i + th, j:j + tw, :]
+                        if img.shape[1] != img.shape[0]:
+                            a = 1
+                    else:
+                        img = data[k]
+                else:
+                    img = data[k][i:i + th, j:j + tw]
+                    if img.shape[0] != img.shape[1]:
+                        a = 1
+                data[k] = img
+        return data