fix conflict

6c7ff9c7 · LDOUBLEV · ac91a9e1 · 9b8f587e · 6c7ff9c7 · 6c7ff9c7
Commit 6c7ff9c7 authored Aug 05, 2021 by LDOUBLEV
20 changed files
--- a/doc/table/pipeline.jpg
+++ b/doc/table/pipeline.jpg
--- a/doc/table/pipeline_en.jpg
+++ b/doc/table/pipeline_en.jpg
--- a/doc/table/ppstructure.GIF
+++ b/doc/table/ppstructure.GIF
--- a/doc/table/result_all.jpg
+++ b/doc/table/result_all.jpg
--- a/doc/table/result_text.jpg
+++ b/doc/table/result_text.jpg
--- a/doc/table/table.jpg
+++ b/doc/table/table.jpg
--- a/doc/table/tableocr_pipeline.jpg
+++ b/doc/table/tableocr_pipeline.jpg
--- a/doc/table/tableocr_pipeline_en.jpg
+++ b/doc/table/tableocr_pipeline_en.jpg
--- a/paddleocr.py
+++ b/paddleocr.py
--- a/ppocr/data/__init__.py
+++ b/ppocr/data/__init__.py
@@ -35,6 +35,7 @@ from ppocr.data.imaug import transform, create_operators
 from ppocr.data.simple_dataset import SimpleDataSet
 from ppocr.data.lmdb_dataset import LMDBDataSet
 from ppocr.data.pgnet_dataset import PGDataSet
+from ppocr.data.pubtab_dataset import PubTabDataSet

 __all__ = ['build_dataloader', 'transform', 'create_operators']

@@ -55,7 +56,7 @@ signal.signal(signal.SIGTERM, term_mp)
 def build_dataloader(config, mode, device, logger, seed=None):
    config = copy.deepcopy(config)

-    support_dict = ['SimpleDataSet', 'LMDBDataSet', 'PGDataSet']
+    support_dict = ['SimpleDataSet', 'LMDBDataSet', 'PGDataSet', 'PubTabDataSet']
    module_name = config[mode]['dataset']['name']
    assert module_name in support_dict, Exception(
        'DataSet only support {}'.format(support_dict))

--- a/ppocr/data/imaug/__init__.py
+++ b/ppocr/data/imaug/__init__.py
@@ -23,12 +23,14 @@ from .random_crop_data import EastRandomCropData, PSERandomCrop

 from .rec_img_aug import RecAug, RecResizeImg, ClsResizeImg, SRNRecResizeImg
 from .randaugment import RandAugment
+from .copy_paste import CopyPaste
 from .operators import *
 from .label_ops import *

 from .east_process import *
 from .sast_process import *
 from .pg_process import *
+from .gen_table_mask import *


 def transform(data, ops=None):

--- a/ppocr/data/imaug/copy_paste.py
+++ b/ppocr/data/imaug/copy_paste.py
+# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import copy
+import cv2
+import random
+import numpy as np
+from PIL import Image
+from shapely.geometry import Polygon
+
+from ppocr.data.imaug.iaa_augment import IaaAugment
+from ppocr.data.imaug.random_crop_data import is_poly_outside_rect
+from tools.infer.utility import get_rotate_crop_image
+
+
+class CopyPaste(object):
+    def __init__(self, objects_paste_ratio=0.2, limit_paste=True, **kwargs):
+        self.ext_data_num = 1
+        self.objects_paste_ratio = objects_paste_ratio
+        self.limit_paste = limit_paste
+        augmenter_args = [{'type': 'Resize', 'args': {'size': [0.5, 3]}}]
+        self.aug = IaaAugment(augmenter_args)
+
+    def __call__(self, data):
+        src_img = data['image']
+        src_polys = data['polys'].tolist()
+        src_ignores = data['ignore_tags'].tolist()
+        ext_data = data['ext_data'][0]
+        ext_image = ext_data['image']
+        ext_polys = ext_data['polys']
+        ext_ignores = ext_data['ignore_tags']
+
+        indexs = [i for i in range(len(ext_ignores)) if not ext_ignores[i]]
+        select_num = max(
+            1, min(int(self.objects_paste_ratio * len(ext_polys)), 30))
+
+        random.shuffle(indexs)
+        select_idxs = indexs[:select_num]
+        select_polys = ext_polys[select_idxs]
+        select_ignores = ext_ignores[select_idxs]
+
+        src_img = cv2.cvtColor(src_img, cv2.COLOR_BGR2RGB)
+        ext_image = cv2.cvtColor(ext_image, cv2.COLOR_BGR2RGB)
+        src_img = Image.fromarray(src_img).convert('RGBA')
+        for poly, tag in zip(select_polys, select_ignores):
+            box_img = get_rotate_crop_image(ext_image, poly)
+
+            src_img, box = self.paste_img(src_img, box_img, src_polys)
+            if box is not None:
+                src_polys.append(box)
+                src_ignores.append(tag)
+        src_img = cv2.cvtColor(np.array(src_img), cv2.COLOR_RGB2BGR)
+        h, w = src_img.shape[:2]
+        src_polys = np.array(src_polys)
+        src_polys[:, :, 0] = np.clip(src_polys[:, :, 0], 0, w)
+        src_polys[:, :, 1] = np.clip(src_polys[:, :, 1], 0, h)
+        data['image'] = src_img
+        data['polys'] = src_polys
+        data['ignore_tags'] = np.array(src_ignores)
+        return data
+
+    def paste_img(self, src_img, box_img, src_polys):
+        box_img_pil = Image.fromarray(box_img).convert('RGBA')
+        src_w, src_h = src_img.size
+        box_w, box_h = box_img_pil.size
+
+        angle = np.random.randint(0, 360)
+        box = np.array([[[0, 0], [box_w, 0], [box_w, box_h], [0, box_h]]])
+        box = rotate_bbox(box_img, box, angle)[0]
+        box_img_pil = box_img_pil.rotate(angle, expand=1)
+        box_w, box_h = box_img_pil.width, box_img_pil.height
+        if src_w - box_w < 0 or src_h - box_h < 0:
+            return src_img, None
+
+        paste_x, paste_y = self.select_coord(src_polys, box, src_w - box_w,
+                                             src_h - box_h)
+        if paste_x is None:
+            return src_img, None
+        box[:, 0] += paste_x
+        box[:, 1] += paste_y
+        r, g, b, A = box_img_pil.split()
+        src_img.paste(box_img_pil, (paste_x, paste_y), mask=A)
+
+        return src_img, box
+
+    def select_coord(self, src_polys, box, endx, endy):
+        if self.limit_paste:
+            xmin, ymin, xmax, ymax = box[:, 0].min(), box[:, 1].min(
+            ), box[:, 0].max(), box[:, 1].max()
+            for _ in range(50):
+                paste_x = random.randint(0, endx)
+                paste_y = random.randint(0, endy)
+                xmin1 = xmin + paste_x
+                xmax1 = xmax + paste_x
+                ymin1 = ymin + paste_y
+                ymax1 = ymax + paste_y
+
+                num_poly_in_rect = 0
+                for poly in src_polys:
+                    if not is_poly_outside_rect(poly, xmin1, ymin1,
+                                                xmax1 - xmin1, ymax1 - ymin1):
+                        num_poly_in_rect += 1
+                        break
+                if num_poly_in_rect == 0:
+                    return paste_x, paste_y
+            return None, None
+        else:
+            paste_x = random.randint(0, endx)
+            paste_y = random.randint(0, endy)
+            return paste_x, paste_y
+
+
+def get_union(pD, pG):
+    return Polygon(pD).union(Polygon(pG)).area
+
+
+def get_intersection_over_union(pD, pG):
+    return get_intersection(pD, pG) / get_union(pD, pG)
+
+
+def get_intersection(pD, pG):
+    return Polygon(pD).intersection(Polygon(pG)).area
+
+
+def rotate_bbox(img, text_polys, angle, scale=1):
+    """
+    from https://github.com/WenmuZhou/DBNet.pytorch/blob/master/data_loader/modules/augment.py
+    Args:
+        img: np.ndarray
+        text_polys: np.ndarray N*4*2
+        angle: int
+        scale: int
+
+    Returns:
+
+    """
+    w = img.shape[1]
+    h = img.shape[0]
+
+    rangle = np.deg2rad(angle)
+    nw = (abs(np.sin(rangle) * h) + abs(np.cos(rangle) * w))
+    nh = (abs(np.cos(rangle) * h) + abs(np.sin(rangle) * w))
+    rot_mat = cv2.getRotationMatrix2D((nw * 0.5, nh * 0.5), angle, scale)
+    rot_move = np.dot(rot_mat, np.array([(nw - w) * 0.5, (nh - h) * 0.5, 0]))
+    rot_mat[0, 2] += rot_move[0]
+    rot_mat[1, 2] += rot_move[1]
+
+    # ---------------------- rotate box ----------------------
+    rot_text_polys = list()
+    for bbox in text_polys:
+        point1 = np.dot(rot_mat, np.array([bbox[0, 0], bbox[0, 1], 1]))
+        point2 = np.dot(rot_mat, np.array([bbox[1, 0], bbox[1, 1], 1]))
+        point3 = np.dot(rot_mat, np.array([bbox[2, 0], bbox[2, 1], 1]))
+        point4 = np.dot(rot_mat, np.array([bbox[3, 0], bbox[3, 1], 1]))
+        rot_text_polys.append([point1, point2, point3, point4])
+    return np.array(rot_text_polys, dtype=np.float32)
--- a/ppocr/data/imaug/gen_table_mask.py
+++ b/ppocr/data/imaug/gen_table_mask.py
+"""
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+import sys
+import six
+import cv2
+import numpy as np
+
+
+class GenTableMask(object):
+    """ gen table mask """
+
+    def __init__(self, shrink_h_max, shrink_w_max, mask_type=0, **kwargs):
+        self.shrink_h_max = 5
+        self.shrink_w_max = 5
+        self.mask_type = mask_type
+        
+    def projection(self, erosion, h, w, spilt_threshold=0):
+        # 水平投影
+        projection_map = np.ones_like(erosion)
+        project_val_array = [0 for _ in range(0, h)]
+
+        for j in range(0, h):
+            for i in range(0, w):
+                if erosion[j, i] == 255:
+                    project_val_array[j] += 1
+        # 根据数组，获取切割点
+        start_idx = 0  # 记录进入字符区的索引
+        end_idx = 0  # 记录进入空白区域的索引
+        in_text = False  # 是否遍历到了字符区内
+        box_list = []
+        for i in range(len(project_val_array)):
+            if in_text == False and project_val_array[i] > spilt_threshold:  # 进入字符区了
+                in_text = True
+                start_idx = i
+            elif project_val_array[i] <= spilt_threshold and in_text == True:  # 进入空白区了
+                end_idx = i
+                in_text = False
+                if end_idx - start_idx <= 2:
+                    continue
+                box_list.append((start_idx, end_idx + 1))
+
+        if in_text:
+            box_list.append((start_idx, h - 1))
+        # 绘制投影直方图
+        for j in range(0, h):
+            for i in range(0, project_val_array[j]):
+                projection_map[j, i] = 0
+        return box_list, projection_map
+
+    def projection_cx(self, box_img):
+        box_gray_img = cv2.cvtColor(box_img, cv2.COLOR_BGR2GRAY)
+        h, w = box_gray_img.shape
+        # 灰度图片进行二值化处理
+        ret, thresh1 = cv2.threshold(box_gray_img, 200, 255, cv2.THRESH_BINARY_INV)
+        # 纵向腐蚀
+        if h < w:
+            kernel = np.ones((2, 1), np.uint8)
+            erode = cv2.erode(thresh1, kernel, iterations=1)
+        else:
+            erode = thresh1
+        # 水平膨胀
+        kernel = np.ones((1, 5), np.uint8)
+        erosion = cv2.dilate(erode, kernel, iterations=1)
+        # 水平投影
+        projection_map = np.ones_like(erosion)
+        project_val_array = [0 for _ in range(0, h)]
+
+        for j in range(0, h):
+            for i in range(0, w):
+                if erosion[j, i] == 255:
+                    project_val_array[j] += 1
+        # 根据数组，获取切割点
+        start_idx = 0  # 记录进入字符区的索引
+        end_idx = 0  # 记录进入空白区域的索引
+        in_text = False  # 是否遍历到了字符区内
+        box_list = []
+        spilt_threshold = 0
+        for i in range(len(project_val_array)):
+            if in_text == False and project_val_array[i] > spilt_threshold:  # 进入字符区了
+                in_text = True
+                start_idx = i
+            elif project_val_array[i] <= spilt_threshold and in_text == True:  # 进入空白区了
+                end_idx = i
+                in_text = False
+                if end_idx - start_idx <= 2:
+                    continue
+                box_list.append((start_idx, end_idx + 1))
+
+        if in_text:
+            box_list.append((start_idx, h - 1))
+        # 绘制投影直方图
+        for j in range(0, h):
+            for i in range(0, project_val_array[j]):
+                projection_map[j, i] = 0
+        split_bbox_list = []
+        if len(box_list) > 1:
+            for i, (h_start, h_end) in enumerate(box_list):
+                if i == 0:
+                    h_start = 0
+                if i == len(box_list):
+                    h_end = h
+                word_img = erosion[h_start:h_end + 1, :]
+                word_h, word_w = word_img.shape
+                w_split_list, w_projection_map = self.projection(word_img.T, word_w, word_h)
+                w_start, w_end = w_split_list[0][0], w_split_list[-1][1]
+                if h_start > 0:
+                    h_start -= 1
+                h_end += 1
+                word_img = box_img[h_start:h_end + 1:, w_start:w_end + 1, :]
+                split_bbox_list.append([w_start, h_start, w_end, h_end])
+        else:
+            split_bbox_list.append([0, 0, w, h])
+        return split_bbox_list
+
+    def shrink_bbox(self, bbox):
+        left, top, right, bottom = bbox
+        sh_h = min(max(int((bottom - top) * 0.1), 1), self.shrink_h_max)
+        sh_w = min(max(int((right - left) * 0.1), 1), self.shrink_w_max)
+        left_new = left + sh_w
+        right_new = right - sh_w
+        top_new = top + sh_h
+        bottom_new = bottom - sh_h
+        if left_new >= right_new:
+            left_new = left
+            right_new = right
+        if top_new >= bottom_new:
+            top_new = top
+            bottom_new = bottom
+        return [left_new, top_new, right_new, bottom_new]
+
+    def __call__(self, data):
+        img = data['image']
+        cells = data['cells']
+        height, width = img.shape[0:2]
+        if self.mask_type == 1:
+            mask_img = np.zeros((height, width), dtype=np.float32)
+        else:
+            mask_img = np.zeros((height, width, 3), dtype=np.float32)
+        cell_num = len(cells)
+        for cno in range(cell_num):
+            if "bbox" in cells[cno]:
+                bbox = cells[cno]['bbox']
+                left, top, right, bottom = bbox
+                box_img = img[top:bottom, left:right, :].copy()
+                split_bbox_list = self.projection_cx(box_img)
+                for sno in range(len(split_bbox_list)):
+                    split_bbox_list[sno][0] += left
+                    split_bbox_list[sno][1] += top
+                    split_bbox_list[sno][2] += left
+                    split_bbox_list[sno][3] += top
+
+                for sno in range(len(split_bbox_list)):
+                    left, top, right, bottom = split_bbox_list[sno]
+                    left, top, right, bottom = self.shrink_bbox([left, top, right, bottom])
+                    if self.mask_type == 1:
+                        mask_img[top:bottom, left:right] = 1.0
+                        data['mask_img'] = mask_img
+                    else:
+                        mask_img[top:bottom, left:right, :] = (255, 255, 255)        
+                        data['image'] = mask_img
+        return data
+
+class ResizeTableImage(object):
+    def __init__(self, max_len, **kwargs):
+        super(ResizeTableImage, self).__init__()
+        self.max_len = max_len
+
+    def get_img_bbox(self, cells):
+        bbox_list = []
+        if len(cells) == 0:
+            return bbox_list
+        cell_num = len(cells)
+        for cno in range(cell_num):
+            if "bbox" in cells[cno]:
+                bbox = cells[cno]['bbox']
+                bbox_list.append(bbox)
+        return bbox_list
+
+    def resize_img_table(self, img, bbox_list, max_len):
+        height, width = img.shape[0:2]
+        ratio = max_len / (max(height, width) * 1.0)
+        resize_h = int(height * ratio)
+        resize_w = int(width * ratio)
+        img_new = cv2.resize(img, (resize_w, resize_h))
+        bbox_list_new = []
+        for bno in range(len(bbox_list)):
+            left, top, right, bottom = bbox_list[bno].copy()
+            left = int(left * ratio)
+            top = int(top * ratio)
+            right = int(right * ratio)
+            bottom = int(bottom * ratio)
+            bbox_list_new.append([left, top, right, bottom])
+        return img_new, bbox_list_new
+    
+    def __call__(self, data):
+        img = data['image']
+        if 'cells' not in data:
+            cells = []
+        else:
+            cells = data['cells']
+        bbox_list = self.get_img_bbox(cells)
+        img_new, bbox_list_new = self.resize_img_table(img, bbox_list, self.max_len)
+        data['image'] = img_new
+        cell_num = len(cells)
+        bno = 0
+        for cno in range(cell_num):
+            if "bbox" in data['cells'][cno]:
+                data['cells'][cno]['bbox'] = bbox_list_new[bno]
+                bno += 1
+        data['max_len'] = self.max_len
+        return data
+
+class PaddingTableImage(object):
+    def __init__(self, **kwargs):
+        super(PaddingTableImage, self).__init__()
+    
+    def __call__(self, data):
+        img = data['image']
+        max_len = data['max_len']
+        padding_img = np.zeros((max_len, max_len, 3), dtype=np.float32)
+        height, width = img.shape[0:2]
+        padding_img[0:height, 0:width, :] = img.copy()
+        data['image'] = padding_img
+        return data
+            
\ No newline at end of file
--- a/ppocr/data/imaug/label_ops.py
+++ b/ppocr/data/imaug/label_ops.py
@@ -19,6 +19,7 @@ from __future__ import unicode_literals

 import numpy as np
 import string
+import json


 class ClsLabelEncode(object):
@@ -39,7 +40,6 @@ class DetLabelEncode(object):
        pass

    def __call__(self, data):
-        import json
        label = data['label']
        label = json.loads(label)
        nBox = len(label)
@@ -53,6 +53,8 @@ class DetLabelEncode(object):
                txt_tags.append(True)
            else:
                txt_tags.append(False)
+        if len(boxes) == 0:
+            return None
        boxes = self.expand_points_num(boxes)
        boxes = np.array(boxes, dtype=np.float32)
        txt_tags = np.array(txt_tags, dtype=np.bool)
@@ -351,3 +353,171 @@ class SRNLabelEncode(BaseRecLabelEncode):
            assert False, "Unsupport type %s in get_beg_end_flag_idx" \
                          % beg_or_end
        return idx
+
+
+class TableLabelEncode(object):
+    """ Convert between text-label and text-index """
+
+    def __init__(self,
+                 max_text_length,
+                 max_elem_length,
+                 max_cell_num,
+                 character_dict_path,
+                 span_weight=1.0,
+                 **kwargs):
+        self.max_text_length = max_text_length
+        self.max_elem_length = max_elem_length
+        self.max_cell_num = max_cell_num
+        list_character, list_elem = self.load_char_elem_dict(
+            character_dict_path)
+        list_character = self.add_special_char(list_character)
+        list_elem = self.add_special_char(list_elem)
+        self.dict_character = {}
+        for i, char in enumerate(list_character):
+            self.dict_character[char] = i
+        self.dict_elem = {}
+        for i, elem in enumerate(list_elem):
+            self.dict_elem[elem] = i
+        self.span_weight = span_weight
+
+    def load_char_elem_dict(self, character_dict_path):
+        list_character = []
+        list_elem = []
+        with open(character_dict_path, "rb") as fin:
+            lines = fin.readlines()
+            substr = lines[0].decode('utf-8').strip("\r\n").split("\t")
+            character_num = int(substr[0])
+            elem_num = int(substr[1])
+
+            for cno in range(1, 1 + character_num):
+                character = lines[cno].decode('utf-8').strip("\r\n")
+                list_character.append(character)
+            for eno in range(1 + character_num, 1 + character_num + elem_num):
+                elem = lines[eno].decode('utf-8').strip("\r\n")
+                list_elem.append(elem)
+        return list_character, list_elem
+
+    def add_special_char(self, list_character):
+        self.beg_str = "sos"
+        self.end_str = "eos"
+        list_character = [self.beg_str] + list_character + [self.end_str]
+        return list_character
+
+    def get_span_idx_list(self):
+        span_idx_list = []
+        for elem in self.dict_elem:
+            if 'span' in elem:
+                span_idx_list.append(self.dict_elem[elem])
+        return span_idx_list
+
+    def __call__(self, data):
+        cells = data['cells']
+        structure = data['structure']['tokens']
+        structure = self.encode(structure, 'elem')
+        if structure is None:
+            return None
+        elem_num = len(structure)
+        structure = [0] + structure + [len(self.dict_elem) - 1]
+        structure = structure + [0] * (self.max_elem_length + 2 - len(structure)
+                                       )
+        structure = np.array(structure)
+        data['structure'] = structure
+        elem_char_idx1 = self.dict_elem['<td>']
+        elem_char_idx2 = self.dict_elem['<td']
+        span_idx_list = self.get_span_idx_list()
+        td_idx_list = np.logical_or(structure == elem_char_idx1,
+                                    structure == elem_char_idx2)
+        td_idx_list = np.where(td_idx_list)[0]
+
+        structure_mask = np.ones(
+            (self.max_elem_length + 2, 1), dtype=np.float32)
+        bbox_list = np.zeros((self.max_elem_length + 2, 4), dtype=np.float32)
+        bbox_list_mask = np.zeros(
+            (self.max_elem_length + 2, 1), dtype=np.float32)
+        img_height, img_width, img_ch = data['image'].shape
+        if len(span_idx_list) > 0:
+            span_weight = len(td_idx_list) * 1.0 / len(span_idx_list)
+            span_weight = min(max(span_weight, 1.0), self.span_weight)
+        for cno in range(len(cells)):
+            if 'bbox' in cells[cno]:
+                bbox = cells[cno]['bbox'].copy()
+                bbox[0] = bbox[0] * 1.0 / img_width
+                bbox[1] = bbox[1] * 1.0 / img_height
+                bbox[2] = bbox[2] * 1.0 / img_width
+                bbox[3] = bbox[3] * 1.0 / img_height
+                td_idx = td_idx_list[cno]
+                bbox_list[td_idx] = bbox
+                bbox_list_mask[td_idx] = 1.0
+                cand_span_idx = td_idx + 1
+                if cand_span_idx < (self.max_elem_length + 2):
+                    if structure[cand_span_idx] in span_idx_list:
+                        structure_mask[cand_span_idx] = span_weight
+
+        data['bbox_list'] = bbox_list
+        data['bbox_list_mask'] = bbox_list_mask
+        data['structure_mask'] = structure_mask
+        char_beg_idx = self.get_beg_end_flag_idx('beg', 'char')
+        char_end_idx = self.get_beg_end_flag_idx('end', 'char')
+        elem_beg_idx = self.get_beg_end_flag_idx('beg', 'elem')
+        elem_end_idx = self.get_beg_end_flag_idx('end', 'elem')
+        data['sp_tokens'] = np.array([
+            char_beg_idx, char_end_idx, elem_beg_idx, elem_end_idx,
+            elem_char_idx1, elem_char_idx2, self.max_text_length,
+            self.max_elem_length, self.max_cell_num, elem_num
+        ])
+        return data
+
+    def encode(self, text, char_or_elem):
+        """convert text-label into text-index.
+        """
+        if char_or_elem == "char":
+            max_len = self.max_text_length
+            current_dict = self.dict_character
+        else:
+            max_len = self.max_elem_length
+            current_dict = self.dict_elem
+        if len(text) > max_len:
+            return None
+        if len(text) == 0:
+            if char_or_elem == "char":
+                return [self.dict_character['space']]
+            else:
+                return None
+        text_list = []
+        for char in text:
+            if char not in current_dict:
+                return None
+            text_list.append(current_dict[char])
+        if len(text_list) == 0:
+            if char_or_elem == "char":
+                return [self.dict_character['space']]
+            else:
+                return None
+        return text_list
+
+    def get_ignored_tokens(self, char_or_elem):
+        beg_idx = self.get_beg_end_flag_idx("beg", char_or_elem)
+        end_idx = self.get_beg_end_flag_idx("end", char_or_elem)
+        return [beg_idx, end_idx]
+
+    def get_beg_end_flag_idx(self, beg_or_end, char_or_elem):
+        if char_or_elem == "char":
+            if beg_or_end == "beg":
+                idx = np.array(self.dict_character[self.beg_str])
+            elif beg_or_end == "end":
+                idx = np.array(self.dict_character[self.end_str])
+            else:
+                assert False, "Unsupport type %s in get_beg_end_flag_idx of char" \
+                              % beg_or_end
+        elif char_or_elem == "elem":
+            if beg_or_end == "beg":
+                idx = np.array(self.dict_elem[self.beg_str])
+            elif beg_or_end == "end":
+                idx = np.array(self.dict_elem[self.end_str])
+            else:
+                assert False, "Unsupport type %s in get_beg_end_flag_idx of elem" \
+                              % beg_or_end
+        else:
+            assert False, "Unsupport type %s in char_or_elem" \
+                              % char_or_elem
+        return idx
--- a/ppocr/data/imaug/operators.py
+++ b/ppocr/data/imaug/operators.py
@@ -81,7 +81,7 @@ class NormalizeImage(object):
        assert isinstance(img,
                          np.ndarray), "invalid input 'img' in NormalizeImage"
        data['image'] = (
-            img.astype('float32') * self.scale - self.mean) / self.std
+                                img.astype('float32') * self.scale - self.mean) / self.std
        return data


@@ -163,7 +163,7 @@ class DetResizeForTest(object):
            img, (ratio_h, ratio_w)
        """
        limit_side_len = self.limit_side_len
-        h, w, _ = img.shape
+        h, w, c = img.shape

        # limit the max side
        if self.limit_type == 'max':
@@ -174,7 +174,7 @@ class DetResizeForTest(object):
                    ratio = float(limit_side_len) / w
            else:
                ratio = 1.
-        else:
+        elif self.limit_type == 'min':
            if min(h, w) < limit_side_len:
                if h < w:
                    ratio = float(limit_side_len) / h
@@ -182,6 +182,10 @@ class DetResizeForTest(object):
                    ratio = float(limit_side_len) / w
            else:
                ratio = 1.
+        elif self.limit_type == 'resize_long':
+            ratio = float(limit_side_len) / max(h,w)
+        else:
+            raise Exception('not support limit type, image ')
        resize_h = int(h * ratio)
        resize_w = int(w * ratio)


--- a/ppocr/data/pubtab_dataset.py
+++ b/ppocr/data/pubtab_dataset.py
+# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import numpy as np
+import os
+import random
+from paddle.io import Dataset
+import json
+
+from .imaug import transform, create_operators
+
+
+class PubTabDataSet(Dataset):
+    def __init__(self, config, mode, logger, seed=None):
+        super(PubTabDataSet, self).__init__()
+        self.logger = logger
+
+        global_config = config['Global']
+        dataset_config = config[mode]['dataset']
+        loader_config = config[mode]['loader']
+
+        label_file_path = dataset_config.pop('label_file_path')
+
+        self.data_dir = dataset_config['data_dir']
+        self.do_shuffle = loader_config['shuffle']
+        self.do_hard_select = False
+        if 'hard_select' in loader_config:
+            self.do_hard_select = loader_config['hard_select']
+            self.hard_prob = loader_config['hard_prob']
+        if self.do_hard_select:
+            self.img_select_prob = self.load_hard_select_prob()
+        self.table_select_type = None
+        if 'table_select_type' in loader_config:
+            self.table_select_type = loader_config['table_select_type']
+            self.table_select_prob = loader_config['table_select_prob']
+
+        self.seed = seed
+        logger.info("Initialize indexs of datasets:%s" % label_file_path)
+        with open(label_file_path, "rb") as f:
+            self.data_lines = f.readlines()
+        self.data_idx_order_list = list(range(len(self.data_lines)))
+        if mode.lower() == "train":
+            self.shuffle_data_random()
+        self.ops = create_operators(dataset_config['transforms'], global_config)
+
+    def shuffle_data_random(self):
+        if self.do_shuffle:
+            random.seed(self.seed)
+            random.shuffle(self.data_lines)
+        return
+
+    def __getitem__(self, idx):
+        try:
+            data_line = self.data_lines[idx]
+            data_line = data_line.decode('utf-8').strip("\n")
+            info = json.loads(data_line)
+            file_name = info['filename']
+            select_flag = True
+            if self.do_hard_select:
+                prob = self.img_select_prob[file_name]
+                if prob < random.uniform(0, 1):
+                    select_flag = False
+            
+            if self.table_select_type:
+                structure = info['html']['structure']['tokens'].copy()
+                structure_str = ''.join(structure)
+                table_type = "simple"
+                if 'colspan' in structure_str or 'rowspan' in structure_str:
+                    table_type = "complex"
+                if table_type == "complex":
+                    if self.table_select_prob < random.uniform(0, 1):
+                        select_flag = False                    
+            
+            if select_flag:
+                cells = info['html']['cells'].copy()
+                structure = info['html']['structure'].copy()
+                img_path = os.path.join(self.data_dir, file_name)
+                data = {'img_path': img_path, 'cells': cells, 'structure':structure}
+                if not os.path.exists(img_path):
+                    raise Exception("{} does not exist!".format(img_path))
+                with open(data['img_path'], 'rb') as f:
+                    img = f.read()
+                    data['image'] = img
+                outs = transform(data, self.ops)
+            else:
+                outs = None
+        except Exception as e:
+            self.logger.error(
+                "When parsing line {}, error happened with msg: {}".format(
+                    data_line, e))
+            outs = None
+        if outs is None:
+            return self.__getitem__(np.random.randint(self.__len__()))
+        return outs
+
+    def __len__(self):
+        return len(self.data_idx_order_list)
--- a/ppocr/data/simple_dataset.py
+++ b/ppocr/data/simple_dataset.py
@@ -69,12 +69,42 @@ class SimpleDataSet(Dataset):
        random.shuffle(self.data_lines)
        return

+    def get_ext_data(self):
+        ext_data_num = 0
+        for op in self.ops:
+            if hasattr(op, 'ext_data_num'):
+                ext_data_num = getattr(op, 'ext_data_num')
+                break
+        load_data_ops = self.ops[:2]
+        ext_data = []
+
+        while len(ext_data) < ext_data_num:
+            file_idx = self.data_idx_order_list[np.random.randint(self.__len__(
+            ))]
+            data_line = self.data_lines[file_idx]
+            data_line = data_line.decode('utf-8')
+            substr = data_line.strip("\n").split(self.delimiter)
+            file_name = substr[0]
+            label = substr[1]
+            img_path = os.path.join(self.data_dir, file_name)
+            data = {'img_path': img_path, 'label': label}
+            if not os.path.exists(img_path):
+                continue
+            with open(data['img_path'], 'rb') as f:
+                img = f.read()
+                data['image'] = img
+            data = transform(data, load_data_ops)
+            if data is None:
+                continue
+            ext_data.append(data)
+        return ext_data
+
    def __getitem__(self, idx):
        file_idx = self.data_idx_order_list[idx]
        data_line = self.data_lines[file_idx]
        try:
            data_line = data_line.decode('utf-8')
-            substr = data_line.strip("\n").strip("\r").split(self.delimiter)
+            substr = data_line.strip("\n").split(self.delimiter)
            file_name = substr[0]
            label = substr[1]
            img_path = os.path.join(self.data_dir, file_name)
@@ -84,6 +114,7 @@ class SimpleDataSet(Dataset):
            with open(data['img_path'], 'rb') as f:
                img = f.read()
                data['image'] = img
+            data['ext_data'] = self.get_ext_data()
            outs = transform(data, self.ops)
        except Exception as e:
            self.logger.error(

--- a/ppocr/losses/__init__.py
+++ b/ppocr/losses/__init__.py
@@ -13,28 +13,39 @@
 # limitations under the License.

 import copy
+import paddle
+import paddle.nn as nn

+# det loss
+from .det_db_loss import DBLoss
+from .det_east_loss import EASTLoss
+from .det_sast_loss import SASTLoss

-def build_loss(config):
-    # det loss
-    from .det_db_loss import DBLoss
-    from .det_east_loss import EASTLoss
-    from .det_sast_loss import SASTLoss
+# rec loss
+from .rec_ctc_loss import CTCLoss
+from .rec_att_loss import AttentionLoss
+from .rec_srn_loss import SRNLoss
+
+# cls loss
+from .cls_loss import ClsLoss
+
+# e2e loss
+from .e2e_pg_loss import PGLoss

-    # rec loss
-    from .rec_ctc_loss import CTCLoss
-    from .rec_att_loss import AttentionLoss
-    from .rec_srn_loss import SRNLoss
+# basic loss function
+from .basic_loss import DistanceLoss

-    # cls loss
-    from .cls_loss import ClsLoss
+# combined loss function
+from .combined_loss import CombinedLoss

-    # e2e loss
-    from .e2e_pg_loss import PGLoss
+# table loss
+from .table_att_loss import TableAttentionLoss
+
+def build_loss(config):
    support_dict = [
        'DBLoss', 'EASTLoss', 'SASTLoss', 'CTCLoss', 'ClsLoss', 'AttentionLoss',
-        'SRNLoss', 'PGLoss']
-
+        'SRNLoss', 'PGLoss', 'CombinedLoss', 'TableAttentionLoss'
+    ]
    config = copy.deepcopy(config)
    module_name = config.pop('name')
    assert module_name in support_dict, Exception('loss only support {}'.format(

--- a/ppocr/losses/basic_loss.py
+++ b/ppocr/losses/basic_loss.py
+#copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
+
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+
+from paddle.nn import L1Loss
+from paddle.nn import MSELoss as L2Loss
+from paddle.nn import SmoothL1Loss
+
+
+class CELoss(nn.Layer):
+    def __init__(self, epsilon=None):
+        super().__init__()
+        if epsilon is not None and (epsilon <= 0 or epsilon >= 1):
+            epsilon = None
+        self.epsilon = epsilon
+
+    def _labelsmoothing(self, target, class_num):
+        if target.shape[-1] != class_num:
+            one_hot_target = F.one_hot(target, class_num)
+        else:
+            one_hot_target = target
+        soft_target = F.label_smooth(one_hot_target, epsilon=self.epsilon)
+        soft_target = paddle.reshape(soft_target, shape=[-1, class_num])
+        return soft_target
+
+    def forward(self, x, label):
+        loss_dict = {}
+        if self.epsilon is not None:
+            class_num = x.shape[-1]
+            label = self._labelsmoothing(label, class_num)
+            x = -F.log_softmax(x, axis=-1)
+            loss = paddle.sum(x * label, axis=-1)
+        else:
+            if label.shape[-1] == x.shape[-1]:
+                label = F.softmax(label, axis=-1)
+                soft_label = True
+            else:
+                soft_label = False
+            loss = F.cross_entropy(x, label=label, soft_label=soft_label)
+        return loss
+
+
+class KLJSLoss(object):
+    def __init__(self, mode='kl'):
+        assert mode in ['kl', 'js', 'KL', 'JS'], "mode can only be one of ['kl', 'js', 'KL', 'JS']"
+        self.mode = mode
+
+    def __call__(self, p1, p2, reduction="mean"):
+
+        loss = paddle.multiply(p2, paddle.log( (p2+1e-5)/(p1+1e-5) + 1e-5))
+
+        if self.mode.lower() == "js":
+            loss += paddle.multiply(p1, paddle.log((p1+1e-5)/(p2+1e-5) + 1e-5))
+            loss *= 0.5
+        if reduction == "mean":
+            loss = paddle.mean(loss, axis=[1,2])
+        elif reduction=="none" or reduction is None:
+            return loss 
+        else:
+            loss = paddle.sum(loss, axis=[1,2])
+
+        return loss 
+
+class DMLLoss(nn.Layer):
+    """
+    DMLLoss
+    """
+
+    def __init__(self, act=None):
+        super().__init__()
+        if act is not None:
+            assert act in ["softmax", "sigmoid"]
+        if act == "softmax":
+            self.act = nn.Softmax(axis=-1)
+        elif act == "sigmoid":
+            self.act = nn.Sigmoid()
+        else:
+            self.act = None
+        
+        self.jskl_loss = KLJSLoss(mode="js")
+
+    def forward(self, out1, out2):
+        if self.act is not None:
+            out1 = self.act(out1)
+            out2 = self.act(out2)
+        if len(out1.shape) < 2:
+            log_out1 = paddle.log(out1)
+            log_out2 = paddle.log(out2)
+            loss = (F.kl_div(
+                log_out1, out2, reduction='batchmean') + F.kl_div(
+                    log_out2, out1, reduction='batchmean')) / 2.0
+        else:
+            loss = self.jskl_loss(out1, out2)
+        return loss
+
+
+class DistanceLoss(nn.Layer):
+    """
+    DistanceLoss:
+        mode: loss mode
+    """
+
+    def __init__(self, mode="l2", **kargs):
+        super().__init__()
+        assert mode in ["l1", "l2", "smooth_l1"]
+        if mode == "l1":
+            self.loss_func = nn.L1Loss(**kargs)
+        elif mode == "l2":
+            self.loss_func = nn.MSELoss(**kargs)
+        elif mode == "smooth_l1":
+            self.loss_func = nn.SmoothL1Loss(**kargs)
+
+    def forward(self, x, y):
+        return self.loss_func(x, y)
--- a/ppocr/losses/cls_loss.py
+++ b/ppocr/losses/cls_loss.py
@@ -24,7 +24,7 @@ class ClsLoss(nn.Layer):
        super(ClsLoss, self).__init__()
        self.loss_func = nn.CrossEntropyLoss(reduction='mean')

-    def __call__(self, predicts, batch):
+    def forward(self, predicts, batch):
        label = batch[1]
        loss = self.loss_func(input=predicts, label=label)
        return {'loss': loss}