Revert "add table eval and predict script" (#3062)

3302a0b1 · zhoujun · GitHub · 85aeae71 · 85aeae71 · 85aeae71
Unverified Commit 3302a0b1 authored Jun 09, 2021 by zhoujun Committed by GitHub Jun 09, 2021
12 changed files
--- a/ppstructure/table/predict_structure.py
+++ b/ppstructure/table/predict_structure.py
-# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import os
-import sys
-
-__dir__ = os.path.dirname(os.path.abspath(__file__))
-sys.path.append(__dir__)
-sys.path.append(os.path.abspath(os.path.join(__dir__, '../..')))
-
-os.environ["FLAGS_allocator_strategy"] = 'auto_growth'
-
-import cv2
-import numpy as np
-import math
-import time
-import traceback
-import paddle
-
-import tools.infer.utility as utility
-from ppocr.data import create_operators, transform
-from ppocr.postprocess import build_post_process
-from ppocr.utils.logging import get_logger
-from ppocr.utils.utility import get_image_file_list, check_and_read_gif
-
-logger = get_logger()
-
-
-class TableStructurer(object):
-    def __init__(self, args):
-        pre_process_list = [{
-            'ResizeTableImage': {
-                'max_len': args.structure_max_len
-            }
-        }, {
-            'NormalizeImage': {
-                'std': [0.229, 0.224, 0.225],
-                'mean': [0.485, 0.456, 0.406],
-                'scale': '1./255.',
-                'order': 'hwc'
-            }
-        }, {
-            'PaddingTableImage': None
-        }, {
-            'ToCHWImage': None
-        }, {
-            'KeepKeys': {
-                'keep_keys': ['image']
-            }
-        }]
-        postprocess_params = {
-            'name': 'TableLabelDecode',
-            "character_type": args.structure_char_type,
-            "character_dict_path": args.structure_char_dict_path,
-            "max_text_length": args.structure_max_text_length,
-            "max_elem_length": args.structure_max_elem_length,
-            "max_cell_num": args.structure_max_cell_num
-        }
-
-        self.preprocess_op = create_operators(pre_process_list)
-        self.postprocess_op = build_post_process(postprocess_params)
-        self.predictor, self.input_tensor, self.output_tensors = \
-            utility.create_predictor(args, 'structure', logger)
-
-    def __call__(self, img):
-        ori_im = img.copy()
-        data = {'image': img}
-        data = transform(data, self.preprocess_op)
-        img = data[0]
-        if img is None:
-            return None, 0
-        img = np.expand_dims(img, axis=0)
-        img = img.copy()
-        starttime = time.time()
-
-        self.input_tensor.copy_from_cpu(img)
-        self.predictor.run()
-        outputs = []
-        for output_tensor in self.output_tensors:
-            output = output_tensor.copy_to_cpu()
-            outputs.append(output)
-
-        preds = {}
-        preds['structure_probs'] = outputs[1]
-        preds['loc_preds'] = outputs[0]
-
-        post_result = self.postprocess_op(preds)
-
-        structure_str_list = post_result['structure_str_list']
-        res_loc = post_result['res_loc']
-        imgh, imgw = ori_im.shape[0:2]
-        res_loc_final = []
-        for rno in range(len(res_loc[0])):
-            x0, y0, x1, y1 = res_loc[0][rno]
-            left = max(int(imgw * x0), 0)
-            top = max(int(imgh * y0), 0)
-            right = min(int(imgw * x1), imgw - 1)
-            bottom = min(int(imgh * y1), imgh - 1)
-            res_loc_final.append([left, top, right, bottom])
-
-        structure_str_list = structure_str_list[0][:-1]
-        structure_str_list = ['<html>', '<body>', '<table>'] + structure_str_list + ['</table>', '</body>', '</html>']
-
-        elapse = time.time() - starttime
-        return (structure_str_list, res_loc_final), elapse
-
-
-def main(args):
-    image_file_list = get_image_file_list(args.image_dir)
-    table_structurer = TableStructurer(args)
-    count = 0
-    total_time = 0
-    for image_file in image_file_list:
-        img, flag = check_and_read_gif(image_file)
-        if not flag:
-            img = cv2.imread(image_file)
-        if img is None:
-            logger.info("error in loading image:{}".format(image_file))
-            continue
-        structure_res, elapse = table_structurer(img)
-
-        logger.info("result: {}".format(structure_res))
-
-        if count > 0:
-            total_time += elapse
-        count += 1
-        logger.info("Predict time of {}: {}".format(image_file, elapse))
-
-
-if __name__ == "__main__":
-    main(utility.parse_args())
--- a/ppstructure/table/predict_table.py
+++ b/ppstructure/table/predict_table.py
-# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import os
-import sys
-import subprocess
-
-__dir__ = os.path.dirname(os.path.abspath(__file__))
-sys.path.append(__dir__)
-sys.path.append(os.path.abspath(os.path.join(__dir__, '..')))
-sys.path.append(os.path.abspath(os.path.join(__dir__, '../..')))
-
-os.environ["FLAGS_allocator_strategy"] = 'auto_growth'
-import cv2
-import copy
-import numpy as np
-import time
-import tools.infer.predict_rec as predict_rec
-import tools.infer.predict_det as predict_det
-from ppocr.utils.utility import get_image_file_list, check_and_read_gif
-from ppocr.utils.logging import get_logger
-from ppstructure.table.matcher import distance, compute_iou
-from ppstructure.utility import parse_args
-import ppstructure.table.predict_structure as predict_strture
-
-logger = get_logger()
-
-
-def expand(pix, det_box, shape):
-    x0, y0, x1, y1 = det_box
-    #     print(shape)
-    h, w, c = shape
-    tmp_x0 = x0 - pix
-    tmp_x1 = x1 + pix
-    tmp_y0 = y0 - pix
-    tmp_y1 = y1 + pix
-    x0_ = tmp_x0 if tmp_x0 >= 0 else 0
-    x1_ = tmp_x1 if tmp_x1 <= w else w
-    y0_ = tmp_y0 if tmp_y0 >= 0 else 0
-    y1_ = tmp_y1 if tmp_y1 <= h else h
-    return x0_, y0_, x1_, y1_
-
-
-class TableSystem(object):
-    def __init__(self, args, text_detector=None, text_recognizer=None):
-        self.text_detector = predict_det.TextDetector(args) if text_detector is None else text_detector
-        self.text_recognizer = predict_rec.TextRecognizer(args) if text_recognizer is None else text_recognizer
-        self.table_structurer = predict_strture.TableStructurer(args)
-
-    def __call__(self, img):
-        ori_im = img.copy()
-        structure_res, elapse = self.table_structurer(copy.deepcopy(img))
-        dt_boxes, elapse = self.text_detector(copy.deepcopy(img))
-        dt_boxes = sorted_boxes(dt_boxes)
-
-        r_boxes = []
-        for box in dt_boxes:
-            x_min = box[:, 0].min() - 1
-            x_max = box[:, 0].max() + 1
-            y_min = box[:, 1].min() - 1
-            y_max = box[:, 1].max() + 1
-            box = [x_min, y_min, x_max, y_max]
-            r_boxes.append(box)
-        dt_boxes = np.array(r_boxes)
-
-        logger.debug("dt_boxes num : {}, elapse : {}".format(
-            len(dt_boxes), elapse))
-        if dt_boxes is None:
-            return None, None
-        img_crop_list = []
-
-        for i in range(len(dt_boxes)):
-            det_box = dt_boxes[i]
-            x0, y0, x1, y1 = expand(2, det_box, ori_im.shape)
-            text_rect = ori_im[int(y0):int(y1), int(x0):int(x1), :]
-            img_crop_list.append(text_rect)
-        rec_res, elapse = self.text_recognizer(img_crop_list)
-        logger.debug("rec_res num  : {}, elapse : {}".format(
-            len(rec_res), elapse))
-
-        pred_html, pred = self.rebuild_table(structure_res, dt_boxes, rec_res)
-        return pred_html
-
-    def rebuild_table(self, structure_res, dt_boxes, rec_res):
-        pred_structures, pred_bboxes = structure_res
-        matched_index = self.match_result(dt_boxes, pred_bboxes)
-        pred_html, pred = self.get_pred_html(pred_structures, matched_index, rec_res)
-        return pred_html, pred
-
-    def match_result(self, dt_boxes, pred_bboxes):
-        matched = {}
-        for i, gt_box in enumerate(dt_boxes):
-            # gt_box = [np.min(gt_box[:, 0]), np.min(gt_box[:, 1]), np.max(gt_box[:, 0]), np.max(gt_box[:, 1])]
-            distances = []
-            for j, pred_box in enumerate(pred_bboxes):
-                distances.append(
-                    (distance(gt_box, pred_box), 1. - compute_iou(gt_box, pred_box)))  # 获取两两cell之间的L1距离和 1- IOU
-            sorted_distances = distances.copy()
-            # 根据距离和IOU挑选最"近"的cell
-            sorted_distances = sorted(sorted_distances, key=lambda item: (item[1], item[0]))
-            if distances.index(sorted_distances[0]) not in matched.keys():
-                matched[distances.index(sorted_distances[0])] = [i]
-            else:
-                matched[distances.index(sorted_distances[0])].append(i)
-        return matched
-
-    def get_pred_html(self, pred_structures, matched_index, ocr_contents):
-        end_html = []
-        td_index = 0
-        for tag in pred_structures:
-            if '</td>' in tag:
-                if td_index in matched_index.keys():
-                    b_with = False
-                    if '<b>' in ocr_contents[matched_index[td_index][0]] and len(matched_index[td_index]) > 1:
-                        b_with = True
-                        end_html.extend('<b>')
-                    for i, td_index_index in enumerate(matched_index[td_index]):
-                        content = ocr_contents[td_index_index][0]
-                        if len(matched_index[td_index]) > 1:
-                            if len(content) == 0:
-                                continue
-                            if content[0] == ' ':
-                                content = content[1:]
-                            if '<b>' in content:
-                                content = content[3:]
-                            if '</b>' in content:
-                                content = content[:-4]
-                            if len(content) == 0:
-                                continue
-                            if i != len(matched_index[td_index]) - 1 and ' ' != content[-1]:
-                                content += ' '
-                        end_html.extend(content)
-                    if b_with:
-                        end_html.extend('</b>')
-
-                end_html.append(tag)
-                td_index += 1
-            else:
-                end_html.append(tag)
-        return ''.join(end_html), end_html
-
-
-def sorted_boxes(dt_boxes):
-    """
-    Sort text boxes in order from top to bottom, left to right
-    args:
-        dt_boxes(array):detected text boxes with shape [4, 2]
-    return:
-        sorted boxes(array) with shape [4, 2]
-    """
-    num_boxes = dt_boxes.shape[0]
-    sorted_boxes = sorted(dt_boxes, key=lambda x: (x[0][1], x[0][0]))
-    _boxes = list(sorted_boxes)
-
-    for i in range(num_boxes - 1):
-        if abs(_boxes[i + 1][0][1] - _boxes[i][0][1]) < 10 and \
-                (_boxes[i + 1][0][0] < _boxes[i][0][0]):
-            tmp = _boxes[i]
-            _boxes[i] = _boxes[i + 1]
-            _boxes[i + 1] = tmp
-    return _boxes
-
-
-def to_excel(html_table, excel_path):
-    from tablepyxl import tablepyxl
-    tablepyxl.document_to_xl(html_table, excel_path)
-
-
-def main(args):
-    image_file_list = get_image_file_list(args.image_dir)
-    image_file_list = image_file_list[args.process_id::args.total_process_num]
-    os.makedirs(args.output, exist_ok=True)
-
-    text_sys = TableSystem(args)
-    img_num = len(image_file_list)
-    for i, image_file in enumerate(image_file_list):
-        logger.info("[{}/{}] {}".format(i, img_num, image_file))
-        img, flag = check_and_read_gif(image_file)
-        excel_path = os.path.join(args.table_output, os.path.basename(image_file).split('.')[0] + '.xlsx')
-        if not flag:
-            img = cv2.imread(image_file)
-        if img is None:
-            logger.error("error in loading image:{}".format(image_file))
-            continue
-        starttime = time.time()
-        pred_html = text_sys(img)
-
-        to_excel(pred_html, excel_path)
-        logger.info('excel saved to {}'.format(excel_path))
-        logger.info(pred_html)
-        elapse = time.time() - starttime
-        logger.info("Predict time : {:.3f}s".format(elapse))
-
-
-if __name__ == "__main__":
-    args = parse_args()
-    if args.use_mp:
-        p_list = []
-        total_process_num = args.total_process_num
-        for process_id in range(total_process_num):
-            cmd = [sys.executable, "-u"] + sys.argv + [
-                "--process_id={}".format(process_id),
-                "--use_mp={}".format(False)
-            ]
-            p = subprocess.Popen(cmd, stdout=sys.stdout, stderr=sys.stdout)
-            p_list.append(p)
-        for p in p_list:
-            p.wait()
-    else:
-        main(args)
--- a/ppstructure/table/table_metric/__init__.py
+++ b/ppstructure/table/table_metric/__init__.py
-# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-__all__ = ['TEDS']
-from .table_metric import TEDS
\ No newline at end of file
--- a/ppstructure/table/table_metric/parallel.py
+++ b/ppstructure/table/table_metric/parallel.py
-from tqdm import tqdm
-from concurrent.futures import ProcessPoolExecutor, as_completed
-
-
-def parallel_process(array, function, n_jobs=16, use_kwargs=False, front_num=0):
-    """
-        A parallel version of the map function with a progress bar.
-        Args:
-            array (array-like): An array to iterate over.
-            function (function): A python function to apply to the elements of array
-            n_jobs (int, default=16): The number of cores to use
-            use_kwargs (boolean, default=False): Whether to consider the elements of array as dictionaries of
-                keyword arguments to function
-            front_num (int, default=3): The number of iterations to run serially before kicking off the parallel job.
-                Useful for catching bugs
-        Returns:
-            [function(array[0]), function(array[1]), ...]
-    """
-    # We run the first few iterations serially to catch bugs
-    if front_num > 0:
-        front = [function(**a) if use_kwargs else function(a)
-                 for a in array[:front_num]]
-    else:
-        front = []
-    # If we set n_jobs to 1, just run a list comprehension. This is useful for benchmarking and debugging.
-    if n_jobs == 1:
-        return front + [function(**a) if use_kwargs else function(a) for a in tqdm(array[front_num:])]
-    # Assemble the workers
-    with ProcessPoolExecutor(max_workers=n_jobs) as pool:
-        # Pass the elements of array into function
-        if use_kwargs:
-            futures = [pool.submit(function, **a) for a in array[front_num:]]
-        else:
-            futures = [pool.submit(function, a) for a in array[front_num:]]
-        kwargs = {
-            'total': len(futures),
-            'unit': 'it',
-            'unit_scale': True,
-            'leave': True
-        }
-        # Print out the progress as tasks complete
-        for f in tqdm(as_completed(futures), **kwargs):
-            pass
-    out = []
-    # Get the results from the futures.
-    for i, future in tqdm(enumerate(futures)):
-        try:
-            out.append(future.result())
-        except Exception as e:
-            out.append(e)
-    return front + out
--- a/ppstructure/table/table_metric/table_metric.py
+++ b/ppstructure/table/table_metric/table_metric.py
-# Copyright 2020 IBM
-# Author: peter.zhong@au1.ibm.com
-#
-# This is free software; you can redistribute it and/or modify
-# it under the terms of the Apache 2.0 License.
-#
-# This software is distributed in the hope that it will be useful,
-# but WITHOUT ANY WARRANTY; without even the implied warranty of
-# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-# Apache 2.0 License for more details.
-
-import distance
-from apted import APTED, Config
-from apted.helpers import Tree
-from lxml import etree, html
-from collections import deque
-from .parallel import parallel_process
-from tqdm import tqdm
-
-
-class TableTree(Tree):
-    def __init__(self, tag, colspan=None, rowspan=None, content=None, *children):
-        self.tag = tag
-        self.colspan = colspan
-        self.rowspan = rowspan
-        self.content = content
-        self.children = list(children)
-
-    def bracket(self):
-        """Show tree using brackets notation"""
-        if self.tag == 'td':
-            result = '"tag": %s, "colspan": %d, "rowspan": %d, "text": %s' % \
-                     (self.tag, self.colspan, self.rowspan, self.content)
-        else:
-            result = '"tag": %s' % self.tag
-        for child in self.children:
-            result += child.bracket()
-        return "{{{}}}".format(result)
-
-
-class CustomConfig(Config):
-    @staticmethod
-    def maximum(*sequences):
-        """Get maximum possible value
-        """
-        return max(map(len, sequences))
-
-    def normalized_distance(self, *sequences):
-        """Get distance from 0 to 1
-        """
-        return float(distance.levenshtein(*sequences)) / self.maximum(*sequences)
-
-    def rename(self, node1, node2):
-        """Compares attributes of trees"""
-        #print(node1.tag)
-        if (node1.tag != node2.tag) or (node1.colspan != node2.colspan) or (node1.rowspan != node2.rowspan):
-            return 1.
-        if node1.tag == 'td':
-            if node1.content or node2.content:
-                #print(node1.content, )
-                return self.normalized_distance(node1.content, node2.content)
-        return 0.
-
-
-
-class CustomConfig_del_short(Config):
-    @staticmethod
-    def maximum(*sequences):
-        """Get maximum possible value
-        """
-        return max(map(len, sequences))
-
-    def normalized_distance(self, *sequences):
-        """Get distance from 0 to 1
-        """
-        return float(distance.levenshtein(*sequences)) / self.maximum(*sequences)
-
-    def rename(self, node1, node2):
-        """Compares attributes of trees"""
-        if (node1.tag != node2.tag) or (node1.colspan != node2.colspan) or (node1.rowspan != node2.rowspan):
-            return 1.
-        if node1.tag == 'td':
-            if node1.content or node2.content:
-                #print('before')
-                #print(node1.content, node2.content)
-                #print('after')
-                node1_content = node1.content
-                node2_content = node2.content
-                if len(node1_content) < 3:
-                    node1_content = ['####']
-                if len(node2_content) < 3:
-                    node2_content = ['####']   
-                return self.normalized_distance(node1_content, node2_content)
-        return 0.
-
-class CustomConfig_del_block(Config):
-    @staticmethod
-    def maximum(*sequences):
-        """Get maximum possible value
-        """
-        return max(map(len, sequences))
-
-    def normalized_distance(self, *sequences):
-        """Get distance from 0 to 1
-        """
-        return float(distance.levenshtein(*sequences)) / self.maximum(*sequences)
-
-    def rename(self, node1, node2):
-        """Compares attributes of trees"""
-        if (node1.tag != node2.tag) or (node1.colspan != node2.colspan) or (node1.rowspan != node2.rowspan):
-            return 1.
-        if node1.tag == 'td':
-            if node1.content or node2.content:
-                
-                node1_content = node1.content
-                node2_content = node2.content
-                while ' '  in node1_content:
-                    print(node1_content.index(' '))
-                    node1_content.pop(node1_content.index(' '))
-                while ' ' in node2_content:
-                    print(node2_content.index(' '))
-                    node2_content.pop(node2_content.index(' '))
-                return self.normalized_distance(node1_content, node2_content)
-        return 0.
-
-class TEDS(object):
-    ''' Tree Edit Distance basead Similarity
-    '''
-
-    def __init__(self, structure_only=False, n_jobs=1, ignore_nodes=None):
-        assert isinstance(n_jobs, int) and (
-            n_jobs >= 1), 'n_jobs must be an integer greather than 1'
-        self.structure_only = structure_only
-        self.n_jobs = n_jobs
-        self.ignore_nodes = ignore_nodes
-        self.__tokens__ = []
-
-    def tokenize(self, node):
-        ''' Tokenizes table cells
-        '''
-        self.__tokens__.append('<%s>' % node.tag)
-        if node.text is not None:
-            self.__tokens__ += list(node.text)
-        for n in node.getchildren():
-            self.tokenize(n)
-        if node.tag != 'unk':
-            self.__tokens__.append('</%s>' % node.tag)
-        if node.tag != 'td' and node.tail is not None:
-            self.__tokens__ += list(node.tail)
-
-    def load_html_tree(self, node, parent=None):
-        ''' Converts HTML tree to the format required by apted
-        '''
-        global __tokens__
-        if node.tag == 'td':
-            if self.structure_only:
-                cell = []
-            else:
-                self.__tokens__ = []
-                self.tokenize(node)
-                cell = self.__tokens__[1:-1].copy()
-            new_node = TableTree(node.tag,
-                                 int(node.attrib.get('colspan', '1')),
-                                 int(node.attrib.get('rowspan', '1')),
-                                 cell, *deque())
-        else:
-            new_node = TableTree(node.tag, None, None, None, *deque())
-        if parent is not None:
-            parent.children.append(new_node)
-        if node.tag != 'td':
-            for n in node.getchildren():
-                self.load_html_tree(n, new_node)
-        if parent is None:
-            return new_node
-
-    def evaluate(self, pred, true):
-        ''' Computes TEDS score between the prediction and the ground truth of a
-            given sample
-        '''
-        if (not pred) or (not true):
-            return 0.0
-        parser = html.HTMLParser(remove_comments=True, encoding='utf-8')
-        pred = html.fromstring(pred, parser=parser)
-        true = html.fromstring(true, parser=parser)
-        if pred.xpath('body/table') and true.xpath('body/table'):
-            pred = pred.xpath('body/table')[0]
-            true = true.xpath('body/table')[0]
-            if self.ignore_nodes:
-                etree.strip_tags(pred, *self.ignore_nodes)
-                etree.strip_tags(true, *self.ignore_nodes)
-            n_nodes_pred = len(pred.xpath(".//*"))
-            n_nodes_true = len(true.xpath(".//*"))
-            n_nodes = max(n_nodes_pred, n_nodes_true)
-            tree_pred = self.load_html_tree(pred)
-            tree_true = self.load_html_tree(true)
-            distance = APTED(tree_pred, tree_true,
-                             CustomConfig()).compute_edit_distance()
-            return 1.0 - (float(distance) / n_nodes)
-        else:
-            return 0.0
-
-    def batch_evaluate(self, pred_json, true_json):
-        ''' Computes TEDS score between the prediction and the ground truth of
-            a batch of samples
-            @params pred_json: {'FILENAME': 'HTML CODE', ...}
-            @params true_json: {'FILENAME': {'html': 'HTML CODE'}, ...}
-            @output: {'FILENAME': 'TEDS SCORE', ...}
-        '''
-        samples = true_json.keys()
-        if self.n_jobs == 1:
-            scores = [self.evaluate(pred_json.get(
-                filename, ''), true_json[filename]['html']) for filename in tqdm(samples)]
-        else:
-            inputs = [{'pred': pred_json.get(
-                filename, ''), 'true': true_json[filename]['html']} for filename in samples]
-            scores = parallel_process(
-                inputs, self.evaluate, use_kwargs=True, n_jobs=self.n_jobs, front_num=1)
-        scores = dict(zip(samples, scores))
-        return scores
-
-    def batch_evaluate_html(self, pred_htmls, true_htmls):
-        ''' Computes TEDS score between the prediction and the ground truth of
-            a batch of samples
-        '''
-        if self.n_jobs == 1:
-            scores = [self.evaluate(pred_html, true_html) for (
-                pred_html, true_html) in zip(pred_htmls, true_htmls)]
-        else:
-            inputs = [{"pred": pred_html, "true": true_html} for(
-                pred_html, true_html) in zip(pred_htmls, true_htmls)]
-
-            scores = parallel_process(
-                inputs, self.evaluate, use_kwargs=True, n_jobs=self.n_jobs, front_num=1)
-        return scores
-
-
-if __name__ == '__main__':
-    import json
-    import pprint
-    with open('sample_pred.json') as fp:
-        pred_json = json.load(fp)
-    with open('sample_gt.json') as fp:
-        true_json = json.load(fp)
-    teds = TEDS(n_jobs=4)
-    scores = teds.batch_evaluate(pred_json, true_json)
-    pp = pprint.PrettyPrinter()
-    pp.pprint(scores)
--- a/ppstructure/table/tablepyxl/__init__.py
+++ b/ppstructure/table/tablepyxl/__init__.py
-# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
\ No newline at end of file
--- a/ppstructure/table/tablepyxl/style.py
+++ b/ppstructure/table/tablepyxl/style.py
--- a/ppstructure/table/tablepyxl/tablepyxl.py
+++ b/ppstructure/table/tablepyxl/tablepyxl.py
-# Do imports like python3 so our package works for 2 and 3
-from __future__ import absolute_import
-
-from lxml import html
-from openpyxl import Workbook
-from openpyxl.utils import get_column_letter
-from premailer import Premailer
-from tablepyxl.style import Table
-
-
-def string_to_int(s):
-    if s.isdigit():
-        return int(s)
-    return 0
-
-
-def get_Tables(doc):
-    tree = html.fromstring(doc)
-    comments = tree.xpath('//comment()')
-    for comment in comments:
-        comment.drop_tag()
-    return [Table(table) for table in tree.xpath('//table')]
-
-
-def write_rows(worksheet, elem, row, column=1):
-    """
-    Writes every tr child element of elem to a row in the worksheet
-    returns the next row after all rows are written
-    """
-    from openpyxl.cell.cell import MergedCell
-
-    initial_column = column
-    for table_row in elem.rows:
-        for table_cell in table_row.cells:
-            cell = worksheet.cell(row=row, column=column)
-            while isinstance(cell, MergedCell):
-                column += 1
-                cell = worksheet.cell(row=row, column=column)
-
-            colspan = string_to_int(table_cell.element.get("colspan", "1"))
-            rowspan = string_to_int(table_cell.element.get("rowspan", "1"))
-            if rowspan > 1 or colspan > 1:
-                worksheet.merge_cells(start_row=row, start_column=column,
-                                      end_row=row + rowspan - 1, end_column=column + colspan - 1)
-
-            cell.value = table_cell.value
-            table_cell.format(cell)
-            min_width = table_cell.get_dimension('min-width')
-            max_width = table_cell.get_dimension('max-width')
-
-            if colspan == 1:
-                # Initially, when iterating for the first time through the loop, the width of all the cells is None.
-                # As we start filling in contents, the initial width of the cell (which can be retrieved by:
-                # worksheet.column_dimensions[get_column_letter(column)].width) is equal to the width of the previous
-                # cell in the same column (i.e. width of A2 = width of A1)
-                width = max(worksheet.column_dimensions[get_column_letter(column)].width or 0, len(table_cell.value) + 2)
-                if max_width and width > max_width:
-                    width = max_width
-                elif min_width and width < min_width:
-                    width = min_width
-                worksheet.column_dimensions[get_column_letter(column)].width = width
-            column += colspan
-        row += 1
-        column = initial_column
-    return row
-
-
-def table_to_sheet(table, wb):
-    """
-    Takes a table and workbook and writes the table to a new sheet.
-    The sheet title will be the same as the table attribute name.
-    """
-    ws = wb.create_sheet(title=table.element.get('name'))
-    insert_table(table, ws, 1, 1)
-
-
-def document_to_workbook(doc, wb=None, base_url=None):
-    """
-    Takes a string representation of an html document and writes one sheet for
-    every table in the document.
-    The workbook is returned
-    """
-    if not wb:
-        wb = Workbook()
-        wb.remove(wb.active)
-
-    inline_styles_doc = Premailer(doc, base_url=base_url, remove_classes=False).transform()
-    tables = get_Tables(inline_styles_doc)
-
-    for table in tables:
-        table_to_sheet(table, wb)
-
-    return wb
-
-
-def document_to_xl(doc, filename, base_url=None):
-    """
-    Takes a string representation of an html document and writes one sheet for
-    every table in the document. The workbook is written out to a file called filename
-    """
-    wb = document_to_workbook(doc, base_url=base_url)
-    wb.save(filename)
-
-
-def insert_table(table, worksheet, column, row):
-    if table.head:
-        row = write_rows(worksheet, table.head, row, column)
-    if table.body:
-        row = write_rows(worksheet, table.body, row, column)
-
-
-def insert_table_at_cell(table, cell):
-    """
-    Inserts a table at the location of an openpyxl Cell object.
-    """
-    ws = cell.parent
-    column, row = cell.column, cell.row
-    insert_table(table, ws, column, row)
\ No newline at end of file
--- a/ppstructure/utility.py
+++ b/ppstructure/utility.py
--- a/tools/infer/predict_det.py
+++ b/tools/infer/predict_det.py
@@ -43,7 +43,7 @@ class TextDetector(object):
        pre_process_list = [{
            'DetResizeForTest': {
                'limit_side_len': args.det_limit_side_len,
-                'limit_type': args.det_limit_type,
+                'limit_type': args.det_limit_type
            }
        }, {
            'NormalizeImage': {

--- a/tools/infer/predict_system.py
+++ b/tools/infer/predict_system.py
--- a/tools/infer/utility.py
+++ b/tools/infer/utility.py