first commit

f1506916 · sugon_cxj · 55c28ed5 · f1506916 · f1506916 · f1506916
Commit f1506916 authored May 18, 2023 by sugon_cxj
20 changed files
--- a/ppocr/utils/stats.py
+++ b/ppocr/utils/stats.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import collections
+import numpy as np
+import datetime
+
+__all__ = ['TrainingStats', 'Time']
+
+
+class SmoothedValue(object):
+    """Track a series of values and provide access to smoothed values over a
+    window or the global series average.
+    """
+
+    def __init__(self, window_size):
+        self.deque = collections.deque(maxlen=window_size)
+
+    def add_value(self, value):
+        self.deque.append(value)
+
+    def get_median_value(self):
+        return np.median(self.deque)
+
+
+def Time():
+    return datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')
+
+
+class TrainingStats(object):
+    def __init__(self, window_size, stats_keys):
+        self.window_size = window_size
+        self.smoothed_losses_and_metrics = {
+            key: SmoothedValue(window_size)
+            for key in stats_keys
+        }
+
+    def update(self, stats):
+        for k, v in stats.items():
+            if k not in self.smoothed_losses_and_metrics:
+                self.smoothed_losses_and_metrics[k] = SmoothedValue(
+                    self.window_size)
+            self.smoothed_losses_and_metrics[k].add_value(v)
+
+    def get(self, extras=None):
+        stats = collections.OrderedDict()
+        if extras:
+            for k, v in extras.items():
+                stats[k] = v
+        for k, v in self.smoothed_losses_and_metrics.items():
+            stats[k] = round(v.get_median_value(), 6)
+
+        return stats
+
+    def log(self, extras=None):
+        d = self.get(extras)
+        strs = []
+        for k, v in d.items():
+            strs.append('{}: {:x<6f}'.format(k, v))
+        strs = ', '.join(strs)
+        return strs
--- a/ppocr/utils/utility.py
+++ b/ppocr/utils/utility.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import logging
+import os
+import imghdr
+import cv2
+import random
+import numpy as np
+import paddle
+
+
+def print_dict(d, logger, delimiter=0):
+    """
+    Recursively visualize a dict and
+    indenting acrrording by the relationship of keys.
+    """
+    for k, v in sorted(d.items()):
+        if isinstance(v, dict):
+            logger.info("{}{} : ".format(delimiter * " ", str(k)))
+            print_dict(v, logger, delimiter + 4)
+        elif isinstance(v, list) and len(v) >= 1 and isinstance(v[0], dict):
+            logger.info("{}{} : ".format(delimiter * " ", str(k)))
+            for value in v:
+                print_dict(value, logger, delimiter + 4)
+        else:
+            logger.info("{}{} : {}".format(delimiter * " ", k, v))
+
+
+def get_check_global_params(mode):
+    check_params = ['use_gpu', 'max_text_length', 'image_shape', \
+                    'image_shape', 'character_type', 'loss_type']
+    if mode == "train_eval":
+        check_params = check_params + [ \
+            'train_batch_size_per_card', 'test_batch_size_per_card']
+    elif mode == "test":
+        check_params = check_params + ['test_batch_size_per_card']
+    return check_params
+
+
+def _check_image_file(path):
+    img_end = {'jpg', 'bmp', 'png', 'jpeg', 'rgb', 'tif', 'tiff', 'gif'}
+    return any([path.lower().endswith(e) for e in img_end])
+
+
+def get_image_file_list(img_file):
+    imgs_lists = []
+    if img_file is None or not os.path.exists(img_file):
+        raise Exception("not found any img file in {}".format(img_file))
+
+    img_end = {'jpg', 'bmp', 'png', 'jpeg', 'rgb', 'tif', 'tiff', 'gif'}
+    if os.path.isfile(img_file) and _check_image_file(img_file):
+        imgs_lists.append(img_file)
+    elif os.path.isdir(img_file):
+        for single_file in os.listdir(img_file):
+            file_path = os.path.join(img_file, single_file)
+            if os.path.isfile(file_path) and _check_image_file(file_path):
+                imgs_lists.append(file_path)
+    if len(imgs_lists) == 0:
+        raise Exception("not found any img file in {}".format(img_file))
+    imgs_lists = sorted(imgs_lists)
+    return imgs_lists
+
+
+def check_and_read_gif(img_path):
+    if os.path.basename(img_path)[-3:] in ['gif', 'GIF']:
+        gif = cv2.VideoCapture(img_path)
+        ret, frame = gif.read()
+        if not ret:
+            logger = logging.getLogger('ppocr')
+            logger.info("Cannot read {}. This gif image maybe corrupted.")
+            return None, False
+        if len(frame.shape) == 2 or frame.shape[-1] == 1:
+            frame = cv2.cvtColor(frame, cv2.COLOR_GRAY2RGB)
+        imgvalue = frame[:, :, ::-1]
+        return imgvalue, True
+    return None, False
+
+
+def load_vqa_bio_label_maps(label_map_path):
+    with open(label_map_path, "r", encoding='utf-8') as fin:
+        lines = fin.readlines()
+    lines = [line.strip() for line in lines]
+    if "O" not in lines:
+        lines.insert(0, "O")
+    labels = []
+    for line in lines:
+        if line == "O":
+            labels.append("O")
+        else:
+            labels.append("B-" + line)
+            labels.append("I-" + line)
+    label2id_map = {label: idx for idx, label in enumerate(labels)}
+    id2label_map = {idx: label for idx, label in enumerate(labels)}
+    return label2id_map, id2label_map
+
+
+def set_seed(seed=1024):
+    random.seed(seed)
+    np.random.seed(seed)
+    paddle.seed(seed)
+
+
+class AverageMeter:
+    def __init__(self):
+        self.reset()
+
+    def reset(self):
+        """reset"""
+        self.val = 0
+        self.avg = 0
+        self.sum = 0
+        self.count = 0
+
+    def update(self, val, n=1):
+        """update"""
+        self.val = val
+        self.sum += val * n
+        self.count += n
+        self.avg = self.sum / self.count
--- a/requirements.txt
+++ b/requirements.txt
+shapely
+scikit-image
+imgaug==0.4.0
+pyclipper
+lmdb
+tqdm
+numpy
+visualdl
+rapidfuzz
+opencv-contrib-python==4.4.0.46
+cython
+lxml
+premailer
+openpyxl
+attrdict
--- a/tools/__pycache__/__init__.cpython-37.pyc
+++ b/tools/__pycache__/__init__.cpython-37.pyc
--- a/tools/__pycache__/program.cpython-37.pyc
+++ b/tools/__pycache__/program.cpython-37.pyc
--- a/tools/eval.py
+++ b/tools/eval.py
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+import sys
+
+__dir__ = os.path.dirname(os.path.abspath(__file__))
+sys.path.insert(0, __dir__)
+sys.path.insert(0, os.path.abspath(os.path.join(__dir__, '..')))
+
+from ppocr.data import build_dataloader
+from ppocr.modeling.architectures import build_model
+from ppocr.postprocess import build_post_process
+from ppocr.metrics import build_metric
+from ppocr.utils.save_load import load_model
+import tools.program as program
+
+
+def main():
+    global_config = config['Global']
+    # build dataloader
+    valid_dataloader = build_dataloader(config, 'Eval', device, logger)
+
+    # build post process
+    post_process_class = build_post_process(config['PostProcess'],
+                                            global_config)
+
+    # build model
+    # for rec algorithm
+    if hasattr(post_process_class, 'character'):
+        char_num = len(getattr(post_process_class, 'character'))
+        if config['Architecture']["algorithm"] in ["Distillation",
+                                                   ]:  # distillation model
+            for key in config['Architecture']["Models"]:
+                if config['Architecture']['Models'][key]['Head'][
+                        'name'] == 'MultiHead':  # for multi head
+                    out_channels_list = {}
+                    if config['PostProcess'][
+                            'name'] == 'DistillationSARLabelDecode':
+                        char_num = char_num - 2
+                    out_channels_list['CTCLabelDecode'] = char_num
+                    out_channels_list['SARLabelDecode'] = char_num + 2
+                    config['Architecture']['Models'][key]['Head'][
+                        'out_channels_list'] = out_channels_list
+                else:
+                    config['Architecture']["Models"][key]["Head"][
+                        'out_channels'] = char_num
+        elif config['Architecture']['Head'][
+                'name'] == 'MultiHead':  # for multi head
+            out_channels_list = {}
+            if config['PostProcess']['name'] == 'SARLabelDecode':
+                char_num = char_num - 2
+            out_channels_list['CTCLabelDecode'] = char_num
+            out_channels_list['SARLabelDecode'] = char_num + 2
+            config['Architecture']['Head'][
+                'out_channels_list'] = out_channels_list
+        else:  # base rec model
+            config['Architecture']["Head"]['out_channels'] = char_num
+
+    model = build_model(config['Architecture'])
+    extra_input_models = ["SRN", "NRTR", "SAR", "SEED", "SVTR"]
+    extra_input = False
+    if config['Architecture']['algorithm'] == 'Distillation':
+        for key in config['Architecture']["Models"]:
+            extra_input = extra_input or config['Architecture']['Models'][key][
+                'algorithm'] in extra_input_models
+    else:
+        extra_input = config['Architecture']['algorithm'] in extra_input_models
+    if "model_type" in config['Architecture'].keys():
+        model_type = config['Architecture']['model_type']
+    else:
+        model_type = None
+
+    best_model_dict = load_model(
+        config, model, model_type=config['Architecture']["model_type"])
+    if len(best_model_dict):
+        logger.info('metric in ckpt ***************')
+        for k, v in best_model_dict.items():
+            logger.info('{}:{}'.format(k, v))
+
+    # build metric
+    eval_class = build_metric(config['Metric'])
+    # start eval
+    metric = program.eval(model, valid_dataloader, post_process_class,
+                          eval_class, model_type, extra_input)
+    logger.info('metric eval ***************')
+    for k, v in metric.items():
+        logger.info('{}:{}'.format(k, v))
+
+
+if __name__ == '__main__':
+    config, device, logger, vdl_writer = program.preprocess()
+    main()
--- a/tools/infer/__pycache__/predict_cls.cpython-37.pyc
+++ b/tools/infer/__pycache__/predict_cls.cpython-37.pyc
--- a/tools/infer/__pycache__/predict_det.cpython-37.pyc
+++ b/tools/infer/__pycache__/predict_det.cpython-37.pyc
--- a/tools/infer/__pycache__/predict_rec.cpython-37.pyc
+++ b/tools/infer/__pycache__/predict_rec.cpython-37.pyc
--- a/tools/infer/__pycache__/utility.cpython-37.pyc
+++ b/tools/infer/__pycache__/utility.cpython-37.pyc
--- a/tools/infer/predict_cls.py
+++ b/tools/infer/predict_cls.py
+import os
+import sys
+
+__dir__ = os.path.dirname(os.path.abspath(__file__))
+sys.path.append(__dir__)
+sys.path.insert(0, os.path.abspath(os.path.join(__dir__, '../..')))
+
+os.environ["FLAGS_allocator_strategy"] = 'auto_growth'
+
+import cv2
+import copy
+import numpy as np
+import math
+import time
+import traceback
+
+import tools.infer.utility as utility
+from ppocr.postprocess import build_post_process
+from ppocr.utils.logging import get_logger
+from ppocr.utils.utility import get_image_file_list, check_and_read_gif
+
+logger = get_logger()
+
+
+class TextClassifier(object):
+    def __init__(self, args):
+        self.cls_image_shape = [int(v) for v in args.cls_image_shape.split(",")]
+        self.cls_batch_num = args.cls_batch_num
+        self.cls_thresh = args.cls_thresh
+        postprocess_params = {
+            'name': 'ClsPostProcess',
+            "label_list": args.label_list,
+        }
+        self.postprocess_op = build_post_process(postprocess_params)
+        self.predictor, self.input_tensor, self.output_tensors, _ = \
+            utility.create_predictor(args, 'cls', logger)
+        self.use_onnx = args.use_onnx
+
+    def resize_norm_img(self, img):
+        imgC, imgH, imgW = self.cls_image_shape
+        h = img.shape[0]
+        w = img.shape[1]
+        ratio = w / float(h)
+        if math.ceil(imgH * ratio) > imgW:
+            resized_w = imgW
+        else:
+            resized_w = int(math.ceil(imgH * ratio))
+        resized_image = cv2.resize(img, (resized_w, imgH))
+        resized_image = resized_image.astype('float32')
+        if self.cls_image_shape[0] == 1:
+            resized_image = resized_image / 255
+            resized_image = resized_image[np.newaxis, :]
+        else:
+            resized_image = resized_image.transpose((2, 0, 1)) / 255
+        resized_image -= 0.5
+        resized_image /= 0.5
+        padding_im = np.zeros((imgC, imgH, imgW), dtype=np.float32)
+        padding_im[:, :, 0:resized_w] = resized_image
+        return padding_im
+
+    def __call__(self, img_list):
+        img_list = copy.deepcopy(img_list)
+        img_num = len(img_list)
+        # Calculate the aspect ratio of all text bars
+        width_list = []
+        for img in img_list:
+            width_list.append(img.shape[1] / float(img.shape[0]))
+        # Sorting can speed up the cls process
+        indices = np.argsort(np.array(width_list))
+
+        cls_res = [['', 0.0]] * img_num
+        batch_num = self.cls_batch_num
+        elapse = 0
+        for beg_img_no in range(0, img_num, batch_num):
+
+            end_img_no = min(img_num, beg_img_no + batch_num)
+            norm_img_batch = []
+            max_wh_ratio = 0
+            starttime = time.time()
+            for ino in range(beg_img_no, end_img_no):
+                h, w = img_list[indices[ino]].shape[0:2]
+                wh_ratio = w * 1.0 / h
+                max_wh_ratio = max(max_wh_ratio, wh_ratio)
+            for ino in range(beg_img_no, end_img_no):
+                norm_img = self.resize_norm_img(img_list[indices[ino]])
+                norm_img = norm_img[np.newaxis, :]
+                norm_img_batch.append(norm_img)
+            norm_img_batch = np.concatenate(norm_img_batch)
+            norm_img_batch = norm_img_batch.copy()
+
+            if self.use_onnx:
+                input_dict = {}
+                input_dict[self.input_tensor.name] = norm_img_batch
+                outputs = self.predictor.run(self.output_tensors, input_dict)
+                prob_out = outputs[0]
+            else:
+                self.input_tensor.copy_from_cpu(norm_img_batch)
+                self.predictor.run()
+                prob_out = self.output_tensors[0].copy_to_cpu()
+                self.predictor.try_shrink_memory()
+            cls_result = self.postprocess_op(prob_out)
+            elapse += time.time() - starttime
+            for rno in range(len(cls_result)):
+                label, score = cls_result[rno]
+                cls_res[indices[beg_img_no + rno]] = [label, score]
+                if '180' in label and score > self.cls_thresh:
+                    img_list[indices[beg_img_no + rno]] = cv2.rotate(
+                        img_list[indices[beg_img_no + rno]], 1)
+        return img_list, cls_res, elapse
+
+
+def main(args):
+    image_file_list = get_image_file_list(args.image_dir)
+    text_classifier = TextClassifier(args)
+    valid_image_file_list = []
+    img_list = []
+    for image_file in image_file_list:
+        img, flag = check_and_read_gif(image_file)
+        if not flag:
+            img = cv2.imread(image_file)
+        if img is None:
+            logger.info("error in loading image:{}".format(image_file))
+            continue
+        valid_image_file_list.append(image_file)
+        img_list.append(img)
+    try:
+        img_list, cls_res, predict_time = text_classifier(img_list)
+    except Exception as E:
+        logger.info(traceback.format_exc())
+        logger.info(E)
+        exit()
+    for ino in range(len(img_list)):
+        logger.info("Predicts of {}:{}".format(valid_image_file_list[ino],
+                                               cls_res[ino]))
+
+
+if __name__ == "__main__":
+    main(utility.parse_args())
--- a/tools/infer/predict_det.py
+++ b/tools/infer/predict_det.py
+import os
+import sys
+import paddle
+
+__dir__ = os.path.dirname(os.path.abspath(__file__))
+sys.path.append(__dir__)
+sys.path.insert(0, os.path.abspath(os.path.join(__dir__, '../..')))
+
+os.environ["FLAGS_allocator_strategy"] = 'auto_growth'
+
+import cv2
+import numpy as np
+import time
+import sys
+from scipy.spatial import distance as dist
+
+import tools.infer.utility as utility
+from ppocr.utils.logging import get_logger
+from ppocr.utils.utility import get_image_file_list, check_and_read_gif
+from ppocr.data import create_operators, transform
+from ppocr.postprocess import build_post_process
+import json
+logger = get_logger()
+
+
+class TextDetector(object):
+    def __init__(self, args):
+        self.args = args
+        self.det_algorithm = args.det_algorithm
+        self.use_onnx = args.use_onnx
+        pre_process_list = [{
+            'DetResizeForSingle': None
+        }, {
+            'NormalizeImage': {
+                'std': [0.229, 0.224, 0.225],
+                'mean': [0.485, 0.456, 0.406],
+                'scale': '1./255.',
+                'order': 'hwc'
+            }
+        }, {
+            'ToCHWImage': None
+        }, {
+            'KeepKeys': {
+                'keep_keys': ['image', 'shape']
+            }
+        }]
+        postprocess_params = {}
+        if self.det_algorithm == "DB":
+            postprocess_params['name'] = 'DBPostProcess'
+            postprocess_params["thresh"] = args.det_db_thresh
+            postprocess_params["box_thresh"] = args.det_db_box_thresh
+            postprocess_params["max_candidates"] = 1000
+            postprocess_params["unclip_ratio"] = args.det_db_unclip_ratio
+            postprocess_params["use_dilation"] = args.use_dilation
+            postprocess_params["score_mode"] = args.det_db_score_mode
+        else:
+            logger.info("not support det_algorithm:{}".format(self.det_algorithm))
+            sys.exit(0)
+
+        self.preprocess_op = create_operators(pre_process_list)
+        self.postprocess_op = build_post_process(postprocess_params)
+        self.predictor, self.input_tensor, self.output_tensors, self.config = utility.create_predictor(
+            args, 'det', logger)
+
+    def order_points_clockwise(self, pts):
+        rect = np.zeros((4, 2), dtype="float32")
+        s = pts.sum(axis=1)
+        rect[0] = pts[np.argmin(s)]
+        rect[2] = pts[np.argmax(s)]
+        tmp = np.delete(pts, (np.argmin(s), np.argmax(s)), axis=0)
+        diff = np.diff(np.array(tmp), axis=1)
+        rect[1] = tmp[np.argmin(diff)]
+        rect[3] = tmp[np.argmax(diff)]
+        return rect
+
+    def clip_det_res(self, points, img_height, img_width):
+        for pno in range(points.shape[0]):
+            points[pno, 0] = int(min(max(points[pno, 0], 0), img_width - 1))
+            points[pno, 1] = int(min(max(points[pno, 1], 0), img_height - 1))
+        return points
+
+    def filter_tag_det_res(self, dt_boxes, image_shape):
+        img_height, img_width = image_shape[0:2]
+        dt_boxes_new = []
+        for box in dt_boxes:
+            box = self.order_points_clockwise(box)
+            box = self.clip_det_res(box, img_height, img_width)
+            rect_width = int(np.linalg.norm(box[0] - box[1]))
+            rect_height = int(np.linalg.norm(box[0] - box[3]))
+            if rect_width <= 3 or rect_height <= 3:
+                continue
+            dt_boxes_new.append(box)
+        dt_boxes = np.array(dt_boxes_new)
+        return dt_boxes
+
+    def filter_tag_det_res_only_clip(self, dt_boxes, image_shape):
+        img_height, img_width = image_shape[0:2]
+        dt_boxes_new = []
+        for box in dt_boxes:
+            box = self.clip_det_res(box, img_height, img_width)
+            dt_boxes_new.append(box)
+        dt_boxes = np.array(dt_boxes_new)
+        return dt_boxes
+
+    def __call__(self, img):
+        ori_im = img.copy()
+        data = {'image': img}
+
+        st = time.time()
+
+        if self.args.benchmark:
+            self.autolog.times.start()
+
+        data = transform(data, self.preprocess_op)
+        img, shape_list = data
+        if img is None:
+            return None, 0
+        img = np.expand_dims(img, axis=0)
+        shape_list = np.expand_dims(shape_list, axis=0)
+        # print(img.shape)
+        img = img.copy()
+
+        self.input_tensor.copy_from_cpu(img)
+        self.predictor.run()
+        paddle.device.cuda.synchronize()
+        outputs = []
+        for output_tensor in self.output_tensors:
+            output = output_tensor.copy_to_cpu()
+            outputs.append(output)
+        if self.args.benchmark:
+            self.autolog.times.stamp()
+
+        preds = {}
+        if self.det_algorithm in ['DB', 'PSE']:
+            preds['maps'] = outputs[0]
+        else:
+            raise NotImplementedError
+
+        post_result = self.postprocess_op(preds, shape_list)
+        dt_boxes = post_result[0]['points']
+        dt_boxes = self.filter_tag_det_res(dt_boxes, ori_im.shape)
+
+        if self.args.benchmark:
+            self.autolog.times.end(stamp=True)
+        et = time.time()
+        return dt_boxes, et - st
+
+
+if __name__ == "__main__":
+    args = utility.parse_args()
+    image_file_list = get_image_file_list(args.image_dir)
+    text_detector = TextDetector(args)
+    count = 0
+    total_time = 0
+    draw_img_save = "./inference_results"
+
+    if args.warmup:
+        img = np.random.uniform(0, 255, [640, 640, 3]).astype(np.uint8)
+        for i in range(2):
+            res = text_detector(img)
+
+    if not os.path.exists(draw_img_save):
+        os.makedirs(draw_img_save)
+    save_results = []
+    for image_file in image_file_list:
+        img, flag = check_and_read_gif(image_file)
+        if not flag:
+            img = cv2.imread(image_file)
+        if img is None:
+            logger.info("error in loading image:{}".format(image_file))
+            continue
+        st = time.time()
+        dt_boxes, _ = text_detector(img)
+        elapse = time.time() - st
+        if count > 0:
+            total_time += elapse
+        count += 1
+        save_pred = os.path.basename(image_file) + "\t" + str(
+            json.dumps([x.tolist() for x in dt_boxes])) + "\n"
+        save_results.append(save_pred)
+        logger.info(save_pred)
+        logger.info("The predict time of {}: {}".format(image_file, elapse))
+        src_im = utility.draw_text_det_res(dt_boxes, image_file)
+        img_name_pure = os.path.split(image_file)[-1]
+        img_path = os.path.join(draw_img_save,
+                                "det_res_{}".format(img_name_pure))
+        cv2.imwrite(img_path, src_im)
+        logger.info("The visualized image saved in {}".format(img_path))
+
+    with open(os.path.join(draw_img_save, "det_results.txt"), 'w') as f:
+        f.writelines(save_results)
+        f.close()
+    if args.benchmark:
+        text_detector.autolog.report()
--- a/tools/infer/predict_rec.py
+++ b/tools/infer/predict_rec.py
+import os
+import sys
+from PIL import Image
+__dir__ = os.path.dirname(os.path.abspath(__file__))
+sys.path.append(__dir__)
+sys.path.insert(0, os.path.abspath(os.path.join(__dir__, '../..')))
+
+os.environ["FLAGS_allocator_strategy"] = 'auto_growth'
+
+import cv2
+import numpy as np
+import math
+import time
+import traceback
+import paddle
+
+import tools.infer.utility as utility
+from ppocr.postprocess import build_post_process
+from ppocr.utils.logging import get_logger
+from ppocr.utils.utility import get_image_file_list, check_and_read_gif
+
+logger = get_logger()
+
+class TextRecognizer(object):
+    def __init__(self, args):
+        self.rec_image_shape = [int(v) for v in args.rec_image_shape.split(",")]
+        self.rec_batch_num = args.rec_batch_num
+        postprocess_params = {
+            'name': 'CTCLabelDecode',
+            "character_dict_path": args.rec_char_dict_path,
+            "use_space_char": args.use_space_char
+        }
+        self.postprocess_op = build_post_process(postprocess_params)
+        self.predictor, self.input_tensor, self.output_tensors, self.config = \
+            utility.create_predictor(args, 'rec', logger)
+    
+    def resize_norm_img_section(self, img, max_wh_ratio):
+        # print("rec resize for section")
+        imgC, imgH, imgW = self.rec_image_shape
+
+        assert imgC == img.shape[2]
+        
+        rec_precision_level = os.environ.get("OCR_REC_PRECISION")
+        max_w = imgH * 48
+        # max_w = 2304
+        if rec_precision_level =='0':
+            imgW = max_w
+        elif rec_precision_level == '1':
+            imgW = int((imgH * max_wh_ratio))
+            if imgW <= max_w / 2:
+                imgW = max_w / 2
+            else:
+                imgW = max_w
+        elif rec_precision_level == '2':
+            imgW = int((imgH * max_wh_ratio))
+            if imgW <= max_w / 4:
+                imgW = max_w / 4
+            elif imgW > max_w / 4 and imgW <= max_w / 2:
+                imgW = max_w / 2
+            elif imgW > max_w / 2 and imgW <= 3 * max_w / 4:
+                imgW = 3 * max_w / 4
+            else:
+                imgW = max_w
+        else:
+            imgW = int((imgH * max_wh_ratio))
+            if imgW <= max_w / 6:
+                imgW = max_w / 6
+            elif imgW > max_w / 6 and imgW <= max_w / 3:
+                imgW = max_w / 3
+            elif imgW > max_w / 3 and imgW <= max_w / 2:
+                imgW = max_w / 2
+            elif imgW > max_w / 2 and imgW <= 2 * max_w / 3:
+                imgW = 2 * max_w / 3
+            elif imgW > 2 *max_w / 3 and imgW <= 5 * max_w / 6:
+                imgW = 5 * max_w / 6
+            else:
+                imgW = max_w
+
+        imgW = int(imgW)
+        h, w = img.shape[:2]
+        ratio = w / float(h)
+        if math.ceil(imgH * ratio) > imgW:
+            resized_w = imgW
+        else:
+            resized_w = int(math.ceil(imgH * ratio))
+        resized_image = cv2.resize(img, (resized_w, imgH))
+        resized_image = resized_image.astype('float32')
+        resized_image = resized_image.transpose((2, 0, 1)) / 255
+        resized_image -= 0.5
+        resized_image /= 0.5
+        padding_im = np.zeros((imgC, imgH, imgW), dtype=np.float32)
+        padding_im[:, :, 0:resized_w] = resized_image
+        return padding_im
+
+    def __call__(self, img_list):
+        img_num = len(img_list)
+        # Calculate the aspect ratio of all text bars
+        width_list = []
+        for img in img_list:
+            width_list.append(img.shape[1] / float(img.shape[0]))
+        # Sorting can speed up the recognition process
+        indices = np.argsort(np.array(width_list))
+
+        rec_res = [['', 0.0]] * img_num
+        max_batnum = 24
+        min_batnum = 8
+        if os.environ.get("OCR_REC_MAX_BATNUM") is not None:
+            max_batnum = int(os.environ.get("OCR_REC_MAX_BATNUM"))
+        if os.environ.get("OCR_REC_MIN_BATNUM") is not None:
+            min_batnum = int(os.environ.get("OCR_REC_MIN_BATNUM"))
+        assert max_batnum / min_batnum == int(max_batnum / min_batnum), "max_batnum must be multiple of min_batnum."
+        img_num_left = img_num
+        img_no_count = 0
+        st = time.time()
+        if img_num_left > max_batnum:
+            batch_num = max_batnum
+            batch_num = int(batch_num)
+            for beg_img_no in range(img_no_count, int(img_num_left / batch_num) * batch_num, batch_num):
+                end_img_no = beg_img_no + batch_num
+                norm_img_batch = []
+                max_wh_ratio = 0
+                for ino in range(beg_img_no, end_img_no):
+                    h, w = img_list[indices[ino]].shape[0:2]
+                    wh_ratio = w * 1.0 / h
+                    max_wh_ratio = max(max_wh_ratio, wh_ratio)
+                for ino in range(beg_img_no, end_img_no):
+                    norm_img = self.resize_norm_img_section(img_list[indices[ino]], max_wh_ratio)
+                    norm_img = norm_img[np.newaxis, :]
+                    norm_img_batch.append(norm_img)
+
+                norm_img_batch = np.concatenate(norm_img_batch, axis=0)
+                norm_img_batch = norm_img_batch.copy()
+
+                self.input_tensor.copy_from_cpu(norm_img_batch)
+                self.predictor.run()
+
+                outputs = []
+                for output_tensor in self.output_tensors:
+                    output = output_tensor.copy_to_cpu()
+                    outputs.append(output)
+                if len(outputs) != 1:
+                    preds = outputs
+                else:
+                    preds = outputs[0]
+                rec_result = self.postprocess_op(preds)
+                for rno in range(len(rec_result)):
+                    rec_res[indices[beg_img_no + rno]] = rec_result[rno]
+            img_no_count = int(img_num_left / batch_num) * batch_num
+            img_num_left = img_num_left - int(img_num_left / batch_num) * batch_num
+                    
+        batch_num = math.ceil(img_num_left / min_batnum) * min_batnum
+        batch_num = int(batch_num)
+        Dnum = batch_num - img_num_left
+        for dno in range(Dnum):
+            indices = np.append(indices,img_num + dno)
+            rec_res.append(['', 0.0])
+        
+        beg_img_no = img_no_count
+        end_img_no = img_num
+        norm_img_batch = []
+        max_wh_ratio = 0
+        for ino in range(beg_img_no, end_img_no):
+            h, w = img_list[indices[ino]].shape[0:2]
+            wh_ratio = w * 1.0 / h
+            max_wh_ratio = max(max_wh_ratio, wh_ratio)
+        for ino in range(beg_img_no, end_img_no):
+            norm_img = self.resize_norm_img_section(img_list[indices[ino]], max_wh_ratio)
+            norm_img = norm_img[np.newaxis, :]
+            norm_img_batch.append(norm_img)
+
+        norm_img_batch = np.concatenate(norm_img_batch)
+        if norm_img_batch.shape[0] != batch_num:
+            img_tmp = np.zeros((batch_num - norm_img_batch.shape[0], norm_img_batch.shape[1], norm_img_batch.shape[2], norm_img_batch.shape[3]), dtype=np.float32)
+            norm_img_batch = np.concatenate([norm_img_batch, img_tmp])
+        norm_img_batch = norm_img_batch.copy()
+
+        self.input_tensor.copy_from_cpu(norm_img_batch)
+        self.predictor.run()
+
+        outputs = []
+        for output_tensor in self.output_tensors:
+            output = output_tensor.copy_to_cpu()
+            outputs.append(output)
+        if len(outputs) != 1:
+            preds = outputs
+        else:
+            preds = outputs[0]
+        rec_result = self.postprocess_op(preds)
+        for rno in range(len(rec_result)):
+            rec_res[indices[beg_img_no + rno]] = rec_result[rno]
+
+        return rec_res, time.time() - st
+
+
+def main(args):
+    image_file_list = get_image_file_list(args.image_dir)
+    text_recognizer = TextRecognizer(args)
+    valid_image_file_list = []
+    img_list = []
+
+    logger.info(
+        "In PP-OCRv3, rec_image_shape parameter defaults to '3, 48, 320', "
+        "if you are using recognition model with PP-OCRv2 or an older version, please set --rec_image_shape='3,32,320"
+    )
+    # warmup 2 times
+    if args.warmup:
+        img = np.random.uniform(0, 255, [48, 320, 3]).astype(np.uint8)
+        for i in range(2):
+            res = text_recognizer([img] * int(args.rec_batch_num))
+
+    for image_file in image_file_list:
+        img, flag = check_and_read_gif(image_file)
+        if not flag:
+            img = cv2.imread(image_file)
+        if img is None:
+            logger.info("error in loading image:{}".format(image_file))
+            continue
+        valid_image_file_list.append(image_file)
+        img_list.append(img)
+    try:
+        rec_res, _ = text_recognizer(img_list)
+
+    except Exception as E:
+        logger.info(traceback.format_exc())
+        logger.info(E)
+        exit()
+    for ino in range(len(img_list)):
+        logger.info("Predicts of {}:{}".format(valid_image_file_list[ino],
+                                               rec_res[ino]))
+    if args.benchmark:
+        text_recognizer.autolog.report()
+
+
+if __name__ == "__main__":
+    main(utility.parse_args())
--- a/tools/infer/predict_system.py
+++ b/tools/infer/predict_system.py
+import os
+import sys
+import subprocess
+
+__dir__ = os.path.dirname(os.path.abspath(__file__))
+sys.path.append(__dir__)
+sys.path.insert(0, os.path.abspath(os.path.join(__dir__, '../..')))
+
+os.environ["FLAGS_allocator_strategy"] = 'auto_growth'
+
+import cv2
+import copy
+import numpy as np
+import json
+import time
+import logging
+from PIL import Image
+import tools.infer.utility as utility
+import tools.infer.predict_rec as predict_rec
+import tools.infer.predict_det as predict_det
+import tools.infer.predict_cls as predict_cls
+from ppocr.utils.utility import get_image_file_list, check_and_read_gif
+from ppocr.utils.logging import get_logger
+from tools.infer.utility import draw_ocr_box_txt, get_rotate_crop_image
+logger = get_logger()
+
+
+class TextSystem(object):
+    def __init__(self, args):
+        if not args.show_log:
+            logger.setLevel(logging.INFO)
+
+        self.text_detector = predict_det.TextDetector(args)
+        self.text_recognizer = predict_rec.TextRecognizer(args)
+        self.use_angle_cls = args.use_angle_cls
+        self.drop_score = args.drop_score
+        if self.use_angle_cls:
+            self.text_classifier = predict_cls.TextClassifier(args)
+
+        self.args = args
+
+    def __call__(self, img, cls=True):
+        ori_im = img.copy()
+        dt_boxes, elapse = self.text_detector(img)
+
+        # logger.debug("dt_boxes num : {}, elapse : {}".format(
+        #     len(dt_boxes), elapse))
+        if dt_boxes is None:
+            return None, None
+        img_crop_list = []
+
+        dt_boxes = sorted_boxes(dt_boxes)
+
+        for bno in range(len(dt_boxes)):
+            tmp_box = copy.deepcopy(dt_boxes[bno])
+            img_crop = get_rotate_crop_image(ori_im, tmp_box)
+            img_crop_list.append(img_crop)
+        if self.use_angle_cls and cls:
+            img_crop_list, angle_list, elapse = self.text_classifier(
+                img_crop_list)
+            logger.debug("cls num  : {}, elapse : {}".format(
+                len(img_crop_list), elapse))
+
+        rec_res, elapse = self.text_recognizer(img_crop_list)
+        # logger.debug("rec_res num  : {}, elapse : {}".format(
+        #     len(rec_res), elapse))
+
+        filter_boxes, filter_rec_res = [], []
+        for box, rec_result in zip(dt_boxes, rec_res):
+            text, score = rec_result
+            if score >= self.drop_score:
+                filter_boxes.append(box)
+                filter_rec_res.append(rec_result)
+        return filter_boxes, filter_rec_res
+
+
+def sorted_boxes(dt_boxes):
+    """
+    Sort text boxes in order from top to bottom, left to right
+    args:
+        dt_boxes(array):detected text boxes with shape [4, 2]
+    return:
+        sorted boxes(array) with shape [4, 2]
+    """
+    num_boxes = dt_boxes.shape[0]
+    sorted_boxes = sorted(dt_boxes, key=lambda x: (x[0][1], x[0][0]))
+    _boxes = list(sorted_boxes)
+
+    for i in range(num_boxes - 1):
+        for j in range(i, 0, -1):
+            if abs(_boxes[j + 1][0][1] - _boxes[j][0][1]) < 10 and \
+                    (_boxes[j + 1][0][0] < _boxes[j][0][0]):
+                tmp = _boxes[j]
+                _boxes[j] = _boxes[j + 1]
+                _boxes[j + 1] = tmp
+            else:
+                break
+    return _boxes
+
+
+def main(args):
+    image_file_list = get_image_file_list(args.image_dir)
+    image_file_list = image_file_list[args.process_id::args.total_process_num]
+    text_sys = TextSystem(args)
+    is_visualize = False
+    font_path = args.vis_font_path
+    drop_score = args.drop_score
+    draw_img_save_dir = args.draw_img_save_dir
+    os.makedirs(draw_img_save_dir, exist_ok=True)
+    save_results = []
+
+    # warm up
+    if args.warmup:
+        warmup_file_list = get_image_file_list("./warmup_images_5/")
+        warmup_file_rec_list = get_image_file_list("./warmup_images_rec/")
+        startwarm = time.time()
+        for warmup_file in warmup_file_list:
+            print(warmup_file)
+            img_warm = cv2.imread(warmup_file)
+            res = text_sys.text_detector(img_warm)
+        for warmup_file_rec in warmup_file_rec_list:
+            print(warmup_file_rec)
+            img_warm_rec = cv2.imread(warmup_file_rec)
+            max_batnum = 24
+            min_batnum = 8
+            if os.environ.get("OCR_REC_MAX_BATNUM") is not None:
+                max_batnum = int(os.environ.get("OCR_REC_MAX_BATNUM"))
+            if os.environ.get("OCR_REC_MIN_BATNUM") is not None:
+                min_batnum = int(os.environ.get("OCR_REC_MIN_BATNUM"))
+            assert max_batnum / min_batnum == int(max_batnum / min_batnum), "max_batnum must be multiple of min_batnum."
+        
+            for bn in range(int(max_batnum / min_batnum)):
+                img_rec_list = []
+                for i in range(min_batnum * (bn + 1)):
+                    img_rec_list.append(img_warm_rec)
+                rec_results = text_sys.text_recognizer(img_rec_list)
+        elapsewarm = time.time() - startwarm
+        logger.debug("warmup time:{}".format(elapsewarm))
+
+    total_time = 0
+    _st = time.time()
+    for idx, image_file in enumerate(image_file_list):
+
+        img, flag = check_and_read_gif(image_file)
+        if not flag:
+            img = cv2.imread(image_file)
+        if img is None:
+            logger.debug("error in loading image:{}".format(image_file))
+            continue
+        starttime = time.time()
+        dt_boxes, rec_res = text_sys(img)
+        elapse = time.time() - starttime
+        total_time += elapse
+
+        logger.debug(
+            str(idx) + "  Predict time of %s: %.3fs" % (image_file, elapse))
+        for text, score in rec_res:
+            logger.debug("{}, {:.3f}".format(text, score))
+
+        res = [{
+            "transcription": rec_res[idx][0],
+            "points": np.array(dt_boxes[idx]).astype(np.int32).tolist(),
+        } for idx in range(len(dt_boxes))]
+        save_pred = os.path.basename(image_file) + "\t" + json.dumps(
+            res, ensure_ascii=False) + "\n"
+        save_results.append(save_pred)
+
+        if is_visualize:
+            image = Image.fromarray(cv2.cvtColor(img, cv2.COLOR_BGR2RGB))
+            boxes = dt_boxes
+            txts = [rec_res[i][0] for i in range(len(rec_res))]
+            scores = [rec_res[i][1] for i in range(len(rec_res))]
+
+            draw_img = draw_ocr_box_txt(
+                image,
+                boxes,
+                txts,
+                scores,
+                drop_score=drop_score,
+                font_path=font_path)
+            if flag:
+                image_file = image_file[:-3] + "png"
+            cv2.imwrite(
+                os.path.join(draw_img_save_dir, os.path.basename(image_file)),
+                draw_img[:, :, ::-1])
+            logger.debug("The visualized image saved in {}".format(
+                os.path.join(draw_img_save_dir, os.path.basename(image_file))))
+
+    logger.info("The predict total time is {}".format(time.time() - _st))
+
+    if args.total_process_num > 1:
+        save_results_path = os.path.join(draw_img_save_dir, f"system_results_{args.process_id}.txt")
+    else:
+        save_results_path = os.path.join(draw_img_save_dir, "system_results.txt")
+
+    with open(save_results_path, 'w', encoding='utf-8') as f:
+        f.writelines(save_results)
+
+
+if __name__ == "__main__":
+    args = utility.parse_args()
+    if args.use_mp:
+        p_list = []
+        total_process_num = args.total_process_num
+        for process_id in range(total_process_num):
+            cmd = [sys.executable, "-u"] + sys.argv + [
+                "--process_id={}".format(process_id),
+                "--use_mp={}".format(False)
+            ]
+            p = subprocess.Popen(cmd, stdout=sys.stdout, stderr=sys.stdout)
+            p_list.append(p)
+        for p in p_list:
+            p.wait()
+    else:
+        main(args)
--- a/tools/infer/utility.py
+++ b/tools/infer/utility.py
+import argparse
+import os
+import sys
+import platform
+import cv2
+import numpy as np
+import paddle
+from PIL import Image, ImageDraw, ImageFont
+import math
+from paddle import inference
+import time
+from ppocr.utils.logging import get_logger
+
+
+def str2bool(v):
+    return v.lower() in ("true", "t", "1")
+
+
+def init_args():
+    parser = argparse.ArgumentParser()
+    # params for prediction engine
+    parser.add_argument("--use_gpu", type=str2bool, default=True)
+    parser.add_argument("--use_xpu", type=str2bool, default=False)
+    parser.add_argument("--ir_optim", type=str2bool, default=True)
+    parser.add_argument("--use_tensorrt", type=str2bool, default=False)
+    parser.add_argument("--min_subgraph_size", type=int, default=15)
+    parser.add_argument("--precision", type=str, default="fp32")
+    parser.add_argument("--gpu_mem", type=int, default=500)
+
+    # params for text detector
+    parser.add_argument("--image_dir", type=str)
+    parser.add_argument("--det_algorithm", type=str, default='DB')
+    parser.add_argument("--det_model_dir", type=str)
+    parser.add_argument("--det_limit_side_len", type=float, default=960)
+    parser.add_argument("--det_limit_type", type=str, default='max')
+
+    # DB parmas
+    parser.add_argument("--det_db_thresh", type=float, default=0.3)
+    parser.add_argument("--det_db_box_thresh", type=float, default=0.6)
+    parser.add_argument("--det_db_unclip_ratio", type=float, default=1.5)
+    parser.add_argument("--max_batch_size", type=int, default=10)
+    parser.add_argument("--use_dilation", type=str2bool, default=False)
+    parser.add_argument("--det_db_score_mode", type=str, default="fast")
+    parser.add_argument("--vis_seg_map", type=str2bool, default=False)
+
+    # params for text recognizer
+    parser.add_argument("--rec_algorithm", type=str, default='SVTR_LCNet')
+    parser.add_argument("--rec_model_dir", type=str)
+    parser.add_argument("--rec_image_shape", type=str, default="3, 48, 320")
+    parser.add_argument("--rec_batch_num", type=int, default=6)
+    parser.add_argument("--max_text_length", type=int, default=25)
+    parser.add_argument(
+        "--rec_char_dict_path",
+        type=str,
+        default="./ppocr/utils/ppocr_keys_v1.txt")
+    parser.add_argument("--use_space_char", type=str2bool, default=True)
+    parser.add_argument(
+        "--vis_font_path", type=str, default="./doc/fonts/simfang.ttf")
+    parser.add_argument("--drop_score", type=float, default=0.5)
+
+    # params for text classifier
+    parser.add_argument("--use_angle_cls", type=str2bool, default=False)
+    parser.add_argument("--cls_model_dir", type=str)
+    parser.add_argument("--cls_image_shape", type=str, default="3, 48, 192")
+    parser.add_argument("--label_list", type=list, default=['0', '180'])
+    parser.add_argument("--cls_batch_num", type=int, default=6)
+    parser.add_argument("--cls_thresh", type=float, default=0.9)
+
+    parser.add_argument("--enable_mkldnn", type=str2bool, default=False)
+    parser.add_argument("--cpu_threads", type=int, default=10)
+    parser.add_argument("--use_pdserving", type=str2bool, default=False)
+    parser.add_argument("--warmup", type=str2bool, default=False)
+
+    #
+    parser.add_argument(
+        "--draw_img_save_dir", type=str, default="./inference_results")
+    parser.add_argument("--save_crop_res", type=str2bool, default=False)
+    parser.add_argument("--crop_res_save_dir", type=str, default="./output")
+
+    # multi-process
+    parser.add_argument("--use_mp", type=str2bool, default=False)
+    parser.add_argument("--total_process_num", type=int, default=1)
+    parser.add_argument("--process_id", type=int, default=0)
+
+    parser.add_argument("--benchmark", type=str2bool, default=False)
+    parser.add_argument("--save_log_path", type=str, default="./log_output/")
+
+    parser.add_argument("--show_log", type=str2bool, default=True)
+    parser.add_argument("--use_onnx", type=str2bool, default=False)
+    return parser
+
+
+def parse_args():
+    parser = init_args()
+    return parser.parse_args()
+
+
+def create_predictor(args, mode, logger):
+    if mode == "det":
+        model_dir = args.det_model_dir
+    elif mode == 'cls':
+        model_dir = args.cls_model_dir
+    else:
+        model_dir = args.rec_model_dir
+
+    if model_dir is None:
+        logger.info("not find {} model file path {}".format(mode, model_dir))
+        sys.exit(0)
+
+    model_file_path = model_dir + "/inference.pdmodel"
+    params_file_path = model_dir + "/inference.pdiparams"
+    if not os.path.exists(model_file_path):
+        raise ValueError("not find model file path {}".format(
+            model_file_path))
+    if not os.path.exists(params_file_path):
+        raise ValueError("not find params file path {}".format(
+            params_file_path))
+
+    config = inference.Config(model_file_path, params_file_path)
+
+    if hasattr(args, 'precision'):
+        if args.precision == "fp16" and args.use_tensorrt:
+            precision = inference.PrecisionType.Half
+            print("fp16 set success!")
+        elif args.precision == "int8":
+            precision = inference.PrecisionType.Int8
+        else:
+            precision = inference.PrecisionType.Float32
+    else:
+        precision = inference.PrecisionType.Float32
+
+    if args.use_gpu:
+        gpu_id = get_infer_gpuid()
+        if gpu_id is None:
+            logger.warning(
+                "GPU is not found in current device by nvidia-smi. Please check your device or ignore it if run on jetson."
+            )
+        config.enable_use_gpu(args.gpu_mem, 0)
+        use_dynamic_shape = True
+        if mode == "det":
+            min_input_shape = {
+                "x": [1, 3, 50, 50],
+                "conv2d_92.tmp_0": [1, 120, 20, 20],
+                "conv2d_91.tmp_0": [1, 24, 10, 10],
+                "conv2d_59.tmp_0": [1, 96, 20, 20],
+                "nearest_interp_v2_1.tmp_0": [1, 256, 10, 10],
+                "nearest_interp_v2_2.tmp_0": [1, 256, 20, 20],
+                "conv2d_124.tmp_0": [1, 256, 20, 20],
+                "nearest_interp_v2_3.tmp_0": [1, 64, 20, 20],
+                "nearest_interp_v2_4.tmp_0": [1, 64, 20, 20],
+                "nearest_interp_v2_5.tmp_0": [1, 64, 20, 20],
+                "elementwise_add_7": [1, 56, 2, 2],
+                "nearest_interp_v2_0.tmp_0": [1, 256, 2, 2]
+            }
+            max_input_shape = {
+                "x": [1, 3, 1536, 1536],
+                "conv2d_92.tmp_0": [1, 120, 400, 400],
+                "conv2d_91.tmp_0": [1, 24, 200, 200],
+                "conv2d_59.tmp_0": [1, 96, 400, 400],
+                "nearest_interp_v2_1.tmp_0": [1, 256, 200, 200],
+                "conv2d_124.tmp_0": [1, 256, 400, 400],
+                "nearest_interp_v2_2.tmp_0": [1, 256, 400, 400],
+                "nearest_interp_v2_3.tmp_0": [1, 64, 400, 400],
+                "nearest_interp_v2_4.tmp_0": [1, 64, 400, 400],
+                "nearest_interp_v2_5.tmp_0": [1, 64, 400, 400],
+                "elementwise_add_7": [1, 56, 400, 400],
+                "nearest_interp_v2_0.tmp_0": [1, 256, 400, 400]
+            }
+            opt_input_shape = {
+                "x": [1, 3, 640, 640],
+                "conv2d_92.tmp_0": [1, 120, 160, 160],
+                "conv2d_91.tmp_0": [1, 24, 80, 80],
+                "conv2d_59.tmp_0": [1, 96, 160, 160],
+                "nearest_interp_v2_1.tmp_0": [1, 256, 80, 80],
+                "nearest_interp_v2_2.tmp_0": [1, 256, 160, 160],
+                "conv2d_124.tmp_0": [1, 256, 160, 160],
+                "nearest_interp_v2_3.tmp_0": [1, 64, 160, 160],
+                "nearest_interp_v2_4.tmp_0": [1, 64, 160, 160],
+                "nearest_interp_v2_5.tmp_0": [1, 64, 160, 160],
+                "elementwise_add_7": [1, 56, 40, 40],
+                "nearest_interp_v2_0.tmp_0": [1, 256, 40, 40]
+            }
+            min_pact_shape = {
+                "nearest_interp_v2_26.tmp_0": [1, 256, 20, 20],
+                "nearest_interp_v2_27.tmp_0": [1, 64, 20, 20],
+                "nearest_interp_v2_28.tmp_0": [1, 64, 20, 20],
+                "nearest_interp_v2_29.tmp_0": [1, 64, 20, 20]
+            }
+            max_pact_shape = {
+                "nearest_interp_v2_26.tmp_0": [1, 256, 400, 400],
+                "nearest_interp_v2_27.tmp_0": [1, 64, 400, 400],
+                "nearest_interp_v2_28.tmp_0": [1, 64, 400, 400],
+                "nearest_interp_v2_29.tmp_0": [1, 64, 400, 400]
+            }
+            opt_pact_shape = {
+                "nearest_interp_v2_26.tmp_0": [1, 256, 160, 160],
+                "nearest_interp_v2_27.tmp_0": [1, 64, 160, 160],
+                "nearest_interp_v2_28.tmp_0": [1, 64, 160, 160],
+                "nearest_interp_v2_29.tmp_0": [1, 64, 160, 160]
+            }
+            min_input_shape.update(min_pact_shape)
+            max_input_shape.update(max_pact_shape)
+            opt_input_shape.update(opt_pact_shape)
+        elif mode == "rec":
+            if args.rec_algorithm not in ["CRNN", "SVTR_LCNet"]:
+                use_dynamic_shape = False
+            imgH = int(args.rec_image_shape.split(',')[-2])
+            min_input_shape = {"x": [1, 3, imgH, 10]}
+            max_input_shape = {"x": [args.rec_batch_num, 3, imgH, 2304]}
+            opt_input_shape = {"x": [args.rec_batch_num, 3, imgH, 320]}
+            config.exp_disable_tensorrt_ops(["transpose2"])
+        elif mode == "cls":
+            min_input_shape = {"x": [1, 3, 48, 10]}
+            max_input_shape = {"x": [args.rec_batch_num, 3, 48, 1024]}
+            opt_input_shape = {"x": [args.rec_batch_num, 3, 48, 320]}
+        else:
+            use_dynamic_shape = False
+        if use_dynamic_shape:
+            config.set_trt_dynamic_shape_info(
+                min_input_shape, max_input_shape, opt_input_shape)
+
+    elif args.use_xpu:
+        config.enable_xpu(10 * 1024 * 1024)
+    else:
+        config.disable_gpu()
+        if hasattr(args, "cpu_threads"):
+            config.set_cpu_math_library_num_threads(args.cpu_threads)
+        else:
+            # default cpu threads as 10
+            config.set_cpu_math_library_num_threads(10)
+        if args.enable_mkldnn:
+            # cache 10 different shapes for mkldnn to avoid memory leak
+            config.set_mkldnn_cache_capacity(10)
+            config.enable_mkldnn()
+            if args.precision == "fp16":
+                config.enable_mkldnn_bfloat16()
+    # enable memory optim
+    config.enable_memory_optim()
+    config.disable_glog_info()
+    config.delete_pass("conv_transpose_eltwiseadd_bn_fuse_pass")
+    config.delete_pass("matmul_transpose_reshape_fuse_pass")
+    if mode == 'table':
+        config.delete_pass("fc_fuse_pass")  # not supported for table
+    config.switch_use_feed_fetch_ops(False)
+    config.switch_ir_optim(True)
+
+    # create predictor
+    predictor = inference.create_predictor(config)
+    input_names = predictor.get_input_names()
+    for name in input_names:
+        input_tensor = predictor.get_input_handle(name)
+    output_tensors = get_output_tensors(args, mode, predictor)
+    return predictor, input_tensor, output_tensors, config
+
+
+def get_output_tensors(args, mode, predictor):
+    output_names = predictor.get_output_names()
+    output_tensors = []
+    if mode == "rec" and args.rec_algorithm in ["CRNN", "SVTR_LCNet"]:
+        output_name = 'softmax_0.tmp_0'
+        if output_name in output_names:
+            return [predictor.get_output_handle(output_name)]
+        else:
+            for output_name in output_names:
+                output_tensor = predictor.get_output_handle(output_name)
+                output_tensors.append(output_tensor)
+    else:
+        for output_name in output_names:
+            output_tensor = predictor.get_output_handle(output_name)
+            output_tensors.append(output_tensor)
+    return output_tensors
+
+
+def get_infer_gpuid():
+    sysstr = platform.system()
+    if sysstr == "Windows":
+        return 0
+
+    if not paddle.fluid.core.is_compiled_with_rocm():
+        cmd = "env | grep CUDA_VISIBLE_DEVICES"
+    else:
+        cmd = "env | grep HIP_VISIBLE_DEVICES"
+    env_cuda = os.popen(cmd).readlines()
+    if len(env_cuda) == 0:
+        return 0
+    else:
+        gpu_id = env_cuda[0].strip().split("=")[1]
+        return int(gpu_id[0])
+
+
+def draw_text_det_res(dt_boxes, img_path):
+    src_im = cv2.imread(img_path)
+    for box in dt_boxes:
+        box = np.array(box).astype(np.int32).reshape(-1, 2)
+        cv2.polylines(src_im, [box], True, color=(255, 255, 0), thickness=2)
+    return src_im
+
+
+def draw_ocr_box_txt(image,
+                     boxes,
+                     txts,
+                     scores=None,
+                     drop_score=0.5,
+                     font_path="./doc/simfang.ttf"):
+    h, w = image.height, image.width
+    img_left = image.copy()
+    img_right = Image.new('RGB', (w, h), (255, 255, 255))
+
+    import random
+
+    random.seed(0)
+    draw_left = ImageDraw.Draw(img_left)
+    draw_right = ImageDraw.Draw(img_right)
+    for idx, (box, txt) in enumerate(zip(boxes, txts)):
+        if scores is not None and scores[idx] < drop_score:
+            continue
+        color = (random.randint(0, 255), random.randint(0, 255),
+                 random.randint(0, 255))
+        draw_left.polygon(box, fill=color)
+        draw_right.polygon(
+            [
+                box[0][0], box[0][1], box[1][0], box[1][1], box[2][0],
+                box[2][1], box[3][0], box[3][1]
+            ],
+            outline=color)
+        box_height = math.sqrt((box[0][0] - box[3][0])**2 + (box[0][1] - box[3][
+            1])**2)
+        box_width = math.sqrt((box[0][0] - box[1][0])**2 + (box[0][1] - box[1][
+            1])**2)
+        if box_height > 2 * box_width:
+            font_size = max(int(box_width * 0.9), 10)
+            font = ImageFont.truetype(font_path, font_size, encoding="utf-8")
+            cur_y = box[0][1]
+            for c in txt:
+                char_size = font.getsize(c)
+                draw_right.text(
+                    (box[0][0] + 3, cur_y), c, fill=(0, 0, 0), font=font)
+                cur_y += char_size[1]
+        else:
+            font_size = max(int(box_height * 0.8), 10)
+            font = ImageFont.truetype(font_path, font_size, encoding="utf-8")
+            draw_right.text(
+                [box[0][0], box[0][1]], txt, fill=(0, 0, 0), font=font)
+    img_left = Image.blend(image, img_left, 0.5)
+    img_show = Image.new('RGB', (w * 2, h), (255, 255, 255))
+    img_show.paste(img_left, (0, 0, w, h))
+    img_show.paste(img_right, (w, 0, w * 2, h))
+    return np.array(img_show)
+
+
+def get_rotate_crop_image(img, points):
+    '''
+    img_height, img_width = img.shape[0:2]
+    left = int(np.min(points[:, 0]))
+    right = int(np.max(points[:, 0]))
+    top = int(np.min(points[:, 1]))
+    bottom = int(np.max(points[:, 1]))
+    img_crop = img[top:bottom, left:right, :].copy()
+    points[:, 0] = points[:, 0] - left
+    points[:, 1] = points[:, 1] - top
+    '''
+    assert len(points) == 4, "shape of points must be 4*2"
+    img_crop_width = int(
+        max(
+            np.linalg.norm(points[0] - points[1]),
+            np.linalg.norm(points[2] - points[3])))
+    img_crop_height = int(
+        max(
+            np.linalg.norm(points[0] - points[3]),
+            np.linalg.norm(points[1] - points[2])))
+    pts_std = np.float32([[0, 0], [img_crop_width, 0],
+                          [img_crop_width, img_crop_height],
+                          [0, img_crop_height]])
+    M = cv2.getPerspectiveTransform(points, pts_std)
+    dst_img = cv2.warpPerspective(
+        img,
+        M, (img_crop_width, img_crop_height),
+        borderMode=cv2.BORDER_REPLICATE,
+        flags=cv2.INTER_CUBIC)
+    dst_img_height, dst_img_width = dst_img.shape[0:2]
+    if dst_img_height * 1.0 / dst_img_width >= 1.5:
+        dst_img = np.rot90(dst_img)
+    return dst_img
+
+
+if __name__ == '__main__':
+    pass
--- a/tools/program.py
+++ b/tools/program.py
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+import sys
+import platform
+import yaml
+import time
+import datetime
+import paddle
+import paddle.distributed as dist
+from tqdm import tqdm
+from argparse import ArgumentParser, RawDescriptionHelpFormatter
+
+from ppocr.utils.stats import TrainingStats
+from ppocr.utils.save_load import save_model
+from ppocr.utils.utility import print_dict, AverageMeter
+from ppocr.utils.logging import get_logger
+from ppocr.utils.loggers import VDLLogger, WandbLogger, Loggers
+from ppocr.utils import profiler
+from ppocr.data import build_dataloader
+
+
+class ArgsParser(ArgumentParser):
+    def __init__(self):
+        super(ArgsParser, self).__init__(
+            formatter_class=RawDescriptionHelpFormatter)
+        self.add_argument("-c", "--config", help="configuration file to use")
+        self.add_argument(
+            "-o", "--opt", nargs='+', help="set configuration options")
+        self.add_argument(
+            '-p',
+            '--profiler_options',
+            type=str,
+            default=None,
+            help='The option of profiler, which should be in format ' \
+                 '\"key1=value1;key2=value2;key3=value3\".'
+        )
+
+    def parse_args(self, argv=None):
+        args = super(ArgsParser, self).parse_args(argv)
+        assert args.config is not None, \
+            "Please specify --config=configure_file_path."
+        args.opt = self._parse_opt(args.opt)
+        return args
+
+    def _parse_opt(self, opts):
+        config = {}
+        if not opts:
+            return config
+        for s in opts:
+            s = s.strip()
+            k, v = s.split('=')
+            config[k] = yaml.load(v, Loader=yaml.Loader)
+        return config
+
+
+def load_config(file_path):
+    """
+    Load config from yml/yaml file.
+    Args:
+        file_path (str): Path of the config file to be loaded.
+    Returns: global config
+    """
+    _, ext = os.path.splitext(file_path)
+    assert ext in ['.yml', '.yaml'], "only support yaml files for now"
+    config = yaml.load(open(file_path, 'rb'), Loader=yaml.Loader)
+    return config
+
+
+def merge_config(config, opts):
+    """
+    Merge config into global config.
+    Args:
+        config (dict): Config to be merged.
+    Returns: global config
+    """
+    for key, value in opts.items():
+        if "." not in key:
+            if isinstance(value, dict) and key in config:
+                config[key].update(value)
+            else:
+                config[key] = value
+        else:
+            sub_keys = key.split('.')
+            assert (
+                sub_keys[0] in config
+            ), "the sub_keys can only be one of global_config: {}, but get: " \
+               "{}, please check your running command".format(
+                config.keys(), sub_keys[0])
+            cur = config[sub_keys[0]]
+            for idx, sub_key in enumerate(sub_keys[1:]):
+                if idx == len(sub_keys) - 2:
+                    cur[sub_key] = value
+                else:
+                    cur = cur[sub_key]
+    return config
+
+
+def check_device(use_gpu, use_xpu=False):
+    """
+    Log error and exit when set use_gpu=true in paddlepaddle
+    cpu version.
+    """
+    err = "Config {} cannot be set as true while your paddle " \
+          "is not compiled with {} ! \nPlease try: \n" \
+          "\t1. Install paddlepaddle to run model on {} \n" \
+          "\t2. Set {} as false in config file to run " \
+          "model on CPU"
+
+    try:
+        if use_gpu and use_xpu:
+            print("use_xpu and use_gpu can not both be ture.")
+        if use_gpu and not paddle.is_compiled_with_cuda():
+            print(err.format("use_gpu", "cuda", "gpu", "use_gpu"))
+            sys.exit(1)
+        if use_xpu and not paddle.device.is_compiled_with_xpu():
+            print(err.format("use_xpu", "xpu", "xpu", "use_xpu"))
+            sys.exit(1)
+    except Exception as e:
+        pass
+
+
+def check_xpu(use_xpu):
+    """
+    Log error and exit when set use_xpu=true in paddlepaddle
+    cpu/gpu version.
+    """
+    err = "Config use_xpu cannot be set as true while you are " \
+          "using paddlepaddle cpu/gpu version ! \nPlease try: \n" \
+          "\t1. Install paddlepaddle-xpu to run model on XPU \n" \
+          "\t2. Set use_xpu as false in config file to run " \
+          "model on CPU/GPU"
+
+    try:
+        if use_xpu and not paddle.is_compiled_with_xpu():
+            print(err)
+            sys.exit(1)
+    except Exception as e:
+        pass
+
+
+def train(config,
+          train_dataloader,
+          valid_dataloader,
+          device,
+          model,
+          loss_class,
+          optimizer,
+          lr_scheduler,
+          post_process_class,
+          eval_class,
+          pre_best_model_dict,
+          logger,
+          log_writer=None,
+          scaler=None):
+    cal_metric_during_train = config['Global'].get('cal_metric_during_train',
+                                                   False)
+    calc_epoch_interval = config['Global'].get('calc_epoch_interval', 1)
+    log_smooth_window = config['Global']['log_smooth_window']
+    epoch_num = config['Global']['epoch_num']
+    print_batch_step = config['Global']['print_batch_step']
+    eval_batch_step = config['Global']['eval_batch_step']
+    profiler_options = config['profiler_options']
+
+    global_step = 0
+    if 'global_step' in pre_best_model_dict:
+        global_step = pre_best_model_dict['global_step']
+    start_eval_step = 0
+    if type(eval_batch_step) == list and len(eval_batch_step) >= 2:
+        start_eval_step = eval_batch_step[0]
+        eval_batch_step = eval_batch_step[1]
+        if len(valid_dataloader) == 0:
+            logger.info(
+                'No Images in eval dataset, evaluation during training ' \
+                'will be disabled'
+            )
+            start_eval_step = 1e111
+        logger.info(
+            "During the training process, after the {}th iteration, " \
+            "an evaluation is run every {} iterations".
+            format(start_eval_step, eval_batch_step))
+    save_epoch_step = config['Global']['save_epoch_step']
+    save_model_dir = config['Global']['save_model_dir']
+    if not os.path.exists(save_model_dir):
+        os.makedirs(save_model_dir)
+    main_indicator = eval_class.main_indicator
+    best_model_dict = {main_indicator: 0}
+    best_model_dict.update(pre_best_model_dict)
+    train_stats = TrainingStats(log_smooth_window, ['lr'])
+    model_average = False
+    model.train()
+
+    use_srn = config['Architecture']['algorithm'] == "SRN"
+    extra_input_models = ["SRN", "NRTR", "SAR", "SEED", "SVTR"]
+    extra_input = False
+    if config['Architecture']['algorithm'] == 'Distillation':
+        for key in config['Architecture']["Models"]:
+            extra_input = extra_input or config['Architecture']['Models'][key][
+                'algorithm'] in extra_input_models
+    else:
+        extra_input = config['Architecture']['algorithm'] in extra_input_models
+    try:
+        model_type = config['Architecture']['model_type']
+    except:
+        model_type = None
+
+    algorithm = config['Architecture']['algorithm']
+
+    start_epoch = best_model_dict[
+        'start_epoch'] if 'start_epoch' in best_model_dict else 1
+
+    total_samples = 0
+    train_reader_cost = 0.0
+    train_batch_cost = 0.0
+    reader_start = time.time()
+    eta_meter = AverageMeter()
+
+    max_iter = len(train_dataloader) - 1 if platform.system(
+    ) == "Windows" else len(train_dataloader)
+
+    for epoch in range(start_epoch, epoch_num + 1):
+        if train_dataloader.dataset.need_reset:
+            train_dataloader = build_dataloader(
+                config, 'Train', device, logger, seed=epoch)
+            max_iter = len(train_dataloader) - 1 if platform.system(
+            ) == "Windows" else len(train_dataloader)
+        for idx, batch in enumerate(train_dataloader):
+            profiler.add_profiler_step(profiler_options)
+            train_reader_cost += time.time() - reader_start
+            if idx >= max_iter:
+                break
+            lr = optimizer.get_lr()
+            images = batch[0]
+            if use_srn:
+                model_average = True
+
+            # use amp
+            if scaler:
+                with paddle.amp.auto_cast():
+                    if model_type == 'table' or extra_input:
+                        preds = model(images, data=batch[1:])
+                    else:
+                        preds = model(images)
+            else:
+                if model_type == 'table' or extra_input:
+                    preds = model(images, data=batch[1:])
+                elif model_type in ["kie", 'vqa']:
+                    preds = model(batch)
+                else:
+                    preds = model(images)
+
+            loss = loss_class(preds, batch)
+            avg_loss = loss['loss']
+
+            if scaler:
+                scaled_avg_loss = scaler.scale(avg_loss)
+                scaled_avg_loss.backward()
+                scaler.minimize(optimizer, scaled_avg_loss)
+            else:
+                avg_loss.backward()
+                optimizer.step()
+            optimizer.clear_grad()
+
+            if cal_metric_during_train and epoch % calc_epoch_interval == 0:  # only rec and cls need
+                batch = [item.numpy() for item in batch]
+                if model_type in ['table', 'kie']:
+                    eval_class(preds, batch)
+                else:
+                    if config['Loss']['name'] in ['MultiLoss', 'MultiLoss_v2'
+                                                  ]:  # for multi head loss
+                        post_result = post_process_class(
+                            preds['ctc'], batch[1])  # for CTC head out
+                    else:
+                        post_result = post_process_class(preds, batch[1])
+                    eval_class(post_result, batch)
+                metric = eval_class.get_metric()
+                train_stats.update(metric)
+
+            train_batch_time = time.time() - reader_start
+            train_batch_cost += train_batch_time
+            eta_meter.update(train_batch_time)
+            global_step += 1
+            total_samples += len(images)
+
+            if not isinstance(lr_scheduler, float):
+                lr_scheduler.step()
+
+            # logger and visualdl
+            stats = {k: v.numpy().mean() for k, v in loss.items()}
+            stats['lr'] = lr
+            train_stats.update(stats)
+
+
+            if log_writer is not None and dist.get_rank() == 0:
+                log_writer.log_metrics(metrics=train_stats.get(), prefix="TRAIN", step=global_step)
+
+            if dist.get_rank() == 0 and (
+                (global_step > 0 and global_step % print_batch_step == 0) or
+                (idx >= len(train_dataloader) - 1)):
+                logs = train_stats.log()
+
+                eta_sec = ((epoch_num + 1 - epoch) * \
+                    len(train_dataloader) - idx - 1) * eta_meter.avg
+                eta_sec_format = str(datetime.timedelta(seconds=int(eta_sec)))
+                strs = 'epoch: [{}/{}], global_step: {}, {}, avg_reader_cost: ' \
+                       '{:.5f} s, avg_batch_cost: {:.5f} s, avg_samples: {}, ' \
+                       'ips: {:.5f} samples/s, eta: {}'.format(
+                    epoch, epoch_num, global_step, logs,
+                    train_reader_cost / print_batch_step,
+                    train_batch_cost / print_batch_step,
+                    total_samples / print_batch_step,
+                    total_samples / train_batch_cost, eta_sec_format)
+                logger.info(strs)
+
+                total_samples = 0
+                train_reader_cost = 0.0
+                train_batch_cost = 0.0
+            # eval
+            if global_step > start_eval_step and \
+                    (global_step - start_eval_step) % eval_batch_step == 0 \
+                    and dist.get_rank() == 0:
+                if model_average:
+                    Model_Average = paddle.incubate.optimizer.ModelAverage(
+                        0.15,
+                        parameters=model.parameters(),
+                        min_average_window=10000,
+                        max_average_window=15625)
+                    Model_Average.apply()
+                cur_metric = eval(
+                    model,
+                    valid_dataloader,
+                    post_process_class,
+                    eval_class,
+                    model_type,
+                    extra_input=extra_input)
+                cur_metric_str = 'cur metric, {}'.format(', '.join(
+                    ['{}: {}'.format(k, v) for k, v in cur_metric.items()]))
+                logger.info(cur_metric_str)
+
+                # logger metric
+                if log_writer is not None:
+                    log_writer.log_metrics(metrics=cur_metric, prefix="EVAL", step=global_step)
+
+                if cur_metric[main_indicator] >= best_model_dict[
+                        main_indicator]:
+                    best_model_dict.update(cur_metric)
+                    best_model_dict['best_epoch'] = epoch
+                    save_model(
+                        model,
+                        optimizer,
+                        save_model_dir,
+                        logger,
+                        config,
+                        is_best=True,
+                        prefix='best_accuracy',
+                        best_model_dict=best_model_dict,
+                        epoch=epoch,
+                        global_step=global_step)
+                best_str = 'best metric, {}'.format(', '.join([
+                    '{}: {}'.format(k, v) for k, v in best_model_dict.items()
+                ]))
+                logger.info(best_str)
+                # logger best metric
+                if log_writer is not None:
+                    log_writer.log_metrics(metrics={
+                        "best_{}".format(main_indicator): best_model_dict[main_indicator]
+                        }, prefix="EVAL", step=global_step)
+                    
+                    log_writer.log_model(is_best=True, prefix="best_accuracy", metadata=best_model_dict)
+
+            reader_start = time.time()
+        if dist.get_rank() == 0:
+            save_model(
+                model,
+                optimizer,
+                save_model_dir,
+                logger,
+                config,
+                is_best=False,
+                prefix='latest',
+                best_model_dict=best_model_dict,
+                epoch=epoch,
+                global_step=global_step)
+
+            if log_writer is not None:
+                log_writer.log_model(is_best=False, prefix="latest")
+
+        if dist.get_rank() == 0 and epoch > 0 and epoch % save_epoch_step == 0:
+            save_model(
+                model,
+                optimizer,
+                save_model_dir,
+                logger,
+                config,
+                is_best=False,
+                prefix='iter_epoch_{}'.format(epoch),
+                best_model_dict=best_model_dict,
+                epoch=epoch,
+                global_step=global_step)
+            if log_writer is not None:
+                log_writer.log_model(is_best=False, prefix='iter_epoch_{}'.format(epoch))
+
+    best_str = 'best metric, {}'.format(', '.join(
+        ['{}: {}'.format(k, v) for k, v in best_model_dict.items()]))
+    logger.info(best_str)
+    if dist.get_rank() == 0 and log_writer is not None:
+        log_writer.close()
+    return
+
+
+def eval(model,
+         valid_dataloader,
+         post_process_class,
+         eval_class,
+         model_type=None,
+         extra_input=False):
+    model.eval()
+    with paddle.no_grad():
+        total_frame = 0.0
+        total_time = 0.0
+        pbar = tqdm(
+            total=len(valid_dataloader),
+            desc='eval model:',
+            position=0,
+            leave=True)
+        max_iter = len(valid_dataloader) - 1 if platform.system(
+        ) == "Windows" else len(valid_dataloader)
+        for idx, batch in enumerate(valid_dataloader):
+            if idx >= max_iter:
+                break
+            images = batch[0]
+            start = time.time()
+            if model_type == 'table' or extra_input:
+                preds = model(images, data=batch[1:])
+            elif model_type in ["kie", 'vqa']:
+                preds = model(batch)
+            else:
+                preds = model(images)
+
+            batch_numpy = []
+            for item in batch:
+                if isinstance(item, paddle.Tensor):
+                    batch_numpy.append(item.numpy())
+                else:
+                    batch_numpy.append(item)
+            # Obtain usable results from post-processing methods
+            total_time += time.time() - start
+            # Evaluate the results of the current batch
+            if model_type in ['table', 'kie']:
+                eval_class(preds, batch_numpy)
+            elif model_type in ['vqa']:
+                post_result = post_process_class(preds, batch_numpy)
+                eval_class(post_result, batch_numpy)
+            else:
+                post_result = post_process_class(preds, batch_numpy[1])
+                eval_class(post_result, batch_numpy)
+
+            pbar.update(1)
+            total_frame += len(images)
+        # Get final metric，eg. acc or hmean
+        metric = eval_class.get_metric()
+
+    pbar.close()
+    model.train()
+    metric['fps'] = total_frame / total_time
+    return metric
+
+
+def update_center(char_center, post_result, preds):
+    result, label = post_result
+    feats, logits = preds
+    logits = paddle.argmax(logits, axis=-1)
+    feats = feats.numpy()
+    logits = logits.numpy()
+
+    for idx_sample in range(len(label)):
+        if result[idx_sample][0] == label[idx_sample][0]:
+            feat = feats[idx_sample]
+            logit = logits[idx_sample]
+            for idx_time in range(len(logit)):
+                index = logit[idx_time]
+                if index in char_center.keys():
+                    char_center[index][0] = (
+                        char_center[index][0] * char_center[index][1] +
+                        feat[idx_time]) / (char_center[index][1] + 1)
+                    char_center[index][1] += 1
+                else:
+                    char_center[index] = [feat[idx_time], 1]
+    return char_center
+
+
+def get_center(model, eval_dataloader, post_process_class):
+    pbar = tqdm(total=len(eval_dataloader), desc='get center:')
+    max_iter = len(eval_dataloader) - 1 if platform.system(
+    ) == "Windows" else len(eval_dataloader)
+    char_center = dict()
+    for idx, batch in enumerate(eval_dataloader):
+        if idx >= max_iter:
+            break
+        images = batch[0]
+        start = time.time()
+        preds = model(images)
+
+        batch = [item.numpy() for item in batch]
+        # Obtain usable results from post-processing methods
+        post_result = post_process_class(preds, batch[1])
+
+        #update char_center
+        char_center = update_center(char_center, post_result, preds)
+        pbar.update(1)
+
+    pbar.close()
+    for key in char_center.keys():
+        char_center[key] = char_center[key][0]
+    return char_center
+
+
+def preprocess(is_train=False):
+    FLAGS = ArgsParser().parse_args()
+    profiler_options = FLAGS.profiler_options
+    config = load_config(FLAGS.config)
+    config = merge_config(config, FLAGS.opt)
+    profile_dic = {"profiler_options": FLAGS.profiler_options}
+    config = merge_config(config, profile_dic)
+
+    if is_train:
+        # save_config
+        save_model_dir = config['Global']['save_model_dir']
+        os.makedirs(save_model_dir, exist_ok=True)
+        with open(os.path.join(save_model_dir, 'config.yml'), 'w') as f:
+            yaml.dump(
+                dict(config), f, default_flow_style=False, sort_keys=False)
+        log_file = '{}/train.log'.format(save_model_dir)
+    else:
+        log_file = None
+    logger = get_logger(log_file=log_file)
+
+    # check if set use_gpu=True in paddlepaddle cpu version
+    use_gpu = config['Global']['use_gpu']
+    use_xpu = config['Global'].get('use_xpu', False)
+
+    # check if set use_xpu=True in paddlepaddle cpu/gpu version
+    use_xpu = False
+    if 'use_xpu' in config['Global']:
+        use_xpu = config['Global']['use_xpu']
+    check_xpu(use_xpu)
+
+    alg = config['Architecture']['algorithm']
+    assert alg in [
+        'EAST', 'DB', 'SAST', 'Rosetta', 'CRNN', 'STARNet', 'RARE', 'SRN',
+        'CLS', 'PGNet', 'Distillation', 'NRTR', 'TableAttn', 'SAR', 'PSE',
+        'SEED', 'SDMGR', 'LayoutXLM', 'LayoutLM', 'PREN', 'FCE', 'SVTR'
+    ]
+
+    if use_xpu:
+        device = 'xpu:{0}'.format(os.getenv('FLAGS_selected_xpus', 0))
+    else:
+        device = 'gpu:{}'.format(dist.ParallelEnv()
+                                 .dev_id) if use_gpu else 'cpu'
+    check_device(use_gpu, use_xpu)
+
+    device = paddle.set_device(device)
+
+    config['Global']['distributed'] = dist.get_world_size() != 1
+
+    loggers = []
+
+    if 'use_visualdl' in config['Global'] and config['Global']['use_visualdl']:
+        save_model_dir = config['Global']['save_model_dir']
+        vdl_writer_path = '{}/vdl/'.format(save_model_dir)
+        log_writer = VDLLogger(save_model_dir)
+        loggers.append(log_writer)
+    if ('use_wandb' in config['Global'] and config['Global']['use_wandb']) or 'wandb' in config:
+        save_dir = config['Global']['save_model_dir']
+        wandb_writer_path = "{}/wandb".format(save_dir)
+        if "wandb" in config:
+            wandb_params = config['wandb']
+        else:
+            wandb_params = dict()
+        wandb_params.update({'save_dir': save_model_dir})
+        log_writer = WandbLogger(**wandb_params, config=config)
+        loggers.append(log_writer)
+    else:
+        log_writer = None
+    print_dict(config, logger)
+
+    if loggers:
+        log_writer = Loggers(loggers)
+    else:
+        log_writer = None
+
+    logger.info('train with paddle {} and device {}'.format(paddle.__version__,
+                                                            device))
+    return config, device, logger, log_writer
--- a/tools/train.py
+++ b/tools/train.py
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+import sys
+
+__dir__ = os.path.dirname(os.path.abspath(__file__))
+sys.path.append(__dir__)
+sys.path.insert(0, os.path.abspath(os.path.join(__dir__, '..')))
+
+import yaml
+import paddle
+import paddle.distributed as dist
+
+from ppocr.data import build_dataloader
+from ppocr.modeling.architectures import build_model
+from ppocr.losses import build_loss
+from ppocr.optimizer import build_optimizer
+from ppocr.postprocess import build_post_process
+from ppocr.metrics import build_metric
+from ppocr.utils.save_load import load_model
+from ppocr.utils.utility import set_seed
+from ppocr.modeling.architectures import apply_to_static
+import tools.program as program
+
+dist.get_world_size()
+
+
+def main(config, device, logger, vdl_writer):
+    # init dist environment
+    if config['Global']['distributed']:
+        dist.init_parallel_env()
+
+    global_config = config['Global']
+
+    # build dataloader
+    train_dataloader = build_dataloader(config, 'Train', device, logger)
+    if len(train_dataloader) == 0:
+        logger.error(
+            "No Images in train dataset, please ensure\n" +
+            "\t1. The images num in the train label_file_list should be larger than or equal with batch size.\n"
+            +
+            "\t2. The annotation file and path in the configuration file are provided normally."
+        )
+        return
+
+    if config['Eval']:
+        valid_dataloader = build_dataloader(config, 'Eval', device, logger)
+    else:
+        valid_dataloader = None
+
+    # build post process
+    post_process_class = build_post_process(config['PostProcess'],
+                                            global_config)
+
+    # build model
+    # for rec algorithm
+    if hasattr(post_process_class, 'character'):
+        char_num = len(getattr(post_process_class, 'character'))
+        if config['Architecture']["algorithm"] in ["Distillation",
+                                                   ]:  # distillation model
+            for key in config['Architecture']["Models"]:
+                if config['Architecture']['Models'][key]['Head'][
+                        'name'] == 'MultiHead':  # for multi head
+                    if config['PostProcess'][
+                            'name'] == 'DistillationSARLabelDecode':
+                        char_num = char_num - 2
+                    # update SARLoss params
+                    assert list(config['Loss']['loss_config_list'][-1].keys())[
+                        0] == 'DistillationSARLoss'
+                    config['Loss']['loss_config_list'][-1][
+                        'DistillationSARLoss']['ignore_index'] = char_num + 1
+                    out_channels_list = {}
+                    out_channels_list['CTCLabelDecode'] = char_num
+                    out_channels_list['SARLabelDecode'] = char_num + 2
+                    config['Architecture']['Models'][key]['Head'][
+                        'out_channels_list'] = out_channels_list
+                else:
+                    config['Architecture']["Models"][key]["Head"][
+                        'out_channels'] = char_num
+        elif config['Architecture']['Head'][
+                'name'] == 'MultiHead':  # for multi head
+            if config['PostProcess']['name'] == 'SARLabelDecode':
+                char_num = char_num - 2
+            # update SARLoss params
+            assert list(config['Loss']['loss_config_list'][1].keys())[
+                0] == 'SARLoss'
+            if config['Loss']['loss_config_list'][1]['SARLoss'] is None:
+                config['Loss']['loss_config_list'][1]['SARLoss'] = {
+                    'ignore_index': char_num + 1
+                }
+            else:
+                config['Loss']['loss_config_list'][1]['SARLoss'][
+                    'ignore_index'] = char_num + 1
+            out_channels_list = {}
+            out_channels_list['CTCLabelDecode'] = char_num
+            out_channels_list['SARLabelDecode'] = char_num + 2
+            config['Architecture']['Head'][
+                'out_channels_list'] = out_channels_list
+        else:  # base rec model
+            config['Architecture']["Head"]['out_channels'] = char_num
+
+        if config['PostProcess']['name'] == 'SARLabelDecode':  # for SAR model
+            config['Loss']['ignore_index'] = char_num - 1
+
+    model = build_model(config['Architecture'])
+    if config['Global']['distributed']:
+        model = paddle.DataParallel(model)
+
+    model = apply_to_static(model, config, logger)
+
+    # build loss
+    loss_class = build_loss(config['Loss'])
+
+    # build optim
+    optimizer, lr_scheduler = build_optimizer(
+        config['Optimizer'],
+        epochs=config['Global']['epoch_num'],
+        step_each_epoch=len(train_dataloader),
+        model=model)
+
+    # build metric
+    eval_class = build_metric(config['Metric'])
+    # load pretrain model
+    pre_best_model_dict = load_model(config, model, optimizer,
+                                     config['Architecture']["model_type"])
+    logger.info('train dataloader has {} iters'.format(len(train_dataloader)))
+    if valid_dataloader is not None:
+        logger.info('valid dataloader has {} iters'.format(
+            len(valid_dataloader)))
+
+    use_amp = config["Global"].get("use_amp", False)
+    if use_amp:
+        AMP_RELATED_FLAGS_SETTING = {
+            'FLAGS_cudnn_batchnorm_spatial_persistent': 1,
+            'FLAGS_max_inplace_grad_add': 8,
+        }
+        paddle.fluid.set_flags(AMP_RELATED_FLAGS_SETTING)
+        scale_loss = config["Global"].get("scale_loss", 1.0)
+        use_dynamic_loss_scaling = config["Global"].get(
+            "use_dynamic_loss_scaling", False)
+        scaler = paddle.amp.GradScaler(
+            init_loss_scaling=scale_loss,
+            use_dynamic_loss_scaling=use_dynamic_loss_scaling)
+    else:
+        scaler = None
+
+    # start train
+    program.train(config, train_dataloader, valid_dataloader, device, model,
+                  loss_class, optimizer, lr_scheduler, post_process_class,
+                  eval_class, pre_best_model_dict, logger, vdl_writer, scaler)
+
+
+def test_reader(config, device, logger):
+    loader = build_dataloader(config, 'Train', device, logger)
+    import time
+    starttime = time.time()
+    count = 0
+    try:
+        for data in loader():
+            count += 1
+            if count % 1 == 0:
+                batch_time = time.time() - starttime
+                starttime = time.time()
+                logger.info("reader: {}, {}, {}".format(
+                    count, len(data[0]), batch_time))
+    except Exception as e:
+        logger.info(e)
+    logger.info("finish reader: {}, Success!".format(count))
+
+
+if __name__ == '__main__':
+    config, device, logger, vdl_writer = program.preprocess(is_train=True)
+    seed = config['Global']['seed'] if 'seed' in config['Global'] else 1024
+    set_seed(seed)
+    main(config, device, logger, vdl_writer)
+    # test_reader(config, device, logger)
--- a/warmup_images_5/ArT_0_.jpg
+++ b/warmup_images_5/ArT_0_.jpg
--- a/warmup_images_5/ArT_24_.jpg
+++ b/warmup_images_5/ArT_24_.jpg
--- a/warmup_images_5/ArT_262_.jpg
+++ b/warmup_images_5/ArT_262_.jpg