Merge pull request #2062 from myhloli/dev

feat: support 3.10~3.12 & remove paddle

Merge pull request #2062 from myhloli/dev
feat: support 3.10~3.12 & remove paddle
f4ffdfe8 · Xiaomeng Zhao · GitHub · ec566d22 · cb3a4314 · f4ffdfe8
Unverified Commit f4ffdfe8 authored Apr 02, 2025 by Xiaomeng Zhao Committed by GitHub Apr 02, 2025
18 changed files
--- a/magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/utils/resources/models_config.yml
+++ b/magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/utils/resources/models_config.yml
+lang:
+  ch:
+    det: ch_PP-OCRv4_det_infer.pth
+    rec: ch_PP-OCRv4_rec_infer.pth
+    dict: ppocr_keys_v1.txt
+  en:
+    det: en_PP-OCRv3_det_infer.pth
+    rec: en_PP-OCRv4_rec_infer.pth
+    dict: en_dict.txt
+  korean:
+    det: Multilingual_PP-OCRv3_det_infer.pth
+    rec: korean_PP-OCRv3_rec_infer.pth
+    dict: korean_dict.txt
+  japan:
+    det: Multilingual_PP-OCRv3_det_infer.pth
+    rec: japan_PP-OCRv3_rec_infer.pth
+    dict: japan_dict.txt
+  chinese_cht:
+    det: Multilingual_PP-OCRv3_det_infer.pth
+    rec: chinese_cht_PP-OCRv3_rec_infer.pth
+    dict: chinese_cht_dict.txt
+  ta:
+    det: Multilingual_PP-OCRv3_det_infer.pth
+    rec: ta_PP-OCRv3_rec_infer.pth
+    dict: ta_dict.txt
+  te:
+    det: Multilingual_PP-OCRv3_det_infer.pth
+    rec: te_PP-OCRv3_rec_infer.pth
+    dict: te_dict.txt
+  ka:
+    det: Multilingual_PP-OCRv3_det_infer.pth
+    rec: ka_PP-OCRv3_rec_infer.pth
+    dict: ka_dict.txt
+  latin:
+    det: en_PP-OCRv3_det_infer.pth
+    rec: latin_PP-OCRv3_rec_infer.pth
+    dict: latin_dict.txt
+  arabic:
+    det: Multilingual_PP-OCRv3_det_infer.pth
+    rec: arabic_PP-OCRv3_rec_infer.pth
+    dict: arabic_dict.txt
+  cyrillic:
+    det: Multilingual_PP-OCRv3_det_infer.pth
+    rec: cyrillic_PP-OCRv3_rec_infer.pth
+    dict: cyrillic_dict.txt
+  devanagari:
+    det: Multilingual_PP-OCRv3_det_infer.pth
+    rec: devanagari_PP-OCRv3_rec_infer.pth
+    dict: devanagari_dict.txt
\ No newline at end of file
--- a/magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/tools/__init__.py
+++ b/magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/tools/__init__.py
+# Copyright (c) Opendatalab. All rights reserved.
\ No newline at end of file
--- a/magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/tools/infer/__init__.py
+++ b/magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/tools/infer/__init__.py
+# Copyright (c) Opendatalab. All rights reserved.
--- a/magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/tools/infer/predict_cls.py
+++ b/magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/tools/infer/predict_cls.py
+import cv2
+import copy
+import numpy as np
+import math
+import time
+import torch
+from ...pytorchocr.base_ocr_v20 import BaseOCRV20
+from . import pytorchocr_utility as utility
+from ...pytorchocr.postprocess import build_post_process
+class TextClassifier(BaseOCRV20):
+    def __init__(self, args, **kwargs):
+        self.device = args.device
+        self.cls_image_shape = [int(v) for v in args.cls_image_shape.split(",")]
+        self.cls_batch_num = args.cls_batch_num
+        self.cls_thresh = args.cls_thresh
+        postprocess_params = {
+            'name': 'ClsPostProcess',
+            "label_list": args.label_list,
+        }
+        self.postprocess_op = build_post_process(postprocess_params)
+        self.weights_path = args.cls_model_path
+        self.yaml_path = args.cls_yaml_path
+        network_config = utility.get_arch_config(self.weights_path)
+        super(TextClassifier, self).__init__(network_config, **kwargs)
+        self.cls_image_shape = [int(v) for v in args.cls_image_shape.split(",")]
+        self.limited_max_width = args.limited_max_width
+        self.limited_min_width = args.limited_min_width
+        self.load_pytorch_weights(self.weights_path)
+        self.net.eval()
+        self.net.to(self.device)
+    def resize_norm_img(self, img):
+        imgC, imgH, imgW = self.cls_image_shape
+        h = img.shape[0]
+        w = img.shape[1]
+        ratio = w / float(h)
+        imgW = max(min(imgW, self.limited_max_width), self.limited_min_width)
+        ratio_imgH = math.ceil(imgH * ratio)
+        ratio_imgH = max(ratio_imgH, self.limited_min_width)
+        if ratio_imgH > imgW:
+            resized_w = imgW
+        else:
+            resized_w = int(math.ceil(imgH * ratio))
+        resized_image = cv2.resize(img, (resized_w, imgH))
+        resized_image = resized_image.astype('float32')
+        if self.cls_image_shape[0] == 1:
+            resized_image = resized_image / 255
+            resized_image = resized_image[np.newaxis, :]
+        else:
+            resized_image = resized_image.transpose((2, 0, 1)) / 255
+        resized_image -= 0.5
+        resized_image /= 0.5
+        padding_im = np.zeros((imgC, imgH, imgW), dtype=np.float32)
+        padding_im[:, :, 0:resized_w] = resized_image
+        return padding_im
+    def __call__(self, img_list):
+        img_list = copy.deepcopy(img_list)
+        img_num = len(img_list)
+        # Calculate the aspect ratio of all text bars
+        width_list = []
+        for img in img_list:
+            width_list.append(img.shape[1] / float(img.shape[0]))
+        # Sorting can speed up the cls process
+        indices = np.argsort(np.array(width_list))
+        cls_res = [['', 0.0]] * img_num
+        batch_num = self.cls_batch_num
+        elapse = 0
+        for beg_img_no in range(0, img_num, batch_num):
+            end_img_no = min(img_num, beg_img_no + batch_num)
+            norm_img_batch = []
+            max_wh_ratio = 0
+            for ino in range(beg_img_no, end_img_no):
+                h, w = img_list[indices[ino]].shape[0:2]
+                wh_ratio = w * 1.0 / h
+                max_wh_ratio = max(max_wh_ratio, wh_ratio)
+            for ino in range(beg_img_no, end_img_no):
+                norm_img = self.resize_norm_img(img_list[indices[ino]])
+                norm_img = norm_img[np.newaxis, :]
+                norm_img_batch.append(norm_img)
+            norm_img_batch = np.concatenate(norm_img_batch)
+            norm_img_batch = norm_img_batch.copy()
+            starttime = time.time()
+            with torch.no_grad():
+                inp = torch.from_numpy(norm_img_batch)
+                inp = inp.to(self.device)
+                prob_out = self.net(inp)
+            prob_out = prob_out.cpu().numpy()
+            cls_result = self.postprocess_op(prob_out)
+            elapse += time.time() - starttime
+            for rno in range(len(cls_result)):
+                label, score = cls_result[rno]
+                cls_res[indices[beg_img_no + rno]] = [label, score]
+                if '180' in label and score > self.cls_thresh:
+                    img_list[indices[beg_img_no + rno]] = cv2.rotate(
+                        img_list[indices[beg_img_no + rno]], 1)
+        return img_list, cls_res, elapse
--- a/magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/tools/infer/predict_det.py
+++ b/magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/tools/infer/predict_det.py
+import sys
+import numpy as np
+import time
+import torch
+from ...pytorchocr.base_ocr_v20 import BaseOCRV20
+from . import pytorchocr_utility as utility
+from ...pytorchocr.data import create_operators, transform
+from ...pytorchocr.postprocess import build_post_process
+class TextDetector(BaseOCRV20):
+    def __init__(self, args, **kwargs):
+        self.args = args
+        self.det_algorithm = args.det_algorithm
+        self.device = args.device
+        pre_process_list = [{
+            'DetResizeForTest': {
+                'limit_side_len': args.det_limit_side_len,
+                'limit_type': args.det_limit_type,
+            }
+        }, {
+            'NormalizeImage': {
+                'std': [0.229, 0.224, 0.225],
+                'mean': [0.485, 0.456, 0.406],
+                'scale': '1./255.',
+                'order': 'hwc'
+            }
+        }, {
+            'ToCHWImage': None
+        }, {
+            'KeepKeys': {
+                'keep_keys': ['image', 'shape']
+            }
+        }]
+        postprocess_params = {}
+        if self.det_algorithm == "DB":
+            postprocess_params['name'] = 'DBPostProcess'
+            postprocess_params["thresh"] = args.det_db_thresh
+            postprocess_params["box_thresh"] = args.det_db_box_thresh
+            postprocess_params["max_candidates"] = 1000
+            postprocess_params["unclip_ratio"] = args.det_db_unclip_ratio
+            postprocess_params["use_dilation"] = args.use_dilation
+            postprocess_params["score_mode"] = args.det_db_score_mode
+        elif self.det_algorithm == "DB++":
+            postprocess_params['name'] = 'DBPostProcess'
+            postprocess_params["thresh"] = args.det_db_thresh
+            postprocess_params["box_thresh"] = args.det_db_box_thresh
+            postprocess_params["max_candidates"] = 1000
+            postprocess_params["unclip_ratio"] = args.det_db_unclip_ratio
+            postprocess_params["use_dilation"] = args.use_dilation
+            postprocess_params["score_mode"] = args.det_db_score_mode
+            pre_process_list[1] = {
+                'NormalizeImage': {
+                    'std': [1.0, 1.0, 1.0],
+                    'mean':
+                        [0.48109378172549, 0.45752457890196, 0.40787054090196],
+                    'scale': '1./255.',
+                    'order': 'hwc'
+                }
+            }
+        elif self.det_algorithm == "EAST":
+            postprocess_params['name'] = 'EASTPostProcess'
+            postprocess_params["score_thresh"] = args.det_east_score_thresh
+            postprocess_params["cover_thresh"] = args.det_east_cover_thresh
+            postprocess_params["nms_thresh"] = args.det_east_nms_thresh
+        elif self.det_algorithm == "SAST":
+            pre_process_list[0] = {
+                'DetResizeForTest': {
+                    'resize_long': args.det_limit_side_len
+                }
+            }
+            postprocess_params['name'] = 'SASTPostProcess'
+            postprocess_params["score_thresh"] = args.det_sast_score_thresh
+            postprocess_params["nms_thresh"] = args.det_sast_nms_thresh
+            self.det_sast_polygon = args.det_sast_polygon
+            if self.det_sast_polygon:
+                postprocess_params["sample_pts_num"] = 6
+                postprocess_params["expand_scale"] = 1.2
+                postprocess_params["shrink_ratio_of_width"] = 0.2
+            else:
+                postprocess_params["sample_pts_num"] = 2
+                postprocess_params["expand_scale"] = 1.0
+                postprocess_params["shrink_ratio_of_width"] = 0.3
+        elif self.det_algorithm == "PSE":
+            postprocess_params['name'] = 'PSEPostProcess'
+            postprocess_params["thresh"] = args.det_pse_thresh
+            postprocess_params["box_thresh"] = args.det_pse_box_thresh
+            postprocess_params["min_area"] = args.det_pse_min_area
+            postprocess_params["box_type"] = args.det_pse_box_type
+            postprocess_params["scale"] = args.det_pse_scale
+            self.det_pse_box_type = args.det_pse_box_type
+        elif self.det_algorithm == "FCE":
+            pre_process_list[0] = {
+                'DetResizeForTest': {
+                    'rescale_img': [1080, 736]
+                }
+            }
+            postprocess_params['name'] = 'FCEPostProcess'
+            postprocess_params["scales"] = args.scales
+            postprocess_params["alpha"] = args.alpha
+            postprocess_params["beta"] = args.beta
+            postprocess_params["fourier_degree"] = args.fourier_degree
+            postprocess_params["box_type"] = args.det_fce_box_type
+        else:
+            print("unknown det_algorithm:{}".format(self.det_algorithm))
+            sys.exit(0)
+        self.preprocess_op = create_operators(pre_process_list)
+        self.postprocess_op = build_post_process(postprocess_params)
+        self.weights_path = args.det_model_path
+        self.yaml_path = args.det_yaml_path
+        network_config = utility.get_arch_config(self.weights_path)
+        super(TextDetector, self).__init__(network_config, **kwargs)
+        self.load_pytorch_weights(self.weights_path)
+        self.net.eval()
+        self.net.to(self.device)
+    def order_points_clockwise(self, pts):
+        """
+        reference from: https://github.com/jrosebr1/imutils/blob/master/imutils/perspective.py
+        # sort the points based on their x-coordinates
+        """
+        xSorted = pts[np.argsort(pts[:, 0]), :]
+        # grab the left-most and right-most points from the sorted
+        # x-roodinate points
+        leftMost = xSorted[:2, :]
+        rightMost = xSorted[2:, :]
+        # now, sort the left-most coordinates according to their
+        # y-coordinates so we can grab the top-left and bottom-left
+        # points, respectively
+        leftMost = leftMost[np.argsort(leftMost[:, 1]), :]
+        (tl, bl) = leftMost
+        rightMost = rightMost[np.argsort(rightMost[:, 1]), :]
+        (tr, br) = rightMost
+        rect = np.array([tl, tr, br, bl], dtype="float32")
+        return rect
+    def clip_det_res(self, points, img_height, img_width):
+        for pno in range(points.shape[0]):
+            points[pno, 0] = int(min(max(points[pno, 0], 0), img_width - 1))
+            points[pno, 1] = int(min(max(points[pno, 1], 0), img_height - 1))
+        return points
+    def filter_tag_det_res(self, dt_boxes, image_shape):
+        img_height, img_width = image_shape[0:2]
+        dt_boxes_new = []
+        for box in dt_boxes:
+            box = self.order_points_clockwise(box)
+            box = self.clip_det_res(box, img_height, img_width)
+            rect_width = int(np.linalg.norm(box[0] - box[1]))
+            rect_height = int(np.linalg.norm(box[0] - box[3]))
+            if rect_width <= 3 or rect_height <= 3:
+                continue
+            dt_boxes_new.append(box)
+        dt_boxes = np.array(dt_boxes_new)
+        return dt_boxes
+    def filter_tag_det_res_only_clip(self, dt_boxes, image_shape):
+        img_height, img_width = image_shape[0:2]
+        dt_boxes_new = []
+        for box in dt_boxes:
+            box = self.clip_det_res(box, img_height, img_width)
+            dt_boxes_new.append(box)
+        dt_boxes = np.array(dt_boxes_new)
+        return dt_boxes
+    def __call__(self, img):
+        ori_im = img.copy()
+        data = {'image': img}
+        data = transform(data, self.preprocess_op)
+        img, shape_list = data
+        if img is None:
+            return None, 0
+        img = np.expand_dims(img, axis=0)
+        shape_list = np.expand_dims(shape_list, axis=0)
+        img = img.copy()
+        starttime = time.time()
+        with torch.no_grad():
+            inp = torch.from_numpy(img)
+            inp = inp.to(self.device)
+            outputs = self.net(inp)
+        preds = {}
+        if self.det_algorithm == "EAST":
+            preds['f_geo'] = outputs['f_geo'].cpu().numpy()
+            preds['f_score'] = outputs['f_score'].cpu().numpy()
+        elif self.det_algorithm == 'SAST':
+            preds['f_border'] = outputs['f_border'].cpu().numpy()
+            preds['f_score'] = outputs['f_score'].cpu().numpy()
+            preds['f_tco'] = outputs['f_tco'].cpu().numpy()
+            preds['f_tvo'] = outputs['f_tvo'].cpu().numpy()
+        elif self.det_algorithm in ['DB', 'PSE', 'DB++']:
+            preds['maps'] = outputs['maps'].cpu().numpy()
+        elif self.det_algorithm == 'FCE':
+            for i, (k, output) in enumerate(outputs.items()):
+                preds['level_{}'.format(i)] = output
+        else:
+            raise NotImplementedError
+        post_result = self.postprocess_op(preds, shape_list)
+        dt_boxes = post_result[0]['points']
+        if (self.det_algorithm == "SAST" and
+            self.det_sast_polygon) or (self.det_algorithm in ["PSE", "FCE"] and
+                                       self.postprocess_op.box_type == 'poly'):
+            dt_boxes = self.filter_tag_det_res_only_clip(dt_boxes, ori_im.shape)
+        else:
+            dt_boxes = self.filter_tag_det_res(dt_boxes, ori_im.shape)
+        elapse = time.time() - starttime
+        return dt_boxes, elapse
--- a/magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/tools/infer/predict_rec.py
+++ b/magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/tools/infer/predict_rec.py
+from PIL import Image
+import cv2
+import numpy as np
+import math
+import time
+import torch
+from ...pytorchocr.base_ocr_v20 import BaseOCRV20
+from . import pytorchocr_utility as utility
+from ...pytorchocr.postprocess import build_post_process
+class TextRecognizer(BaseOCRV20):
+    def __init__(self, args, **kwargs):
+        self.device = args.device
+        self.rec_image_shape = [int(v) for v in args.rec_image_shape.split(",")]
+        self.character_type = args.rec_char_type
+        self.rec_batch_num = args.rec_batch_num
+        self.rec_algorithm = args.rec_algorithm
+        self.max_text_length = args.max_text_length
+        postprocess_params = {
+            'name': 'CTCLabelDecode',
+            "character_type": args.rec_char_type,
+            "character_dict_path": args.rec_char_dict_path,
+            "use_space_char": args.use_space_char
+        }
+        if self.rec_algorithm == "SRN":
+            postprocess_params = {
+                'name': 'SRNLabelDecode',
+                "character_type": args.rec_char_type,
+                "character_dict_path": args.rec_char_dict_path,
+                "use_space_char": args.use_space_char
+            }
+        elif self.rec_algorithm == "RARE":
+            postprocess_params = {
+                'name': 'AttnLabelDecode',
+                "character_type": args.rec_char_type,
+                "character_dict_path": args.rec_char_dict_path,
+                "use_space_char": args.use_space_char
+            }
+        elif self.rec_algorithm == 'NRTR':
+            postprocess_params = {
+                'name': 'NRTRLabelDecode',
+                "character_dict_path": args.rec_char_dict_path,
+                "use_space_char": args.use_space_char
+            }
+        elif self.rec_algorithm == "SAR":
+            postprocess_params = {
+                'name': 'SARLabelDecode',
+                "character_dict_path": args.rec_char_dict_path,
+                "use_space_char": args.use_space_char
+            }
+        elif self.rec_algorithm == 'ViTSTR':
+            postprocess_params = {
+                'name': 'ViTSTRLabelDecode',
+                "character_dict_path": args.rec_char_dict_path,
+                "use_space_char": args.use_space_char
+            }
+        elif self.rec_algorithm == "CAN":
+            self.inverse = args.rec_image_inverse
+            postprocess_params = {
+                'name': 'CANLabelDecode',
+                "character_dict_path": args.rec_char_dict_path,
+                "use_space_char": args.use_space_char
+            }
+        elif self.rec_algorithm == 'RFL':
+            postprocess_params = {
+                'name': 'RFLLabelDecode',
+                "character_dict_path": None,
+                "use_space_char": args.use_space_char
+            }
+        self.postprocess_op = build_post_process(postprocess_params)
+        self.limited_max_width = args.limited_max_width
+        self.limited_min_width = args.limited_min_width
+        self.weights_path = args.rec_model_path
+        self.yaml_path = args.rec_yaml_path
+        network_config = utility.get_arch_config(self.weights_path)
+        weights = self.read_pytorch_weights(self.weights_path)
+        self.out_channels = self.get_out_channels(weights)
+        if self.rec_algorithm == 'NRTR':
+            self.out_channels = list(weights.values())[-1].numpy().shape[0]
+        elif self.rec_algorithm == 'SAR':
+            self.out_channels = list(weights.values())[-3].numpy().shape[0]
+        kwargs['out_channels'] = self.out_channels
+        super(TextRecognizer, self).__init__(network_config, **kwargs)
+        self.load_state_dict(weights)
+        self.net.eval()
+        self.net.to(self.device)
+    def resize_norm_img(self, img, max_wh_ratio):
+        imgC, imgH, imgW = self.rec_image_shape
+        if self.rec_algorithm == 'NRTR' or self.rec_algorithm == 'ViTSTR':
+            img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
+            # return padding_im
+            image_pil = Image.fromarray(np.uint8(img))
+            if self.rec_algorithm == 'ViTSTR':
+                img = image_pil.resize([imgW, imgH], Image.BICUBIC)
+            else:
+                img = image_pil.resize([imgW, imgH], Image.ANTIALIAS)
+            img = np.array(img)
+            norm_img = np.expand_dims(img, -1)
+            norm_img = norm_img.transpose((2, 0, 1))
+            if self.rec_algorithm == 'ViTSTR':
+                norm_img = norm_img.astype(np.float32) / 255.
+            else:
+                norm_img = norm_img.astype(np.float32) / 128. - 1.
+            return norm_img
+        elif self.rec_algorithm == 'RFL':
+            img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
+            resized_image = cv2.resize(
+                img, (imgW, imgH), interpolation=cv2.INTER_CUBIC)
+            resized_image = resized_image.astype('float32')
+            resized_image = resized_image / 255
+            resized_image = resized_image[np.newaxis, :]
+            resized_image -= 0.5
+            resized_image /= 0.5
+            return resized_image
+        assert imgC == img.shape[2]
+        max_wh_ratio = max(max_wh_ratio, imgW / imgH)
+        imgW = int((imgH * max_wh_ratio))
+        imgW = max(min(imgW, self.limited_max_width), self.limited_min_width)
+        h, w = img.shape[:2]
+        ratio = w / float(h)
+        ratio_imgH = math.ceil(imgH * ratio)
+        ratio_imgH = max(ratio_imgH, self.limited_min_width)
+        if ratio_imgH > imgW:
+            resized_w = imgW
+        else:
+            resized_w = int(ratio_imgH)
+        resized_image = cv2.resize(img, (resized_w, imgH))
+        resized_image = resized_image.astype('float32')
+        resized_image = resized_image.transpose((2, 0, 1)) / 255
+        resized_image -= 0.5
+        resized_image /= 0.5
+        padding_im = np.zeros((imgC, imgH, imgW), dtype=np.float32)
+        padding_im[:, :, 0:resized_w] = resized_image
+        return padding_im
+    def resize_norm_img_svtr(self, img, image_shape):
+        imgC, imgH, imgW = image_shape
+        resized_image = cv2.resize(
+            img, (imgW, imgH), interpolation=cv2.INTER_LINEAR)
+        resized_image = resized_image.astype('float32')
+        resized_image = resized_image.transpose((2, 0, 1)) / 255
+        resized_image -= 0.5
+        resized_image /= 0.5
+        return resized_image
+    def resize_norm_img_srn(self, img, image_shape):
+        imgC, imgH, imgW = image_shape
+        img_black = np.zeros((imgH, imgW))
+        im_hei = img.shape[0]
+        im_wid = img.shape[1]
+        if im_wid <= im_hei * 1:
+            img_new = cv2.resize(img, (imgH * 1, imgH))
+        elif im_wid <= im_hei * 2:
+            img_new = cv2.resize(img, (imgH * 2, imgH))
+        elif im_wid <= im_hei * 3:
+            img_new = cv2.resize(img, (imgH * 3, imgH))
+        else:
+            img_new = cv2.resize(img, (imgW, imgH))
+        img_np = np.asarray(img_new)
+        img_np = cv2.cvtColor(img_np, cv2.COLOR_BGR2GRAY)
+        img_black[:, 0:img_np.shape[1]] = img_np
+        img_black = img_black[:, :, np.newaxis]
+        row, col, c = img_black.shape
+        c = 1
+        return np.reshape(img_black, (c, row, col)).astype(np.float32)
+    def srn_other_inputs(self, image_shape, num_heads, max_text_length):
+        imgC, imgH, imgW = image_shape
+        feature_dim = int((imgH / 8) * (imgW / 8))
+        encoder_word_pos = np.array(range(0, feature_dim)).reshape(
+            (feature_dim, 1)).astype('int64')
+        gsrm_word_pos = np.array(range(0, max_text_length)).reshape(
+            (max_text_length, 1)).astype('int64')
+        gsrm_attn_bias_data = np.ones((1, max_text_length, max_text_length))
+        gsrm_slf_attn_bias1 = np.triu(gsrm_attn_bias_data, 1).reshape(
+            [-1, 1, max_text_length, max_text_length])
+        gsrm_slf_attn_bias1 = np.tile(
+            gsrm_slf_attn_bias1,
+            [1, num_heads, 1, 1]).astype('float32') * [-1e9]
+        gsrm_slf_attn_bias2 = np.tril(gsrm_attn_bias_data, -1).reshape(
+            [-1, 1, max_text_length, max_text_length])
+        gsrm_slf_attn_bias2 = np.tile(
+            gsrm_slf_attn_bias2,
+            [1, num_heads, 1, 1]).astype('float32') * [-1e9]
+        encoder_word_pos = encoder_word_pos[np.newaxis, :]
+        gsrm_word_pos = gsrm_word_pos[np.newaxis, :]
+        return [
+            encoder_word_pos, gsrm_word_pos, gsrm_slf_attn_bias1,
+            gsrm_slf_attn_bias2
+        ]
+    def process_image_srn(self, img, image_shape, num_heads, max_text_length):
+        norm_img = self.resize_norm_img_srn(img, image_shape)
+        norm_img = norm_img[np.newaxis, :]
+        [encoder_word_pos, gsrm_word_pos, gsrm_slf_attn_bias1, gsrm_slf_attn_bias2] = \
+            self.srn_other_inputs(image_shape, num_heads, max_text_length)
+        gsrm_slf_attn_bias1 = gsrm_slf_attn_bias1.astype(np.float32)
+        gsrm_slf_attn_bias2 = gsrm_slf_attn_bias2.astype(np.float32)
+        encoder_word_pos = encoder_word_pos.astype(np.int64)
+        gsrm_word_pos = gsrm_word_pos.astype(np.int64)
+        return (norm_img, encoder_word_pos, gsrm_word_pos, gsrm_slf_attn_bias1,
+                gsrm_slf_attn_bias2)
+    def resize_norm_img_sar(self, img, image_shape,
+                            width_downsample_ratio=0.25):
+        imgC, imgH, imgW_min, imgW_max = image_shape
+        h = img.shape[0]
+        w = img.shape[1]
+        valid_ratio = 1.0
+        # make sure new_width is an integral multiple of width_divisor.
+        width_divisor = int(1 / width_downsample_ratio)
+        # resize
+        ratio = w / float(h)
+        resize_w = math.ceil(imgH * ratio)
+        if resize_w % width_divisor != 0:
+            resize_w = round(resize_w / width_divisor) * width_divisor
+        if imgW_min is not None:
+            resize_w = max(imgW_min, resize_w)
+        if imgW_max is not None:
+            valid_ratio = min(1.0, 1.0 * resize_w / imgW_max)
+            resize_w = min(imgW_max, resize_w)
+        resized_image = cv2.resize(img, (resize_w, imgH))
+        resized_image = resized_image.astype('float32')
+        # norm
+        if image_shape[0] == 1:
+            resized_image = resized_image / 255
+            resized_image = resized_image[np.newaxis, :]
+        else:
+            resized_image = resized_image.transpose((2, 0, 1)) / 255
+        resized_image -= 0.5
+        resized_image /= 0.5
+        resize_shape = resized_image.shape
+        padding_im = -1.0 * np.ones((imgC, imgH, imgW_max), dtype=np.float32)
+        padding_im[:, :, 0:resize_w] = resized_image
+        pad_shape = padding_im.shape
+        return padding_im, resize_shape, pad_shape, valid_ratio
+    def norm_img_can(self, img, image_shape):
+        img = cv2.cvtColor(
+            img, cv2.COLOR_BGR2GRAY)  # CAN only predict gray scale image
+        if self.inverse:
+            img = 255 - img
+        if self.rec_image_shape[0] == 1:
+            h, w = img.shape
+            _, imgH, imgW = self.rec_image_shape
+            if h < imgH or w < imgW:
+                padding_h = max(imgH - h, 0)
+                padding_w = max(imgW - w, 0)
+                img_padded = np.pad(img, ((0, padding_h), (0, padding_w)),
+                                    'constant',
+                                    constant_values=(255))
+                img = img_padded
+        img = np.expand_dims(img, 0) / 255.0  # h,w,c -> c,h,w
+        img = img.astype('float32')
+        return img
+    def __call__(self, img_list):
+        img_num = len(img_list)
+        # Calculate the aspect ratio of all text bars
+        width_list = []
+        for img in img_list:
+            width_list.append(img.shape[1] / float(img.shape[0]))
+        # Sorting can speed up the recognition process
+        indices = np.argsort(np.array(width_list))
+        # rec_res = []
+        rec_res = [['', 0.0]] * img_num
+        batch_num = self.rec_batch_num
+        elapse = 0
+        for beg_img_no in range(0, img_num, batch_num):
+            end_img_no = min(img_num, beg_img_no + batch_num)
+            norm_img_batch = []
+            max_wh_ratio = 0
+            for ino in range(beg_img_no, end_img_no):
+                # h, w = img_list[ino].shape[0:2]
+                h, w = img_list[indices[ino]].shape[0:2]
+                wh_ratio = w * 1.0 / h
+                max_wh_ratio = max(max_wh_ratio, wh_ratio)
+            for ino in range(beg_img_no, end_img_no):
+                if self.rec_algorithm == "SAR":
+                    norm_img, _, _, valid_ratio = self.resize_norm_img_sar(
+                        img_list[indices[ino]], self.rec_image_shape)
+                    norm_img = norm_img[np.newaxis, :]
+                    valid_ratio = np.expand_dims(valid_ratio, axis=0)
+                    valid_ratios = []
+                    valid_ratios.append(valid_ratio)
+                    norm_img_batch.append(norm_img)
+                elif self.rec_algorithm == "SVTR":
+                    norm_img = self.resize_norm_img_svtr(img_list[indices[ino]],
+                                                         self.rec_image_shape)
+                    norm_img = norm_img[np.newaxis, :]
+                    norm_img_batch.append(norm_img)
+                elif self.rec_algorithm == "SRN":
+                    norm_img = self.process_image_srn(img_list[indices[ino]],
+                                                      self.rec_image_shape, 8,
+                                                      self.max_text_length)
+                    encoder_word_pos_list = []
+                    gsrm_word_pos_list = []
+                    gsrm_slf_attn_bias1_list = []
+                    gsrm_slf_attn_bias2_list = []
+                    encoder_word_pos_list.append(norm_img[1])
+                    gsrm_word_pos_list.append(norm_img[2])
+                    gsrm_slf_attn_bias1_list.append(norm_img[3])
+                    gsrm_slf_attn_bias2_list.append(norm_img[4])
+                    norm_img_batch.append(norm_img[0])
+                elif self.rec_algorithm == "CAN":
+                    norm_img = self.norm_img_can(img_list[indices[ino]],
+                                                 max_wh_ratio)
+                    norm_img = norm_img[np.newaxis, :]
+                    norm_img_batch.append(norm_img)
+                    norm_image_mask = np.ones(norm_img.shape, dtype='float32')
+                    word_label = np.ones([1, 36], dtype='int64')
+                    norm_img_mask_batch = []
+                    word_label_list = []
+                    norm_img_mask_batch.append(norm_image_mask)
+                    word_label_list.append(word_label)
+                else:
+                    norm_img = self.resize_norm_img(img_list[indices[ino]],
+                                                    max_wh_ratio)
+                    norm_img = norm_img[np.newaxis, :]
+                    norm_img_batch.append(norm_img)
+            norm_img_batch = np.concatenate(norm_img_batch)
+            norm_img_batch = norm_img_batch.copy()
+            if self.rec_algorithm == "SRN":
+                starttime = time.time()
+                encoder_word_pos_list = np.concatenate(encoder_word_pos_list)
+                gsrm_word_pos_list = np.concatenate(gsrm_word_pos_list)
+                gsrm_slf_attn_bias1_list = np.concatenate(
+                    gsrm_slf_attn_bias1_list)
+                gsrm_slf_attn_bias2_list = np.concatenate(
+                    gsrm_slf_attn_bias2_list)
+                with torch.no_grad():
+                    inp = torch.from_numpy(norm_img_batch)
+                    encoder_word_pos_inp = torch.from_numpy(encoder_word_pos_list)
+                    gsrm_word_pos_inp = torch.from_numpy(gsrm_word_pos_list)
+                    gsrm_slf_attn_bias1_inp = torch.from_numpy(gsrm_slf_attn_bias1_list)
+                    gsrm_slf_attn_bias2_inp = torch.from_numpy(gsrm_slf_attn_bias2_list)
+                    # if self.use_gpu:
+                    #     inp = inp.cuda()
+                    #     encoder_word_pos_inp = encoder_word_pos_inp.cuda()
+                    #     gsrm_word_pos_inp = gsrm_word_pos_inp.cuda()
+                    #     gsrm_slf_attn_bias1_inp = gsrm_slf_attn_bias1_inp.cuda()
+                    #     gsrm_slf_attn_bias2_inp = gsrm_slf_attn_bias2_inp.cuda()
+                    inp = inp.to(self.device)
+                    encoder_word_pos_inp = encoder_word_pos_inp.to(self.device)
+                    gsrm_word_pos_inp = gsrm_word_pos_inp.to(self.device)
+                    gsrm_slf_attn_bias1_inp = gsrm_slf_attn_bias1_inp.to(self.device)
+                    gsrm_slf_attn_bias2_inp = gsrm_slf_attn_bias2_inp.to(self.device)
+                    backbone_out = self.net.backbone(inp) # backbone_feat
+                    prob_out = self.net.head(backbone_out, [encoder_word_pos_inp, gsrm_word_pos_inp, gsrm_slf_attn_bias1_inp, gsrm_slf_attn_bias2_inp])
+                # preds = {"predict": prob_out[2]}
+                preds = {"predict": prob_out["predict"]}
+            elif self.rec_algorithm == "SAR":
+                starttime = time.time()
+                # valid_ratios = np.concatenate(valid_ratios)
+                # inputs = [
+                #     norm_img_batch,
+                #     valid_ratios,
+                # ]
+                with torch.no_grad():
+                    inp = torch.from_numpy(norm_img_batch)
+                    # if self.use_gpu:
+                    #     inp = inp.cuda()
+                    inp = inp.to(self.device)
+                    preds = self.net(inp)
+            elif self.rec_algorithm == "CAN":
+                starttime = time.time()
+                norm_img_mask_batch = np.concatenate(norm_img_mask_batch)
+                word_label_list = np.concatenate(word_label_list)
+                inputs = [norm_img_batch, norm_img_mask_batch, word_label_list]
+                inp = [torch.from_numpy(e_i) for e_i in inputs]
+                inp = [e_i.to(self.device) for e_i in inp]
+                with torch.no_grad():
+                    outputs = self.net(inp)
+                    outputs = [v.cpu().numpy() for k, v in enumerate(outputs)]
+                preds = outputs
+            else:
+                starttime = time.time()
+                with torch.no_grad():
+                    inp = torch.from_numpy(norm_img_batch)
+                    inp = inp.to(self.device)
+                    prob_out = self.net(inp)
+                if isinstance(prob_out, list):
+                    preds = [v.cpu().numpy() for v in prob_out]
+                else:
+                    preds = prob_out.cpu().numpy()
+            rec_result = self.postprocess_op(preds)
+            for rno in range(len(rec_result)):
+                rec_res[indices[beg_img_no + rno]] = rec_result[rno]
+            elapse += time.time() - starttime
+        return rec_res, elapse
--- a/magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/tools/infer/predict_system.py
+++ b/magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/tools/infer/predict_system.py
+import cv2
+import copy
+import numpy as np
+from . import predict_rec
+from . import  predict_det
+from . import  predict_cls
+class TextSystem(object):
+    def __init__(self, args, **kwargs):
+        self.text_detector = predict_det.TextDetector(args, **kwargs)
+        self.text_recognizer = predict_rec.TextRecognizer(args, **kwargs)
+        self.use_angle_cls = args.use_angle_cls
+        self.drop_score = args.drop_score
+        if self.use_angle_cls:
+            self.text_classifier = predict_cls.TextClassifier(args, **kwargs)
+    def get_rotate_crop_image(self, img, points):
+        '''
+        img_height, img_width = img.shape[0:2]
+        left = int(np.min(points[:, 0]))
+        right = int(np.max(points[:, 0]))
+        top = int(np.min(points[:, 1]))
+        bottom = int(np.max(points[:, 1]))
+        img_crop = img[top:bottom, left:right, :].copy()
+        points[:, 0] = points[:, 0] - left
+        points[:, 1] = points[:, 1] - top
+        '''
+        img_crop_width = int(
+            max(
+                np.linalg.norm(points[0] - points[1]),
+                np.linalg.norm(points[2] - points[3])))
+        img_crop_height = int(
+            max(
+                np.linalg.norm(points[0] - points[3]),
+                np.linalg.norm(points[1] - points[2])))
+        pts_std = np.float32([[0, 0], [img_crop_width, 0],
+                              [img_crop_width, img_crop_height],
+                              [0, img_crop_height]])
+        M = cv2.getPerspectiveTransform(points, pts_std)
+        dst_img = cv2.warpPerspective(
+            img,
+            M, (img_crop_width, img_crop_height),
+            borderMode=cv2.BORDER_REPLICATE,
+            flags=cv2.INTER_CUBIC)
+        dst_img_height, dst_img_width = dst_img.shape[0:2]
+        if dst_img_height * 1.0 / dst_img_width >= 1.5:
+            dst_img = np.rot90(dst_img)
+        return dst_img
+    def __call__(self, img):
+        ori_im = img.copy()
+        dt_boxes, elapse = self.text_detector(img)
+        print("dt_boxes num : {}, elapse : {}".format(
+            len(dt_boxes), elapse))
+        if dt_boxes is None:
+            return None, None
+        img_crop_list = []
+        dt_boxes = sorted_boxes(dt_boxes)
+        for bno in range(len(dt_boxes)):
+            tmp_box = copy.deepcopy(dt_boxes[bno])
+            img_crop = self.get_rotate_crop_image(ori_im, tmp_box)
+            img_crop_list.append(img_crop)
+        if self.use_angle_cls:
+            img_crop_list, angle_list, elapse = self.text_classifier(
+                img_crop_list)
+            print("cls num  : {}, elapse : {}".format(
+                len(img_crop_list), elapse))
+        rec_res, elapse = self.text_recognizer(img_crop_list)
+        print("rec_res num  : {}, elapse : {}".format(
+            len(rec_res), elapse))
+        # self.print_draw_crop_rec_res(img_crop_list, rec_res)
+        filter_boxes, filter_rec_res = [], []
+        for box, rec_reuslt in zip(dt_boxes, rec_res):
+            text, score = rec_reuslt
+            if score >= self.drop_score:
+                filter_boxes.append(box)
+                filter_rec_res.append(rec_reuslt)
+        return filter_boxes, filter_rec_res
+def sorted_boxes(dt_boxes):
+    """
+    Sort text boxes in order from top to bottom, left to right
+    args:
+        dt_boxes(array):detected text boxes with shape [4, 2]
+    return:
+        sorted boxes(array) with shape [4, 2]
+    """
+    num_boxes = dt_boxes.shape[0]
+    sorted_boxes = sorted(dt_boxes, key=lambda x: (x[0][1], x[0][0]))
+    _boxes = list(sorted_boxes)
+    for i in range(num_boxes - 1):
+        if abs(_boxes[i + 1][0][1] - _boxes[i][0][1]) < 10 and \
+                (_boxes[i + 1][0][0] < _boxes[i][0][0]):
+            tmp = _boxes[i]
+            _boxes[i] = _boxes[i + 1]
+            _boxes[i + 1] = tmp
+    return _boxes
--- a/magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/tools/infer/pytorchocr_utility.py
+++ b/magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/tools/infer/pytorchocr_utility.py
+import os
+import math
+from pathlib import Path
+import numpy as np
+import cv2
+import argparse
+root_dir = Path(__file__).resolve().parent.parent.parent
+DEFAULT_CFG_PATH = root_dir / "pytorchocr" / "utils" / "resources" / "arch_config.yaml"
+def init_args():
+    def str2bool(v):
+        return v.lower() in ("true", "t", "1")
+    parser = argparse.ArgumentParser()
+    # params for prediction engine
+    parser.add_argument("--use_gpu", type=str2bool, default=False)
+    parser.add_argument("--det", type=str2bool, default=True)
+    parser.add_argument("--rec", type=str2bool, default=True)
+    parser.add_argument("--device", type=str, default='cpu')
+    # parser.add_argument("--ir_optim", type=str2bool, default=True)
+    # parser.add_argument("--use_tensorrt", type=str2bool, default=False)
+    # parser.add_argument("--use_fp16", type=str2bool, default=False)
+    parser.add_argument("--gpu_mem", type=int, default=500)
+    parser.add_argument("--warmup", type=str2bool, default=False)
+    # params for text detector
+    parser.add_argument("--image_dir", type=str)
+    parser.add_argument("--det_algorithm", type=str, default='DB')
+    parser.add_argument("--det_model_path", type=str)
+    parser.add_argument("--det_limit_side_len", type=float, default=960)
+    parser.add_argument("--det_limit_type", type=str, default='max')
+    # DB parmas
+    parser.add_argument("--det_db_thresh", type=float, default=0.3)
+    parser.add_argument("--det_db_box_thresh", type=float, default=0.6)
+    parser.add_argument("--det_db_unclip_ratio", type=float, default=1.5)
+    parser.add_argument("--max_batch_size", type=int, default=10)
+    parser.add_argument("--use_dilation", type=str2bool, default=False)
+    parser.add_argument("--det_db_score_mode", type=str, default="fast")
+    # EAST parmas
+    parser.add_argument("--det_east_score_thresh", type=float, default=0.8)
+    parser.add_argument("--det_east_cover_thresh", type=float, default=0.1)
+    parser.add_argument("--det_east_nms_thresh", type=float, default=0.2)
+    # SAST parmas
+    parser.add_argument("--det_sast_score_thresh", type=float, default=0.5)
+    parser.add_argument("--det_sast_nms_thresh", type=float, default=0.2)
+    parser.add_argument("--det_sast_polygon", type=str2bool, default=False)
+    # PSE parmas
+    parser.add_argument("--det_pse_thresh", type=float, default=0)
+    parser.add_argument("--det_pse_box_thresh", type=float, default=0.85)
+    parser.add_argument("--det_pse_min_area", type=float, default=16)
+    parser.add_argument("--det_pse_box_type", type=str, default='box')
+    parser.add_argument("--det_pse_scale", type=int, default=1)
+    # FCE parmas
+    parser.add_argument("--scales", type=list, default=[8, 16, 32])
+    parser.add_argument("--alpha", type=float, default=1.0)
+    parser.add_argument("--beta", type=float, default=1.0)
+    parser.add_argument("--fourier_degree", type=int, default=5)
+    parser.add_argument("--det_fce_box_type", type=str, default='poly')
+    # params for text recognizer
+    parser.add_argument("--rec_algorithm", type=str, default='CRNN')
+    parser.add_argument("--rec_model_path", type=str)
+    parser.add_argument("--rec_image_inverse", type=str2bool, default=True)
+    parser.add_argument("--rec_image_shape", type=str, default="3, 48, 320")
+    parser.add_argument("--rec_char_type", type=str, default='ch')
+    parser.add_argument("--rec_batch_num", type=int, default=6)
+    parser.add_argument("--max_text_length", type=int, default=25)
+    parser.add_argument("--use_space_char", type=str2bool, default=True)
+    parser.add_argument("--drop_score", type=float, default=0.5)
+    parser.add_argument("--limited_max_width", type=int, default=1280)
+    parser.add_argument("--limited_min_width", type=int, default=16)
+    parser.add_argument(
+        "--vis_font_path", type=str,
+        default=os.path.join(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))), 'doc/fonts/simfang.ttf'))
+    parser.add_argument(
+        "--rec_char_dict_path",
+        type=str,
+        default=os.path.join(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))),
+                             'pytorchocr/utils/ppocr_keys_v1.txt'))
+    # params for text classifier
+    parser.add_argument("--use_angle_cls", type=str2bool, default=False)
+    parser.add_argument("--cls_model_path", type=str)
+    parser.add_argument("--cls_image_shape", type=str, default="3, 48, 192")
+    parser.add_argument("--label_list", type=list, default=['0', '180'])
+    parser.add_argument("--cls_batch_num", type=int, default=6)
+    parser.add_argument("--cls_thresh", type=float, default=0.9)
+    parser.add_argument("--enable_mkldnn", type=str2bool, default=False)
+    parser.add_argument("--use_pdserving", type=str2bool, default=False)
+    # params for e2e
+    parser.add_argument("--e2e_algorithm", type=str, default='PGNet')
+    parser.add_argument("--e2e_model_path", type=str)
+    parser.add_argument("--e2e_limit_side_len", type=float, default=768)
+    parser.add_argument("--e2e_limit_type", type=str, default='max')
+    # PGNet parmas
+    parser.add_argument("--e2e_pgnet_score_thresh", type=float, default=0.5)
+    parser.add_argument(
+        "--e2e_char_dict_path", type=str,
+        default=os.path.join(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))),
+                             'pytorchocr/utils/ic15_dict.txt'))
+    parser.add_argument("--e2e_pgnet_valid_set", type=str, default='totaltext')
+    parser.add_argument("--e2e_pgnet_polygon", type=bool, default=True)
+    parser.add_argument("--e2e_pgnet_mode", type=str, default='fast')
+    # SR parmas
+    parser.add_argument("--sr_model_path", type=str)
+    parser.add_argument("--sr_image_shape", type=str, default="3, 32, 128")
+    parser.add_argument("--sr_batch_num", type=int, default=1)
+    # params .yaml
+    parser.add_argument("--det_yaml_path", type=str, default=None)
+    parser.add_argument("--rec_yaml_path", type=str, default=None)
+    parser.add_argument("--cls_yaml_path", type=str, default=None)
+    parser.add_argument("--e2e_yaml_path", type=str, default=None)
+    parser.add_argument("--sr_yaml_path", type=str, default=None)
+    # multi-process
+    parser.add_argument("--use_mp", type=str2bool, default=False)
+    parser.add_argument("--total_process_num", type=int, default=1)
+    parser.add_argument("--process_id", type=int, default=0)
+    parser.add_argument("--benchmark", type=str2bool, default=False)
+    parser.add_argument("--save_log_path", type=str, default="./log_output/")
+    parser.add_argument("--show_log", type=str2bool, default=True)
+    return parser
+def parse_args():
+    parser = init_args()
+    return parser.parse_args()
+def get_default_config(args):
+    return vars(args)
+def read_network_config_from_yaml(yaml_path, char_num=None):
+    if not os.path.exists(yaml_path):
+        raise FileNotFoundError('{} is not existed.'.format(yaml_path))
+    import yaml
+    with open(yaml_path, encoding='utf-8') as f:
+        res = yaml.safe_load(f)
+    if res.get('Architecture') is None:
+        raise ValueError('{} has no Architecture'.format(yaml_path))
+    if res['Architecture']['Head']['name'] == 'MultiHead' and char_num is not None:
+        res['Architecture']['Head']['out_channels_list'] = {
+            'CTCLabelDecode': char_num,
+            'SARLabelDecode': char_num + 2,
+            'NRTRLabelDecode': char_num + 3
+        }
+    return res['Architecture']
+def AnalysisConfig(weights_path, yaml_path=None, char_num=None):
+    if not os.path.exists(os.path.abspath(weights_path)):
+        raise FileNotFoundError('{} is not found.'.format(weights_path))
+    if yaml_path is not None:
+        return read_network_config_from_yaml(yaml_path, char_num=char_num)
+def resize_img(img, input_size=600):
+    """
+    resize img and limit the longest side of the image to input_size
+    """
+    img = np.array(img)
+    im_shape = img.shape
+    im_size_max = np.max(im_shape[0:2])
+    im_scale = float(input_size) / float(im_size_max)
+    img = cv2.resize(img, None, None, fx=im_scale, fy=im_scale)
+    return img
+def str_count(s):
+    """
+    Count the number of Chinese characters,
+    a single English character and a single number
+    equal to half the length of Chinese characters.
+    args:
+        s(string): the input of string
+    return(int):
+        the number of Chinese characters
+    """
+    import string
+    count_zh = count_pu = 0
+    s_len = len(s)
+    en_dg_count = 0
+    for c in s:
+        if c in string.ascii_letters or c.isdigit() or c.isspace():
+            en_dg_count += 1
+        elif c.isalpha():
+            count_zh += 1
+        else:
+            count_pu += 1
+    return s_len - math.ceil(en_dg_count / 2)
+def base64_to_cv2(b64str):
+    import base64
+    data = base64.b64decode(b64str.encode('utf8'))
+    data = np.fromstring(data, np.uint8)
+    data = cv2.imdecode(data, cv2.IMREAD_COLOR)
+    return data
+def get_arch_config(model_path):
+    from omegaconf import OmegaConf
+    all_arch_config = OmegaConf.load(DEFAULT_CFG_PATH)
+    path = Path(model_path)
+    file_name = path.stem
+    if file_name not in all_arch_config:
+        raise ValueError(f"architecture {file_name} is not in arch_config.yaml")
+    arch_config = all_arch_config[file_name]
+    return arch_config
\ No newline at end of file
--- a/magic_pdf/model/sub_modules/table/rapidtable/rapid_table.py
+++ b/magic_pdf/model/sub_modules/table/rapidtable/rapid_table.py
@@ -23,25 +23,17 @@ class RapidTableModel(object):
        self.table_model = RapidTable(input_args)
-        # if ocr_engine is None:
+        # self.ocr_model_name = "RapidOCR"
-        #     self.ocr_model_name = "RapidOCR"
+        # if torch.cuda.is_available():
-        #     if torch.cuda.is_available():
+        #     from rapidocr_paddle import RapidOCR
-        #         from rapidocr_paddle import RapidOCR
+        #     self.ocr_engine = RapidOCR(det_use_cuda=True, cls_use_cuda=True, rec_use_cuda=True)
-        #         self.ocr_engine = RapidOCR(det_use_cuda=True, cls_use_cuda=True, rec_use_cuda=True)
-        #     else:
-        #         from rapidocr_onnxruntime import RapidOCR
-        #         self.ocr_engine = RapidOCR()
        # else:
-        #     self.ocr_model_name = "PaddleOCR"
+        #     from rapidocr_onnxruntime import RapidOCR
-        #     self.ocr_engine = ocr_engine
+        #     self.ocr_engine = RapidOCR()
+        self.ocr_model_name = "PaddleOCR"
+        self.ocr_engine = ocr_engine
-        self.ocr_model_name = "RapidOCR"
-        if torch.cuda.is_available():
-            from rapidocr_paddle import RapidOCR
-            self.ocr_engine = RapidOCR(det_use_cuda=True, cls_use_cuda=True, rec_use_cuda=True)
-        else:
-            from rapidocr_onnxruntime import RapidOCR
-            self.ocr_engine = RapidOCR()
    def predict(self, image):

--- a/magic_pdf/model/sub_modules/table/structeqtable/struct_eqtable.py
+++ b/magic_pdf/model/sub_modules/table/structeqtable/struct_eqtable.py
-import torch
-from struct_eqtable import build_model
-from magic_pdf.model.sub_modules.table.table_utils import minify_html
-class StructTableModel:
-    def __init__(self, model_path, max_new_tokens=1024, max_time=60):
-        # init
-        assert torch.cuda.is_available(), "CUDA must be available for StructEqTable model."
-        self.model = build_model(
-            model_ckpt=model_path,
-            max_new_tokens=max_new_tokens,
-            max_time=max_time,
-            lmdeploy=False,
-            flash_attn=False,
-            batch_size=1,
-        ).cuda()
-        self.default_format = "html"
-    def predict(self, images, output_format=None, **kwargs):
-        if output_format is None:
-            output_format = self.default_format
-        else:
-            if output_format not in ['latex', 'markdown', 'html']:
-                raise ValueError(f"Output format {output_format} is not supported.")
-        results = self.model(
-            images, output_format=output_format
-        )
-        if output_format == "html":
-            results = [minify_html(html) for html in results]
-        return results
--- a/magic_pdf/model/sub_modules/table/tablemaster/tablemaster_paddle.py
+++ b/magic_pdf/model/sub_modules/table/tablemaster/tablemaster_paddle.py
-import os
-import cv2
-import numpy as np
-from paddleocr import PaddleOCR
-from ppstructure.table.predict_table import TableSystem
-from ppstructure.utility import init_args
-from PIL import Image
-from magic_pdf.config.constants import *  # noqa: F403
-class TableMasterPaddleModel(object):
-    """This class is responsible for converting image of table into HTML format
-    using a pre-trained model.
-    Attributes:
-    - table_sys: An instance of TableSystem initialized with parsed arguments.
-    Methods:
-    - __init__(config): Initializes the model with configuration parameters.
-    - img2html(image): Converts a PIL Image or NumPy array to HTML string.
-    - parse_args(**kwargs): Parses configuration arguments.
-    """
-    def __init__(self, config):
-        """
-        Parameters:
-        - config (dict): Configuration dictionary containing model_dir and device.
-        """
-        args = self.parse_args(**config)
-        self.table_sys = TableSystem(args)
-    def img2html(self, image):
-        """
-        Parameters:
-        - image (PIL.Image or np.ndarray): The image of the table to be converted.
-        Return:
-        - HTML (str): A string representing the HTML structure with content of the table.
-        """
-        if isinstance(image, Image.Image):
-            image = np.asarray(image)
-            image = cv2.cvtColor(image, cv2.COLOR_RGB2BGR)
-        pred_res, _ = self.table_sys(image)
-        pred_html = pred_res['html']
-        # res = '<td><table  border="1">' + pred_html.replace("<html><body><table>", "").replace(
-        # "</table></body></html>","") + "</table></td>\n"
-        return pred_html
-    def parse_args(self, **kwargs):
-        parser = init_args()
-        model_dir = kwargs.get('model_dir')
-        table_model_dir = os.path.join(model_dir, TABLE_MASTER_DIR)  # noqa: F405
-        table_char_dict_path = os.path.join(model_dir, TABLE_MASTER_DICT)  # noqa: F405
-        det_model_dir = os.path.join(model_dir, DETECT_MODEL_DIR)  # noqa: F405
-        rec_model_dir = os.path.join(model_dir, REC_MODEL_DIR)  # noqa: F405
-        rec_char_dict_path = os.path.join(model_dir, REC_CHAR_DICT)  # noqa: F405
-        device = kwargs.get('device', 'cpu')
-        use_gpu = True if device.startswith('cuda') else False
-        config = {
-            'use_gpu': use_gpu,
-            'table_max_len': kwargs.get('table_max_len', TABLE_MAX_LEN),  # noqa: F405
-            'table_algorithm': 'TableMaster',
-            'table_model_dir': table_model_dir,
-            'table_char_dict_path': table_char_dict_path,
-            'det_model_dir': det_model_dir,
-            'rec_model_dir': rec_model_dir,
-            'rec_char_dict_path': rec_char_dict_path,
-        }
-        parser.set_defaults(**config)
-        return parser.parse_args([])
--- a/magic_pdf/pdf_parse_union_core_v2.py
+++ b/magic_pdf/pdf_parse_union_core_v2.py
@@ -4,6 +4,7 @@ import os
 import re
 import statistics
 import time
+import warnings
 from typing import List
 import cv2
@@ -21,12 +22,9 @@ from magic_pdf.libs.config_reader import get_local_layoutreader_model_dir, get_l
 from magic_pdf.libs.convert_utils import dict_to_list
 from magic_pdf.libs.hash_utils import compute_md5
 from magic_pdf.libs.pdf_image_tools import cut_image_to_pil_image
-from magic_pdf.libs.performance_stats import measure_time, PerformanceStats
 from magic_pdf.model.magic_model import MagicModel
 from magic_pdf.post_proc.llm_aided import llm_aided_formula, llm_aided_text, llm_aided_title
-from concurrent.futures import ThreadPoolExecutor
 from magic_pdf.model.sub_modules.model_init import AtomModelSingleton
 from magic_pdf.post_proc.para_split_v3 import para_split
 from magic_pdf.pre_proc.construct_page_dict import ocr_construct_page_component_v2
@@ -195,7 +193,7 @@ def calculate_contrast(img, img_mode) -> float:
    std_dev = np.std(gray_img)
    # 对比度定义为标准差除以平均值（加上小常数避免除零错误）
    contrast = std_dev / (mean_value + 1e-6)
-    # logger.info(f"contrast: {contrast}")
+    # logger.debug(f"contrast: {contrast}")
    return round(contrast, 2)
 # @measure_time
@@ -288,33 +286,39 @@ def txt_spans_extract_v2(pdf_page, spans, all_bboxes, all_discarded_blocks, lang
    if len(need_ocr_spans) > 0:
        # 初始化ocr模型
-        atom_model_manager = AtomModelSingleton()
+        # atom_model_manager = AtomModelSingleton()
-        ocr_model = atom_model_manager.get_atom_model(
+        # ocr_model = atom_model_manager.get_atom_model(
-            atom_model_name='ocr',
+        #     atom_model_name='ocr',
-            ocr_show_log=False,
+        #     ocr_show_log=False,
-            det_db_box_thresh=0.3,
+        #     det_db_box_thresh=0.3,
-            lang=lang
+        #     lang=lang
-        )
+        # )
        for span in need_ocr_spans:
            # 对span的bbox截图再ocr
            span_img = cut_image_to_pil_image(span['bbox'], pdf_page, mode='cv2')
            # 计算span的对比度，低于0.20的span不进行ocr
-            if calculate_contrast(span_img, img_mode='bgr') <= 0.20:
+            if calculate_contrast(span_img, img_mode='bgr') <= 0.17:
                spans.remove(span)
                continue
+                # pass
+            span['content'] = ''
+            span['score'] = 1
+            span['np_img'] = span_img
-            ocr_res = ocr_model.ocr(span_img, det=False)
-            if ocr_res and len(ocr_res) > 0:
+            # ocr_res = ocr_model.ocr(span_img, det=False)
-                if len(ocr_res[0]) > 0:
+            # if ocr_res and len(ocr_res) > 0:
-                    ocr_text, ocr_score = ocr_res[0][0]
+            #     if len(ocr_res[0]) > 0:
-                    # logger.info(f"ocr_text: {ocr_text}, ocr_score: {ocr_score}")
+            #         ocr_text, ocr_score = ocr_res[0][0]
-                    if ocr_score > 0.5 and len(ocr_text) > 0:
+            #         # logger.info(f"ocr_text: {ocr_text}, ocr_score: {ocr_score}")
-                        span['content'] = ocr_text
+            #         if ocr_score > 0.5 and len(ocr_text) > 0:
-                        span['score'] = ocr_score
+            #             span['content'] = ocr_text
-                    else:
+            #             span['score'] = float(round(ocr_score, 2))
-                        spans.remove(span)
+            #         else:
+            #             spans.remove(span)
    return spans
@@ -372,9 +376,12 @@ def do_predict(boxes: List[List[int]], model) -> List[int]:
    from magic_pdf.model.sub_modules.reading_oreder.layoutreader.helpers import (
        boxes2inputs, parse_logits, prepare_inputs)
-    inputs = boxes2inputs(boxes)
+    with warnings.catch_warnings():
-    inputs = prepare_inputs(inputs, model)
+        warnings.filterwarnings("ignore", category=FutureWarning, module="transformers")
-    logits = model(**inputs).logits.cpu().squeeze(0)
+        inputs = boxes2inputs(boxes)
+        inputs = prepare_inputs(inputs, model)
+        logits = model(**inputs).logits.cpu().squeeze(0)
    return parse_logits(logits, len(boxes))
@@ -951,7 +958,47 @@ def pdf_parse_union(
            )
        pdf_info_dict[f'page_{page_id}'] = page_info
-    # PerformanceStats.print_stats()
+    need_ocr_list = []
+    img_crop_list = []
+    text_block_list = []
+    for pange_id, page_info in pdf_info_dict.items():
+        for block in page_info['preproc_blocks']:
+            if block['type'] in ['table', 'image']:
+                for sub_block in block['blocks']:
+                    if sub_block['type'] in ['image_caption', 'image_footnote', 'table_caption', 'table_footnote']:
+                        text_block_list.append(sub_block)
+            elif block['type'] in ['text', 'title']:
+                text_block_list.append(block)
+        for block in page_info['discarded_blocks']:
+            text_block_list.append(block)
+    for block in text_block_list:
+        for line in block['lines']:
+            for span in line['spans']:
+                if 'np_img' in span:
+                    need_ocr_list.append(span)
+                    img_crop_list.append(span['np_img'])
+                    span.pop('np_img')
+    if len(img_crop_list) > 0:
+        # Get OCR results for this language's images
+        atom_model_manager = AtomModelSingleton()
+        ocr_model = atom_model_manager.get_atom_model(
+            atom_model_name='ocr',
+            ocr_show_log=False,
+            det_db_box_thresh=0.3,
+            lang=lang
+        )
+        rec_start = time.time()
+        ocr_res_list = ocr_model.ocr(img_crop_list, det=False)[0]
+        # Verify we have matching counts
+        assert len(ocr_res_list) == len(need_ocr_list), f'ocr_res_list: {len(ocr_res_list)}, need_ocr_list: {len(need_ocr_list)}'
+        # Process OCR results for this language
+        for index, span in enumerate(need_ocr_list):
+            ocr_text, ocr_score = ocr_res_list[index]
+            span['content'] = ocr_text
+            span['score'] = float(round(ocr_score, 2))
+        rec_time = time.time() - rec_start
+        logger.info(f'ocr-dynamic-rec time: {round(rec_time, 2)}, total images processed: {len(img_crop_list)}')
    """分段"""
    para_split(pdf_info_dict)

--- a/magic_pdf/resources/model_config/UniMERNet/demo.yaml
+++ b/magic_pdf/resources/model_config/UniMERNet/demo.yaml
-model:
-  arch: unimernet
-  model_type: unimernet
-  model_config:
-    model_name: ./models/unimernet_base
-    max_seq_len: 1536
-  load_pretrained: True
-  pretrained: './models/unimernet_base/pytorch_model.pth'
-  tokenizer_config:
-    path: ./models/unimernet_base
-datasets:
-  formula_rec_eval:
-    vis_processor:
-      eval:
-        name: "formula_image_eval"
-        image_size:
-          - 192
-          - 672
-run:
-  runner: runner_iter
-  task: unimernet_train
-  batch_size_train: 64
-  batch_size_eval: 64
-  num_workers: 1
-  iters_per_inner_epoch: 2000
-  max_iters: 60000
-  seed: 42
-  output_dir: "../output/demo"
-  evaluate: True
-  test_splits: [ "eval" ]
-  device: "cuda"
-  world_size: 1
-  dist_url: "env://"
-  distributed: True
-  distributed_type: ddp  # or fsdp when train llm
-  generate_cfg:
-    temperature: 0.0
\ No newline at end of file
--- a/magic_pdf/resources/model_config/layoutlmv3/layoutlmv3_base_inference.yaml
+++ b/magic_pdf/resources/model_config/layoutlmv3/layoutlmv3_base_inference.yaml
-AUG:
-  DETR: true
-CACHE_DIR: ~/cache/huggingface
-CUDNN_BENCHMARK: false
-DATALOADER:
-  ASPECT_RATIO_GROUPING: true
-  FILTER_EMPTY_ANNOTATIONS: false
-  NUM_WORKERS: 4
-  REPEAT_THRESHOLD: 0.0
-  SAMPLER_TRAIN: TrainingSampler
-DATASETS:
-  PRECOMPUTED_PROPOSAL_TOPK_TEST: 1000
-  PRECOMPUTED_PROPOSAL_TOPK_TRAIN: 2000
-  PROPOSAL_FILES_TEST: []
-  PROPOSAL_FILES_TRAIN: []
-  TEST:
-  - scihub_train
-  TRAIN:
-  - scihub_train
-GLOBAL:
-  HACK: 1.0
-ICDAR_DATA_DIR_TEST: ''
-ICDAR_DATA_DIR_TRAIN: ''
-INPUT:
-  CROP:
-    ENABLED: true
-    SIZE:
-    - 384
-    - 600
-    TYPE: absolute_range
-  FORMAT: RGB
-  MASK_FORMAT: polygon
-  MAX_SIZE_TEST: 1333
-  MAX_SIZE_TRAIN: 1333
-  MIN_SIZE_TEST: 800
-  MIN_SIZE_TRAIN:
-  - 480
-  - 512
-  - 544
-  - 576
-  - 608
-  - 640
-  - 672
-  - 704
-  - 736
-  - 768
-  - 800
-  MIN_SIZE_TRAIN_SAMPLING: choice
-  RANDOM_FLIP: horizontal
-MODEL:
-  ANCHOR_GENERATOR:
-    ANGLES:
-    - - -90
-      - 0
-      - 90
-    ASPECT_RATIOS:
-    - - 0.5
-      - 1.0
-      - 2.0
-    NAME: DefaultAnchorGenerator
-    OFFSET: 0.0
-    SIZES:
-    - - 32
-    - - 64
-    - - 128
-    - - 256
-    - - 512
-  BACKBONE:
-    FREEZE_AT: 2
-    NAME: build_vit_fpn_backbone
-  CONFIG_PATH: ''
-  DEVICE: cuda
-  FPN:
-    FUSE_TYPE: sum
-    IN_FEATURES:
-    - layer3
-    - layer5
-    - layer7
-    - layer11
-    NORM: ''
-    OUT_CHANNELS: 256
-  IMAGE_ONLY: true
-  KEYPOINT_ON: false
-  LOAD_PROPOSALS: false
-  MASK_ON: true
-  META_ARCHITECTURE: VLGeneralizedRCNN
-  PANOPTIC_FPN:
-    COMBINE:
-      ENABLED: true
-      INSTANCES_CONFIDENCE_THRESH: 0.5
-      OVERLAP_THRESH: 0.5
-      STUFF_AREA_LIMIT: 4096
-    INSTANCE_LOSS_WEIGHT: 1.0
-  PIXEL_MEAN:
-  - 127.5
-  - 127.5
-  - 127.5
-  PIXEL_STD:
-  - 127.5
-  - 127.5
-  - 127.5
-  PROPOSAL_GENERATOR:
-    MIN_SIZE: 0
-    NAME: RPN
-  RESNETS:
-    DEFORM_MODULATED: false
-    DEFORM_NUM_GROUPS: 1
-    DEFORM_ON_PER_STAGE:
-    - false
-    - false
-    - false
-    - false
-    DEPTH: 50
-    NORM: FrozenBN
-    NUM_GROUPS: 1
-    OUT_FEATURES:
-    - res4
-    RES2_OUT_CHANNELS: 256
-    RES5_DILATION: 1
-    STEM_OUT_CHANNELS: 64
-    STRIDE_IN_1X1: true
-    WIDTH_PER_GROUP: 64
-  RETINANET:
-    BBOX_REG_LOSS_TYPE: smooth_l1
-    BBOX_REG_WEIGHTS:
-    - 1.0
-    - 1.0
-    - 1.0
-    - 1.0
-    FOCAL_LOSS_ALPHA: 0.25
-    FOCAL_LOSS_GAMMA: 2.0
-    IN_FEATURES:
-    - p3
-    - p4
-    - p5
-    - p6
-    - p7
-    IOU_LABELS:
-    - 0
-    - -1
-    - 1
-    IOU_THRESHOLDS:
-    - 0.4
-    - 0.5
-    NMS_THRESH_TEST: 0.5
-    NORM: ''
-    NUM_CLASSES: 10
-    NUM_CONVS: 4
-    PRIOR_PROB: 0.01
-    SCORE_THRESH_TEST: 0.05
-    SMOOTH_L1_LOSS_BETA: 0.1
-    TOPK_CANDIDATES_TEST: 1000
-  ROI_BOX_CASCADE_HEAD:
-    BBOX_REG_WEIGHTS:
-    - - 10.0
-      - 10.0
-      - 5.0
-      - 5.0
-    - - 20.0
-      - 20.0
-      - 10.0
-      - 10.0
-    - - 30.0
-      - 30.0
-      - 15.0
-      - 15.0
-    IOUS:
-    - 0.5
-    - 0.6
-    - 0.7
-  ROI_BOX_HEAD:
-    BBOX_REG_LOSS_TYPE: smooth_l1
-    BBOX_REG_LOSS_WEIGHT: 1.0
-    BBOX_REG_WEIGHTS:
-    - 10.0
-    - 10.0
-    - 5.0
-    - 5.0
-    CLS_AGNOSTIC_BBOX_REG: true
-    CONV_DIM: 256
-    FC_DIM: 1024
-    NAME: FastRCNNConvFCHead
-    NORM: ''
-    NUM_CONV: 0
-    NUM_FC: 2
-    POOLER_RESOLUTION: 7
-    POOLER_SAMPLING_RATIO: 0
-    POOLER_TYPE: ROIAlignV2
-    SMOOTH_L1_BETA: 0.0
-    TRAIN_ON_PRED_BOXES: false
-  ROI_HEADS:
-    BATCH_SIZE_PER_IMAGE: 512
-    IN_FEATURES:
-    - p2
-    - p3
-    - p4
-    - p5
-    IOU_LABELS:
-    - 0
-    - 1
-    IOU_THRESHOLDS:
-    - 0.5
-    NAME: CascadeROIHeads
-    NMS_THRESH_TEST: 0.5
-    NUM_CLASSES: 10
-    POSITIVE_FRACTION: 0.25
-    PROPOSAL_APPEND_GT: true
-    SCORE_THRESH_TEST: 0.05
-  ROI_KEYPOINT_HEAD:
-    CONV_DIMS:
-    - 512
-    - 512
-    - 512
-    - 512
-    - 512
-    - 512
-    - 512
-    - 512
-    LOSS_WEIGHT: 1.0
-    MIN_KEYPOINTS_PER_IMAGE: 1
-    NAME: KRCNNConvDeconvUpsampleHead
-    NORMALIZE_LOSS_BY_VISIBLE_KEYPOINTS: true
-    NUM_KEYPOINTS: 17
-    POOLER_RESOLUTION: 14
-    POOLER_SAMPLING_RATIO: 0
-    POOLER_TYPE: ROIAlignV2
-  ROI_MASK_HEAD:
-    CLS_AGNOSTIC_MASK: false
-    CONV_DIM: 256
-    NAME: MaskRCNNConvUpsampleHead
-    NORM: ''
-    NUM_CONV: 4
-    POOLER_RESOLUTION: 14
-    POOLER_SAMPLING_RATIO: 0
-    POOLER_TYPE: ROIAlignV2
-  RPN:
-    BATCH_SIZE_PER_IMAGE: 256
-    BBOX_REG_LOSS_TYPE: smooth_l1
-    BBOX_REG_LOSS_WEIGHT: 1.0
-    BBOX_REG_WEIGHTS:
-    - 1.0
-    - 1.0
-    - 1.0
-    - 1.0
-    BOUNDARY_THRESH: -1
-    CONV_DIMS:
-    - -1
-    HEAD_NAME: StandardRPNHead
-    IN_FEATURES:
-    - p2
-    - p3
-    - p4
-    - p5
-    - p6
-    IOU_LABELS:
-    - 0
-    - -1
-    - 1
-    IOU_THRESHOLDS:
-    - 0.3
-    - 0.7
-    LOSS_WEIGHT: 1.0
-    NMS_THRESH: 0.7
-    POSITIVE_FRACTION: 0.5
-    POST_NMS_TOPK_TEST: 1000
-    POST_NMS_TOPK_TRAIN: 2000
-    PRE_NMS_TOPK_TEST: 1000
-    PRE_NMS_TOPK_TRAIN: 2000
-    SMOOTH_L1_BETA: 0.0
-  SEM_SEG_HEAD:
-    COMMON_STRIDE: 4
-    CONVS_DIM: 128
-    IGNORE_VALUE: 255
-    IN_FEATURES:
-    - p2
-    - p3
-    - p4
-    - p5
-    LOSS_WEIGHT: 1.0
-    NAME: SemSegFPNHead
-    NORM: GN
-    NUM_CLASSES: 10
-  VIT:
-    DROP_PATH: 0.1
-    IMG_SIZE:
-    - 224
-    - 224
-    NAME: layoutlmv3_base
-    OUT_FEATURES:
-    - layer3
-    - layer5
-    - layer7
-    - layer11
-    POS_TYPE: abs
-  WEIGHTS: 
-OUTPUT_DIR: 
-SCIHUB_DATA_DIR_TRAIN: ~/publaynet/layout_scihub/train
-SEED: 42
-SOLVER:
-  AMP:
-    ENABLED: true
-  BACKBONE_MULTIPLIER: 1.0
-  BASE_LR: 0.0002
-  BIAS_LR_FACTOR: 1.0
-  CHECKPOINT_PERIOD: 2000
-  CLIP_GRADIENTS:
-    CLIP_TYPE: full_model
-    CLIP_VALUE: 1.0
-    ENABLED: true
-    NORM_TYPE: 2.0
-  GAMMA: 0.1
-  GRADIENT_ACCUMULATION_STEPS: 1
-  IMS_PER_BATCH: 32
-  LR_SCHEDULER_NAME: WarmupCosineLR
-  MAX_ITER: 20000
-  MOMENTUM: 0.9
-  NESTEROV: false
-  OPTIMIZER: ADAMW
-  REFERENCE_WORLD_SIZE: 0
-  STEPS:
-  - 10000
-  WARMUP_FACTOR: 0.01
-  WARMUP_ITERS: 333
-  WARMUP_METHOD: linear
-  WEIGHT_DECAY: 0.05
-  WEIGHT_DECAY_BIAS: null
-  WEIGHT_DECAY_NORM: 0.0
-TEST:
-  AUG:
-    ENABLED: false
-    FLIP: true
-    MAX_SIZE: 4000
-    MIN_SIZES:
-    - 400
-    - 500
-    - 600
-    - 700
-    - 800
-    - 900
-    - 1000
-    - 1100
-    - 1200
-  DETECTIONS_PER_IMAGE: 100
-  EVAL_PERIOD: 1000
-  EXPECTED_RESULTS: []
-  KEYPOINT_OKS_SIGMAS: []
-  PRECISE_BN:
-    ENABLED: false
-    NUM_ITER: 200
-VERSION: 2
-VIS_PERIOD: 0
--- a/requirements.txt
+++ b/requirements.txt
@@ -3,12 +3,12 @@ Brotli>=1.1.0
 click>=8.1.7
 fast-langdetect>=0.2.3,<0.3.0
 loguru>=0.6.0
-numpy>=1.21.6,<2.0.0
+numpy>=1.21.6
-pydantic>=2.7.2
+pydantic>=2.7.2,<2.11
-PyMuPDF>=1.24.9,<=1.24.14
+PyMuPDF>=1.24.9,<1.25.0
 scikit-learn>=1.0.2
 torch>=2.2.2,!=2.5.0,!=2.5.1,<=2.6.0
 torchvision
-transformers>=4.49.0
+transformers>=4.49.0,<5.0.0
 pdfminer.six==20231228
 # The requirements.txt must ensure that only necessary external dependencies are introduced. If there are new dependencies to add, please contact the project administrator.
--- a/scripts/download_models.py
+++ b/scripts/download_models.py
@@ -37,7 +37,7 @@ if __name__ == '__main__':
        "models/Layout/YOLO/*",
        "models/MFD/YOLO/*",
        "models/MFR/unimernet_hf_small_2503/*",
-        "models/OCR/paddleocr/*",
+        "models/OCR/paddleocr_torch/*",
        # "models/TabRec/TableMaster/*",
        # "models/TabRec/StructEqTable/*",
    ]
@@ -47,11 +47,11 @@ if __name__ == '__main__':
    print(f'model_dir is: {model_dir}')
    print(f'layoutreader_model_dir is: {layoutreader_model_dir}')
-    paddleocr_model_dir = model_dir + '/OCR/paddleocr'
+    # paddleocr_model_dir = model_dir + '/OCR/paddleocr'
-    user_paddleocr_dir = os.path.expanduser('~/.paddleocr')
+    # user_paddleocr_dir = os.path.expanduser('~/.paddleocr')
-    if os.path.exists(user_paddleocr_dir):
+    # if os.path.exists(user_paddleocr_dir):
-        shutil.rmtree(user_paddleocr_dir)
+    #     shutil.rmtree(user_paddleocr_dir)
-    shutil.copytree(paddleocr_model_dir, user_paddleocr_dir)
+    # shutil.copytree(paddleocr_model_dir, user_paddleocr_dir)
    json_url = 'https://gcore.jsdelivr.net/gh/opendatalab/MinerU@master/magic-pdf.template.json'
    config_file_name = 'magic-pdf.json'

--- a/scripts/download_models_hf.py
+++ b/scripts/download_models_hf.py
@@ -38,7 +38,7 @@ if __name__ == '__main__':
        "models/Layout/YOLO/*",
        "models/MFD/YOLO/*",
        "models/MFR/unimernet_hf_small_2503/*",
-        "models/OCR/paddleocr/*",
+        "models/OCR/paddleocr_torch/*",
        # "models/TabRec/TableMaster/*",
        # "models/TabRec/StructEqTable/*",
    ]
@@ -54,11 +54,11 @@ if __name__ == '__main__':
    print(f'model_dir is: {model_dir}')
    print(f'layoutreader_model_dir is: {layoutreader_model_dir}')
-    paddleocr_model_dir = model_dir + '/OCR/paddleocr'
+    # paddleocr_model_dir = model_dir + '/OCR/paddleocr'
-    user_paddleocr_dir = os.path.expanduser('~/.paddleocr')
+    # user_paddleocr_dir = os.path.expanduser('~/.paddleocr')
-    if os.path.exists(user_paddleocr_dir):
+    # if os.path.exists(user_paddleocr_dir):
-        shutil.rmtree(user_paddleocr_dir)
+    #     shutil.rmtree(user_paddleocr_dir)
-    shutil.copytree(paddleocr_model_dir, user_paddleocr_dir)
+    # shutil.copytree(paddleocr_model_dir, user_paddleocr_dir)
    json_url = 'https://github.com/opendatalab/MinerU/raw/master/magic-pdf.template.json'
    config_file_name = 'magic-pdf.json'

--- a/setup.py
+++ b/setup.py
@@ -26,9 +26,10 @@ if __name__ == '__main__':
    setup(
        name="magic_pdf",  # 项目名
        version=__version__,  # 自动从tag中获取版本号
-        packages=find_packages() + ["magic_pdf.resources"],  # 包含所有的包
+        packages=find_packages() + ["magic_pdf.resources"] + ["magic_pdf.model.sub_modules.ocr.paddleocr2pytorch.pytorchocr.utils.resources"],  # 包含所有的包
        package_data={
            "magic_pdf.resources": ["**"],  # 包含magic_pdf.resources目录下的所有文件
+            "magic_pdf.model.sub_modules.ocr.paddleocr2pytorch.pytorchocr.utils.resources": ["**"],  # pytorchocr.resources目录下的所有文件
        },
        install_requires=parse_requirements('requirements.txt'),  # 项目依赖的第三方库
        extras_require={
@@ -38,30 +39,21 @@ if __name__ == '__main__':
                     ],
            "full": [
                     "matplotlib<=3.9.0;platform_system=='Windows'",  # 3.9.1及之后不提供windows的预编译包，避免一些没有编译环境的windows设备安装失败
-                     "matplotlib;platform_system=='Linux' or platform_system=='Darwin'",  # linux 和 macos 不应限制matplotlib的最高版本，以避免无法更新导致的一些bug
+                     "matplotlib>=3.10;platform_system=='Linux' or platform_system=='Darwin'",  # linux 和 macos 不应限制matplotlib的最高版本，以避免无法更新导致的一些bug
                     "ultralytics>=8.3.48",  # yolov8,公式检测
-                     "paddleocr==2.7.3",  # 2.8.0及2.8.1版本与detectron2有冲突，需锁定2.7.3
-                     "paddlepaddle==3.0.0rc1;platform_system=='Linux' or platform_system=='Darwin'",  # 解决linux的段异常问题
-                     "paddlepaddle==2.6.1;platform_system=='Windows'",  # windows版本3.0.0效率下降，需锁定2.6.1
                     "doclayout_yolo==0.0.2b1",  # doclayout_yolo
-                     "rapidocr-paddle>=1.4.5,<2.0.0",  # rapidocr-paddle
+                     "dill>=0.3.9,<1",  # doclayout_yolo
-                     "rapidocr_onnxruntime>=1.4.4,<2.0.0",
                     "rapid_table>=1.0.3,<2.0.0",  # rapid_table
-                     "PyYAML",  # yaml
+                     "PyYAML>=6.0.2,<7",  # yaml
-                     "ftfy"
+                     "ftfy>=6.3.1,<7", # unimernet_hf
-                     "openai",  # openai SDK
+                     "openai>=1.70.0,<2",  # openai SDK
+                     "shapely>=2.0.7,<3",  # imgaug-paddleocr2pytorch
+                     "pyclipper>=1.3.0,<2",  # paddleocr2pytorch
+                     "omegaconf>=2.3.0,<3",  # paddleocr2pytorch
                     ],
            "old_linux":[
                "albumentations<=1.4.20", # 1.4.21引入的simsimd不支持2019年及更早的linux系统
            ],
-            "layoutlmv3":[
-                "detectron2"
-            ],
-            "struct_eqtable":[
-                "struct-eqtable==0.3.2",  # 表格解析
-                "einops",  # struct-eqtable依赖
-                "accelerate",  # struct-eqtable依赖
-            ],
        },
        description="A practical tool for converting PDF to Markdown",  # 简短描述
        long_description=long_description,  # 详细描述