Merge pull request #2065 from opendatalab/release-1.3.0

Release 1.3.0

Merge pull request #2065 from opendatalab/release-1.3.0
Release 1.3.0
41d96cd8 · Xiaomeng Zhao · GitHub · c3d43e52 · dd96663c · 41d96cd8
Unverified Commit 41d96cd8 authored Apr 03, 2025 by Xiaomeng Zhao Committed by GitHub Apr 03, 2025
20 changed files
--- a/magic_pdf/model/sub_modules/mfr/unimernet/unimernet_hf/unimer_swin/configuration_unimer_swin.py
+++ b/magic_pdf/model/sub_modules/mfr/unimernet/unimernet_hf/unimer_swin/configuration_unimer_swin.py
+# coding=utf-8
+# Copyright 2022 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Donut Swin Transformer model configuration"""
+
+from transformers.configuration_utils import PretrainedConfig
+from transformers.utils import logging
+
+
+logger = logging.get_logger(__name__)
+
+
+class UnimerSwinConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`UnimerSwinModel`]. It is used to instantiate a
+    Donut model according to the specified arguments, defining the model architecture. Instantiating a configuration
+    with the defaults will yield a similar configuration to that of the Donut
+    [naver-clova-ix/donut-base](https://huggingface.co/naver-clova-ix/donut-base) architecture.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+    Args:
+        image_size (`int`, *optional*, defaults to 224):
+            The size (resolution) of each image.
+        patch_size (`int`, *optional*, defaults to 4):
+            The size (resolution) of each patch.
+        num_channels (`int`, *optional*, defaults to 3):
+            The number of input channels.
+        embed_dim (`int`, *optional*, defaults to 96):
+            Dimensionality of patch embedding.
+        depths (`list(int)`, *optional*, defaults to `[2, 2, 6, 2]`):
+            Depth of each layer in the Transformer encoder.
+        num_heads (`list(int)`, *optional*, defaults to `[3, 6, 12, 24]`):
+            Number of attention heads in each layer of the Transformer encoder.
+        window_size (`int`, *optional*, defaults to 7):
+            Size of windows.
+        mlp_ratio (`float`, *optional*, defaults to 4.0):
+            Ratio of MLP hidden dimensionality to embedding dimensionality.
+        qkv_bias (`bool`, *optional*, defaults to `True`):
+            Whether or not a learnable bias should be added to the queries, keys and values.
+        hidden_dropout_prob (`float`, *optional*, defaults to 0.0):
+            The dropout probability for all fully connected layers in the embeddings and encoder.
+        attention_probs_dropout_prob (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for the attention probabilities.
+        drop_path_rate (`float`, *optional*, defaults to 0.1):
+            Stochastic depth rate.
+        hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`):
+            The non-linear activation function (function or string) in the encoder. If string, `"gelu"`, `"relu"`,
+            `"selu"` and `"gelu_new"` are supported.
+        use_absolute_embeddings (`bool`, *optional*, defaults to `False`):
+            Whether or not to add absolute position embeddings to the patch embeddings.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        layer_norm_eps (`float`, *optional*, defaults to 1e-05):
+            The epsilon used by the layer normalization layers.
+
+    Example:
+
+    ```python
+    >>> from transformers import UnimerSwinConfig, UnimerSwinModel
+
+    >>> # Initializing a Donut naver-clova-ix/donut-base style configuration
+    >>> configuration = UnimerSwinConfig()
+
+    >>> # Randomly initializing a model from the naver-clova-ix/donut-base style configuration
+    >>> model = UnimerSwinModel(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+
+    model_type = "unimer-swin"
+
+    attribute_map = {
+        "num_attention_heads": "num_heads",
+        "num_hidden_layers": "num_layers",
+    }
+
+    def __init__(
+        self,
+        image_size=224,
+        patch_size=4,
+        num_channels=3,
+        embed_dim=96,
+        depths=[2, 2, 6, 2],
+        num_heads=[3, 6, 12, 24],
+        window_size=7,
+        mlp_ratio=4.0,
+        qkv_bias=True,
+        hidden_dropout_prob=0.0,
+        attention_probs_dropout_prob=0.0,
+        drop_path_rate=0.1,
+        hidden_act="gelu",
+        use_absolute_embeddings=False,
+        initializer_range=0.02,
+        layer_norm_eps=1e-5,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+
+        self.image_size = image_size
+        self.patch_size = patch_size
+        self.num_channels = num_channels
+        self.embed_dim = embed_dim
+        self.depths = depths
+        self.num_layers = len(depths)
+        self.num_heads = num_heads
+        self.window_size = window_size
+        self.mlp_ratio = mlp_ratio
+        self.qkv_bias = qkv_bias
+        self.hidden_dropout_prob = hidden_dropout_prob
+        self.attention_probs_dropout_prob = attention_probs_dropout_prob
+        self.drop_path_rate = drop_path_rate
+        self.hidden_act = hidden_act
+        self.use_absolute_embeddings = use_absolute_embeddings
+        self.layer_norm_eps = layer_norm_eps
+        self.initializer_range = initializer_range
+        # we set the hidden_size attribute in order to make Swin work with VisionEncoderDecoderModel
+        # this indicates the channel dimension after the last stage of the model
+        self.hidden_size = int(embed_dim * 2 ** (len(depths) - 1))
--- a/magic_pdf/model/sub_modules/mfr/unimernet/unimernet_hf/unimer_swin/image_processing_unimer_swin.py
+++ b/magic_pdf/model/sub_modules/mfr/unimernet/unimernet_hf/unimer_swin/image_processing_unimer_swin.py
+from transformers.image_processing_utils import BaseImageProcessor
+import numpy as np
+import cv2
+import albumentations as alb
+from albumentations.pytorch import ToTensorV2
+
+
+# TODO: dereference cv2 if possible
+class UnimerSwinImageProcessor(BaseImageProcessor):
+    def __init__(
+            self,
+            image_size = (192, 672),
+        ):
+        self.input_size = [int(_) for _ in image_size]
+        assert len(self.input_size) == 2
+    
+        self.transform = alb.Compose(
+            [
+                alb.ToGray(),
+                alb.Normalize((0.7931, 0.7931, 0.7931), (0.1738, 0.1738, 0.1738)),
+                # alb.Sharpen()
+                ToTensorV2(),
+            ]
+        )
+
+    def __call__(self, item):
+        image = self.prepare_input(item)
+        return self.transform(image=image)['image'][:1]
+
+    @staticmethod
+    def crop_margin_numpy(img: np.ndarray) -> np.ndarray:
+        """Crop margins of image using NumPy operations"""
+        # Convert to grayscale if it's a color image
+        if len(img.shape) == 3 and img.shape[2] == 3:
+            gray = cv2.cvtColor(img, cv2.COLOR_RGB2GRAY)
+        else:
+            gray = img.copy()
+
+        # Normalize and threshold
+        if gray.max() == gray.min():
+            return img
+
+        normalized = (((gray - gray.min()) / (gray.max() - gray.min())) * 255).astype(np.uint8)
+        binary = 255 * (normalized < 200).astype(np.uint8)
+
+        # Find bounding box
+        coords = cv2.findNonZero(binary)  # Find all non-zero points (text)
+        x, y, w, h = cv2.boundingRect(coords)  # Find minimum spanning bounding box
+
+        # Return cropped image
+        return img[y:y + h, x:x + w]
+
+    def prepare_input(self, img, random_padding: bool = False):
+        """
+        Convert PIL Image or numpy array to properly sized and padded image after:
+            - crop margins
+            - resize while maintaining aspect ratio
+            - pad to target size
+        """
+        if img is None:
+            return None
+
+        # try:
+        #     img = self.crop_margin_numpy(img)
+        # except Exception:
+        #     # might throw an error for broken files
+        #     return None
+
+        if img.shape[0] == 0 or img.shape[1] == 0:
+            return None
+
+        # Get current dimensions
+        h, w = img.shape[:2]
+        target_h, target_w = self.input_size
+
+        # Calculate scale to preserve aspect ratio (equivalent to resize + thumbnail)
+        scale = min(target_h / h, target_w / w)
+
+        # Calculate new dimensions
+        new_h, new_w = int(h * scale), int(w * scale)
+
+        # Resize the image while preserving aspect ratio
+        resized_img = cv2.resize(img, (new_w, new_h))
+
+        # Calculate padding values using the existing method
+        delta_width = target_w - new_w
+        delta_height = target_h - new_h
+
+        pad_width, pad_height = self._get_padding_values(new_w, new_h, random_padding)
+
+        # Apply padding (convert PIL padding format to OpenCV format)
+        padding_color = [0, 0, 0] if len(img.shape) == 3 else [0]
+
+        padded_img = cv2.copyMakeBorder(
+            resized_img,
+            pad_height,  # top
+            delta_height - pad_height,  # bottom
+            pad_width,  # left
+            delta_width - pad_width,  # right
+            cv2.BORDER_CONSTANT,
+            value=padding_color
+        )
+
+        return padded_img
+
+    def _calculate_padding(self, new_w, new_h, random_padding):
+        """Calculate padding values for PIL images"""
+        delta_width = self.input_size[1] - new_w
+        delta_height = self.input_size[0] - new_h
+
+        pad_width, pad_height = self._get_padding_values(new_w, new_h, random_padding)
+
+        return (
+            pad_width,
+            pad_height,
+            delta_width - pad_width,
+            delta_height - pad_height,
+        )
+
+    def _get_padding_values(self, new_w, new_h, random_padding):
+        """Get padding values based on image dimensions and padding strategy"""
+        delta_width = self.input_size[1] - new_w
+        delta_height = self.input_size[0] - new_h
+
+        if random_padding:
+            pad_width = np.random.randint(low=0, high=delta_width + 1)
+            pad_height = np.random.randint(low=0, high=delta_height + 1)
+        else:
+            pad_width = delta_width // 2
+            pad_height = delta_height // 2
+
+        return pad_width, pad_height
--- a/magic_pdf/model/sub_modules/mfr/unimernet/unimernet_hf/unimer_swin/modeling_unimer_swin.py
+++ b/magic_pdf/model/sub_modules/mfr/unimernet/unimernet_hf/unimer_swin/modeling_unimer_swin.py
--- a/magic_pdf/model/sub_modules/model_init.py
+++ b/magic_pdf/model/sub_modules/model_init.py
@@ -5,47 +5,57 @@ from magic_pdf.config.constants import MODEL_NAME
 from magic_pdf.model.model_list import AtomicModel
 from magic_pdf.model.sub_modules.language_detection.yolov11.YOLOv11 import YOLOv11LangDetModel
 from magic_pdf.model.sub_modules.layout.doclayout_yolo.DocLayoutYOLO import DocLayoutYOLOModel
-from magic_pdf.model.sub_modules.layout.layoutlmv3.model_init import Layoutlmv3_Predictor
 from magic_pdf.model.sub_modules.mfd.yolov8.YOLOv8 import YOLOv8MFDModel
 from magic_pdf.model.sub_modules.mfr.unimernet.Unimernet import UnimernetModel
-
-try:
-    from magic_pdf_ascend_plugin.libs.license_verifier import load_license, LicenseFormatError, LicenseSignatureError, LicenseExpiredError
-    from magic_pdf_ascend_plugin.model_plugin.ocr.paddleocr.ppocr_273_npu import ModifiedPaddleOCR
-    from magic_pdf_ascend_plugin.model_plugin.table.rapidtable.rapid_table_npu import RapidTableModel
-    license_key = load_license()
-    logger.info(f'Using Ascend Plugin Success, License id is {license_key["payload"]["id"]},'
-                f' License expired at {license_key["payload"]["date"]["end_date"]}')
-except Exception as e:
-    if isinstance(e, ImportError):
-        pass
-    elif isinstance(e, LicenseFormatError):
-        logger.error("Ascend Plugin: Invalid license format. Please check the license file.")
-    elif isinstance(e, LicenseSignatureError):
-        logger.error("Ascend Plugin: Invalid signature. The license may be tampered with.")
-    elif isinstance(e, LicenseExpiredError):
-        logger.error("Ascend Plugin: License has expired. Please renew your license.")
-    elif isinstance(e, FileNotFoundError):
-        logger.error("Ascend Plugin: Not found License file.")
-    else:
-        logger.error(f"Ascend Plugin: {e}")
-    from magic_pdf.model.sub_modules.ocr.paddleocr.ppocr_273_mod import ModifiedPaddleOCR
-    # from magic_pdf.model.sub_modules.ocr.paddleocr.ppocr_291_mod import ModifiedPaddleOCR
-    from magic_pdf.model.sub_modules.table.rapidtable.rapid_table import RapidTableModel
-
-from magic_pdf.model.sub_modules.table.structeqtable.struct_eqtable import StructTableModel
-from magic_pdf.model.sub_modules.table.tablemaster.tablemaster_paddle import TableMasterPaddleModel
-
-def table_model_init(table_model_type, model_path, max_time, _device_='cpu', ocr_engine=None, table_sub_model_name=None):
+from magic_pdf.model.sub_modules.ocr.paddleocr2pytorch.pytorch_paddle import PytorchPaddleOCR
+from magic_pdf.model.sub_modules.table.rapidtable.rapid_table import RapidTableModel
+# try:
+#     from magic_pdf_ascend_plugin.libs.license_verifier import (
+#         LicenseExpiredError, LicenseFormatError, LicenseSignatureError,
+#         load_license)
+#     from magic_pdf_ascend_plugin.model_plugin.ocr.paddleocr.ppocr_273_npu import ModifiedPaddleOCR
+#     from magic_pdf_ascend_plugin.model_plugin.table.rapidtable.rapid_table_npu import RapidTableModel
+#     license_key = load_license()
+#     logger.info(f'Using Ascend Plugin Success, License id is {license_key["payload"]["id"]},'
+#                 f' License expired at {license_key["payload"]["date"]["end_date"]}')
+# except Exception as e:
+#     if isinstance(e, ImportError):
+#         pass
+#     elif isinstance(e, LicenseFormatError):
+#         logger.error('Ascend Plugin: Invalid license format. Please check the license file.')
+#     elif isinstance(e, LicenseSignatureError):
+#         logger.error('Ascend Plugin: Invalid signature. The license may be tampered with.')
+#     elif isinstance(e, LicenseExpiredError):
+#         logger.error('Ascend Plugin: License has expired. Please renew your license.')
+#     elif isinstance(e, FileNotFoundError):
+#         logger.error('Ascend Plugin: Not found License file.')
+#     else:
+#         logger.error(f'Ascend Plugin: {e}')
+#     from magic_pdf.model.sub_modules.ocr.paddleocr.ppocr_273_mod import ModifiedPaddleOCR
+#     # from magic_pdf.model.sub_modules.ocr.paddleocr.ppocr_291_mod import ModifiedPaddleOCR
+#     from magic_pdf.model.sub_modules.table.rapidtable.rapid_table import RapidTableModel
+
+
+def table_model_init(table_model_type, model_path, max_time, _device_='cpu', lang=None, table_sub_model_name=None):
    if table_model_type == MODEL_NAME.STRUCT_EQTABLE:
+        from magic_pdf.model.sub_modules.table.structeqtable.struct_eqtable import StructTableModel
        table_model = StructTableModel(model_path, max_new_tokens=2048, max_time=max_time)
    elif table_model_type == MODEL_NAME.TABLE_MASTER:
+        from magic_pdf.model.sub_modules.table.tablemaster.tablemaster_paddle import TableMasterPaddleModel
        config = {
            'model_dir': model_path,
            'device': _device_
        }
        table_model = TableMasterPaddleModel(config)
    elif table_model_type == MODEL_NAME.RAPID_TABLE:
+        atom_model_manager = AtomModelSingleton()
+        ocr_engine = atom_model_manager.get_atom_model(
+            atom_model_name='ocr',
+            ocr_show_log=False,
+            det_db_box_thresh=0.5,
+            det_db_unclip_ratio=1.6,
+            lang=lang
+        )
        table_model = RapidTableModel(ocr_engine, table_sub_model_name)
    else:
        logger.error('table model type not allow')
@@ -55,7 +65,7 @@ def table_model_init(table_model_type, model_path, max_time, _device_='cpu', ocr


 def mfd_model_init(weight, device='cpu'):
-    if str(device).startswith("npu"):
+    if str(device).startswith('npu'):
        device = torch.device(device)
    mfd_model = YOLOv8MFDModel(weight, device)
    return mfd_model
@@ -67,19 +77,20 @@ def mfr_model_init(weight_dir, cfg_path, device='cpu'):


 def layout_model_init(weight, config_file, device):
+    from magic_pdf.model.sub_modules.layout.layoutlmv3.model_init import Layoutlmv3_Predictor
    model = Layoutlmv3_Predictor(weight, config_file, device)
    return model


 def doclayout_yolo_model_init(weight, device='cpu'):
-    if str(device).startswith("npu"):
+    if str(device).startswith('npu'):
        device = torch.device(device)
    model = DocLayoutYOLOModel(weight, device)
    return model


 def langdetect_model_init(langdetect_model_weight, device='cpu'):
-    if str(device).startswith("npu"):
+    if str(device).startswith('npu'):
        device = torch.device(device)
    model = YOLOv11LangDetModel(langdetect_model_weight, device)
    return model
@@ -92,7 +103,8 @@ def ocr_model_init(show_log: bool = False,
                   det_db_unclip_ratio=1.8,
                   ):
    if lang is not None and lang != '':
-        model = ModifiedPaddleOCR(
+        # model = ModifiedPaddleOCR(
+        model = PytorchPaddleOCR(
            show_log=show_log,
            det_db_box_thresh=det_db_box_thresh,
            lang=lang,
@@ -100,7 +112,8 @@ def ocr_model_init(show_log: bool = False,
            det_db_unclip_ratio=det_db_unclip_ratio,
        )
    else:
-        model = ModifiedPaddleOCR(
+        # model = ModifiedPaddleOCR(
+        model = PytorchPaddleOCR(
            show_log=show_log,
            det_db_box_thresh=det_db_box_thresh,
            use_dilation=use_dilation,
@@ -129,7 +142,7 @@ class AtomModelSingleton:
        elif atom_model_name in [AtomicModel.Layout]:
            key = (atom_model_name, layout_model_name)
        elif atom_model_name in [AtomicModel.Table]:
-            key = (atom_model_name, table_model_name)
+            key = (atom_model_name, table_model_name, lang)
        else:
            key = atom_model_name

@@ -177,7 +190,7 @@ def atom_model_init(model_name: str, **kwargs):
            kwargs.get('table_model_path'),
            kwargs.get('table_max_time'),
            kwargs.get('device'),
-            kwargs.get('ocr_engine'),
+            kwargs.get('lang'),
            kwargs.get('table_sub_model_name')
        )
    elif model_name == AtomicModel.LangDetect:

--- a/magic_pdf/model/sub_modules/model_utils.py
+++ b/magic_pdf/model/sub_modules/model_utils.py
 import time
-
 import torch
-from PIL import Image
 from loguru import logger
-
+import numpy as np
 from magic_pdf.libs.clean_memory import clean_memory


-def crop_img(input_res, input_pil_img, crop_paste_x=0, crop_paste_y=0):
+def crop_img(input_res, input_np_img, crop_paste_x=0, crop_paste_y=0):
+
    crop_xmin, crop_ymin = int(input_res['poly'][0]), int(input_res['poly'][1])
    crop_xmax, crop_ymax = int(input_res['poly'][4]), int(input_res['poly'][5])
-    # Create a white background with an additional width and height of 50
+
+    # Calculate new dimensions
    crop_new_width = crop_xmax - crop_xmin + crop_paste_x * 2
    crop_new_height = crop_ymax - crop_ymin + crop_paste_y * 2
-    return_image = Image.new('RGB', (crop_new_width, crop_new_height), 'white')

-    # Crop image
-    crop_box = (crop_xmin, crop_ymin, crop_xmax, crop_ymax)
-    cropped_img = input_pil_img.crop(crop_box)
-    return_image.paste(cropped_img, (crop_paste_x, crop_paste_y))
-    return_list = [crop_paste_x, crop_paste_y, crop_xmin, crop_ymin, crop_xmax, crop_ymax, crop_new_width, crop_new_height]
+    # Create a white background array
+    return_image = np.ones((crop_new_height, crop_new_width, 3), dtype=np.uint8) * 255
+
+    # Crop the original image using numpy slicing
+    cropped_img = input_np_img[crop_ymin:crop_ymax, crop_xmin:crop_xmax]
+
+    # Paste the cropped image onto the white background
+    return_image[crop_paste_y:crop_paste_y + (crop_ymax - crop_ymin),
+    crop_paste_x:crop_paste_x + (crop_xmax - crop_xmin)] = cropped_img
+
+    return_list = [crop_paste_x, crop_paste_y, crop_xmin, crop_ymin, crop_xmax, crop_ymax, crop_new_width,
+                   crop_new_height]
    return return_image, return_list



--- a/magic_pdf/model/sub_modules/ocr/paddleocr/ppocr_273_mod.py
+++ b/magic_pdf/model/sub_modules/ocr/paddleocr/ppocr_273_mod.py
-import copy
-import platform
-import time
-import cv2
-import numpy as np
-import torch
-
-from paddleocr import PaddleOCR
-from ppocr.utils.logging import get_logger
-from ppocr.utils.utility import alpha_to_color, binarize_img
-from tools.infer.predict_system import sorted_boxes
-from tools.infer.utility import get_rotate_crop_image, get_minarea_rect_crop
-
-from magic_pdf.model.sub_modules.ocr.paddleocr.ocr_utils import update_det_boxes, merge_det_boxes, check_img, \
-    ONNXModelSingleton
-
-logger = get_logger()
-
-
-class ModifiedPaddleOCR(PaddleOCR):
-    def __init__(self, *args, **kwargs):
-
-        super().__init__(*args, **kwargs)
-        self.lang = kwargs.get('lang', 'ch')
-        # 在cpu架构为arm且不支持cuda时调用onnx、
-        if not torch.cuda.is_available() and platform.machine() in ['arm64', 'aarch64']:
-            self.use_onnx = True
-            onnx_model_manager = ONNXModelSingleton()
-            self.additional_ocr = onnx_model_manager.get_onnx_model(**kwargs)
-        else:
-            self.use_onnx = False
-
-    def ocr(self,
-            img,
-            det=True,
-            rec=True,
-            cls=True,
-            bin=False,
-            inv=False,
-            alpha_color=(255, 255, 255),
-            mfd_res=None,
-            ):
-        """
-        OCR with PaddleOCR
-        args：
-            img: img for OCR, support ndarray, img_path and list or ndarray
-            det: use text detection or not. If False, only rec will be exec. Default is True
-            rec: use text recognition or not. If False, only det will be exec. Default is True
-            cls: use angle classifier or not. Default is True. If True, the text with rotation of 180 degrees can be recognized. If no text is rotated by 180 degrees, use cls=False to get better performance. Text with rotation of 90 or 270 degrees can be recognized even if cls=False.
-            bin: binarize image to black and white. Default is False.
-            inv: invert image colors. Default is False.
-            alpha_color: set RGB color Tuple for transparent parts replacement. Default is pure white.
-        """
-        assert isinstance(img, (np.ndarray, list, str, bytes))
-        if isinstance(img, list) and det == True:
-            logger.error('When input a list of images, det must be false')
-            exit(0)
-        if cls == True and self.use_angle_cls == False:
-            pass
-            # logger.warning(
-            #     'Since the angle classifier is not initialized, it will not be used during the forward process'
-            # )
-
-        img = check_img(img)
-        # for infer pdf file
-        if isinstance(img, list):
-            if self.page_num > len(img) or self.page_num == 0:
-                self.page_num = len(img)
-            imgs = img[:self.page_num]
-        else:
-            imgs = [img]
-
-        def preprocess_image(_image):
-            _image = alpha_to_color(_image, alpha_color)
-            if inv:
-                _image = cv2.bitwise_not(_image)
-            if bin:
-                _image = binarize_img(_image)
-            return _image
-
-        if det and rec:
-            ocr_res = []
-            for img in imgs:
-                img = preprocess_image(img)
-                dt_boxes, rec_res, _ = self.__call__(img, cls, mfd_res=mfd_res)
-                if not dt_boxes and not rec_res:
-                    ocr_res.append(None)
-                    continue
-                tmp_res = [[box.tolist(), res]
-                           for box, res in zip(dt_boxes, rec_res)]
-                ocr_res.append(tmp_res)
-            return ocr_res
-        elif det and not rec:
-            ocr_res = []
-            for img in imgs:
-                img = preprocess_image(img)
-                if self.lang in ['ch'] and self.use_onnx:
-                    dt_boxes, elapse = self.additional_ocr.text_detector(img)
-                else:
-                    dt_boxes, elapse = self.text_detector(img)
-                if dt_boxes is None:
-                    ocr_res.append(None)
-                    continue
-                dt_boxes = sorted_boxes(dt_boxes)
-                # merge_det_boxes 和 update_det_boxes 都会把poly转成bbox再转回poly，因此需要过滤所有倾斜程度较大的文本框
-                dt_boxes = merge_det_boxes(dt_boxes)
-                if mfd_res:
-                    bef = time.time()
-                    dt_boxes = update_det_boxes(dt_boxes, mfd_res)
-                    aft = time.time()
-                    logger.debug("split text box by formula, new dt_boxes num : {}, elapsed : {}".format(
-                        len(dt_boxes), aft - bef))
-                tmp_res = [box.tolist() for box in dt_boxes]
-                ocr_res.append(tmp_res)
-            return ocr_res
-        else:
-            ocr_res = []
-            cls_res = []
-            for img in imgs:
-                if not isinstance(img, list):
-                    img = preprocess_image(img)
-                    img = [img]
-                if self.use_angle_cls and cls:
-                    img, cls_res_tmp, elapse = self.text_classifier(img)
-                    if not rec:
-                        cls_res.append(cls_res_tmp)
-                if self.lang in ['ch'] and self.use_onnx:
-                    rec_res, elapse = self.additional_ocr.text_recognizer(img)
-                else:
-                    rec_res, elapse = self.text_recognizer(img)
-                ocr_res.append(rec_res)
-            if not rec:
-                return cls_res
-            return ocr_res
-
-    def __call__(self, img, cls=True, mfd_res=None):
-        time_dict = {'det': 0, 'rec': 0, 'cls': 0, 'all': 0}
-
-        if img is None:
-            logger.debug("no valid image provided")
-            return None, None, time_dict
-
-        start = time.time()
-        ori_im = img.copy()
-        if self.lang in ['ch'] and self.use_onnx:
-            dt_boxes, elapse = self.additional_ocr.text_detector(img)
-        else:
-            dt_boxes, elapse = self.text_detector(img)
-        time_dict['det'] = elapse
-
-        if dt_boxes is None:
-            logger.debug("no dt_boxes found, elapsed : {}".format(elapse))
-            end = time.time()
-            time_dict['all'] = end - start
-            return None, None, time_dict
-        else:
-            logger.debug("dt_boxes num : {}, elapsed : {}".format(
-                len(dt_boxes), elapse))
-        img_crop_list = []
-
-        dt_boxes = sorted_boxes(dt_boxes)
-
-        # merge_det_boxes 和 update_det_boxes 都会把poly转成bbox再转回poly，因此需要过滤所有倾斜程度较大的文本框
-        dt_boxes = merge_det_boxes(dt_boxes)
-
-        if mfd_res:
-            bef = time.time()
-            dt_boxes = update_det_boxes(dt_boxes, mfd_res)
-            aft = time.time()
-            logger.debug("split text box by formula, new dt_boxes num : {}, elapsed : {}".format(
-                len(dt_boxes), aft - bef))
-
-        for bno in range(len(dt_boxes)):
-            tmp_box = copy.deepcopy(dt_boxes[bno])
-            if self.args.det_box_type == "quad":
-                img_crop = get_rotate_crop_image(ori_im, tmp_box)
-            else:
-                img_crop = get_minarea_rect_crop(ori_im, tmp_box)
-            img_crop_list.append(img_crop)
-        if self.use_angle_cls and cls:
-            img_crop_list, angle_list, elapse = self.text_classifier(
-                img_crop_list)
-            time_dict['cls'] = elapse
-            logger.debug("cls num  : {}, elapsed : {}".format(
-                len(img_crop_list), elapse))
-        if self.lang in ['ch'] and self.use_onnx:
-            rec_res, elapse = self.additional_ocr.text_recognizer(img_crop_list)
-        else:
-            rec_res, elapse = self.text_recognizer(img_crop_list)
-        time_dict['rec'] = elapse
-        logger.debug("rec_res num  : {}, elapsed : {}".format(
-            len(rec_res), elapse))
-        if self.args.save_crop_res:
-            self.draw_crop_rec_res(self.args.crop_res_save_dir, img_crop_list,
-                                   rec_res)
-        filter_boxes, filter_rec_res = [], []
-        for box, rec_result in zip(dt_boxes, rec_res):
-            text, score = rec_result
-            if score >= self.drop_score:
-                filter_boxes.append(box)
-                filter_rec_res.append(rec_result)
-        end = time.time()
-        time_dict['all'] = end - start
-        return filter_boxes, filter_rec_res, time_dict
\ No newline at end of file
--- a/magic_pdf/model/sub_modules/ocr/paddleocr/ppocr_291_mod.py
+++ b/magic_pdf/model/sub_modules/ocr/paddleocr/ppocr_291_mod.py
-import copy
-import time
-
-
-import cv2
-import numpy as np
-from paddleocr import PaddleOCR
-from paddleocr.paddleocr import check_img, logger
-from paddleocr.ppocr.utils.utility import alpha_to_color, binarize_img
-from paddleocr.tools.infer.predict_system import sorted_boxes
-from paddleocr.tools.infer.utility import slice_generator, merge_fragmented, get_rotate_crop_image, \
-    get_minarea_rect_crop
-
-from magic_pdf.model.sub_modules.ocr.paddleocr.ocr_utils import update_det_boxes
-
-
-class ModifiedPaddleOCR(PaddleOCR):
-
-    def ocr(
-        self,
-        img,
-        det=True,
-        rec=True,
-        cls=True,
-        bin=False,
-        inv=False,
-        alpha_color=(255, 255, 255),
-        slice={},
-        mfd_res=None,
-    ):
-        """
-        OCR with PaddleOCR
-
-        Args:
-            img: Image for OCR. It can be an ndarray, img_path, or a list of ndarrays.
-            det: Use text detection or not. If False, only text recognition will be executed. Default is True.
-            rec: Use text recognition or not. If False, only text detection will be executed. Default is True.
-            cls: Use angle classifier or not. Default is True. If True, the text with a rotation of 180 degrees can be recognized. If no text is rotated by 180 degrees, use cls=False to get better performance.
-            bin: Binarize image to black and white. Default is False.
-            inv: Invert image colors. Default is False.
-            alpha_color: Set RGB color Tuple for transparent parts replacement. Default is pure white.
-            slice: Use sliding window inference for large images. Both det and rec must be True. Requires int values for slice["horizontal_stride"], slice["vertical_stride"], slice["merge_x_thres"], slice["merge_y_thres"] (See doc/doc_en/slice_en.md). Default is {}.
-
-        Returns:
-            If both det and rec are True, returns a list of OCR results for each image. Each OCR result is a list of bounding boxes and recognized text for each detected text region.
-            If det is True and rec is False, returns a list of detected bounding boxes for each image.
-            If det is False and rec is True, returns a list of recognized text for each image.
-            If both det and rec are False, returns a list of angle classification results for each image.
-
-        Raises:
-            AssertionError: If the input image is not of type ndarray, list, str, or bytes.
-            SystemExit: If det is True and the input is a list of images.
-
-        Note:
-            - If the angle classifier is not initialized (use_angle_cls=False), it will not be used during the forward process.
-            - For PDF files, if the input is a list of images and the page_num is specified, only the first page_num images will be processed.
-            - The preprocess_image function is used to preprocess the input image by applying alpha color replacement, inversion, and binarization if specified.
-        """
-        assert isinstance(img, (np.ndarray, list, str, bytes))
-        if isinstance(img, list) and det == True:
-            logger.error("When input a list of images, det must be false")
-            exit(0)
-        if cls == True and self.use_angle_cls == False:
-            logger.warning(
-                "Since the angle classifier is not initialized, it will not be used during the forward process"
-            )
-
-        img, flag_gif, flag_pdf = check_img(img, alpha_color)
-        # for infer pdf file
-        if isinstance(img, list) and flag_pdf:
-            if self.page_num > len(img) or self.page_num == 0:
-                imgs = img
-            else:
-                imgs = img[: self.page_num]
-        else:
-            imgs = [img]
-
-        def preprocess_image(_image):
-            _image = alpha_to_color(_image, alpha_color)
-            if inv:
-                _image = cv2.bitwise_not(_image)
-            if bin:
-                _image = binarize_img(_image)
-            return _image
-
-        if det and rec:
-            ocr_res = []
-            for img in imgs:
-                img = preprocess_image(img)
-                dt_boxes, rec_res, _ = self.__call__(img, cls, slice, mfd_res=mfd_res)
-                if not dt_boxes and not rec_res:
-                    ocr_res.append(None)
-                    continue
-                tmp_res = [[box.tolist(), res] for box, res in zip(dt_boxes, rec_res)]
-                ocr_res.append(tmp_res)
-            return ocr_res
-        elif det and not rec:
-            ocr_res = []
-            for img in imgs:
-                img = preprocess_image(img)
-                dt_boxes, elapse = self.text_detector(img)
-                if dt_boxes.size == 0:
-                    ocr_res.append(None)
-                    continue
-                tmp_res = [box.tolist() for box in dt_boxes]
-                ocr_res.append(tmp_res)
-            return ocr_res
-        else:
-            ocr_res = []
-            cls_res = []
-            for img in imgs:
-                if not isinstance(img, list):
-                    img = preprocess_image(img)
-                    img = [img]
-                if self.use_angle_cls and cls:
-                    img, cls_res_tmp, elapse = self.text_classifier(img)
-                    if not rec:
-                        cls_res.append(cls_res_tmp)
-                rec_res, elapse = self.text_recognizer(img)
-                ocr_res.append(rec_res)
-            if not rec:
-                return cls_res
-            return ocr_res
-
-    def __call__(self, img, cls=True, slice={}, mfd_res=None):
-        time_dict = {"det": 0, "rec": 0, "cls": 0, "all": 0}
-
-        if img is None:
-            logger.debug("no valid image provided")
-            return None, None, time_dict
-
-        start = time.time()
-        ori_im = img.copy()
-        if slice:
-            slice_gen = slice_generator(
-                img,
-                horizontal_stride=slice["horizontal_stride"],
-                vertical_stride=slice["vertical_stride"],
-            )
-            elapsed = []
-            dt_slice_boxes = []
-            for slice_crop, v_start, h_start in slice_gen:
-                dt_boxes, elapse = self.text_detector(slice_crop, use_slice=True)
-                if dt_boxes.size:
-                    dt_boxes[:, :, 0] += h_start
-                    dt_boxes[:, :, 1] += v_start
-                    dt_slice_boxes.append(dt_boxes)
-                    elapsed.append(elapse)
-            dt_boxes = np.concatenate(dt_slice_boxes)
-
-            dt_boxes = merge_fragmented(
-                boxes=dt_boxes,
-                x_threshold=slice["merge_x_thres"],
-                y_threshold=slice["merge_y_thres"],
-            )
-            elapse = sum(elapsed)
-        else:
-            dt_boxes, elapse = self.text_detector(img)
-
-        time_dict["det"] = elapse
-
-        if dt_boxes is None:
-            logger.debug("no dt_boxes found, elapsed : {}".format(elapse))
-            end = time.time()
-            time_dict["all"] = end - start
-            return None, None, time_dict
-        else:
-            logger.debug(
-                "dt_boxes num : {}, elapsed : {}".format(len(dt_boxes), elapse)
-            )
-        img_crop_list = []
-
-        dt_boxes = sorted_boxes(dt_boxes)
-
-        if mfd_res:
-            bef = time.time()
-            dt_boxes = update_det_boxes(dt_boxes, mfd_res)
-            aft = time.time()
-            logger.debug("split text box by formula, new dt_boxes num : {}, elapsed : {}".format(
-                len(dt_boxes), aft - bef))
-
-        for bno in range(len(dt_boxes)):
-            tmp_box = copy.deepcopy(dt_boxes[bno])
-            if self.args.det_box_type == "quad":
-                img_crop = get_rotate_crop_image(ori_im, tmp_box)
-            else:
-                img_crop = get_minarea_rect_crop(ori_im, tmp_box)
-            img_crop_list.append(img_crop)
-        if self.use_angle_cls and cls:
-            img_crop_list, angle_list, elapse = self.text_classifier(img_crop_list)
-            time_dict["cls"] = elapse
-            logger.debug(
-                "cls num  : {}, elapsed : {}".format(len(img_crop_list), elapse)
-            )
-        if len(img_crop_list) > 1000:
-            logger.debug(
-                f"rec crops num: {len(img_crop_list)}, time and memory cost may be large."
-            )
-
-        rec_res, elapse = self.text_recognizer(img_crop_list)
-        time_dict["rec"] = elapse
-        logger.debug("rec_res num  : {}, elapsed : {}".format(len(rec_res), elapse))
-        if self.args.save_crop_res:
-            self.draw_crop_rec_res(self.args.crop_res_save_dir, img_crop_list, rec_res)
-        filter_boxes, filter_rec_res = [], []
-        for box, rec_result in zip(dt_boxes, rec_res):
-            text, score = rec_result[0], rec_result[1]
-            if score >= self.drop_score:
-                filter_boxes.append(box)
-                filter_rec_res.append(rec_result)
-        end = time.time()
-        time_dict["all"] = end - start
-        return filter_boxes, filter_rec_res, time_dict
--- a/magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/__init__.py
+++ b/magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/__init__.py
+# Copyright (c) Opendatalab. All rights reserved.
--- a/magic_pdf/model/sub_modules/ocr/paddleocr/ocr_utils.py
+++ b/magic_pdf/model/sub_modules/ocr/paddleocr/ocr_utils.py
--- a/magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorch_paddle.py
+++ b/magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorch_paddle.py
--- a/magic_pdf/model/sub_modules/table/structeqtable/__init__.py
+++ b/magic_pdf/model/sub_modules/table/structeqtable/__init__.py
--- a/magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/base_ocr_v20.py
+++ b/magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/base_ocr_v20.py
+import os
+import torch
+from .modeling.architectures.base_model import BaseModel
+
+class BaseOCRV20:
+    def __init__(self, config, **kwargs):
+        self.config = config
+        self.build_net(**kwargs)
+        self.net.eval()
+
+
+    def build_net(self, **kwargs):
+        self.net = BaseModel(self.config, **kwargs)
+
+    def read_pytorch_weights(self, weights_path):
+        if not os.path.exists(weights_path):
+            raise FileNotFoundError('{} is not existed.'.format(weights_path))
+        weights = torch.load(weights_path)
+        return weights
+
+    def get_out_channels(self, weights):
+        if list(weights.keys())[-1].endswith('.weight') and len(list(weights.values())[-1].shape) == 2:
+            out_channels = list(weights.values())[-1].numpy().shape[1]
+        else:
+            out_channels = list(weights.values())[-1].numpy().shape[0]
+        return out_channels
+
+    def load_state_dict(self, weights):
+        self.net.load_state_dict(weights)
+        # print('weights is loaded.')
+
+    def load_pytorch_weights(self, weights_path):
+        self.net.load_state_dict(torch.load(weights_path, weights_only=True))
+        # print('model is loaded: {}'.format(weights_path))
+
+    def inference(self, inputs):
+        with torch.no_grad():
+            infer = self.net(inputs)
+        return infer
--- a/magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/data/__init__.py
+++ b/magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/data/__init__.py
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+from .imaug import transform, create_operators
+
+
--- a/magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/data/imaug/__init__.py
+++ b/magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/data/imaug/__init__.py
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+# from .iaa_augment import IaaAugment
+# from .make_border_map import MakeBorderMap
+# from .make_shrink_map import MakeShrinkMap
+# from .random_crop_data import EastRandomCropData, PSERandomCrop
+
+# from .rec_img_aug import RecAug, RecResizeImg, ClsResizeImg
+# from .randaugment import RandAugment
+from .operators import *
+# from .label_ops import *
+
+# from .east_process import *
+# from .sast_process import *
+# from .gen_table_mask import *
+
+def transform(data, ops=None):
+    """ transform """
+    if ops is None:
+        ops = []
+    for op in ops:
+        data = op(data)
+        if data is None:
+            return None
+    return data
+
+
+def create_operators(op_param_list, global_config=None):
+    """
+    create operators based on the config
+    Args:
+        params(list): a dict list, used to create some operators
+    """
+    assert isinstance(op_param_list, list), ('operator config should be a list')
+    ops = []
+    for operator in op_param_list:
+        assert isinstance(operator,
+                          dict) and len(operator) == 1, "yaml format error"
+        op_name = list(operator)[0]
+        param = {} if operator[op_name] is None else operator[op_name]
+        if global_config is not None:
+            param.update(global_config)
+        op = eval(op_name)(**param)
+        ops.append(op)
+    return ops
\ No newline at end of file
--- a/magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/data/imaug/operators.py
+++ b/magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/data/imaug/operators.py
--- a/magic_pdf/model/sub_modules/table/tablemaster/__init__.py
+++ b/magic_pdf/model/sub_modules/table/tablemaster/__init__.py
--- a/magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/modeling/architectures/__init__.py
+++ b/magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/modeling/architectures/__init__.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import copy
+
+__all__ = ["build_model"]
+
+
+def build_model(config, **kwargs):
+    from .base_model import BaseModel
+
+    config = copy.deepcopy(config)
+    module_class = BaseModel(config, **kwargs)
+    return module_class
--- a/magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/modeling/architectures/base_model.py
+++ b/magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/modeling/architectures/base_model.py
--- a/magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/modeling/backbones/__init__.py
+++ b/magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/modeling/backbones/__init__.py
--- a/magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/modeling/backbones/det_mobilenet_v3.py
+++ b/magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/modeling/backbones/det_mobilenet_v3.py