Merge pull request #1955 from myhloli/dev

Dev push

Merge pull request #1955 from myhloli/dev
Dev push
1ec5d09d · Xiaomeng Zhao · GitHub · dd377537 · 6e35e382 · 1ec5d09d
Unverified Commit 1ec5d09d authored Mar 20, 2025 by Xiaomeng Zhao Committed by GitHub Mar 20, 2025
12 changed files
--- a/magic_pdf/model/sub_modules/mfr/unimernet/unimernet_hf/unimer_swin/__init__.py
+++ b/magic_pdf/model/sub_modules/mfr/unimernet/unimernet_hf/unimer_swin/__init__.py
+from .configuration_unimer_swin import UnimerSwinConfig
+from .modeling_unimer_swin import UnimerSwinModel
+from .image_processing_unimer_swin import UnimerSwinImageProcessor
+
+__all__ = [
+    "UnimerSwinConfig",
+    "UnimerSwinModel",
+    "UnimerSwinImageProcessor",
+]
--- a/magic_pdf/model/sub_modules/mfr/unimernet/unimernet_hf/unimer_swin/configuration_unimer_swin.py
+++ b/magic_pdf/model/sub_modules/mfr/unimernet/unimernet_hf/unimer_swin/configuration_unimer_swin.py
+# coding=utf-8
+# Copyright 2022 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Donut Swin Transformer model configuration"""
+
+from transformers.configuration_utils import PretrainedConfig
+from transformers.utils import logging
+
+
+logger = logging.get_logger(__name__)
+
+
+class UnimerSwinConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`UnimerSwinModel`]. It is used to instantiate a
+    Donut model according to the specified arguments, defining the model architecture. Instantiating a configuration
+    with the defaults will yield a similar configuration to that of the Donut
+    [naver-clova-ix/donut-base](https://huggingface.co/naver-clova-ix/donut-base) architecture.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+    Args:
+        image_size (`int`, *optional*, defaults to 224):
+            The size (resolution) of each image.
+        patch_size (`int`, *optional*, defaults to 4):
+            The size (resolution) of each patch.
+        num_channels (`int`, *optional*, defaults to 3):
+            The number of input channels.
+        embed_dim (`int`, *optional*, defaults to 96):
+            Dimensionality of patch embedding.
+        depths (`list(int)`, *optional*, defaults to `[2, 2, 6, 2]`):
+            Depth of each layer in the Transformer encoder.
+        num_heads (`list(int)`, *optional*, defaults to `[3, 6, 12, 24]`):
+            Number of attention heads in each layer of the Transformer encoder.
+        window_size (`int`, *optional*, defaults to 7):
+            Size of windows.
+        mlp_ratio (`float`, *optional*, defaults to 4.0):
+            Ratio of MLP hidden dimensionality to embedding dimensionality.
+        qkv_bias (`bool`, *optional*, defaults to `True`):
+            Whether or not a learnable bias should be added to the queries, keys and values.
+        hidden_dropout_prob (`float`, *optional*, defaults to 0.0):
+            The dropout probability for all fully connected layers in the embeddings and encoder.
+        attention_probs_dropout_prob (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for the attention probabilities.
+        drop_path_rate (`float`, *optional*, defaults to 0.1):
+            Stochastic depth rate.
+        hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`):
+            The non-linear activation function (function or string) in the encoder. If string, `"gelu"`, `"relu"`,
+            `"selu"` and `"gelu_new"` are supported.
+        use_absolute_embeddings (`bool`, *optional*, defaults to `False`):
+            Whether or not to add absolute position embeddings to the patch embeddings.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        layer_norm_eps (`float`, *optional*, defaults to 1e-05):
+            The epsilon used by the layer normalization layers.
+
+    Example:
+
+    ```python
+    >>> from transformers import UnimerSwinConfig, UnimerSwinModel
+
+    >>> # Initializing a Donut naver-clova-ix/donut-base style configuration
+    >>> configuration = UnimerSwinConfig()
+
+    >>> # Randomly initializing a model from the naver-clova-ix/donut-base style configuration
+    >>> model = UnimerSwinModel(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+
+    model_type = "unimer-swin"
+
+    attribute_map = {
+        "num_attention_heads": "num_heads",
+        "num_hidden_layers": "num_layers",
+    }
+
+    def __init__(
+        self,
+        image_size=224,
+        patch_size=4,
+        num_channels=3,
+        embed_dim=96,
+        depths=[2, 2, 6, 2],
+        num_heads=[3, 6, 12, 24],
+        window_size=7,
+        mlp_ratio=4.0,
+        qkv_bias=True,
+        hidden_dropout_prob=0.0,
+        attention_probs_dropout_prob=0.0,
+        drop_path_rate=0.1,
+        hidden_act="gelu",
+        use_absolute_embeddings=False,
+        initializer_range=0.02,
+        layer_norm_eps=1e-5,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+
+        self.image_size = image_size
+        self.patch_size = patch_size
+        self.num_channels = num_channels
+        self.embed_dim = embed_dim
+        self.depths = depths
+        self.num_layers = len(depths)
+        self.num_heads = num_heads
+        self.window_size = window_size
+        self.mlp_ratio = mlp_ratio
+        self.qkv_bias = qkv_bias
+        self.hidden_dropout_prob = hidden_dropout_prob
+        self.attention_probs_dropout_prob = attention_probs_dropout_prob
+        self.drop_path_rate = drop_path_rate
+        self.hidden_act = hidden_act
+        self.use_absolute_embeddings = use_absolute_embeddings
+        self.layer_norm_eps = layer_norm_eps
+        self.initializer_range = initializer_range
+        # we set the hidden_size attribute in order to make Swin work with VisionEncoderDecoderModel
+        # this indicates the channel dimension after the last stage of the model
+        self.hidden_size = int(embed_dim * 2 ** (len(depths) - 1))
--- a/magic_pdf/model/sub_modules/mfr/unimernet/unimernet_hf/unimer_swin/image_processing_unimer_swin.py
+++ b/magic_pdf/model/sub_modules/mfr/unimernet/unimernet_hf/unimer_swin/image_processing_unimer_swin.py
+from transformers.image_processing_utils import BaseImageProcessor
+import numpy as np
+import cv2
+import albumentations as alb
+from albumentations.pytorch import ToTensorV2
+
+
+# TODO: dereference cv2 if possible
+class UnimerSwinImageProcessor(BaseImageProcessor):
+    def __init__(
+            self,
+            image_size = (192, 672),
+        ):
+        self.input_size = [int(_) for _ in image_size]
+        assert len(self.input_size) == 2
+    
+        self.transform = alb.Compose(
+            [
+                alb.ToGray(always_apply=True),
+                alb.Normalize((0.7931, 0.7931, 0.7931), (0.1738, 0.1738, 0.1738)),
+                # alb.Sharpen()
+                ToTensorV2(),
+            ]
+        )
+
+    def __call__(self, item):
+        image = self.prepare_input(item)
+        return self.transform(image=image)['image'][:1]
+
+    @staticmethod
+    def crop_margin_numpy(img: np.ndarray) -> np.ndarray:
+        """Crop margins of image using NumPy operations"""
+        # Convert to grayscale if it's a color image
+        if len(img.shape) == 3 and img.shape[2] == 3:
+            gray = cv2.cvtColor(img, cv2.COLOR_RGB2GRAY)
+        else:
+            gray = img.copy()
+
+        # Normalize and threshold
+        if gray.max() == gray.min():
+            return img
+
+        normalized = (((gray - gray.min()) / (gray.max() - gray.min())) * 255).astype(np.uint8)
+        binary = 255 * (normalized < 200).astype(np.uint8)
+
+        # Find bounding box
+        coords = cv2.findNonZero(binary)  # Find all non-zero points (text)
+        x, y, w, h = cv2.boundingRect(coords)  # Find minimum spanning bounding box
+
+        # Return cropped image
+        return img[y:y + h, x:x + w]
+
+    def prepare_input(self, img, random_padding: bool = False):
+        """
+        Convert PIL Image or numpy array to properly sized and padded image after:
+            - crop margins
+            - resize while maintaining aspect ratio
+            - pad to target size
+        """
+        if img is None:
+            return None
+
+        try:
+            img = self.crop_margin_numpy(img)
+        except Exception:
+            # might throw an error for broken files
+            return None
+
+        if img.shape[0] == 0 or img.shape[1] == 0:
+            return None
+
+        # Resize while preserving aspect ratio
+        h, w = img.shape[:2]
+        scale = min(self.input_size[0] / h, self.input_size[1] / w)
+        new_h, new_w = int(h * scale), int(w * scale)
+        resized_img = cv2.resize(img, (new_w, new_h), interpolation=cv2.INTER_AREA)
+
+        # Calculate padding
+        pad_width, pad_height = self._get_padding_values(new_w, new_h, random_padding)
+
+        # Create and apply padding
+        channels = 3 if len(img.shape) == 3 else 1
+        padded_img = np.full((self.input_size[0], self.input_size[1], channels), 255, dtype=np.uint8)
+        padded_img[pad_height:pad_height + new_h, pad_width:pad_width + new_w] = resized_img
+
+        return padded_img
+
+    def _calculate_padding(self, new_w, new_h, random_padding):
+        """Calculate padding values for PIL images"""
+        delta_width = self.input_size[1] - new_w
+        delta_height = self.input_size[0] - new_h
+
+        pad_width, pad_height = self._get_padding_values(new_w, new_h, random_padding)
+
+        return (
+            pad_width,
+            pad_height,
+            delta_width - pad_width,
+            delta_height - pad_height,
+        )
+
+    def _get_padding_values(self, new_w, new_h, random_padding):
+        """Get padding values based on image dimensions and padding strategy"""
+        delta_width = self.input_size[1] - new_w
+        delta_height = self.input_size[0] - new_h
+
+        if random_padding:
+            pad_width = np.random.randint(low=0, high=delta_width + 1)
+            pad_height = np.random.randint(low=0, high=delta_height + 1)
+        else:
+            pad_width = delta_width // 2
+            pad_height = delta_height // 2
+
+        return pad_width, pad_height
--- a/magic_pdf/model/sub_modules/mfr/unimernet/unimernet_hf/unimer_swin/modeling_unimer_swin.py
+++ b/magic_pdf/model/sub_modules/mfr/unimernet/unimernet_hf/unimer_swin/modeling_unimer_swin.py
--- a/magic_pdf/model/sub_modules/model_init.py
+++ b/magic_pdf/model/sub_modules/model_init.py
-import os
-
 import torch
 from loguru import logger

 from magic_pdf.config.constants import MODEL_NAME
 from magic_pdf.model.model_list import AtomicModel
-from magic_pdf.model.sub_modules.language_detection.yolov11.YOLOv11 import \
-    YOLOv11LangDetModel
-from magic_pdf.model.sub_modules.layout.doclayout_yolo.DocLayoutYOLO import \
-    DocLayoutYOLOModel
-from magic_pdf.model.sub_modules.layout.layoutlmv3.model_init import \
-    Layoutlmv3_Predictor
+from magic_pdf.model.sub_modules.language_detection.yolov11.YOLOv11 import YOLOv11LangDetModel
+from magic_pdf.model.sub_modules.layout.doclayout_yolo.DocLayoutYOLO import DocLayoutYOLOModel
 from magic_pdf.model.sub_modules.mfd.yolov8.YOLOv8 import YOLOv8MFDModel
 from magic_pdf.model.sub_modules.mfr.unimernet.Unimernet import UnimernetModel

@@ -18,10 +12,8 @@ try:
    from magic_pdf_ascend_plugin.libs.license_verifier import (
        LicenseExpiredError, LicenseFormatError, LicenseSignatureError,
        load_license)
-    from magic_pdf_ascend_plugin.model_plugin.ocr.paddleocr.ppocr_273_npu import \
-        ModifiedPaddleOCR
-    from magic_pdf_ascend_plugin.model_plugin.table.rapidtable.rapid_table_npu import \
-        RapidTableModel
+    from magic_pdf_ascend_plugin.model_plugin.ocr.paddleocr.ppocr_273_npu import ModifiedPaddleOCR
+    from magic_pdf_ascend_plugin.model_plugin.table.rapidtable.rapid_table_npu import RapidTableModel
    license_key = load_license()
    logger.info(f'Using Ascend Plugin Success, License id is {license_key["payload"]["id"]},'
                f' License expired at {license_key["payload"]["date"]["end_date"]}')
@@ -42,16 +34,13 @@ except Exception as e:
    # from magic_pdf.model.sub_modules.ocr.paddleocr.ppocr_291_mod import ModifiedPaddleOCR
    from magic_pdf.model.sub_modules.table.rapidtable.rapid_table import RapidTableModel

-from magic_pdf.model.sub_modules.table.structeqtable.struct_eqtable import \
-    StructTableModel
-from magic_pdf.model.sub_modules.table.tablemaster.tablemaster_paddle import \
-    TableMasterPaddleModel
-

 def table_model_init(table_model_type, model_path, max_time, _device_='cpu', ocr_engine=None, table_sub_model_name=None):
    if table_model_type == MODEL_NAME.STRUCT_EQTABLE:
+        from magic_pdf.model.sub_modules.table.structeqtable.struct_eqtable import StructTableModel
        table_model = StructTableModel(model_path, max_new_tokens=2048, max_time=max_time)
    elif table_model_type == MODEL_NAME.TABLE_MASTER:
+        from magic_pdf.model.sub_modules.table.tablemaster.tablemaster_paddle import TableMasterPaddleModel
        config = {
            'model_dir': model_path,
            'device': _device_
@@ -79,6 +68,7 @@ def mfr_model_init(weight_dir, cfg_path, device='cpu'):


 def layout_model_init(weight, config_file, device):
+    from magic_pdf.model.sub_modules.layout.layoutlmv3.model_init import Layoutlmv3_Predictor
    model = Layoutlmv3_Predictor(weight, config_file, device)
    return model


--- a/magic_pdf/model/sub_modules/model_utils.py
+++ b/magic_pdf/model/sub_modules/model_utils.py
 import time
-
 import torch
-from PIL import Image
 from loguru import logger
-
+import numpy as np
 from magic_pdf.libs.clean_memory import clean_memory


-def crop_img(input_res, input_pil_img, crop_paste_x=0, crop_paste_y=0):
+def crop_img(input_res, input_np_img, crop_paste_x=0, crop_paste_y=0):
+
    crop_xmin, crop_ymin = int(input_res['poly'][0]), int(input_res['poly'][1])
    crop_xmax, crop_ymax = int(input_res['poly'][4]), int(input_res['poly'][5])
-    # Create a white background with an additional width and height of 50
+
+    # Calculate new dimensions
    crop_new_width = crop_xmax - crop_xmin + crop_paste_x * 2
    crop_new_height = crop_ymax - crop_ymin + crop_paste_y * 2
-    return_image = Image.new('RGB', (crop_new_width, crop_new_height), 'white')

-    # Crop image
-    crop_box = (crop_xmin, crop_ymin, crop_xmax, crop_ymax)
-    cropped_img = input_pil_img.crop(crop_box)
-    return_image.paste(cropped_img, (crop_paste_x, crop_paste_y))
-    return_list = [crop_paste_x, crop_paste_y, crop_xmin, crop_ymin, crop_xmax, crop_ymax, crop_new_width, crop_new_height]
+    # Create a white background array
+    return_image = np.ones((crop_new_height, crop_new_width, 3), dtype=np.uint8) * 255
+
+    # Crop the original image using numpy slicing
+    cropped_img = input_np_img[crop_ymin:crop_ymax, crop_xmin:crop_xmax]
+
+    # Paste the cropped image onto the white background
+    return_image[crop_paste_y:crop_paste_y + (crop_ymax - crop_ymin),
+    crop_paste_x:crop_paste_x + (crop_xmax - crop_xmin)] = cropped_img
+
+    return_list = [crop_paste_x, crop_paste_y, crop_xmin, crop_ymin, crop_xmax, crop_ymax, crop_new_width,
+                   crop_new_height]
    return return_image, return_list



--- a/magic_pdf/pdf_parse_union_core_v2.py
+++ b/magic_pdf/pdf_parse_union_core_v2.py
@@ -492,7 +492,7 @@ def insert_lines_into_block(block_bbox, line_height, page_w, page_h):
    else:
        return [[x0, y0, x1, y1]]

-# @measure_time
+
 def sort_lines_by_model(fix_blocks, page_w, page_h, line_height):
    page_line_list = []


--- a/magic_pdf/resources/model_config/model_configs.yaml
+++ b/magic_pdf/resources/model_config/model_configs.yaml
@@ -2,7 +2,7 @@ weights:
  layoutlmv3: Layout/LayoutLMv3/model_final.pth
  doclayout_yolo: Layout/YOLO/doclayout_yolo_docstructbench_imgsz1280_2501.pt
  yolo_v8_mfd: MFD/YOLO/yolo_v8_ft.pt
-  unimernet_small: MFR/unimernet_small_2501
+  unimernet_small: MFR/unimernet_hf_small_2503
  struct_eqtable: TabRec/StructEqTable
  tablemaster: TabRec/TableMaster
  rapid_table: TabRec/RapidTable
\ No newline at end of file
--- a/requirements.txt
+++ b/requirements.txt
@@ -7,7 +7,8 @@ numpy>=1.21.6,<2.0.0
 pydantic>=2.7.2
 PyMuPDF>=1.24.9,<=1.24.14
 scikit-learn>=1.0.2
-torch>=2.2.2
-transformers
+torch>=2.2.2,!=2.5.0,!=2.5.1,<=2.6.0
+torchvision
+transformers>=4.49.0
 pdfminer.six==20231228
 # The requirements.txt must ensure that only necessary external dependencies are introduced. If there are new dependencies to add, please contact the project administrator.
--- a/scripts/download_models.py
+++ b/scripts/download_models.py
 import json
+import shutil
 import os

 import requests
@@ -16,7 +17,7 @@ def download_and_modify_json(url, local_filename, modifications):
    if os.path.exists(local_filename):
        data = json.load(open(local_filename))
        config_version = data.get('config_version', '0.0.0')
-        if config_version < '1.1.1':
+        if config_version < '1.2.0':
            data = download_json(url)
    else:
        data = download_json(url)
@@ -32,12 +33,13 @@ def download_and_modify_json(url, local_filename, modifications):

 if __name__ == '__main__':
    mineru_patterns = [
-        "models/Layout/LayoutLMv3/*",
+        # "models/Layout/LayoutLMv3/*",
        "models/Layout/YOLO/*",
        "models/MFD/YOLO/*",
-        "models/MFR/unimernet_small_2501/*",
-        "models/TabRec/TableMaster/*",
-        "models/TabRec/StructEqTable/*",
+        "models/MFR/unimernet_hf_small_2503/*",
+        "models/OCR/paddleocr/*",
+        # "models/TabRec/TableMaster/*",
+        # "models/TabRec/StructEqTable/*",
    ]
    model_dir = snapshot_download('opendatalab/PDF-Extract-Kit-1.0', allow_patterns=mineru_patterns)
    layoutreader_model_dir = snapshot_download('ppaanngggg/layoutreader')
@@ -45,6 +47,12 @@ if __name__ == '__main__':
    print(f'model_dir is: {model_dir}')
    print(f'layoutreader_model_dir is: {layoutreader_model_dir}')

+    paddleocr_model_dir = model_dir + '/OCR/paddleocr'
+    user_paddleocr_dir = os.path.expanduser('~/.paddleocr')
+    if os.path.exists(user_paddleocr_dir):
+        shutil.rmtree(user_paddleocr_dir)
+    shutil.copytree(paddleocr_model_dir, user_paddleocr_dir)
+
    json_url = 'https://gcore.jsdelivr.net/gh/opendatalab/MinerU@master/magic-pdf.template.json'
    config_file_name = 'magic-pdf.json'
    home_dir = os.path.expanduser('~')

--- a/scripts/download_models_hf.py
+++ b/scripts/download_models_hf.py
--- a/setup.py
+++ b/setup.py