refactor: reorganize project structure and update import paths

cbba27b4 · myhloli · 3027c677 · cbba27b4 · cbba27b4 · cbba27b4
Commit cbba27b4 authored May 28, 2025 by myhloli
20 changed files
--- a/mineru/model/mfr/unimernet/unimernet_hf/unimer_mbart/modeling_unimer_mbart.py
+++ b/mineru/model/mfr/unimernet/unimernet_hf/unimer_mbart/modeling_unimer_mbart.py
--- a/mineru/model/mfr/unimernet/unimernet_hf/unimer_mbart/tokenization_unimer_mbart.py
+++ b/mineru/model/mfr/unimernet/unimernet_hf/unimer_mbart/tokenization_unimer_mbart.py
--- a/mineru/model/mfr/unimernet/unimernet_hf/unimer_swin/__init__.py
+++ b/mineru/model/mfr/unimernet/unimernet_hf/unimer_swin/__init__.py
+from .configuration_unimer_swin import UnimerSwinConfig
+from .modeling_unimer_swin import UnimerSwinModel
+from .image_processing_unimer_swin import UnimerSwinImageProcessor
+
+__all__ = [
+    "UnimerSwinConfig",
+    "UnimerSwinModel",
+    "UnimerSwinImageProcessor",
+]
--- a/mineru/model/mfr/unimernet/unimernet_hf/unimer_swin/configuration_unimer_swin.py
+++ b/mineru/model/mfr/unimernet/unimernet_hf/unimer_swin/configuration_unimer_swin.py
+# coding=utf-8
+# Copyright 2022 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Donut Swin Transformer model configuration"""
+
+from transformers.configuration_utils import PretrainedConfig
+from transformers.utils import logging
+
+
+logger = logging.get_logger(__name__)
+
+
+class UnimerSwinConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`UnimerSwinModel`]. It is used to instantiate a
+    Donut model according to the specified arguments, defining the model architecture. Instantiating a configuration
+    with the defaults will yield a similar configuration to that of the Donut
+    [naver-clova-ix/donut-base](https://huggingface.co/naver-clova-ix/donut-base) architecture.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+    Args:
+        image_size (`int`, *optional*, defaults to 224):
+            The size (resolution) of each image.
+        patch_size (`int`, *optional*, defaults to 4):
+            The size (resolution) of each patch.
+        num_channels (`int`, *optional*, defaults to 3):
+            The number of input channels.
+        embed_dim (`int`, *optional*, defaults to 96):
+            Dimensionality of patch embedding.
+        depths (`list(int)`, *optional*, defaults to `[2, 2, 6, 2]`):
+            Depth of each layer in the Transformer encoder.
+        num_heads (`list(int)`, *optional*, defaults to `[3, 6, 12, 24]`):
+            Number of attention heads in each layer of the Transformer encoder.
+        window_size (`int`, *optional*, defaults to 7):
+            Size of windows.
+        mlp_ratio (`float`, *optional*, defaults to 4.0):
+            Ratio of MLP hidden dimensionality to embedding dimensionality.
+        qkv_bias (`bool`, *optional*, defaults to `True`):
+            Whether or not a learnable bias should be added to the queries, keys and values.
+        hidden_dropout_prob (`float`, *optional*, defaults to 0.0):
+            The dropout probability for all fully connected layers in the embeddings and encoder.
+        attention_probs_dropout_prob (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for the attention probabilities.
+        drop_path_rate (`float`, *optional*, defaults to 0.1):
+            Stochastic depth rate.
+        hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`):
+            The non-linear activation function (function or string) in the encoder. If string, `"gelu"`, `"relu"`,
+            `"selu"` and `"gelu_new"` are supported.
+        use_absolute_embeddings (`bool`, *optional*, defaults to `False`):
+            Whether or not to add absolute position embeddings to the patch embeddings.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        layer_norm_eps (`float`, *optional*, defaults to 1e-05):
+            The epsilon used by the layer normalization layers.
+
+    Example:
+
+    ```python
+    >>> from transformers import UnimerSwinConfig, UnimerSwinModel
+
+    >>> # Initializing a Donut naver-clova-ix/donut-base style configuration
+    >>> configuration = UnimerSwinConfig()
+
+    >>> # Randomly initializing a model from the naver-clova-ix/donut-base style configuration
+    >>> model = UnimerSwinModel(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+
+    model_type = "unimer-swin"
+
+    attribute_map = {
+        "num_attention_heads": "num_heads",
+        "num_hidden_layers": "num_layers",
+    }
+
+    def __init__(
+        self,
+        image_size=224,
+        patch_size=4,
+        num_channels=3,
+        embed_dim=96,
+        depths=[2, 2, 6, 2],
+        num_heads=[3, 6, 12, 24],
+        window_size=7,
+        mlp_ratio=4.0,
+        qkv_bias=True,
+        hidden_dropout_prob=0.0,
+        attention_probs_dropout_prob=0.0,
+        drop_path_rate=0.1,
+        hidden_act="gelu",
+        use_absolute_embeddings=False,
+        initializer_range=0.02,
+        layer_norm_eps=1e-5,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+
+        self.image_size = image_size
+        self.patch_size = patch_size
+        self.num_channels = num_channels
+        self.embed_dim = embed_dim
+        self.depths = depths
+        self.num_layers = len(depths)
+        self.num_heads = num_heads
+        self.window_size = window_size
+        self.mlp_ratio = mlp_ratio
+        self.qkv_bias = qkv_bias
+        self.hidden_dropout_prob = hidden_dropout_prob
+        self.attention_probs_dropout_prob = attention_probs_dropout_prob
+        self.drop_path_rate = drop_path_rate
+        self.hidden_act = hidden_act
+        self.use_absolute_embeddings = use_absolute_embeddings
+        self.layer_norm_eps = layer_norm_eps
+        self.initializer_range = initializer_range
+        # we set the hidden_size attribute in order to make Swin work with VisionEncoderDecoderModel
+        # this indicates the channel dimension after the last stage of the model
+        self.hidden_size = int(embed_dim * 2 ** (len(depths) - 1))
--- a/mineru/model/mfr/unimernet/unimernet_hf/unimer_swin/image_processing_unimer_swin.py
+++ b/mineru/model/mfr/unimernet/unimernet_hf/unimer_swin/image_processing_unimer_swin.py
+from transformers.image_processing_utils import BaseImageProcessor
+import numpy as np
+import cv2
+import albumentations as alb
+from albumentations.pytorch import ToTensorV2
+
+
+# TODO: dereference cv2 if possible
+class UnimerSwinImageProcessor(BaseImageProcessor):
+    def __init__(
+            self,
+            image_size = (192, 672),
+        ):
+        self.input_size = [int(_) for _ in image_size]
+        assert len(self.input_size) == 2
+    
+        self.transform = alb.Compose(
+            [
+                alb.ToGray(),
+                alb.Normalize((0.7931, 0.7931, 0.7931), (0.1738, 0.1738, 0.1738)),
+                # alb.Sharpen()
+                ToTensorV2(),
+            ]
+        )
+
+    def __call__(self, item):
+        image = self.prepare_input(item)
+        return self.transform(image=image)['image'][:1]
+
+    @staticmethod
+    def crop_margin_numpy(img: np.ndarray) -> np.ndarray:
+        """Crop margins of image using NumPy operations"""
+        # Convert to grayscale if it's a color image
+        if len(img.shape) == 3 and img.shape[2] == 3:
+            gray = cv2.cvtColor(img, cv2.COLOR_RGB2GRAY)
+        else:
+            gray = img.copy()
+
+        # Normalize and threshold
+        if gray.max() == gray.min():
+            return img
+
+        normalized = (((gray - gray.min()) / (gray.max() - gray.min())) * 255).astype(np.uint8)
+        binary = 255 * (normalized < 200).astype(np.uint8)
+
+        # Find bounding box
+        coords = cv2.findNonZero(binary)  # Find all non-zero points (text)
+        x, y, w, h = cv2.boundingRect(coords)  # Find minimum spanning bounding box
+
+        # Return cropped image
+        return img[y:y + h, x:x + w]
+
+    def prepare_input(self, img, random_padding: bool = False):
+        """
+        Convert PIL Image or numpy array to properly sized and padded image after:
+            - crop margins
+            - resize while maintaining aspect ratio
+            - pad to target size
+        """
+        if img is None:
+            return None
+
+        # try:
+        #     img = self.crop_margin_numpy(img)
+        # except Exception:
+        #     # might throw an error for broken files
+        #     return None
+
+        if img.shape[0] == 0 or img.shape[1] == 0:
+            return None
+
+        # Get current dimensions
+        h, w = img.shape[:2]
+        target_h, target_w = self.input_size
+
+        # Calculate scale to preserve aspect ratio (equivalent to resize + thumbnail)
+        scale = min(target_h / h, target_w / w)
+
+        # Calculate new dimensions
+        new_h, new_w = int(h * scale), int(w * scale)
+
+        # Resize the image while preserving aspect ratio
+        resized_img = cv2.resize(img, (new_w, new_h))
+
+        # Calculate padding values using the existing method
+        delta_width = target_w - new_w
+        delta_height = target_h - new_h
+
+        pad_width, pad_height = self._get_padding_values(new_w, new_h, random_padding)
+
+        # Apply padding (convert PIL padding format to OpenCV format)
+        padding_color = [0, 0, 0] if len(img.shape) == 3 else [0]
+
+        padded_img = cv2.copyMakeBorder(
+            resized_img,
+            pad_height,  # top
+            delta_height - pad_height,  # bottom
+            pad_width,  # left
+            delta_width - pad_width,  # right
+            cv2.BORDER_CONSTANT,
+            value=padding_color
+        )
+
+        return padded_img
+
+    def _calculate_padding(self, new_w, new_h, random_padding):
+        """Calculate padding values for PIL images"""
+        delta_width = self.input_size[1] - new_w
+        delta_height = self.input_size[0] - new_h
+
+        pad_width, pad_height = self._get_padding_values(new_w, new_h, random_padding)
+
+        return (
+            pad_width,
+            pad_height,
+            delta_width - pad_width,
+            delta_height - pad_height,
+        )
+
+    def _get_padding_values(self, new_w, new_h, random_padding):
+        """Get padding values based on image dimensions and padding strategy"""
+        delta_width = self.input_size[1] - new_w
+        delta_height = self.input_size[0] - new_h
+
+        if random_padding:
+            pad_width = np.random.randint(low=0, high=delta_width + 1)
+            pad_height = np.random.randint(low=0, high=delta_height + 1)
+        else:
+            pad_width = delta_width // 2
+            pad_height = delta_height // 2
+
+        return pad_width, pad_height
--- a/mineru/model/mfr/unimernet/unimernet_hf/unimer_swin/modeling_unimer_swin.py
+++ b/mineru/model/mfr/unimernet/unimernet_hf/unimer_swin/modeling_unimer_swin.py
--- a/mineru/model/ocr/__init__.py
+++ b/mineru/model/ocr/__init__.py
+# Copyright (c) Opendatalab. All rights reserved.
--- a/mineru/model/ocr/paddleocr2pytorch/__init__.py
+++ b/mineru/model/ocr/paddleocr2pytorch/__init__.py
+# Copyright (c) Opendatalab. All rights reserved.
--- a/mineru/model/ocr/paddleocr2pytorch/pytorch_paddle.py
+++ b/mineru/model/ocr/paddleocr2pytorch/pytorch_paddle.py
--- a/mineru/model/ocr/paddleocr2pytorch/pytorchocr/__init__.py
+++ b/mineru/model/ocr/paddleocr2pytorch/pytorchocr/__init__.py
--- a/mineru/model/ocr/paddleocr2pytorch/pytorchocr/base_ocr_v20.py
+++ b/mineru/model/ocr/paddleocr2pytorch/pytorchocr/base_ocr_v20.py
--- a/mineru/model/ocr/paddleocr2pytorch/pytorchocr/data/__init__.py
+++ b/mineru/model/ocr/paddleocr2pytorch/pytorchocr/data/__init__.py
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+from .imaug import transform, create_operators
+
+
--- a/mineru/model/ocr/paddleocr2pytorch/pytorchocr/data/imaug/__init__.py
+++ b/mineru/model/ocr/paddleocr2pytorch/pytorchocr/data/imaug/__init__.py
--- a/mineru/model/ocr/paddleocr2pytorch/pytorchocr/data/imaug/operators.py
+++ b/mineru/model/ocr/paddleocr2pytorch/pytorchocr/data/imaug/operators.py
--- a/mineru/model/ocr/paddleocr2pytorch/pytorchocr/modeling/__init__.py
+++ b/mineru/model/ocr/paddleocr2pytorch/pytorchocr/modeling/__init__.py
--- a/mineru/model/ocr/paddleocr2pytorch/pytorchocr/modeling/architectures/__init__.py
+++ b/mineru/model/ocr/paddleocr2pytorch/pytorchocr/modeling/architectures/__init__.py
--- a/mineru/model/ocr/paddleocr2pytorch/pytorchocr/modeling/architectures/base_model.py
+++ b/mineru/model/ocr/paddleocr2pytorch/pytorchocr/modeling/architectures/base_model.py
--- a/mineru/model/ocr/paddleocr2pytorch/pytorchocr/modeling/backbones/__init__.py
+++ b/mineru/model/ocr/paddleocr2pytorch/pytorchocr/modeling/backbones/__init__.py
--- a/mineru/model/ocr/paddleocr2pytorch/pytorchocr/modeling/backbones/det_mobilenet_v3.py
+++ b/mineru/model/ocr/paddleocr2pytorch/pytorchocr/modeling/backbones/det_mobilenet_v3.py
--- a/mineru/model/ocr/paddleocr2pytorch/pytorchocr/modeling/backbones/rec_hgnet.py
+++ b/mineru/model/ocr/paddleocr2pytorch/pytorchocr/modeling/backbones/rec_hgnet.py