Merge pull request #2611 from myhloli/dev

Dev

Merge pull request #2611 from myhloli/dev
Dev
0c7a0882 · Xiaomeng Zhao · GitHub · 3bd0ecf1 · a392f445 · 3bd0ecf1
Unverified Commit 0c7a0882 authored Jun 12, 2025 by Xiaomeng Zhao Committed by GitHub Jun 12, 2025
20 changed files
--- a/magic_pdf/model/sub_modules/__init__.py
+++ b/magic_pdf/model/sub_modules/__init__.py
--- a/magic_pdf/model/sub_modules/language_detection/utils.py
+++ b/magic_pdf/model/sub_modules/language_detection/utils.py
-# Copyright (c) Opendatalab. All rights reserved.
-import os
-from pathlib import Path
-import yaml
-os.environ['NO_ALBUMENTATIONS_UPDATE'] = '1'  # 禁止albumentations检查更新
-from magic_pdf.config.constants import MODEL_NAME
-from magic_pdf.data.utils import load_images_from_pdf
-from magic_pdf.libs.config_reader import get_local_models_dir, get_device
-from magic_pdf.libs.pdf_check import extract_pages
-from magic_pdf.model.model_list import AtomicModel
-from magic_pdf.model.sub_modules.model_init import AtomModelSingleton
-def get_model_config():
-    local_models_dir = get_local_models_dir()
-    device = get_device()
-    current_file_path = os.path.abspath(__file__)
-    root_dir = Path(current_file_path).parents[3]
-    model_config_dir = os.path.join(root_dir, 'resources', 'model_config')
-    config_path = os.path.join(model_config_dir, 'model_configs.yaml')
-    with open(config_path, 'r', encoding='utf-8') as f:
-        configs = yaml.load(f, Loader=yaml.FullLoader)
-    return root_dir, local_models_dir, device, configs
-def get_text_images(simple_images):
-    _, local_models_dir, device, configs = get_model_config()
-    atom_model_manager = AtomModelSingleton()
-    temp_layout_model = atom_model_manager.get_atom_model(
-        atom_model_name=AtomicModel.Layout,
-        layout_model_name=MODEL_NAME.DocLayout_YOLO,
-        doclayout_yolo_weights=str(
-            os.path.join(
-                local_models_dir, configs['weights'][MODEL_NAME.DocLayout_YOLO]
-            )
-        ),
-        device=device,
-    )
-    text_images = []
-    for simple_image in simple_images:
-        image = simple_image['img']
-        layout_res = temp_layout_model.predict(image)
-        # 给textblock截图
-        for res in layout_res:
-            if res['category_id'] in [1]:
-                x1, y1, _, _, x2, y2, _, _ = res['poly']
-                # 初步清洗（宽和高都小于100）
-                if x2 - x1 < 100 and y2 - y1 < 100:
-                    continue
-                text_images.append(image[y1:y2, x1:x2])
-    return text_images
-def auto_detect_lang(pdf_bytes: bytes):
-    sample_docs = extract_pages(pdf_bytes)
-    sample_pdf_bytes = sample_docs.tobytes()
-    simple_images = load_images_from_pdf(sample_pdf_bytes, dpi=200)
-    text_images = get_text_images(simple_images)
-    langdetect_model = model_init(MODEL_NAME.YOLO_V11_LangDetect)
-    lang = langdetect_model.do_detect(text_images)
-    return lang
-def model_init(model_name: str):
-    atom_model_manager = AtomModelSingleton()
-    if model_name == MODEL_NAME.YOLO_V11_LangDetect:
-        root_dir, _, device, _ = get_model_config()
-        model = atom_model_manager.get_atom_model(
-            atom_model_name=AtomicModel.LangDetect,
-            langdetect_model_name=MODEL_NAME.YOLO_V11_LangDetect,
-            langdetect_model_weight=str(os.path.join(root_dir, 'resources', 'yolov11-langdetect', 'yolo_v11_ft.pt')),
-            device=device,
-        )
-    else:
-        raise ValueError(f"model_name {model_name} not found")
-    return model
--- a/magic_pdf/model/sub_modules/language_detection/yolov11/YOLOv11.py
+++ b/magic_pdf/model/sub_modules/language_detection/yolov11/YOLOv11.py
-# Copyright (c) Opendatalab. All rights reserved.
-import time
-from collections import Counter
-from uuid import uuid4
-import cv2
-import numpy as np
-import torch
-from loguru import logger
-from ultralytics import YOLO
-language_dict = {
-    "ch": "中文简体",
-    "en": "英语",
-    "japan": "日语",
-    "korean": "韩语",
-    "fr": "法语",
-    "german": "德语",
-    "ar": "阿拉伯语",
-    "ru": "俄语"
-}
-def split_images(image, result_images=None):
-    """
-    对输入文件夹内的图片进行处理,若图片竖向(y方向)分辨率超过400,则进行拆分，
-    每次平分图片,直至拆分出的图片竖向分辨率都满足400以下,将处理后的图片(拆分后的子图片)保存到输出文件夹。
-    避免保存因裁剪区域超出图片范围导致出现的无效黑色图片部分。
-    """
-    if result_images is None:
-        result_images = []
-    height, width = image.shape[:2]
-    long_side = max(width, height)  # 获取较长边长度
-    if long_side <= 400:
-        result_images.append(image)
-        return result_images
-    new_long_side = long_side // 2
-    sub_images = []
-    if width >= height:  # 如果宽度是较长边
-        for x in range(0, width, new_long_side):
-            # 判断裁剪区域是否超出图片范围，如果超出则不进行裁剪保存操作
-            if x + new_long_side > width:
-                continue
-            sub_image = image[0:height, x:x + new_long_side]
-            sub_images.append(sub_image)
-    else:  # 如果高度是较长边
-        for y in range(0, height, new_long_side):
-            # 判断裁剪区域是否超出图片范围，如果超出则不进行裁剪保存操作
-            if y + new_long_side > height:
-                continue
-            sub_image = image[y:y + new_long_side, 0:width]
-            sub_images.append(sub_image)
-    for sub_image in sub_images:
-        split_images(sub_image, result_images)
-    return result_images
-def resize_images_to_224(image):
-    """
-    若分辨率小于224则用黑色背景补齐到224*224大小,若大于等于224则调整为224*224大小。
-    Works directly with NumPy arrays.
-    """
-    try:
-        height, width = image.shape[:2]
-        if width < 224 or height < 224:
-            # Create black background
-            new_image = np.zeros((224, 224, 3), dtype=np.uint8)
-            # Calculate paste position (ensure they're not negative)
-            paste_x = max(0, (224 - width) // 2)
-            paste_y = max(0, (224 - height) // 2)
-            # Make sure we don't exceed the boundaries of new_image
-            paste_width = min(width, 224)
-            paste_height = min(height, 224)
-            # Paste original image onto black background
-            new_image[paste_y:paste_y + paste_height, paste_x:paste_x + paste_width] = image[:paste_height, :paste_width]
-            image = new_image
-        else:
-            # Resize using cv2
-            image = cv2.resize(image, (224, 224), interpolation=cv2.INTER_LANCZOS4)
-        return image
-    except Exception as e:
-        logger.exception(f"Error in resize_images_to_224: {e}")
-        return None
-class YOLOv11LangDetModel(object):
-    def __init__(self, langdetect_model_weight, device):
-        self.model = YOLO(langdetect_model_weight)
-        if str(device).startswith("npu"):
-            self.device = torch.device(device)
-        else:
-            self.device = device
-    def do_detect(self, images: list):
-        all_images = []
-        for image in images:
-            height, width = image.shape[:2]
-            if width < 100 and height < 100:
-                continue
-            temp_images = split_images(image)
-            for temp_image in temp_images:
-                all_images.append(resize_images_to_224(temp_image))
-        # langdetect_start = time.time()
-        images_lang_res = self.batch_predict(all_images, batch_size=256)
-        # logger.info(f"image number of langdetect: {len(images_lang_res)}, langdetect time: {round(time.time() - langdetect_start, 2)}")
-        if len(images_lang_res) > 0:
-            count_dict = Counter(images_lang_res)
-            language = max(count_dict, key=count_dict.get)
-        else:
-            language = None
-        return language
-    def predict(self, image):
-        results = self.model.predict(image, verbose=False, device=self.device)
-        predicted_class_id = int(results[0].probs.top1)
-        predicted_class_name = self.model.names[predicted_class_id]
-        return predicted_class_name
-    def batch_predict(self, images: list, batch_size: int) -> list:
-        images_lang_res = []
-        for index in range(0, len(images), batch_size):
-            lang_res = [
-                image_res.cpu()
-                for image_res in self.model.predict(
-                    images[index: index + batch_size],
-                    verbose = False,
-                    device=self.device,
-                )
-            ]
-            for res in lang_res:
-                predicted_class_id = int(res.probs.top1)
-                predicted_class_name = self.model.names[predicted_class_id]
-                images_lang_res.append(predicted_class_name)
-        return images_lang_res
\ No newline at end of file
--- a/magic_pdf/model/sub_modules/layout/__init__.py
+++ b/magic_pdf/model/sub_modules/layout/__init__.py
--- a/magic_pdf/model/sub_modules/layout/doclayout_yolo/__init__.py
+++ b/magic_pdf/model/sub_modules/layout/doclayout_yolo/__init__.py
--- a/magic_pdf/model/sub_modules/layout/layoutlmv3/__init__.py
+++ b/magic_pdf/model/sub_modules/layout/layoutlmv3/__init__.py
--- a/magic_pdf/model/sub_modules/layout/layoutlmv3/backbone.py
+++ b/magic_pdf/model/sub_modules/layout/layoutlmv3/backbone.py
-# --------------------------------------------------------------------------------
-# VIT: Multi-Path Vision Transformer for Dense Prediction
-# Copyright (c) 2022 Electronics and Telecommunications Research Institute (ETRI).
-# All Rights Reserved.
-# Written by Youngwan Lee
-# This source code is licensed(Dual License(GPL3.0 & Commercial)) under the license found in the
-# LICENSE file in the root directory of this source tree.
-# --------------------------------------------------------------------------------
-# References:
-# timm: https://github.com/rwightman/pytorch-image-models/tree/master/timm
-# CoaT: https://github.com/mlpc-ucsd/CoaT
-# --------------------------------------------------------------------------------
-import torch
-from detectron2.layers import (
-    ShapeSpec,
-)
-from detectron2.modeling import Backbone, BACKBONE_REGISTRY, FPN
-from detectron2.modeling.backbone.fpn import LastLevelP6P7, LastLevelMaxPool
-from .beit import beit_base_patch16, dit_base_patch16, dit_large_patch16, beit_large_patch16
-from .deit import deit_base_patch16, mae_base_patch16
-from .layoutlmft.models.layoutlmv3 import LayoutLMv3Model
-from transformers import AutoConfig
-__all__ = [
-    "build_vit_fpn_backbone",
-]
-class VIT_Backbone(Backbone):
-    """
-    Implement VIT backbone.
-    """
-    def __init__(self, name, out_features, drop_path, img_size, pos_type, model_kwargs,
-                 config_path=None, image_only=False, cfg=None):
-        super().__init__()
-        self._out_features = out_features
-        if 'base' in name:
-            self._out_feature_strides = {"layer3": 4, "layer5": 8, "layer7": 16, "layer11": 32}
-            self._out_feature_channels = {"layer3": 768, "layer5": 768, "layer7": 768, "layer11": 768}
-        else:
-            self._out_feature_strides = {"layer7": 4, "layer11": 8, "layer15": 16, "layer23": 32}
-            self._out_feature_channels = {"layer7": 1024, "layer11": 1024, "layer15": 1024, "layer23": 1024}
-        if name == 'beit_base_patch16':
-            model_func = beit_base_patch16
-        elif name == 'dit_base_patch16':
-            model_func = dit_base_patch16
-        elif name == "deit_base_patch16":
-            model_func = deit_base_patch16
-        elif name == "mae_base_patch16":
-            model_func = mae_base_patch16
-        elif name == "dit_large_patch16":
-            model_func = dit_large_patch16
-        elif name == "beit_large_patch16":
-            model_func = beit_large_patch16
-        if 'beit' in name or 'dit' in name:
-            if pos_type == "abs":
-                self.backbone = model_func(img_size=img_size,
-                                           out_features=out_features,
-                                           drop_path_rate=drop_path,
-                                           use_abs_pos_emb=True,
-                                           **model_kwargs)
-            elif pos_type == "shared_rel":
-                self.backbone = model_func(img_size=img_size,
-                                           out_features=out_features,
-                                           drop_path_rate=drop_path,
-                                           use_shared_rel_pos_bias=True,
-                                           **model_kwargs)
-            elif pos_type == "rel":
-                self.backbone = model_func(img_size=img_size,
-                                           out_features=out_features,
-                                           drop_path_rate=drop_path,
-                                           use_rel_pos_bias=True,
-                                           **model_kwargs)
-            else:
-                raise ValueError()
-        elif "layoutlmv3" in name:
-            config = AutoConfig.from_pretrained(config_path)
-            # disable relative bias as DiT
-            config.has_spatial_attention_bias = False
-            config.has_relative_attention_bias = False
-            self.backbone = LayoutLMv3Model(config, detection=True,
-                                               out_features=out_features, image_only=image_only)
-        else:
-            self.backbone = model_func(img_size=img_size,
-                                       out_features=out_features,
-                                       drop_path_rate=drop_path,
-                                       **model_kwargs)
-        self.name = name
-    def forward(self, x):
-        """
-        Args:
-            x: Tensor of shape (N,C,H,W). H, W must be a multiple of ``self.size_divisibility``.
-        Returns:
-            dict[str->Tensor]: names and the corresponding features
-        """
-        if "layoutlmv3" in self.name:
-            return self.backbone.forward(
-                input_ids=x["input_ids"] if "input_ids" in x else None,
-                bbox=x["bbox"] if "bbox" in x else None,
-                images=x["images"] if "images" in x else None,
-                attention_mask=x["attention_mask"] if "attention_mask" in x else None,
-                # output_hidden_states=True,
-            )
-        assert x.dim() == 4, f"VIT takes an input of shape (N, C, H, W). Got {x.shape} instead!"
-        return self.backbone.forward_features(x)
-    def output_shape(self):
-        return {
-            name: ShapeSpec(
-                channels=self._out_feature_channels[name], stride=self._out_feature_strides[name]
-            )
-            for name in self._out_features
-        }
-def build_VIT_backbone(cfg):
-    """
-    Create a VIT instance from config.
-    Args:
-        cfg: a detectron2 CfgNode
-    Returns:
-        A VIT backbone instance.
-    """
-    # fmt: off
-    name = cfg.MODEL.VIT.NAME
-    out_features = cfg.MODEL.VIT.OUT_FEATURES
-    drop_path = cfg.MODEL.VIT.DROP_PATH
-    img_size = cfg.MODEL.VIT.IMG_SIZE
-    pos_type = cfg.MODEL.VIT.POS_TYPE
-    model_kwargs = eval(str(cfg.MODEL.VIT.MODEL_KWARGS).replace("`", ""))
-    if 'layoutlmv3' in name:
-        if cfg.MODEL.CONFIG_PATH != '':
-            config_path = cfg.MODEL.CONFIG_PATH
-        else:
-            config_path = cfg.MODEL.WEIGHTS.replace('pytorch_model.bin', '')  # layoutlmv3 pre-trained models
-            config_path = config_path.replace('model_final.pth', '')  # detection fine-tuned models
-    else:
-        config_path = None
-    return VIT_Backbone(name, out_features, drop_path, img_size, pos_type, model_kwargs,
-                        config_path=config_path, image_only=cfg.MODEL.IMAGE_ONLY, cfg=cfg)
-@BACKBONE_REGISTRY.register()
-def build_vit_fpn_backbone(cfg, input_shape: ShapeSpec):
-    """
-    Create a VIT w/ FPN backbone.
-    Args:
-        cfg: a detectron2 CfgNode
-    Returns:
-        backbone (Backbone): backbone module, must be a subclass of :class:`Backbone`.
-    """
-    bottom_up = build_VIT_backbone(cfg)
-    in_features = cfg.MODEL.FPN.IN_FEATURES
-    out_channels = cfg.MODEL.FPN.OUT_CHANNELS
-    backbone = FPN(
-        bottom_up=bottom_up,
-        in_features=in_features,
-        out_channels=out_channels,
-        norm=cfg.MODEL.FPN.NORM,
-        top_block=LastLevelMaxPool(),
-        fuse_type=cfg.MODEL.FPN.FUSE_TYPE,
-    )
-    return backbone
--- a/magic_pdf/model/sub_modules/layout/layoutlmv3/beit.py
+++ b/magic_pdf/model/sub_modules/layout/layoutlmv3/beit.py
--- a/magic_pdf/model/sub_modules/layout/layoutlmv3/deit.py
+++ b/magic_pdf/model/sub_modules/layout/layoutlmv3/deit.py
--- a/magic_pdf/model/sub_modules/layout/layoutlmv3/layoutlmft/__init__.py
+++ b/magic_pdf/model/sub_modules/layout/layoutlmv3/layoutlmft/__init__.py
-from .models import (
-    LayoutLMv3Config,
-    LayoutLMv3ForTokenClassification,
-    LayoutLMv3ForQuestionAnswering,
-    LayoutLMv3ForSequenceClassification,
-    LayoutLMv3Tokenizer,
-)
--- a/magic_pdf/model/sub_modules/layout/layoutlmv3/layoutlmft/data/__init__.py
+++ b/magic_pdf/model/sub_modules/layout/layoutlmv3/layoutlmft/data/__init__.py
-# flake8: noqa
-from .data_collator import DataCollatorForKeyValueExtraction
--- a/magic_pdf/model/sub_modules/layout/layoutlmv3/layoutlmft/data/cord.py
+++ b/magic_pdf/model/sub_modules/layout/layoutlmv3/layoutlmft/data/cord.py
--- a/magic_pdf/model/sub_modules/layout/layoutlmv3/layoutlmft/data/data_collator.py
+++ b/magic_pdf/model/sub_modules/layout/layoutlmv3/layoutlmft/data/data_collator.py
--- a/magic_pdf/model/sub_modules/layout/layoutlmv3/layoutlmft/data/funsd.py
+++ b/magic_pdf/model/sub_modules/layout/layoutlmv3/layoutlmft/data/funsd.py
--- a/magic_pdf/model/sub_modules/layout/layoutlmv3/layoutlmft/data/image_utils.py
+++ b/magic_pdf/model/sub_modules/layout/layoutlmv3/layoutlmft/data/image_utils.py
--- a/magic_pdf/model/sub_modules/layout/layoutlmv3/layoutlmft/data/xfund.py
+++ b/magic_pdf/model/sub_modules/layout/layoutlmv3/layoutlmft/data/xfund.py
--- a/magic_pdf/model/sub_modules/layout/layoutlmv3/layoutlmft/models/__init__.py
+++ b/magic_pdf/model/sub_modules/layout/layoutlmv3/layoutlmft/models/__init__.py
-from .layoutlmv3 import (
-    LayoutLMv3Config,
-    LayoutLMv3ForTokenClassification,
-    LayoutLMv3ForQuestionAnswering,
-    LayoutLMv3ForSequenceClassification,
-    LayoutLMv3Tokenizer,
-)
--- a/magic_pdf/model/sub_modules/layout/layoutlmv3/layoutlmft/models/layoutlmv3/__init__.py
+++ b/magic_pdf/model/sub_modules/layout/layoutlmv3/layoutlmft/models/layoutlmv3/__init__.py
-from transformers import AutoConfig, AutoModel, AutoModelForTokenClassification, \
-    AutoModelForQuestionAnswering, AutoModelForSequenceClassification, AutoTokenizer
-from transformers.convert_slow_tokenizer import SLOW_TO_FAST_CONVERTERS, RobertaConverter
-from .configuration_layoutlmv3 import LayoutLMv3Config
-from .modeling_layoutlmv3 import (
-    LayoutLMv3ForTokenClassification,
-    LayoutLMv3ForQuestionAnswering,
-    LayoutLMv3ForSequenceClassification,
-    LayoutLMv3Model,
-)
-from .tokenization_layoutlmv3 import LayoutLMv3Tokenizer
-from .tokenization_layoutlmv3_fast import LayoutLMv3TokenizerFast
-#AutoConfig.register("layoutlmv3", LayoutLMv3Config)
-#AutoModel.register(LayoutLMv3Config, LayoutLMv3Model)
-#AutoModelForTokenClassification.register(LayoutLMv3Config, LayoutLMv3ForTokenClassification)
-#AutoModelForQuestionAnswering.register(LayoutLMv3Config, LayoutLMv3ForQuestionAnswering)
-#AutoModelForSequenceClassification.register(LayoutLMv3Config, LayoutLMv3ForSequenceClassification)
-#AutoTokenizer.register(
-#    LayoutLMv3Config, slow_tokenizer_class=LayoutLMv3Tokenizer, fast_tokenizer_class=LayoutLMv3TokenizerFast
-#)
-SLOW_TO_FAST_CONVERTERS.update({"LayoutLMv3Tokenizer": RobertaConverter})
--- a/magic_pdf/model/sub_modules/layout/layoutlmv3/layoutlmft/models/layoutlmv3/configuration_layoutlmv3.py
+++ b/magic_pdf/model/sub_modules/layout/layoutlmv3/layoutlmft/models/layoutlmv3/configuration_layoutlmv3.py
--- a/magic_pdf/model/sub_modules/layout/layoutlmv3/layoutlmft/models/layoutlmv3/modeling_layoutlmv3.py
+++ b/magic_pdf/model/sub_modules/layout/layoutlmv3/layoutlmft/models/layoutlmv3/modeling_layoutlmv3.py