refactor: reorganize project structure and update import paths

cbba27b4 · myhloli · 3027c677 · cbba27b4 · cbba27b4 · cbba27b4
Commit cbba27b4 authored May 28, 2025 by myhloli
20 changed files
--- a/mineru/__init__.py
+++ b/mineru/__init__.py
+# Copyright (c) Opendatalab. All rights reserved.
--- a/mineru/backend/pipeline/batch_analyze.py
+++ b/mineru/backend/pipeline/batch_analyze.py
+import cv2
+from loguru import logger
+from tqdm import tqdm
+from .model_init import AtomModelSingleton
+from ...utils.model_utils import crop_img, get_res_list_from_layout_res, get_coords_and_area
+from ...utils.ocr_utils import get_adjusted_mfdetrec_res, get_ocr_result_list
+YOLO_LAYOUT_BASE_BATCH_SIZE = 1
+MFD_BASE_BATCH_SIZE = 1
+MFR_BASE_BATCH_SIZE = 16
+class BatchAnalyze:
+    def __init__(self, model_manager, batch_ratio: int, formula_enable, table_enable):
+        self.batch_ratio = batch_ratio
+        self.formula_enable = formula_enable
+        self.table_enable = table_enable
+        self.model_manager = model_manager
+    def __call__(self, images_with_extra_info: list) -> list:
+        if len(images_with_extra_info) == 0:
+            return []
+        images_layout_res = []
+        self.model = self.model_manager.get_model(
+            lang=None,
+            formula_enable=self.formula_enable,
+            table_enable=self.table_enable,
+        )
+        atom_model_manager = AtomModelSingleton()
+        images = [image for image, _, _ in images_with_extra_info]
+        # doclayout_yolo
+        layout_images = []
+        for image_index, image in enumerate(images):
+            layout_images.append(image)
+        images_layout_res += self.model.layout_model.batch_predict(
+            layout_images, YOLO_LAYOUT_BASE_BATCH_SIZE
+        )
+        if self.formula_enable:
+            # 公式检测
+            images_mfd_res = self.model.mfd_model.batch_predict(
+                images, MFD_BASE_BATCH_SIZE
+            )
+            # 公式识别
+            images_formula_list = self.model.mfr_model.batch_predict(
+                images_mfd_res,
+                images,
+                batch_size=self.batch_ratio * MFR_BASE_BATCH_SIZE,
+            )
+            mfr_count = 0
+            for image_index in range(len(images)):
+                images_layout_res[image_index] += images_formula_list[image_index]
+                mfr_count += len(images_formula_list[image_index])
+        # 清理显存
+        # clean_vram(self.model.device, vram_threshold=8)
+        ocr_res_list_all_page = []
+        table_res_list_all_page = []
+        for index in range(len(images)):
+            _, ocr_enable, _lang = images_with_extra_info[index]
+            layout_res = images_layout_res[index]
+            np_array_img = images[index]
+            ocr_res_list, table_res_list, single_page_mfdetrec_res = (
+                get_res_list_from_layout_res(layout_res)
+            )
+            ocr_res_list_all_page.append({'ocr_res_list':ocr_res_list,
+                                          'lang':_lang,
+                                          'ocr_enable':ocr_enable,
+                                          'np_array_img':np_array_img,
+                                          'single_page_mfdetrec_res':single_page_mfdetrec_res,
+                                          'layout_res':layout_res,
+                                          })
+            for table_res in table_res_list:
+                table_img, _ = crop_img(table_res, np_array_img)
+                table_res_list_all_page.append({'table_res':table_res,
+                                                'lang':_lang,
+                                                'table_img':table_img,
+                                              })
+        # 文本框检测
+        for ocr_res_list_dict in tqdm(ocr_res_list_all_page, desc="OCR-det Predict"):
+            # Process each area that requires OCR processing
+            _lang = ocr_res_list_dict['lang']
+            # Get OCR results for this language's images
+            ocr_model = atom_model_manager.get_atom_model(
+                atom_model_name='ocr',
+                det_db_box_thresh=0.3,
+                lang=_lang
+            )
+            for res in ocr_res_list_dict['ocr_res_list']:
+                new_image, useful_list = crop_img(
+                    res, ocr_res_list_dict['np_array_img'], crop_paste_x=50, crop_paste_y=50
+                )
+                adjusted_mfdetrec_res = get_adjusted_mfdetrec_res(
+                    ocr_res_list_dict['single_page_mfdetrec_res'], useful_list
+                )
+                # OCR-det
+                new_image = cv2.cvtColor(new_image, cv2.COLOR_RGB2BGR)
+                ocr_res = ocr_model.ocr(
+                    new_image, mfd_res=adjusted_mfdetrec_res, rec=False
+                )[0]
+                # Integration results
+                if ocr_res:
+                    ocr_result_list = get_ocr_result_list(ocr_res, useful_list, ocr_res_list_dict['ocr_enable'], new_image, _lang)
+                    if res["category_id"] == 3:
+                        # ocr_result_list中所有bbox的面积之和
+                        ocr_res_area = sum(get_coords_and_area(ocr_res_item)[4] for ocr_res_item in ocr_result_list if 'poly' in ocr_res_item)
+                        # 求ocr_res_area和res的面积的比值
+                        res_area = get_coords_and_area(res)[4]
+                        if res_area > 0:
+                            ratio = ocr_res_area / res_area
+                            if ratio > 0.25:
+                                res["category_id"] = 1
+                            else:
+                                continue
+                    ocr_res_list_dict['layout_res'].extend(ocr_result_list)
+        # 表格识别 table recognition
+        if self.table_enable:
+            for table_res_dict in tqdm(table_res_list_all_page, desc="Table Predict"):
+                _lang = table_res_dict['lang']
+                table_model = atom_model_manager.get_atom_model(
+                    atom_model_name='table',
+                    device='cpu',
+                    lang=_lang,
+                    table_sub_model_name='slanet_plus'
+                )
+                html_code, table_cell_bboxes, logic_points, elapse = table_model.predict(table_res_dict['table_img'])
+                # 判断是否返回正常
+                if html_code:
+                    expected_ending = html_code.strip().endswith(
+                        '</html>'
+                    ) or html_code.strip().endswith('</table>')
+                    if expected_ending:
+                        table_res_dict['table_res']['html'] = html_code
+                    else:
+                        logger.warning(
+                            'table recognition processing fails, not found expected HTML table end'
+                        )
+                else:
+                    logger.warning(
+                        'table recognition processing fails, not get html return'
+                    )
+        # Create dictionaries to store items by language
+        need_ocr_lists_by_lang = {}  # Dict of lists for each language
+        img_crop_lists_by_lang = {}  # Dict of lists for each language
+        for layout_res in images_layout_res:
+            for layout_res_item in layout_res:
+                if layout_res_item['category_id'] in [15]:
+                    if 'np_img' in layout_res_item and 'lang' in layout_res_item:
+                        lang = layout_res_item['lang']
+                        # Initialize lists for this language if not exist
+                        if lang not in need_ocr_lists_by_lang:
+                            need_ocr_lists_by_lang[lang] = []
+                            img_crop_lists_by_lang[lang] = []
+                        # Add to the appropriate language-specific lists
+                        need_ocr_lists_by_lang[lang].append(layout_res_item)
+                        img_crop_lists_by_lang[lang].append(layout_res_item['np_img'])
+                        # Remove the fields after adding to lists
+                        layout_res_item.pop('np_img')
+                        layout_res_item.pop('lang')
+        if len(img_crop_lists_by_lang) > 0:
+            # Process OCR by language
+            total_processed = 0
+            # Process each language separately
+            for lang, img_crop_list in img_crop_lists_by_lang.items():
+                if len(img_crop_list) > 0:
+                    # Get OCR results for this language's images
+                    ocr_model = atom_model_manager.get_atom_model(
+                        atom_model_name='ocr',
+                        det_db_box_thresh=0.3,
+                        lang=lang
+                    )
+                    ocr_res_list = ocr_model.ocr(img_crop_list, det=False, tqdm_enable=True)[0]
+                    # Verify we have matching counts
+                    assert len(ocr_res_list) == len(
+                        need_ocr_lists_by_lang[lang]), f'ocr_res_list: {len(ocr_res_list)}, need_ocr_list: {len(need_ocr_lists_by_lang[lang])} for lang: {lang}'
+                    # Process OCR results for this language
+                    for index, layout_res_item in enumerate(need_ocr_lists_by_lang[lang]):
+                        ocr_text, ocr_score = ocr_res_list[index]
+                        layout_res_item['text'] = ocr_text
+                        layout_res_item['score'] = float(f"{ocr_score:.3f}")
+                    total_processed += len(img_crop_list)
+        return images_layout_res
--- a/mineru/backend/pipeline/doc_analyze_by_custom_model.py
+++ b/mineru/backend/pipeline/doc_analyze_by_custom_model.py
+import os
+import time
+import numpy as np
+import torch
+from mineru.backend.pipeline.model_init import MineruPipelineModel
+os.environ['FLAGS_npu_jit_compile'] = '0'  # 关闭paddle的jit编译
+os.environ['FLAGS_use_stride_kernel'] = '0'
+os.environ['PYTORCH_ENABLE_MPS_FALLBACK'] = '1'  # 让mps可以fallback
+os.environ['NO_ALBUMENTATIONS_UPDATE'] = '1'  # 禁止albumentations检查更新
+from loguru import logger
+from ...utils.model_utils import get_vram, clean_memory
+from magic_pdf.libs.config_reader import (get_device, get_formula_config,
+                                          get_layout_config,
+                                          get_local_models_dir,
+                                          get_table_recog_config)
+class ModelSingleton:
+    _instance = None
+    _models = {}
+    def __new__(cls, *args, **kwargs):
+        if cls._instance is None:
+            cls._instance = super().__new__(cls)
+        return cls._instance
+    def get_model(
+        self,
+        lang=None,
+        formula_enable=None,
+        table_enable=None,
+    ):
+        key = (lang, formula_enable, table_enable)
+        if key not in self._models:
+            self._models[key] = custom_model_init(
+                lang=lang,
+                formula_enable=formula_enable,
+                table_enable=table_enable,
+            )
+        return self._models[key]
+def custom_model_init(
+    lang=None,
+    formula_enable=None,
+    table_enable=None,
+):
+    model_init_start = time.time()
+    # 从配置文件读取model-dir和device
+    local_models_dir = get_local_models_dir()
+    device = get_device()
+    formula_config = get_formula_config()
+    if formula_enable is not None:
+        formula_config['enable'] = formula_enable
+    table_config = get_table_recog_config()
+    if table_enable is not None:
+        table_config['enable'] = table_enable
+    model_input = {
+        'models_dir': local_models_dir,
+        'device': device,
+        'table_config': table_config,
+        'formula_config': formula_config,
+        'lang': lang,
+    }
+    custom_model = MineruPipelineModel(**model_input)
+    model_init_cost = time.time() - model_init_start
+    logger.info(f'model init cost: {model_init_cost}')
+    return custom_model
+def doc_analyze(
+    dataset: Dataset,
+    ocr: bool = False,
+    start_page_id=0,
+    end_page_id=None,
+    lang=None,
+    formula_enable=None,
+    table_enable=None,
+):
+    end_page_id = (
+        end_page_id
+        if end_page_id is not None and end_page_id >= 0
+        else len(dataset) - 1
+    )
+    MIN_BATCH_INFERENCE_SIZE = int(os.environ.get('MINERU_MIN_BATCH_INFERENCE_SIZE', 100))
+    images = []
+    page_wh_list = []
+    for index in range(len(dataset)):
+        if start_page_id <= index <= end_page_id:
+            page_data = dataset.get_page(index)
+            img_dict = page_data.get_image()
+            images.append(img_dict['img'])
+            page_wh_list.append((img_dict['width'], img_dict['height']))
+    images_with_extra_info = [(images[index], ocr, dataset._lang) for index in range(len(images))]
+    if len(images) >= MIN_BATCH_INFERENCE_SIZE:
+        batch_size = MIN_BATCH_INFERENCE_SIZE
+        batch_images = [images_with_extra_info[i:i+batch_size] for i in range(0, len(images_with_extra_info), batch_size)]
+    else:
+        batch_images = [images_with_extra_info]
+    results = []
+    processed_images_count = 0
+    for index, batch_image in enumerate(batch_images):
+        processed_images_count += len(batch_image)
+        logger.info(f'Batch {index + 1}/{len(batch_images)}: {processed_images_count} pages/{len(images_with_extra_info)} pages')
+        result = may_batch_image_analyze(batch_image, formula_enable, table_enable)
+        results.extend(result)
+    model_json = []
+    for index in range(len(dataset)):
+        if start_page_id <= index <= end_page_id:
+            result = results.pop(0)
+            page_width, page_height = page_wh_list.pop(0)
+        else:
+            result = []
+            page_height = 0
+            page_width = 0
+        page_info = {'page_no': index, 'width': page_width, 'height': page_height}
+        page_dict = {'layout_dets': result, 'page_info': page_info}
+        model_json.append(page_dict)
+    return model_json
+def batch_doc_analyze(
+    datasets: list[Dataset],
+    parse_method: str = 'auto',
+    lang=None,
+    formula_enable=None,
+    table_enable=None,
+):
+    MIN_BATCH_INFERENCE_SIZE = int(os.environ.get('MINERU_MIN_BATCH_INFERENCE_SIZE', 100))
+    batch_size = MIN_BATCH_INFERENCE_SIZE
+    page_wh_list = []
+    images_with_extra_info = []
+    for dataset in datasets:
+        ocr = False
+        if parse_method == 'auto':
+            if dataset.classify() == 'txt':
+                ocr = False
+            elif dataset.classify() == 'ocr':
+                ocr = True
+        elif parse_method == 'ocr':
+            ocr = True
+        elif parse_method == 'txt':
+            ocr = False
+        _lang = dataset._lang
+        for index in range(len(dataset)):
+            page_data = dataset.get_page(index)
+            img_dict = page_data.get_image()
+            page_wh_list.append((img_dict['width'], img_dict['height']))
+            images_with_extra_info.append((img_dict['img'], ocr, _lang))
+    batch_images = [images_with_extra_info[i:i+batch_size] for i in range(0, len(images_with_extra_info), batch_size)]
+    results = []
+    processed_images_count = 0
+    for index, batch_image in enumerate(batch_images):
+        processed_images_count += len(batch_image)
+        logger.info(f'Batch {index + 1}/{len(batch_images)}: {processed_images_count} pages/{len(images_with_extra_info)} pages')
+        result = may_batch_image_analyze(batch_image, formula_enable, table_enable)
+        results.extend(result)
+    infer_results = []
+    for index in range(len(datasets)):
+        dataset = datasets[index]
+        model_json = []
+        for i in range(len(dataset)):
+            result = results.pop(0)
+            page_width, page_height = page_wh_list.pop(0)
+            page_info = {'page_no': i, 'width': page_width, 'height': page_height}
+            page_dict = {'layout_dets': result, 'page_info': page_info}
+            model_json.append(page_dict)
+        infer_results.append(model_json)
+    return infer_results
+def may_batch_image_analyze(
+        images_with_extra_info: list[(np.ndarray, bool, str)],
+        formula_enable=None,
+        table_enable=None):
+    # os.environ['CUDA_VISIBLE_DEVICES'] = str(idx)
+    from .batch_analyze import BatchAnalyze
+    model_manager = ModelSingleton()
+    batch_ratio = 1
+    device = get_device()
+    if str(device).startswith('npu'):
+        import torch_npu
+        if torch_npu.npu.is_available():
+            torch.npu.set_compile_mode(jit_compile=False)
+    if str(device).startswith('npu') or str(device).startswith('cuda'):
+        vram = get_vram(device)
+        if vram is not None:
+            gpu_memory = int(os.getenv('VIRTUAL_VRAM_SIZE', round(vram)))
+            if gpu_memory >= 16:
+                batch_ratio = 16
+            elif gpu_memory >= 12:
+                batch_ratio = 8
+            elif gpu_memory >= 8:
+                batch_ratio = 4
+            elif gpu_memory >= 6:
+                batch_ratio = 2
+            else:
+                batch_ratio = 1
+            logger.info(f'gpu_memory: {gpu_memory} GB, batch_ratio: {batch_ratio}')
+        else:
+            # Default batch_ratio when VRAM can't be determined
+            batch_ratio = 1
+            logger.info(f'Could not determine GPU memory, using default batch_ratio: {batch_ratio}')
+    batch_model = BatchAnalyze(model_manager, batch_ratio, formula_enable, table_enable)
+    results = batch_model(images_with_extra_info)
+    clean_memory(get_device())
+    return results
\ No newline at end of file
--- a/mineru/backend/pipeline/magic_model.py
+++ b/mineru/backend/pipeline/magic_model.py
+import enum
+from magic_pdf.config.model_block_type import ModelBlockTypeEnum
+from magic_pdf.config.ocr_content_type import CategoryId, ContentType
+from magic_pdf.data.dataset import Dataset
+from magic_pdf.libs.boxbase import (_is_in, bbox_distance, bbox_relative_pos,
+                                    calculate_iou)
+from magic_pdf.libs.coordinate_transform import get_scale_ratio
+from magic_pdf.pre_proc.remove_bbox_overlap import _remove_overlap_between_bbox
+CAPATION_OVERLAP_AREA_RATIO = 0.6
+MERGE_BOX_OVERLAP_AREA_RATIO = 1.1
+class PosRelationEnum(enum.Enum):
+    LEFT = 'left'
+    RIGHT = 'right'
+    UP = 'up'
+    BOTTOM = 'bottom'
+    ALL = 'all'
+class MagicModel:
+    """每个函数没有得到元素的时候返回空list."""
+    def __fix_axis(self):
+        for model_page_info in self.__model_list:
+            need_remove_list = []
+            page_no = model_page_info['page_info']['page_no']
+            horizontal_scale_ratio, vertical_scale_ratio = get_scale_ratio(
+                model_page_info, self.__docs.get_page(page_no)
+            )
+            layout_dets = model_page_info['layout_dets']
+            for layout_det in layout_dets:
+                if layout_det.get('bbox') is not None:
+                    # 兼容直接输出bbox的模型数据,如paddle
+                    x0, y0, x1, y1 = layout_det['bbox']
+                else:
+                    # 兼容直接输出poly的模型数据，如xxx
+                    x0, y0, _, _, x1, y1, _, _ = layout_det['poly']
+                bbox = [
+                    int(x0 / horizontal_scale_ratio),
+                    int(y0 / vertical_scale_ratio),
+                    int(x1 / horizontal_scale_ratio),
+                    int(y1 / vertical_scale_ratio),
+                ]
+                layout_det['bbox'] = bbox
+                # 删除高度或者宽度小于等于0的spans
+                if bbox[2] - bbox[0] <= 0 or bbox[3] - bbox[1] <= 0:
+                    need_remove_list.append(layout_det)
+            for need_remove in need_remove_list:
+                layout_dets.remove(need_remove)
+    def __fix_by_remove_low_confidence(self):
+        for model_page_info in self.__model_list:
+            need_remove_list = []
+            layout_dets = model_page_info['layout_dets']
+            for layout_det in layout_dets:
+                if layout_det['score'] <= 0.05:
+                    need_remove_list.append(layout_det)
+                else:
+                    continue
+            for need_remove in need_remove_list:
+                layout_dets.remove(need_remove)
+    def __fix_by_remove_high_iou_and_low_confidence(self):
+        for model_page_info in self.__model_list:
+            need_remove_list = []
+            layout_dets = model_page_info['layout_dets']
+            for layout_det1 in layout_dets:
+                for layout_det2 in layout_dets:
+                    if layout_det1 == layout_det2:
+                        continue
+                    if layout_det1['category_id'] in [
+                        0,
+                        1,
+                        2,
+                        3,
+                        4,
+                        5,
+                        6,
+                        7,
+                        8,
+                        9,
+                    ] and layout_det2['category_id'] in [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]:
+                        if (
+                            calculate_iou(layout_det1['bbox'], layout_det2['bbox'])
+                            > 0.9
+                        ):
+                            if layout_det1['score'] < layout_det2['score']:
+                                layout_det_need_remove = layout_det1
+                            else:
+                                layout_det_need_remove = layout_det2
+                            if layout_det_need_remove not in need_remove_list:
+                                need_remove_list.append(layout_det_need_remove)
+                        else:
+                            continue
+                    else:
+                        continue
+            for need_remove in need_remove_list:
+                layout_dets.remove(need_remove)
+    def __init__(self, model_list: list, docs: Dataset):
+        self.__model_list = model_list
+        self.__docs = docs
+        """为所有模型数据添加bbox信息(缩放，poly->bbox)"""
+        self.__fix_axis()
+        """删除置信度特别低的模型数据(<0.05),提高质量"""
+        self.__fix_by_remove_low_confidence()
+        """删除高iou(>0.9)数据中置信度较低的那个"""
+        self.__fix_by_remove_high_iou_and_low_confidence()
+        self.__fix_footnote()
+    def _bbox_distance(self, bbox1, bbox2):
+        left, right, bottom, top = bbox_relative_pos(bbox1, bbox2)
+        flags = [left, right, bottom, top]
+        count = sum([1 if v else 0 for v in flags])
+        if count > 1:
+            return float('inf')
+        if left or right:
+            l1 = bbox1[3] - bbox1[1]
+            l2 = bbox2[3] - bbox2[1]
+        else:
+            l1 = bbox1[2] - bbox1[0]
+            l2 = bbox2[2] - bbox2[0]
+        if l2 > l1 and (l2 - l1) / l1 > 0.3:
+            return float('inf')
+        return bbox_distance(bbox1, bbox2)
+    def __fix_footnote(self):
+        # 3: figure, 5: table, 7: footnote
+        for model_page_info in self.__model_list:
+            footnotes = []
+            figures = []
+            tables = []
+            for obj in model_page_info['layout_dets']:
+                if obj['category_id'] == 7:
+                    footnotes.append(obj)
+                elif obj['category_id'] == 3:
+                    figures.append(obj)
+                elif obj['category_id'] == 5:
+                    tables.append(obj)
+                if len(footnotes) * len(figures) == 0:
+                    continue
+            dis_figure_footnote = {}
+            dis_table_footnote = {}
+            for i in range(len(footnotes)):
+                for j in range(len(figures)):
+                    pos_flag_count = sum(
+                        list(
+                            map(
+                                lambda x: 1 if x else 0,
+                                bbox_relative_pos(
+                                    footnotes[i]['bbox'], figures[j]['bbox']
+                                ),
+                            )
+                        )
+                    )
+                    if pos_flag_count > 1:
+                        continue
+                    dis_figure_footnote[i] = min(
+                        self._bbox_distance(figures[j]['bbox'], footnotes[i]['bbox']),
+                        dis_figure_footnote.get(i, float('inf')),
+                    )
+            for i in range(len(footnotes)):
+                for j in range(len(tables)):
+                    pos_flag_count = sum(
+                        list(
+                            map(
+                                lambda x: 1 if x else 0,
+                                bbox_relative_pos(
+                                    footnotes[i]['bbox'], tables[j]['bbox']
+                                ),
+                            )
+                        )
+                    )
+                    if pos_flag_count > 1:
+                        continue
+                    dis_table_footnote[i] = min(
+                        self._bbox_distance(tables[j]['bbox'], footnotes[i]['bbox']),
+                        dis_table_footnote.get(i, float('inf')),
+                    )
+            for i in range(len(footnotes)):
+                if i not in dis_figure_footnote:
+                    continue
+                if dis_table_footnote.get(i, float('inf')) > dis_figure_footnote[i]:
+                    footnotes[i]['category_id'] = CategoryId.ImageFootnote
+    def __reduct_overlap(self, bboxes):
+        N = len(bboxes)
+        keep = [True] * N
+        for i in range(N):
+            for j in range(N):
+                if i == j:
+                    continue
+                if _is_in(bboxes[i]['bbox'], bboxes[j]['bbox']):
+                    keep[i] = False
+        return [bboxes[i] for i in range(N) if keep[i]]
+    def __tie_up_category_by_distance_v2(
+        self,
+        page_no: int,
+        subject_category_id: int,
+        object_category_id: int,
+        priority_pos: PosRelationEnum,
+    ):
+        """_summary_
+        Args:
+            page_no (int): _description_
+            subject_category_id (int): _description_
+            object_category_id (int): _description_
+            priority_pos (PosRelationEnum): _description_
+        Returns:
+            _type_: _description_
+        """
+        AXIS_MULPLICITY = 0.5
+        subjects = self.__reduct_overlap(
+            list(
+                map(
+                    lambda x: {'bbox': x['bbox'], 'score': x['score']},
+                    filter(
+                        lambda x: x['category_id'] == subject_category_id,
+                        self.__model_list[page_no]['layout_dets'],
+                    ),
+                )
+            )
+        )
+        objects = self.__reduct_overlap(
+            list(
+                map(
+                    lambda x: {'bbox': x['bbox'], 'score': x['score']},
+                    filter(
+                        lambda x: x['category_id'] == object_category_id,
+                        self.__model_list[page_no]['layout_dets'],
+                    ),
+                )
+            )
+        )
+        M = len(objects)
+        subjects.sort(key=lambda x: x['bbox'][0] ** 2 + x['bbox'][1] ** 2)
+        objects.sort(key=lambda x: x['bbox'][0] ** 2 + x['bbox'][1] ** 2)
+        sub_obj_map_h = {i: [] for i in range(len(subjects))}
+        dis_by_directions = {
+            'top': [[-1, float('inf')]] * M,
+            'bottom': [[-1, float('inf')]] * M,
+            'left': [[-1, float('inf')]] * M,
+            'right': [[-1, float('inf')]] * M,
+        }
+        for i, obj in enumerate(objects):
+            l_x_axis, l_y_axis = (
+                obj['bbox'][2] - obj['bbox'][0],
+                obj['bbox'][3] - obj['bbox'][1],
+            )
+            axis_unit = min(l_x_axis, l_y_axis)
+            for j, sub in enumerate(subjects):
+                bbox1, bbox2, _ = _remove_overlap_between_bbox(
+                    objects[i]['bbox'], subjects[j]['bbox']
+                )
+                left, right, bottom, top = bbox_relative_pos(bbox1, bbox2)
+                flags = [left, right, bottom, top]
+                if sum([1 if v else 0 for v in flags]) > 1:
+                    continue
+                if left:
+                    if dis_by_directions['left'][i][1] > bbox_distance(
+                        obj['bbox'], sub['bbox']
+                    ):
+                        dis_by_directions['left'][i] = [
+                            j,
+                            bbox_distance(obj['bbox'], sub['bbox']),
+                        ]
+                if right:
+                    if dis_by_directions['right'][i][1] > bbox_distance(
+                        obj['bbox'], sub['bbox']
+                    ):
+                        dis_by_directions['right'][i] = [
+                            j,
+                            bbox_distance(obj['bbox'], sub['bbox']),
+                        ]
+                if bottom:
+                    if dis_by_directions['bottom'][i][1] > bbox_distance(
+                        obj['bbox'], sub['bbox']
+                    ):
+                        dis_by_directions['bottom'][i] = [
+                            j,
+                            bbox_distance(obj['bbox'], sub['bbox']),
+                        ]
+                if top:
+                    if dis_by_directions['top'][i][1] > bbox_distance(
+                        obj['bbox'], sub['bbox']
+                    ):
+                        dis_by_directions['top'][i] = [
+                            j,
+                            bbox_distance(obj['bbox'], sub['bbox']),
+                        ]
+            if (
+                dis_by_directions['top'][i][1] != float('inf')
+                and dis_by_directions['bottom'][i][1] != float('inf')
+                and priority_pos in (PosRelationEnum.BOTTOM, PosRelationEnum.UP)
+            ):
+                RATIO = 3
+                if (
+                    abs(
+                        dis_by_directions['top'][i][1]
+                        - dis_by_directions['bottom'][i][1]
+                    )
+                    < RATIO * axis_unit
+                ):
+                    if priority_pos == PosRelationEnum.BOTTOM:
+                        sub_obj_map_h[dis_by_directions['bottom'][i][0]].append(i)
+                    else:
+                        sub_obj_map_h[dis_by_directions['top'][i][0]].append(i)
+                    continue
+            if dis_by_directions['left'][i][1] != float('inf') or dis_by_directions[
+                'right'
+            ][i][1] != float('inf'):
+                if dis_by_directions['left'][i][1] != float(
+                    'inf'
+                ) and dis_by_directions['right'][i][1] != float('inf'):
+                    if AXIS_MULPLICITY * axis_unit >= abs(
+                        dis_by_directions['left'][i][1]
+                        - dis_by_directions['right'][i][1]
+                    ):
+                        left_sub_bbox = subjects[dis_by_directions['left'][i][0]][
+                            'bbox'
+                        ]
+                        right_sub_bbox = subjects[dis_by_directions['right'][i][0]][
+                            'bbox'
+                        ]
+                        left_sub_bbox_y_axis = left_sub_bbox[3] - left_sub_bbox[1]
+                        right_sub_bbox_y_axis = right_sub_bbox[3] - right_sub_bbox[1]
+                        if (
+                            abs(left_sub_bbox_y_axis - l_y_axis)
+                            + dis_by_directions['left'][i][0]
+                            > abs(right_sub_bbox_y_axis - l_y_axis)
+                            + dis_by_directions['right'][i][0]
+                        ):
+                            left_or_right = dis_by_directions['right'][i]
+                        else:
+                            left_or_right = dis_by_directions['left'][i]
+                    else:
+                        left_or_right = dis_by_directions['left'][i]
+                        if left_or_right[1] > dis_by_directions['right'][i][1]:
+                            left_or_right = dis_by_directions['right'][i]
+                else:
+                    left_or_right = dis_by_directions['left'][i]
+                    if left_or_right[1] == float('inf'):
+                        left_or_right = dis_by_directions['right'][i]
+            else:
+                left_or_right = [-1, float('inf')]
+            if dis_by_directions['top'][i][1] != float('inf') or dis_by_directions[
+                'bottom'
+            ][i][1] != float('inf'):
+                if dis_by_directions['top'][i][1] != float('inf') and dis_by_directions[
+                    'bottom'
+                ][i][1] != float('inf'):
+                    if AXIS_MULPLICITY * axis_unit >= abs(
+                        dis_by_directions['top'][i][1]
+                        - dis_by_directions['bottom'][i][1]
+                    ):
+                        top_bottom = subjects[dis_by_directions['bottom'][i][0]]['bbox']
+                        bottom_top = subjects[dis_by_directions['top'][i][0]]['bbox']
+                        top_bottom_x_axis = top_bottom[2] - top_bottom[0]
+                        bottom_top_x_axis = bottom_top[2] - bottom_top[0]
+                        if (
+                            abs(top_bottom_x_axis - l_x_axis)
+                            + dis_by_directions['bottom'][i][1]
+                            > abs(bottom_top_x_axis - l_x_axis)
+                            + dis_by_directions['top'][i][1]
+                        ):
+                            top_or_bottom = dis_by_directions['top'][i]
+                        else:
+                            top_or_bottom = dis_by_directions['bottom'][i]
+                    else:
+                        top_or_bottom = dis_by_directions['top'][i]
+                        if top_or_bottom[1] > dis_by_directions['bottom'][i][1]:
+                            top_or_bottom = dis_by_directions['bottom'][i]
+                else:
+                    top_or_bottom = dis_by_directions['top'][i]
+                    if top_or_bottom[1] == float('inf'):
+                        top_or_bottom = dis_by_directions['bottom'][i]
+            else:
+                top_or_bottom = [-1, float('inf')]
+            if left_or_right[1] != float('inf') or top_or_bottom[1] != float('inf'):
+                if left_or_right[1] != float('inf') and top_or_bottom[1] != float(
+                    'inf'
+                ):
+                    if AXIS_MULPLICITY * axis_unit >= abs(
+                        left_or_right[1] - top_or_bottom[1]
+                    ):
+                        y_axis_bbox = subjects[left_or_right[0]]['bbox']
+                        x_axis_bbox = subjects[top_or_bottom[0]]['bbox']
+                        if (
+                            abs((x_axis_bbox[2] - x_axis_bbox[0]) - l_x_axis) / l_x_axis
+                            > abs((y_axis_bbox[3] - y_axis_bbox[1]) - l_y_axis)
+                            / l_y_axis
+                        ):
+                            sub_obj_map_h[left_or_right[0]].append(i)
+                        else:
+                            sub_obj_map_h[top_or_bottom[0]].append(i)
+                    else:
+                        if left_or_right[1] > top_or_bottom[1]:
+                            sub_obj_map_h[top_or_bottom[0]].append(i)
+                        else:
+                            sub_obj_map_h[left_or_right[0]].append(i)
+                else:
+                    if left_or_right[1] != float('inf'):
+                        sub_obj_map_h[left_or_right[0]].append(i)
+                    else:
+                        sub_obj_map_h[top_or_bottom[0]].append(i)
+        ret = []
+        for i in sub_obj_map_h.keys():
+            ret.append(
+                {
+                    'sub_bbox': {
+                        'bbox': subjects[i]['bbox'],
+                        'score': subjects[i]['score'],
+                    },
+                    'obj_bboxes': [
+                        {'score': objects[j]['score'], 'bbox': objects[j]['bbox']}
+                        for j in sub_obj_map_h[i]
+                    ],
+                    'sub_idx': i,
+                }
+            )
+        return ret
+    def __tie_up_category_by_distance_v3(
+        self,
+        page_no: int,
+        subject_category_id: int,
+        object_category_id: int,
+        priority_pos: PosRelationEnum,
+    ):
+        subjects = self.__reduct_overlap(
+            list(
+                map(
+                    lambda x: {'bbox': x['bbox'], 'score': x['score']},
+                    filter(
+                        lambda x: x['category_id'] == subject_category_id,
+                        self.__model_list[page_no]['layout_dets'],
+                    ),
+                )
+            )
+        )
+        objects = self.__reduct_overlap(
+            list(
+                map(
+                    lambda x: {'bbox': x['bbox'], 'score': x['score']},
+                    filter(
+                        lambda x: x['category_id'] == object_category_id,
+                        self.__model_list[page_no]['layout_dets'],
+                    ),
+                )
+            )
+        )
+        ret = []
+        N, M = len(subjects), len(objects)
+        subjects.sort(key=lambda x: x['bbox'][0] ** 2 + x['bbox'][1] ** 2)
+        objects.sort(key=lambda x: x['bbox'][0] ** 2 + x['bbox'][1] ** 2)
+        OBJ_IDX_OFFSET = 10000
+        SUB_BIT_KIND, OBJ_BIT_KIND = 0, 1
+        all_boxes_with_idx = [(i, SUB_BIT_KIND, sub['bbox'][0], sub['bbox'][1]) for i, sub in enumerate(subjects)] + [(i + OBJ_IDX_OFFSET , OBJ_BIT_KIND, obj['bbox'][0], obj['bbox'][1]) for i, obj in enumerate(objects)]
+        seen_idx = set()
+        seen_sub_idx = set()
+        while N > len(seen_sub_idx):
+            candidates = []
+            for idx, kind, x0, y0 in all_boxes_with_idx:
+                if idx in seen_idx:
+                    continue
+                candidates.append((idx, kind, x0, y0))
+            if len(candidates) == 0:
+                break
+            left_x = min([v[2] for v in candidates])
+            top_y =  min([v[3] for v in candidates])
+            candidates.sort(key=lambda x: (x[2]-left_x) ** 2 + (x[3] - top_y) ** 2)
+            fst_idx, fst_kind, left_x, top_y = candidates[0]
+            candidates.sort(key=lambda x: (x[2] - left_x) ** 2 + (x[3] - top_y)**2)
+            nxt = None
+            for i in range(1, len(candidates)):
+                if candidates[i][1] ^ fst_kind == 1:
+                    nxt = candidates[i]
+                    break
+            if nxt is None:
+                break
+            if fst_kind == SUB_BIT_KIND:
+                sub_idx, obj_idx = fst_idx, nxt[0] - OBJ_IDX_OFFSET
+            else:
+                sub_idx, obj_idx = nxt[0], fst_idx - OBJ_IDX_OFFSET
+            pair_dis = bbox_distance(subjects[sub_idx]['bbox'], objects[obj_idx]['bbox'])
+            nearest_dis = float('inf')
+            for i in range(N):
+                if i in seen_idx or i == sub_idx:continue
+                nearest_dis = min(nearest_dis, bbox_distance(subjects[i]['bbox'], objects[obj_idx]['bbox']))
+            if pair_dis >= 3*nearest_dis:
+                seen_idx.add(sub_idx)
+                continue
+            seen_idx.add(sub_idx)
+            seen_idx.add(obj_idx + OBJ_IDX_OFFSET)
+            seen_sub_idx.add(sub_idx)
+            ret.append(
+                {
+                    'sub_bbox': {
+                        'bbox': subjects[sub_idx]['bbox'],
+                        'score': subjects[sub_idx]['score'],
+                    },
+                    'obj_bboxes': [
+                        {'score': objects[obj_idx]['score'], 'bbox': objects[obj_idx]['bbox']}
+                    ],
+                    'sub_idx': sub_idx,
+                }
+            )
+        for i in range(len(objects)):
+            j = i + OBJ_IDX_OFFSET
+            if j in seen_idx:
+                continue
+            seen_idx.add(j)
+            nearest_dis, nearest_sub_idx = float('inf'), -1
+            for k in range(len(subjects)):
+                dis = bbox_distance(objects[i]['bbox'], subjects[k]['bbox'])
+                if dis < nearest_dis:
+                    nearest_dis = dis
+                    nearest_sub_idx = k
+            for k in range(len(subjects)):
+                if k != nearest_sub_idx: continue
+                if k in seen_sub_idx:
+                    for kk in range(len(ret)):
+                        if ret[kk]['sub_idx'] == k:
+                            ret[kk]['obj_bboxes'].append({'score': objects[i]['score'], 'bbox': objects[i]['bbox']})
+                            break
+                else:
+                    ret.append(
+                        {
+                            'sub_bbox': {
+                                'bbox': subjects[k]['bbox'],
+                                'score': subjects[k]['score'],
+                            },
+                            'obj_bboxes': [
+                                {'score': objects[i]['score'], 'bbox': objects[i]['bbox']}
+                            ],
+                            'sub_idx': k,
+                        }
+                    )
+                seen_sub_idx.add(k)
+                seen_idx.add(k)
+        for i in range(len(subjects)):
+            if i in seen_sub_idx:
+                continue
+            ret.append(
+                {
+                    'sub_bbox': {
+                        'bbox': subjects[i]['bbox'],
+                        'score': subjects[i]['score'],
+                    },
+                    'obj_bboxes': [],
+                    'sub_idx': i,
+                }
+            )
+        return ret
+    def get_imgs_v2(self, page_no: int):
+        with_captions = self.__tie_up_category_by_distance_v3(
+            page_no, 3, 4, PosRelationEnum.BOTTOM
+        )
+        with_footnotes = self.__tie_up_category_by_distance_v3(
+            page_no, 3, CategoryId.ImageFootnote, PosRelationEnum.ALL
+        )
+        ret = []
+        for v in with_captions:
+            record = {
+                'image_body': v['sub_bbox'],
+                'image_caption_list': v['obj_bboxes'],
+            }
+            filter_idx = v['sub_idx']
+            d = next(filter(lambda x: x['sub_idx'] == filter_idx, with_footnotes))
+            record['image_footnote_list'] = d['obj_bboxes']
+            ret.append(record)
+        return ret
+    def get_tables_v2(self, page_no: int) -> list:
+        with_captions = self.__tie_up_category_by_distance_v3(
+            page_no, 5, 6, PosRelationEnum.UP
+        )
+        with_footnotes = self.__tie_up_category_by_distance_v3(
+            page_no, 5, 7, PosRelationEnum.ALL
+        )
+        ret = []
+        for v in with_captions:
+            record = {
+                'table_body': v['sub_bbox'],
+                'table_caption_list': v['obj_bboxes'],
+            }
+            filter_idx = v['sub_idx']
+            d = next(filter(lambda x: x['sub_idx'] == filter_idx, with_footnotes))
+            record['table_footnote_list'] = d['obj_bboxes']
+            ret.append(record)
+        return ret
+    def get_imgs(self, page_no: int):
+        return self.get_imgs_v2(page_no)
+    def get_tables(
+        self, page_no: int
+    ) -> list:  # 3个坐标， caption, table主体，table-note
+        return self.get_tables_v2(page_no)
+    def get_equations(self, page_no: int) -> list:  # 有坐标，也有字
+        inline_equations = self.__get_blocks_by_type(
+            ModelBlockTypeEnum.EMBEDDING.value, page_no, ['latex']
+        )
+        interline_equations = self.__get_blocks_by_type(
+            ModelBlockTypeEnum.ISOLATED.value, page_no, ['latex']
+        )
+        interline_equations_blocks = self.__get_blocks_by_type(
+            ModelBlockTypeEnum.ISOLATE_FORMULA.value, page_no
+        )
+        return inline_equations, interline_equations, interline_equations_blocks
+    def get_discarded(self, page_no: int) -> list:  # 自研模型，只有坐标
+        blocks = self.__get_blocks_by_type(ModelBlockTypeEnum.ABANDON.value, page_no)
+        return blocks
+    def get_text_blocks(self, page_no: int) -> list:  # 自研模型搞的，只有坐标，没有字
+        blocks = self.__get_blocks_by_type(ModelBlockTypeEnum.PLAIN_TEXT.value, page_no)
+        return blocks
+    def get_title_blocks(self, page_no: int) -> list:  # 自研模型，只有坐标，没字
+        blocks = self.__get_blocks_by_type(ModelBlockTypeEnum.TITLE.value, page_no)
+        return blocks
+    def get_ocr_text(self, page_no: int) -> list:  # paddle 搞的，有字也有坐标
+        text_spans = []
+        model_page_info = self.__model_list[page_no]
+        layout_dets = model_page_info['layout_dets']
+        for layout_det in layout_dets:
+            if layout_det['category_id'] == '15':
+                span = {
+                    'bbox': layout_det['bbox'],
+                    'content': layout_det['text'],
+                }
+                text_spans.append(span)
+        return text_spans
+    def get_all_spans(self, page_no: int) -> list:
+        def remove_duplicate_spans(spans):
+            new_spans = []
+            for span in spans:
+                if not any(span == existing_span for existing_span in new_spans):
+                    new_spans.append(span)
+            return new_spans
+        all_spans = []
+        model_page_info = self.__model_list[page_no]
+        layout_dets = model_page_info['layout_dets']
+        allow_category_id_list = [3, 5, 13, 14, 15]
+        """当成span拼接的"""
+        #  3: 'image', # 图片
+        #  5: 'table',       # 表格
+        #  13: 'inline_equation',     # 行内公式
+        #  14: 'interline_equation',      # 行间公式
+        #  15: 'text',      # ocr识别文本
+        for layout_det in layout_dets:
+            category_id = layout_det['category_id']
+            if category_id in allow_category_id_list:
+                span = {'bbox': layout_det['bbox'], 'score': layout_det['score']}
+                if category_id == 3:
+                    span['type'] = ContentType.Image
+                elif category_id == 5:
+                    # 获取table模型结果
+                    latex = layout_det.get('latex', None)
+                    html = layout_det.get('html', None)
+                    if latex:
+                        span['latex'] = latex
+                    elif html:
+                        span['html'] = html
+                    span['type'] = ContentType.Table
+                elif category_id == 13:
+                    span['content'] = layout_det['latex']
+                    span['type'] = ContentType.InlineEquation
+                elif category_id == 14:
+                    span['content'] = layout_det['latex']
+                    span['type'] = ContentType.InterlineEquation
+                elif category_id == 15:
+                    span['content'] = layout_det['text']
+                    span['type'] = ContentType.Text
+                all_spans.append(span)
+        return remove_duplicate_spans(all_spans)
+    def get_page_size(self, page_no: int):  # 获取页面宽高
+        # 获取当前页的page对象
+        page = self.__docs.get_page(page_no).get_page_info()
+        # 获取当前页的宽高
+        page_w = page.w
+        page_h = page.h
+        return page_w, page_h
+    def __get_blocks_by_type(
+        self, type: int, page_no: int, extra_col: list[str] = []
+    ) -> list:
+        blocks = []
+        for page_dict in self.__model_list:
+            layout_dets = page_dict.get('layout_dets', [])
+            page_info = page_dict.get('page_info', {})
+            page_number = page_info.get('page_no', -1)
+            if page_no != page_number:
+                continue
+            for item in layout_dets:
+                category_id = item.get('category_id', -1)
+                bbox = item.get('bbox', None)
+                if category_id == type:
+                    block = {
+                        'bbox': bbox,
+                        'score': item.get('score'),
+                    }
+                    for col in extra_col:
+                        block[col] = item.get(col, None)
+                    blocks.append(block)
+        return blocks
+    def get_model_list(self, page_no):
+        return self.__model_list[page_no]
--- a/mineru/backend/pipeline/model_init.py
+++ b/mineru/backend/pipeline/model_init.py
+import os
+import torch
+from loguru import logger
+from .model_list import AtomicModel
+from ...model.layout.doclayout_yolo import DocLayoutYOLOModel
+from ...model.mfd.yolo_v8 import YOLOv8MFDModel
+from ...model.mfr.unimernet.Unimernet import UnimernetModel
+from ...model.ocr.paddleocr2pytorch.pytorch_paddle import PytorchPaddleOCR
+from ...model.table.rapid_table import RapidTableModel
+doclayout_yolo = "Layout/YOLO/doclayout_yolo_docstructbench_imgsz1280_2501.pt"
+yolo_v8_mfd =  "MFD/YOLO/yolo_v8_ft.pt"
+unimernet_small = "MFR/unimernet_hf_small_2503"
+def table_model_init(lang=None):
+    atom_model_manager = AtomModelSingleton()
+    ocr_engine = atom_model_manager.get_atom_model(
+        atom_model_name='ocr',
+        det_db_box_thresh=0.5,
+        det_db_unclip_ratio=1.6,
+        lang=lang
+    )
+    table_model = RapidTableModel(ocr_engine)
+    return table_model
+def mfd_model_init(weight, device='cpu'):
+    if str(device).startswith('npu'):
+        device = torch.device(device)
+    mfd_model = YOLOv8MFDModel(weight, device)
+    return mfd_model
+def mfr_model_init(weight_dir, device='cpu'):
+    mfr_model = UnimernetModel(weight_dir, device)
+    return mfr_model
+def doclayout_yolo_model_init(weight, device='cpu'):
+    if str(device).startswith('npu'):
+        device = torch.device(device)
+    model = DocLayoutYOLOModel(weight, device)
+    return model
+def ocr_model_init(det_db_box_thresh=0.3,
+                   lang=None,
+                   use_dilation=True,
+                   det_db_unclip_ratio=1.8,
+                   ):
+    if lang is not None and lang != '':
+        model = PytorchPaddleOCR(
+            det_db_box_thresh=det_db_box_thresh,
+            lang=lang,
+            use_dilation=use_dilation,
+            det_db_unclip_ratio=det_db_unclip_ratio,
+        )
+    else:
+        model = PytorchPaddleOCR(
+            det_db_box_thresh=det_db_box_thresh,
+            use_dilation=use_dilation,
+            det_db_unclip_ratio=det_db_unclip_ratio,
+        )
+    return model
+class AtomModelSingleton:
+    _instance = None
+    _models = {}
+    def __new__(cls, *args, **kwargs):
+        if cls._instance is None:
+            cls._instance = super().__new__(cls)
+        return cls._instance
+    def get_atom_model(self, atom_model_name: str, **kwargs):
+        lang = kwargs.get('lang', None)
+        table_model_name = kwargs.get('table_model_name', None)
+        if atom_model_name in [AtomicModel.OCR]:
+            key = (atom_model_name, lang)
+        elif atom_model_name in [AtomicModel.Table]:
+            key = (atom_model_name, table_model_name, lang)
+        else:
+            key = atom_model_name
+        if key not in self._models:
+            self._models[key] = atom_model_init(model_name=atom_model_name, **kwargs)
+        return self._models[key]
+def atom_model_init(model_name: str, **kwargs):
+    atom_model = None
+    if model_name == AtomicModel.Layout:
+        atom_model = doclayout_yolo_model_init(
+            kwargs.get('doclayout_yolo_weights'),
+            kwargs.get('device')
+        )
+    elif model_name == AtomicModel.MFD:
+        atom_model = mfd_model_init(
+            kwargs.get('mfd_weights'),
+            kwargs.get('device')
+        )
+    elif model_name == AtomicModel.MFR:
+        atom_model = mfr_model_init(
+            kwargs.get('mfr_weight_dir'),
+            kwargs.get('device')
+        )
+    elif model_name == AtomicModel.OCR:
+        atom_model = ocr_model_init(
+            kwargs.get('det_db_box_thresh'),
+            kwargs.get('lang'),
+        )
+    elif model_name == AtomicModel.Table:
+        atom_model = table_model_init(
+            kwargs.get('lang'),
+        )
+    else:
+        logger.error('model name not allow')
+        exit(1)
+    if atom_model is None:
+        logger.error('model init failed')
+        exit(1)
+    else:
+        return atom_model
+class MineruPipelineModel:
+    def __init__(self, **kwargs):
+        self.formula_config = kwargs.get('formula_config')
+        self.apply_formula = self.formula_config.get('enable', True)
+        self.table_config = kwargs.get('table_config')
+        self.apply_table = self.table_config.get('enable', True)
+        self.lang = kwargs.get('lang', None)
+        self.device = kwargs.get('device', 'cpu')
+        logger.info(
+            'DocAnalysis init, this may take some times......'
+        )
+        atom_model_manager = AtomModelSingleton()
+        models_dir = kwargs.get('models_dir', "")
+        if not models_dir:
+            logger.error("can't found models_dir, please set models_dir")
+            exit(1)
+        if self.apply_formula:
+            # 初始化公式检测模型
+            self.mfd_model = atom_model_manager.get_atom_model(
+                atom_model_name=AtomicModel.MFD,
+                mfd_weights=str(
+                    os.path.join(models_dir, yolo_v8_mfd)
+                ),
+                device=self.device,
+            )
+            # 初始化公式解析模型
+            mfr_weight_dir = str(
+                os.path.join(models_dir, unimernet_small)
+            )
+            self.mfr_model = atom_model_manager.get_atom_model(
+                atom_model_name=AtomicModel.MFR,
+                mfr_weight_dir=mfr_weight_dir,
+                device=self.device,
+            )
+        # 初始化layout模型
+        self.layout_model = atom_model_manager.get_atom_model(
+            atom_model_name=AtomicModel.Layout,
+            doclayout_yolo_weights=str(
+                os.path.join(models_dir, doclayout_yolo)
+            ),
+            device=self.device,
+        )
+        # 初始化ocr
+        self.ocr_model = atom_model_manager.get_atom_model(
+            atom_model_name=AtomicModel.OCR,
+            det_db_box_thresh=0.3,
+            lang=self.lang
+        )
+        # init table model
+        if self.apply_table:
+            self.table_model = atom_model_manager.get_atom_model(
+                atom_model_name=AtomicModel.Table,
+                lang=self.lang,
+            )
+        logger.info('DocAnalysis init done!')
\ No newline at end of file
--- a/mineru/backend/pipeline/model_list.py
+++ b/mineru/backend/pipeline/model_list.py
+class AtomicModel:
+    Layout = "layout"
+    MFD = "mfd"
+    MFR = "mfr"
+    OCR = "ocr"
+    Table = "table"
--- a/mineru/backend/vlm/token_to_middle_json.py
+++ b/mineru/backend/vlm/token_to_middle_json.py
 import re
-from ...libs.cut_image import cut_image_and_table
+from mineru.utils.cut_image import cut_image_and_table
-from ...libs.enum_class import BlockType, ContentType
+from mineru.utils.enum_class import BlockType, ContentType
-from ...libs.hash_utils import str_md5
+from mineru.utils.hash_utils import str_md5
-from ...libs.magic_model import fix_two_layer_blocks
+from mineru.utils.magic_model import fix_two_layer_blocks
-from ...libs.version import __version__
+from mineru.version import __version__
 def token_to_page_info(token, image_dict, page, image_writer, page_index) -> dict:

--- a/mineru/backend/vlm/vlm_analyze.py
+++ b/mineru/backend/vlm/vlm_analyze.py
@@ -4,7 +4,7 @@ import time
 from loguru import logger
 from ...data.data_reader_writer import DataWriter
-from ...libs.pdf_image_tools import load_images_from_pdf
+from mineru.utils.pdf_image_tools import load_images_from_pdf
 from .base_predictor import BasePredictor
 from .predictor import get_predictor
 from .token_to_middle_json import result_to_middle_json

--- a/mineru/model/__init__.py
+++ b/mineru/model/__init__.py
+# Copyright (c) Opendatalab. All rights reserved.
--- a/mineru/model/layout/__init__.py
+++ b/mineru/model/layout/__init__.py
+# Copyright (c) Opendatalab. All rights reserved.
--- a/mineru/model/layout/doclayout_yolo.py
+++ b/mineru/model/layout/doclayout_yolo.py
+from doclayout_yolo import YOLOv10
+from tqdm import tqdm
+class DocLayoutYOLOModel(object):
+    def __init__(self, weight, device):
+        self.model = YOLOv10(weight)
+        self.device = device
+    def predict(self, image):
+        layout_res = []
+        doclayout_yolo_res = self.model.predict(
+            image,
+            imgsz=1280,
+            conf=0.10,
+            iou=0.45,
+            verbose=False, device=self.device
+        )[0]
+        for xyxy, conf, cla in zip(
+            doclayout_yolo_res.boxes.xyxy.cpu(),
+            doclayout_yolo_res.boxes.conf.cpu(),
+            doclayout_yolo_res.boxes.cls.cpu(),
+        ):
+            xmin, ymin, xmax, ymax = [int(p.item()) for p in xyxy]
+            new_item = {
+                "category_id": int(cla.item()),
+                "poly": [xmin, ymin, xmax, ymin, xmax, ymax, xmin, ymax],
+                "score": round(float(conf.item()), 3),
+            }
+            layout_res.append(new_item)
+        return layout_res
+    def batch_predict(self, images: list, batch_size: int) -> list:
+        images_layout_res = []
+        # for index in range(0, len(images), batch_size):
+        for index in tqdm(range(0, len(images), batch_size), desc="Layout Predict"):
+            doclayout_yolo_res = [
+                image_res.cpu()
+                for image_res in self.model.predict(
+                    images[index : index + batch_size],
+                    imgsz=1280,
+                    conf=0.10,
+                    iou=0.45,
+                    verbose=False,
+                    device=self.device,
+                )
+            ]
+            for image_res in doclayout_yolo_res:
+                layout_res = []
+                for xyxy, conf, cla in zip(
+                    image_res.boxes.xyxy,
+                    image_res.boxes.conf,
+                    image_res.boxes.cls,
+                ):
+                    xmin, ymin, xmax, ymax = [int(p.item()) for p in xyxy]
+                    new_item = {
+                        "category_id": int(cla.item()),
+                        "poly": [xmin, ymin, xmax, ymin, xmax, ymax, xmin, ymax],
+                        "score": round(float(conf.item()), 3),
+                    }
+                    layout_res.append(new_item)
+                images_layout_res.append(layout_res)
+        return images_layout_res
--- a/mineru/model/mfd/__init__.py
+++ b/mineru/model/mfd/__init__.py
+# Copyright (c) Opendatalab. All rights reserved.
--- a/mineru/model/mfd/yolo_v8.py
+++ b/mineru/model/mfd/yolo_v8.py
+from tqdm import tqdm
+from ultralytics import YOLO
+class YOLOv8MFDModel(object):
+    def __init__(self, weight, device="cpu"):
+        self.mfd_model = YOLO(weight)
+        self.device = device
+    def predict(self, image):
+        mfd_res = self.mfd_model.predict(
+            image, imgsz=1888, conf=0.25, iou=0.45, verbose=False, device=self.device
+        )[0]
+        return mfd_res
+    def batch_predict(self, images: list, batch_size: int) -> list:
+        images_mfd_res = []
+        # for index in range(0, len(images), batch_size):
+        for index in tqdm(range(0, len(images), batch_size), desc="MFD Predict"):
+            mfd_res = [
+                image_res.cpu()
+                for image_res in self.mfd_model.predict(
+                    images[index : index + batch_size],
+                    imgsz=1888,
+                    conf=0.25,
+                    iou=0.45,
+                    verbose=False,
+                    device=self.device,
+                )
+            ]
+            for image_res in mfd_res:
+                images_mfd_res.append(image_res)
+        return images_mfd_res
--- a/mineru/model/mfr/__init__.py
+++ b/mineru/model/mfr/__init__.py
+# Copyright (c) Opendatalab. All rights reserved.
--- a/mineru/model/mfr/unimernet/Unimernet.py
+++ b/mineru/model/mfr/unimernet/Unimernet.py
+import torch
+from torch.utils.data import DataLoader, Dataset
+from tqdm import tqdm
+class MathDataset(Dataset):
+    def __init__(self, image_paths, transform=None):
+        self.image_paths = image_paths
+        self.transform = transform
+    def __len__(self):
+        return len(self.image_paths)
+    def __getitem__(self, idx):
+        raw_image = self.image_paths[idx]
+        if self.transform:
+            image = self.transform(raw_image)
+            return image
+class UnimernetModel(object):
+    def __init__(self, weight_dir, cfg_path, _device_="cpu"):
+        from .unimernet_hf import UnimernetModel
+        if _device_.startswith("mps"):
+            self.model = UnimernetModel.from_pretrained(weight_dir, attn_implementation="eager")
+        else:
+            self.model = UnimernetModel.from_pretrained(weight_dir)
+        self.device = _device_
+        self.model.to(_device_)
+        if not _device_.startswith("cpu"):
+            self.model = self.model.to(dtype=torch.float16)
+        self.model.eval()
+    def predict(self, mfd_res, image):
+        formula_list = []
+        mf_image_list = []
+        for xyxy, conf, cla in zip(
+            mfd_res.boxes.xyxy.cpu(), mfd_res.boxes.conf.cpu(), mfd_res.boxes.cls.cpu()
+        ):
+            xmin, ymin, xmax, ymax = [int(p.item()) for p in xyxy]
+            new_item = {
+                "category_id": 13 + int(cla.item()),
+                "poly": [xmin, ymin, xmax, ymin, xmax, ymax, xmin, ymax],
+                "score": round(float(conf.item()), 2),
+                "latex": "",
+            }
+            formula_list.append(new_item)
+            bbox_img = image[ymin:ymax, xmin:xmax]
+            mf_image_list.append(bbox_img)
+        dataset = MathDataset(mf_image_list, transform=self.model.transform)
+        dataloader = DataLoader(dataset, batch_size=32, num_workers=0)
+        mfr_res = []
+        for mf_img in dataloader:
+            mf_img = mf_img.to(dtype=self.model.dtype)
+            mf_img = mf_img.to(self.device)
+            with torch.no_grad():
+                output = self.model.generate({"image": mf_img})
+            mfr_res.extend(output["fixed_str"])
+        for res, latex in zip(formula_list, mfr_res):
+            res["latex"] = latex
+        return formula_list
+    def batch_predict(self, images_mfd_res: list, images: list, batch_size: int = 64) -> list:
+        images_formula_list = []
+        mf_image_list = []
+        backfill_list = []
+        image_info = []  # Store (area, original_index, image) tuples
+        # Collect images with their original indices
+        for image_index in range(len(images_mfd_res)):
+            mfd_res = images_mfd_res[image_index]
+            np_array_image = images[image_index]
+            formula_list = []
+            for idx, (xyxy, conf, cla) in enumerate(zip(
+                    mfd_res.boxes.xyxy, mfd_res.boxes.conf, mfd_res.boxes.cls
+            )):
+                xmin, ymin, xmax, ymax = [int(p.item()) for p in xyxy]
+                new_item = {
+                    "category_id": 13 + int(cla.item()),
+                    "poly": [xmin, ymin, xmax, ymin, xmax, ymax, xmin, ymax],
+                    "score": round(float(conf.item()), 2),
+                    "latex": "",
+                }
+                formula_list.append(new_item)
+                bbox_img = np_array_image[ymin:ymax, xmin:xmax]
+                area = (xmax - xmin) * (ymax - ymin)
+                curr_idx = len(mf_image_list)
+                image_info.append((area, curr_idx, bbox_img))
+                mf_image_list.append(bbox_img)
+            images_formula_list.append(formula_list)
+            backfill_list += formula_list
+        # Stable sort by area
+        image_info.sort(key=lambda x: x[0])  # sort by area
+        sorted_indices = [x[1] for x in image_info]
+        sorted_images = [x[2] for x in image_info]
+        # Create mapping for results
+        index_mapping = {new_idx: old_idx for new_idx, old_idx in enumerate(sorted_indices)}
+        # Create dataset with sorted images
+        dataset = MathDataset(sorted_images, transform=self.model.transform)
+        dataloader = DataLoader(dataset, batch_size=batch_size, num_workers=0)
+        # Process batches and store results
+        mfr_res = []
+        # for mf_img in dataloader:
+        with tqdm(total=len(sorted_images), desc="MFR Predict") as pbar:
+            for index, mf_img in enumerate(dataloader):
+                mf_img = mf_img.to(dtype=self.model.dtype)
+                mf_img = mf_img.to(self.device)
+                with torch.no_grad():
+                    output = self.model.generate({"image": mf_img})
+                mfr_res.extend(output["fixed_str"])
+                # 更新进度条，每次增加batch_size，但要注意最后一个batch可能不足batch_size
+                current_batch_size = min(batch_size, len(sorted_images) - index * batch_size)
+                pbar.update(current_batch_size)
+        # Restore original order
+        unsorted_results = [""] * len(mfr_res)
+        for new_idx, latex in enumerate(mfr_res):
+            original_idx = index_mapping[new_idx]
+            unsorted_results[original_idx] = latex
+        # Fill results back
+        for res, latex in zip(backfill_list, unsorted_results):
+            res["latex"] = latex
+        return images_formula_list
--- a/mineru/model/mfr/unimernet/__init__.py
+++ b/mineru/model/mfr/unimernet/__init__.py
--- a/mineru/model/mfr/unimernet/unimernet_hf/__init__.py
+++ b/mineru/model/mfr/unimernet/unimernet_hf/__init__.py
+from .unimer_swin import UnimerSwinConfig, UnimerSwinModel, UnimerSwinImageProcessor
+from .unimer_mbart import UnimerMBartConfig, UnimerMBartModel, UnimerMBartForCausalLM
+from .modeling_unimernet import UnimernetModel
+__all__ = [
+    "UnimerSwinConfig",
+    "UnimerSwinModel",
+    "UnimerSwinImageProcessor",
+    "UnimerMBartConfig",
+    "UnimerMBartModel",
+    "UnimerMBartForCausalLM",
+    "UnimernetModel",
+]
--- a/mineru/model/mfr/unimernet/unimernet_hf/modeling_unimernet.py
+++ b/mineru/model/mfr/unimernet/unimernet_hf/modeling_unimernet.py
+import os
+import re
+import warnings
+from typing import Optional
+import torch
+from ftfy import fix_text
+from loguru import logger
+from transformers import AutoConfig, AutoModel, AutoModelForCausalLM, AutoTokenizer, PretrainedConfig, PreTrainedModel
+from transformers import VisionEncoderDecoderConfig, VisionEncoderDecoderModel
+from transformers.models.vision_encoder_decoder.modeling_vision_encoder_decoder import logger as base_model_logger
+from .unimer_swin import UnimerSwinConfig, UnimerSwinModel, UnimerSwinImageProcessor
+from .unimer_mbart import UnimerMBartConfig, UnimerMBartForCausalLM
+AutoConfig.register(UnimerSwinConfig.model_type, UnimerSwinConfig)
+AutoConfig.register(UnimerMBartConfig.model_type, UnimerMBartConfig)
+AutoModel.register(UnimerSwinConfig, UnimerSwinModel)
+AutoModelForCausalLM.register(UnimerMBartConfig, UnimerMBartForCausalLM)
+# TODO: rewrite tokenizer
+class TokenizerWrapper:
+    def __init__(self, tokenizer):
+        self.tokenizer = tokenizer
+        self.pad_token_id = self.tokenizer.pad_token_id
+        self.bos_token_id = self.tokenizer.bos_token_id
+        self.eos_token_id = self.tokenizer.eos_token_id
+    def __len__(self):
+        return len(self.tokenizer)
+    def tokenize(self, text, **kwargs):
+        return self.tokenizer(
+            text,
+            return_token_type_ids=False,
+            return_tensors="pt",
+            padding="longest",
+            truncation=True,
+            **kwargs,
+        )
+    def token2str(self, tokens) -> list:
+        generated_text = self.tokenizer.batch_decode(tokens, skip_special_tokens=True)
+        generated_text = [fix_text(text) for text in generated_text]
+        return generated_text
+    def detokenize(self, tokens):
+        toks = [self.tokenizer.convert_ids_to_tokens(tok) for tok in tokens]
+        for b in range(len(toks)):
+            for i in reversed(range(len(toks[b]))):
+                if toks[b][i] is None:
+                    toks[b][i] = ''
+                toks[b][i] = toks[b][i].replace('Ġ', ' ').strip()
+                if toks[b][i] in ([self.tokenizer.bos_token, self.tokenizer.eos_token, self.tokenizer.pad_token]):
+                    del toks[b][i]
+        return toks
+LEFT_PATTERN = re.compile(r'(\\left)(\S*)')
+RIGHT_PATTERN = re.compile(r'(\\right)(\S*)')
+LEFT_COUNT_PATTERN = re.compile(r'\\left(?![a-zA-Z])')
+RIGHT_COUNT_PATTERN = re.compile(r'\\right(?![a-zA-Z])')
+LEFT_RIGHT_REMOVE_PATTERN = re.compile(r'\\left\.?|\\right\.?')
+def fix_latex_left_right(s):
+    """
+    修复LaTeX中的\\left和\\right命令
+    1. 确保它们后面跟有效分隔符
+    2. 平衡\\left和\\right的数量
+    """
+    # 白名单分隔符
+    valid_delims_list = [r'(', r')', r'[', r']', r'{', r'}', r'/', r'|',
+                         r'\{', r'\}', r'\lceil', r'\rceil', r'\lfloor',
+                         r'\rfloor', r'\backslash', r'\uparrow', r'\downarrow',
+                         r'\Uparrow', r'\Downarrow', r'\|', r'\.']
+    # 为\left后缺失有效分隔符的情况添加点
+    def fix_delim(match, is_left=True):
+        cmd = match.group(1)  # \left 或 \right
+        rest = match.group(2) if len(match.groups()) > 1 else ""
+        if not rest or rest not in valid_delims_list:
+            return cmd + "."
+        return match.group(0)
+    # 使用更精确的模式匹配\left和\right命令
+    # 确保它们是独立的命令，不是其他命令的一部分
+    # 使用预编译正则和统一回调函数
+    s = LEFT_PATTERN.sub(lambda m: fix_delim(m, True), s)
+    s = RIGHT_PATTERN.sub(lambda m: fix_delim(m, False), s)
+    # 更精确地计算\left和\right的数量
+    left_count = len(LEFT_COUNT_PATTERN.findall(s))  # 不匹配\lefteqn等
+    right_count = len(RIGHT_COUNT_PATTERN.findall(s))  # 不匹配\rightarrow等
+    if left_count == right_count:
+        # 如果数量相等，检查是否在同一组
+        return fix_left_right_pairs(s)
+    else:
+        # 如果数量不等，移除所有\left和\right
+        # logger.debug(f"latex:{s}")
+        # logger.warning(f"left_count: {left_count}, right_count: {right_count}")
+        return LEFT_RIGHT_REMOVE_PATTERN.sub('', s)
+def fix_left_right_pairs(latex_formula):
+    """
+    检测并修复LaTeX公式中\\left和\\right不在同一组的情况
+    Args:
+        latex_formula (str): 输入的LaTeX公式
+    Returns:
+        str: 修复后的LaTeX公式
+    """
+    # 用于跟踪花括号嵌套层级
+    brace_stack = []
+    # 用于存储\left信息: (位置, 深度, 分隔符)
+    left_stack = []
+    # 存储需要调整的\right信息: (开始位置, 结束位置, 目标位置)
+    adjustments = []
+    i = 0
+    while i < len(latex_formula):
+        # 检查是否是转义字符
+        if i > 0 and latex_formula[i - 1] == '\\':
+            backslash_count = 0
+            j = i - 1
+            while j >= 0 and latex_formula[j] == '\\':
+                backslash_count += 1
+                j -= 1
+            if backslash_count % 2 == 1:
+                i += 1
+                continue
+        # 检测\left命令
+        if i + 5 < len(latex_formula) and latex_formula[i:i + 5] == "\\left" and i + 5 < len(latex_formula):
+            delimiter = latex_formula[i + 5]
+            left_stack.append((i, len(brace_stack), delimiter))
+            i += 6  # 跳过\left和分隔符
+            continue
+        # 检测\right命令
+        elif i + 6 < len(latex_formula) and latex_formula[i:i + 6] == "\\right" and i + 6 < len(latex_formula):
+            delimiter = latex_formula[i + 6]
+            if left_stack:
+                left_pos, left_depth, left_delim = left_stack.pop()
+                # 如果\left和\right不在同一花括号深度
+                if left_depth != len(brace_stack):
+                    # 找到\left所在花括号组的结束位置
+                    target_pos = find_group_end(latex_formula, left_pos, left_depth)
+                    if target_pos != -1:
+                        # 记录需要移动的\right
+                        adjustments.append((i, i + 7, target_pos))
+            i += 7  # 跳过\right和分隔符
+            continue
+        # 处理花括号
+        if latex_formula[i] == '{':
+            brace_stack.append(i)
+        elif latex_formula[i] == '}':
+            if brace_stack:
+                brace_stack.pop()
+        i += 1
+    # 应用调整，从后向前处理以避免索引变化
+    if not adjustments:
+        return latex_formula
+    result = list(latex_formula)
+    adjustments.sort(reverse=True, key=lambda x: x[0])
+    for start, end, target in adjustments:
+        # 提取\right部分
+        right_part = result[start:end]
+        # 从原位置删除
+        del result[start:end]
+        # 在目标位置插入
+        result.insert(target, ''.join(right_part))
+    return ''.join(result)
+def find_group_end(text, pos, depth):
+    """查找特定深度的花括号组的结束位置"""
+    current_depth = depth
+    i = pos
+    while i < len(text):
+        if text[i] == '{' and (i == 0 or not is_escaped(text, i)):
+            current_depth += 1
+        elif text[i] == '}' and (i == 0 or not is_escaped(text, i)):
+            current_depth -= 1
+            if current_depth < depth:
+                return i
+        i += 1
+    return -1  # 未找到对应结束位置
+def is_escaped(text, pos):
+    """检查字符是否被转义"""
+    backslash_count = 0
+    j = pos - 1
+    while j >= 0 and text[j] == '\\':
+        backslash_count += 1
+        j -= 1
+    return backslash_count % 2 == 1
+def fix_unbalanced_braces(latex_formula):
+    """
+    检测LaTeX公式中的花括号是否闭合，并删除无法配对的花括号
+    Args:
+        latex_formula (str): 输入的LaTeX公式
+    Returns:
+        str: 删除无法配对的花括号后的LaTeX公式
+    """
+    stack = []  # 存储左括号的索引
+    unmatched = set()  # 存储不匹配括号的索引
+    i = 0
+    while i < len(latex_formula):
+        # 检查是否是转义的花括号
+        if latex_formula[i] in ['{', '}']:
+            # 计算前面连续的反斜杠数量
+            backslash_count = 0
+            j = i - 1
+            while j >= 0 and latex_formula[j] == '\\':
+                backslash_count += 1
+                j -= 1
+            # 如果前面有奇数个反斜杠，则该花括号是转义的，不参与匹配
+            if backslash_count % 2 == 1:
+                i += 1
+                continue
+            # 否则，该花括号参与匹配
+            if latex_formula[i] == '{':
+                stack.append(i)
+            else:  # latex_formula[i] == '}'
+                if stack:  # 有对应的左括号
+                    stack.pop()
+                else:  # 没有对应的左括号
+                    unmatched.add(i)
+        i += 1
+    # 所有未匹配的左括号
+    unmatched.update(stack)
+    # 构建新字符串，删除不匹配的括号
+    return ''.join(char for i, char in enumerate(latex_formula) if i not in unmatched)
+def process_latex(input_string):
+    """
+        处理LaTeX公式中的反斜杠：
+        1. 如果\后跟特殊字符(#$%&~_^\\{})或空格，保持不变
+        2. 如果\后跟两个小写字母，保持不变
+        3. 其他情况，在\后添加空格
+        Args:
+            input_string (str): 输入的LaTeX公式
+        Returns:
+            str: 处理后的LaTeX公式
+        """
+    def replace_func(match):
+        # 获取\后面的字符
+        next_char = match.group(1)
+        # 如果是特殊字符或空格，保持不变
+        if next_char in "#$%&~_^|\\{} \t\n\r\v\f":
+            return match.group(0)
+        # 如果是字母，检查下一个字符
+        if 'a' <= next_char <= 'z' or 'A' <= next_char <= 'Z':
+            pos = match.start() + 2  # \x后的位置
+            if pos < len(input_string) and ('a' <= input_string[pos] <= 'z' or 'A' <= input_string[pos] <= 'Z'):
+                # 下一个字符也是字母，保持不变
+                return match.group(0)
+        # 其他情况，在\后添加空格
+        return '\\' + ' ' + next_char
+    # 匹配\后面跟一个字符的情况
+    pattern = r'\\(.)'
+    return re.sub(pattern, replace_func, input_string)
+# 常见的在KaTeX/MathJax中可用的数学环境
+ENV_TYPES = ['array', 'matrix', 'pmatrix', 'bmatrix', 'vmatrix',
+             'Bmatrix', 'Vmatrix', 'cases', 'aligned', 'gathered']
+ENV_BEGIN_PATTERNS = {env: re.compile(r'\\begin\{' + env + r'\}') for env in ENV_TYPES}
+ENV_END_PATTERNS = {env: re.compile(r'\\end\{' + env + r'\}') for env in ENV_TYPES}
+ENV_FORMAT_PATTERNS = {env: re.compile(r'\\begin\{' + env + r'\}\{([^}]*)\}') for env in ENV_TYPES}
+def fix_latex_environments(s):
+    """
+    检测LaTeX中环境（如array）的\\begin和\\end是否匹配
+    1. 如果缺少\\begin标签则在开头添加
+    2. 如果缺少\\end标签则在末尾添加
+    """
+    for env in ENV_TYPES:
+        begin_count = len(ENV_BEGIN_PATTERNS[env].findall(s))
+        end_count = len(ENV_END_PATTERNS[env].findall(s))
+        if begin_count != end_count:
+            if end_count > begin_count:
+                format_match = ENV_FORMAT_PATTERNS[env].search(s)
+                default_format = '{c}' if env == 'array' else ''
+                format_str = '{' + format_match.group(1) + '}' if format_match else default_format
+                missing_count = end_count - begin_count
+                begin_command = '\\begin{' + env + '}' + format_str + ' '
+                s = begin_command * missing_count + s
+            else:
+                missing_count = begin_count - end_count
+                s = s + (' \\end{' + env + '}') * missing_count
+    return s
+UP_PATTERN = re.compile(r'\\up([a-zA-Z]+)')
+COMMANDS_TO_REMOVE_PATTERN = re.compile(
+    r'\\(?:lefteqn|boldmath|ensuremath|centering|textsubscript|sides|textsl|textcent|emph|protect|null)')
+REPLACEMENTS_PATTERNS = {
+    re.compile(r'\\underbar'): r'\\underline',
+    re.compile(r'\\Bar'): r'\\hat',
+    re.compile(r'\\Hat'): r'\\hat',
+    re.compile(r'\\Tilde'): r'\\tilde',
+    re.compile(r'\\slash'): r'/',
+    re.compile(r'\\textperthousand'): r'‰',
+    re.compile(r'\\sun'): r'☉',
+    re.compile(r'\\textunderscore'): r'\\_',
+    re.compile(r'\\fint'): r'⨏',
+    re.compile(r'\\up '): r'\\ ',
+    re.compile(r'\\vline = '): r'\\models ',
+    re.compile(r'\\vDash '): r'\\models ',
+    re.compile(r'\\sq \\sqcup '): r'\\square ',
+}
+QQUAD_PATTERN = re.compile(r'\\qquad(?!\s)')
+def latex_rm_whitespace(s: str):
+    """Remove unnecessary whitespace from LaTeX code."""
+    s = fix_unbalanced_braces(s)
+    s = fix_latex_left_right(s)
+    s = fix_latex_environments(s)
+    # 使用预编译的正则表达式
+    s = UP_PATTERN.sub(
+        lambda m: m.group(0) if m.group(1) in ["arrow", "downarrow", "lus", "silon"] else f"\\{m.group(1)}", s
+    )
+    s = COMMANDS_TO_REMOVE_PATTERN.sub('', s)
+    # 应用所有替换
+    for pattern, replacement in REPLACEMENTS_PATTERNS.items():
+        s = pattern.sub(replacement, s)
+    # 处理LaTeX中的反斜杠和空格
+    s = process_latex(s)
+    # \qquad后补空格
+    s = QQUAD_PATTERN.sub(r'\\qquad ', s)
+    return s
+class UnimernetModel(VisionEncoderDecoderModel):
+    def __init__(
+        self,
+        config: Optional[PretrainedConfig] = None,
+        encoder: Optional[PreTrainedModel] = None,
+        decoder: Optional[PreTrainedModel] = None,
+    ):
+        # VisionEncoderDecoderModel's checking log has bug, disable for temp.
+        base_model_logger.disabled = True
+        try:
+            super().__init__(config, encoder, decoder)
+        finally:
+            base_model_logger.disabled = False
+        if not config or not hasattr(config, "_name_or_path"):
+            raise RuntimeError("config._name_or_path is required by UnimernetModel.")
+        model_path = config._name_or_path
+        self.transform = UnimerSwinImageProcessor()
+        self.tokenizer = TokenizerWrapper(AutoTokenizer.from_pretrained(model_path))
+        self._post_check()
+    def _post_check(self):
+        tokenizer = self.tokenizer
+        if tokenizer.tokenizer.model_max_length != self.config.decoder.max_position_embeddings:
+            warnings.warn(
+                f"decoder.max_position_embeddings={self.config.decoder.max_position_embeddings}," +
+                f" but tokenizer.model_max_length={tokenizer.tokenizer.model_max_length}, will set" +
+                f" tokenizer.model_max_length to {self.config.decoder.max_position_embeddings}.")
+            tokenizer.tokenizer.model_max_length = self.config.decoder.max_position_embeddings
+        assert self.config.decoder.vocab_size == len(tokenizer)
+        assert self.config.decoder_start_token_id == tokenizer.bos_token_id
+        assert self.config.pad_token_id == tokenizer.pad_token_id
+    @classmethod
+    def from_checkpoint(cls, model_path: str, model_filename: str = "pytorch_model.pth", state_dict_strip_prefix="model.model."):
+        config = VisionEncoderDecoderConfig.from_pretrained(model_path)
+        config._name_or_path = model_path
+        config.encoder = UnimerSwinConfig(**vars(config.encoder))
+        config.decoder = UnimerMBartConfig(**vars(config.decoder))
+        encoder = UnimerSwinModel(config.encoder)
+        decoder = UnimerMBartForCausalLM(config.decoder)
+        model = cls(config, encoder, decoder)
+        # load model weights
+        model_file_path = os.path.join(model_path, model_filename)
+        checkpoint = torch.load(model_file_path, map_location="cpu", weights_only=True)
+        state_dict = checkpoint["model"] if "model" in checkpoint else checkpoint
+        if not state_dict:
+            raise RuntimeError("state_dict is empty.")
+        if state_dict_strip_prefix:
+            state_dict = {
+                k[len(state_dict_strip_prefix):] if k.startswith(state_dict_strip_prefix) else k: v
+                for k, v in state_dict.items()
+            }
+        missing_keys, unexpected_keys = model.load_state_dict(state_dict, strict=False)
+        if len(unexpected_keys) > 0:
+            warnings.warn("Unexpected key(s) in state_dict: {}.".format(", ".join(f'"{k}"' for k in unexpected_keys)))
+        if len(missing_keys) > 0:
+            raise RuntimeError("Missing key(s) in state_dict: {}.".format(", ".join(f'"{k}"' for k in missing_keys)))
+        return model
+    def forward_bak(self, samples):
+        pixel_values, text = samples["image"], samples["text_input"]
+        text_inputs = self.tokenizer.tokenize(text).to(pixel_values.device)
+        decoder_input_ids, decoder_attention_mask = text_inputs["input_ids"], text_inputs["attention_mask"]
+        num_channels = pixel_values.shape[1]
+        if num_channels == 1:
+            pixel_values = pixel_values.repeat(1, 3, 1, 1)
+        labels = decoder_input_ids * 1
+        labels = labels.masked_fill(labels == self.tokenizer.pad_token_id, -100)
+        loss = self.model(
+            pixel_values=pixel_values,
+            decoder_input_ids=decoder_input_ids[:, :-1],
+            decoder_attention_mask=decoder_attention_mask[:, :-1],
+            labels=labels[:, 1:],
+        ).loss
+        return {"loss": loss}
+    def generate(self, samples, do_sample: bool = False, temperature: float = 0.2, top_p: float = 0.95):
+        pixel_values = samples["image"]
+        num_channels = pixel_values.shape[1]
+        if num_channels == 1:
+            pixel_values = pixel_values.repeat(1, 3, 1, 1)
+        kwargs = {}
+        if do_sample:
+            kwargs["temperature"] = temperature
+            kwargs["top_p"] = top_p
+        outputs = super().generate(
+            pixel_values=pixel_values,
+            max_new_tokens=self.tokenizer.tokenizer.model_max_length, # required
+            decoder_start_token_id=self.tokenizer.tokenizer.bos_token_id,
+            do_sample=do_sample,
+            **kwargs,
+        )
+        outputs = outputs[:, 1:].cpu().numpy()
+        pred_tokens = self.tokenizer.detokenize(outputs)
+        pred_str = self.tokenizer.token2str(outputs)
+        fixed_str = [latex_rm_whitespace(s) for s in pred_str]
+        return {"pred_ids": outputs, "pred_tokens": pred_tokens, "pred_str": pred_str, "fixed_str": fixed_str}
--- a/mineru/model/mfr/unimernet/unimernet_hf/unimer_mbart/__init__.py
+++ b/mineru/model/mfr/unimernet/unimernet_hf/unimer_mbart/__init__.py
+from .configuration_unimer_mbart import UnimerMBartConfig
+from .modeling_unimer_mbart import UnimerMBartModel, UnimerMBartForCausalLM
+__all__ = [
+    "UnimerMBartConfig",
+    "UnimerMBartModel",
+    "UnimerMBartForCausalLM",
+]
--- a/mineru/model/mfr/unimernet/unimernet_hf/unimer_mbart/configuration_unimer_mbart.py
+++ b/mineru/model/mfr/unimernet/unimernet_hf/unimer_mbart/configuration_unimer_mbart.py
+# coding=utf-8
+# Copyright 2021, The Facebook AI Research Team and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""UnimerMBART model configuration"""
+from transformers.configuration_utils import PretrainedConfig
+from transformers.utils import logging
+logger = logging.get_logger(__name__)
+class UnimerMBartConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`MBartModel`]. It is used to instantiate an MBART
+    model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
+    defaults will yield a similar configuration to that of the MBART
+    [facebook/mbart-large-cc25](https://huggingface.co/facebook/mbart-large-cc25) architecture.
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+    Args:
+        vocab_size (`int`, *optional*, defaults to 50265):
+            Vocabulary size of the MBART model. Defines the number of different tokens that can be represented by the
+            `inputs_ids` passed when calling [`MBartModel`] or [`TFMBartModel`].
+        d_model (`int`, *optional*, defaults to 1024):
+            Dimensionality of the layers and the pooler layer.
+        qk_squeeze (`int`, *optional*, defaults to 2):
+            Squeeze ratio for query/key's output dimension. See the [UniMERNet paper](https://arxiv.org/abs/2404.15254).
+            Squeeze Attention maps the query and key to a lower-dimensional space without excessive loss of information,
+            thereby accelerating the computation of attention.
+        encoder_layers (`int`, *optional*, defaults to 12):
+            Number of encoder layers.
+        decoder_layers (`int`, *optional*, defaults to 12):
+            Number of decoder layers.
+        encoder_attention_heads (`int`, *optional*, defaults to 16):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        decoder_attention_heads (`int`, *optional*, defaults to 16):
+            Number of attention heads for each attention layer in the Transformer decoder.
+        decoder_ffn_dim (`int`, *optional*, defaults to 4096):
+            Dimensionality of the "intermediate" (often named feed-forward) layer in decoder.
+        encoder_ffn_dim (`int`, *optional*, defaults to 4096):
+            Dimensionality of the "intermediate" (often named feed-forward) layer in decoder.
+        activation_function (`str` or `function`, *optional*, defaults to `"gelu"`):
+            The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
+            `"relu"`, `"silu"` and `"gelu_new"` are supported.
+        dropout (`float`, *optional*, defaults to 0.1):
+            The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
+        attention_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for the attention probabilities.
+        activation_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for activations inside the fully connected layer.
+        classifier_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for classifier.
+        max_position_embeddings (`int`, *optional*, defaults to 1024):
+            The maximum sequence length that this model might ever be used with. Typically set this to something large
+            just in case (e.g., 512 or 1024 or 2048).
+        init_std (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        encoder_layerdrop (`float`, *optional*, defaults to 0.0):
+            The LayerDrop probability for the encoder. See the [LayerDrop paper](see https://arxiv.org/abs/1909.11556)
+            for more details.
+        decoder_layerdrop (`float`, *optional*, defaults to 0.0):
+            The LayerDrop probability for the decoder. See the [LayerDrop paper](see https://arxiv.org/abs/1909.11556)
+            for more details.
+        scale_embedding (`bool`, *optional*, defaults to `False`):
+            Scale embeddings by diving by sqrt(d_model).
+        use_cache (`bool`, *optional*, defaults to `True`):
+            Whether or not the model should return the last key/values attentions (not used by all models)
+        forced_eos_token_id (`int`, *optional*, defaults to 2):
+            The id of the token to force as the last generated token when `max_length` is reached. Usually set to
+            `eos_token_id`.
+    Example:
+    ```python
+    >>> from transformers import MBartConfig, MBartModel
+    >>> # Initializing a MBART facebook/mbart-large-cc25 style configuration
+    >>> configuration = MBartConfig()
+    >>> # Initializing a model (with random weights) from the facebook/mbart-large-cc25 style configuration
+    >>> model = MBartModel(configuration)
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+    model_type = "unimer-mbart"
+    keys_to_ignore_at_inference = ["past_key_values"]
+    attribute_map = {"num_attention_heads": "encoder_attention_heads", "hidden_size": "d_model"}
+    def __init__(
+        self,
+        vocab_size=50265,
+        max_position_embeddings=1024,
+        encoder_layers=12,
+        encoder_ffn_dim=4096,
+        encoder_attention_heads=16,
+        decoder_layers=12,
+        decoder_ffn_dim=4096,
+        decoder_attention_heads=16,
+        encoder_layerdrop=0.0,
+        decoder_layerdrop=0.0,
+        use_cache=True,
+        is_encoder_decoder=True,
+        activation_function="gelu",
+        d_model=1024,
+        qk_squeeze=2,
+        dropout=0.1,
+        attention_dropout=0.0,
+        activation_dropout=0.0,
+        init_std=0.02,
+        classifier_dropout=0.0,
+        scale_embedding=False,
+        pad_token_id=1,
+        bos_token_id=0,
+        eos_token_id=2,
+        forced_eos_token_id=2,
+        **kwargs,
+    ):
+        self.vocab_size = vocab_size
+        self.max_position_embeddings = max_position_embeddings
+        self.d_model = d_model
+        self.qk_squeeze = qk_squeeze
+        self.encoder_ffn_dim = encoder_ffn_dim
+        self.encoder_layers = encoder_layers
+        self.encoder_attention_heads = encoder_attention_heads
+        self.decoder_ffn_dim = decoder_ffn_dim
+        self.decoder_layers = decoder_layers
+        self.decoder_attention_heads = decoder_attention_heads
+        self.dropout = dropout
+        self.attention_dropout = attention_dropout
+        self.activation_dropout = activation_dropout
+        self.activation_function = activation_function
+        self.init_std = init_std
+        self.encoder_layerdrop = encoder_layerdrop
+        self.decoder_layerdrop = decoder_layerdrop
+        self.classifier_dropout = classifier_dropout
+        self.use_cache = use_cache
+        self.num_hidden_layers = encoder_layers
+        self.scale_embedding = scale_embedding  # scale factor will be sqrt(d_model) if True
+        super().__init__(
+            pad_token_id=pad_token_id,
+            bos_token_id=bos_token_id,
+            eos_token_id=eos_token_id,
+            is_encoder_decoder=is_encoder_decoder,
+            forced_eos_token_id=forced_eos_token_id,
+            **kwargs,
+        )