Merge branch 'master' of https://github.com/opendatalab/MinerU

4a823359 · quyuan · 611e2f59 · b6df9b18 · 4a823359 · 4a823359
Commit 4a823359 authored Jul 12, 2024 by quyuan
20 changed files
--- a/README.md
+++ b/README.md
@@ -3,9 +3,14 @@

 [![stars](https://img.shields.io/github/stars/opendatalab/MinerU.svg)](https://github.com/opendatalab/MinerU)
 [![forks](https://img.shields.io/github/forks/opendatalab/MinerU.svg)](https://github.com/opendatalab/MinerU)
-[![license](https://img.shields.io/github/license/opendatalab/MinerU.svg)](https://github.com/opendatalab/MinerU/tree/main/LICENSE)
-[![issue resolution](https://img.shields.io/github/issues-closed-raw/opendatalab/MinerU)](https://github.com/opendatalab/MinerU/issues)
 [![open issues](https://img.shields.io/github/issues-raw/opendatalab/MinerU)](https://github.com/opendatalab/MinerU/issues)
+[![issue resolution](https://img.shields.io/github/issues-closed-raw/opendatalab/MinerU)](https://github.com/opendatalab/MinerU/issues)
+[![PyPI version](https://badge.fury.io/py/magic-pdf.svg)](https://badge.fury.io/py/magic-pdf)
+[![Downloads](https://static.pepy.tech/badge/magic-pdf)](https://pepy.tech/project/magic-pdf)
+[![Downloads](https://static.pepy.tech/badge/magic-pdf/month)](https://pepy.tech/project/magic-pdf)
+
+
+

 [English](README.md) | [简体中文](README_zh-CN.md)

@@ -63,9 +68,6 @@ https://github.com/opendatalab/MinerU/assets/11393164/618937cb-dc6a-4646-b433-e3

 - [PDF-Extract-Kit](https://github.com/opendatalab/PDF-Extract-Kit)
  - A Comprehensive Toolkit for High-Quality PDF Content Extraction
- [Miner-PDF-Benchmark](https://github.com/opendatalab/Miner-PDF-Benchmark)
-  - An end-to-end PDF document comprehension evaluation suite designed for large-scale model data scenarios
-

 ## Getting Started

@@ -205,4 +207,4 @@ The project currently leverages PyMuPDF to deliver advanced functionalities; how
   <source media="(prefers-color-scheme: light)" srcset="https://api.star-history.com/svg?repos=opendatalab/MinerU&type=Date" />
   <img alt="Star History Chart" src="https://api.star-history.com/svg?repos=opendatalab/MinerU&type=Date" />
 </picture>
-</a>
\ No newline at end of file
+</a>
--- a/README_zh-CN.md
+++ b/README_zh-CN.md
@@ -3,9 +3,11 @@

 [![stars](https://img.shields.io/github/stars/opendatalab/MinerU.svg)](https://github.com/opendatalab/MinerU)
 [![forks](https://img.shields.io/github/forks/opendatalab/MinerU.svg)](https://github.com/opendatalab/MinerU)
-[![license](https://img.shields.io/github/license/opendatalab/MinerU.svg)](https://github.com/opendatalab/MinerU/tree/main/LICENSE)
-[![issue resolution](https://img.shields.io/github/issues-closed-raw/opendatalab/MinerU)](https://github.com/opendatalab/MinerU/issues)
 [![open issues](https://img.shields.io/github/issues-raw/opendatalab/MinerU)](https://github.com/opendatalab/MinerU/issues)
+[![issue resolution](https://img.shields.io/github/issues-closed-raw/opendatalab/MinerU)](https://github.com/opendatalab/MinerU/issues)
+[![PyPI version](https://badge.fury.io/py/magic-pdf.svg)](https://badge.fury.io/py/magic-pdf)
+[![Downloads](https://static.pepy.tech/badge/magic-pdf)](https://pepy.tech/project/magic-pdf)
+[![Downloads](https://static.pepy.tech/badge/magic-pdf/month)](https://pepy.tech/project/magic-pdf)

 [English](README.md) | [简体中文](README_zh-CN.md)

@@ -61,9 +63,6 @@ https://github.com/opendatalab/MinerU/assets/11393164/618937cb-dc6a-4646-b433-e3

 - [PDF-Extract-Kit](https://github.com/opendatalab/PDF-Extract-Kit) 
  - 高质量的PDF内容提取工具包
- [Miner-PDF-Benchmark](https://github.com/opendatalab/Miner-PDF-Benchmark) 
-  - 端到端的PDF文档理解评估套件,专为大规模模型数据场景而设计
-

 ## 上手指南


--- a/assets/whl/detectron2-0.6-cp310-cp310-linux_x86_64.whl
+++ b/assets/whl/detectron2-0.6-cp310-cp310-linux_x86_64.whl
--- a/assets/whl/detectron2-0.6-cp310-cp310-macosx_10_9_universal2.whl
+++ b/assets/whl/detectron2-0.6-cp310-cp310-macosx_10_9_universal2.whl
--- a/assets/whl/detectron2-0.6-cp310-cp310-macosx_11_0_arm64.whl
+++ b/assets/whl/detectron2-0.6-cp310-cp310-macosx_11_0_arm64.whl
--- a/assets/whl/detectron2-0.6-cp310-cp310-win_amd64.whl
+++ b/assets/whl/detectron2-0.6-cp310-cp310-win_amd64.whl
--- a/demo/demo.py
+++ b/demo/demo.py
 import os
 import json

+from loguru import logger
+
 from magic_pdf.pipe.UNIPipe import UNIPipe
 from magic_pdf.rw.DiskReaderWriter import DiskReaderWriter

-current_script_dir = os.path.dirname(os.path.abspath(__file__))
-demo_name = "demo1"
-pdf_path = os.path.join(current_script_dir, f"{demo_name}.pdf")
-model_path = os.path.join(current_script_dir, f"{demo_name}.json")
-pdf_bytes = open(pdf_path, "rb").read()
-model_json = json.loads(open(model_path, "r", encoding="utf-8").read())
-jso_useful_key = {"_pdf_type": "", "model_list": model_json}
-local_image_dir = os.path.join(current_script_dir, 'images')
-image_dir = str(os.path.basename(local_image_dir))
-image_writer = DiskReaderWriter(local_image_dir)
-pipe = UNIPipe(pdf_bytes, jso_useful_key, image_writer)
-pipe.pipe_classify()
-pipe.pipe_parse()
-md_content = pipe.pipe_mk_markdown(image_dir, drop_mode="none")
-with open(f"{demo_name}.md", "w", encoding="utf-8") as f:
-    f.write(md_content)
+try:
+    current_script_dir = os.path.dirname(os.path.abspath(__file__))
+    demo_name = "demo1"
+    pdf_path = os.path.join(current_script_dir, f"{demo_name}.pdf")
+    model_path = os.path.join(current_script_dir, f"{demo_name}.json")
+    pdf_bytes = open(pdf_path, "rb").read()
+    model_json = json.loads(open(model_path, "r", encoding="utf-8").read())
+    jso_useful_key = {"_pdf_type": "", "model_list": model_json}
+    local_image_dir = os.path.join(current_script_dir, 'images')
+    image_dir = str(os.path.basename(local_image_dir))
+    image_writer = DiskReaderWriter(local_image_dir)
+    pipe = UNIPipe(pdf_bytes, jso_useful_key, image_writer)
+    pipe.pipe_classify()
+    pipe.pipe_parse()
+    md_content = pipe.pipe_mk_markdown(image_dir, drop_mode="none")
+    with open(f"{demo_name}.md", "w", encoding="utf-8") as f:
+        f.write(md_content)
+except Exception as e:
+    logger.exception(e)
\ No newline at end of file
--- a/docs/how_to_download_models.md
+++ b/docs/how_to_download_models.md
+#### Install Git LFS
+Before you begin, make sure Git Large File Storage (Git LFS) is installed on your system. Install it using the following command:
+
+```bash
+git lfs install
+```
+
+#### Download the Model from Hugging Face
+To download the `PDF-Extract-Kit` model from Hugging Face, use the following command:
+
+```bash
+git lfs clone https://huggingface.co/wanderkid/PDF-Extract-Kit
+```
+
+Ensure that Git LFS is enabled during the clone to properly download all large files.
+
+
+
+Put [model files]() here:
+
+```
+./
+├── Layout
+│   ├── config.json
+│   └── model_final.pth
+├── MFD
+│   └── weights.pt
+├── MFR
+│   └── UniMERNet
+│       ├── config.json
+│       ├── preprocessor_config.json
+│       ├── pytorch_model.bin
+│       ├── README.md
+│       ├── tokenizer_config.json
+│       └── tokenizer.json
+└── README.md
+```
\ No newline at end of file
--- a/docs/images/flowchart_en.png
+++ b/docs/images/flowchart_en.png
--- a/docs/images/flowchart_zh_cn.png
+++ b/docs/images/flowchart_zh_cn.png
--- a/magic-pdf.template.json
+++ b/magic-pdf.template.json
@@ -3,5 +3,7 @@
        "bucket-name-1":["ak", "sk", "endpoint"],
        "bucket-name-2":["ak", "sk", "endpoint"]
    },
-    "temp-output-dir":"/tmp"
+    "temp-output-dir":"/tmp",
+    "models-dir":"/tmp/models",
+    "device-mode":"cpu"
 }
\ No newline at end of file
--- a/magic_pdf/cli/magicpdf.py
+++ b/magic_pdf/cli/magicpdf.py
@@ -28,18 +28,20 @@ from loguru import logger
 from pathlib import Path
 from magic_pdf.libs.version import __version__

-from magic_pdf.libs.MakeContentConfig import DropMode
+from magic_pdf.libs.MakeContentConfig import DropMode, MakeMode
 from magic_pdf.libs.draw_bbox import draw_layout_bbox, draw_span_bbox
 from magic_pdf.pipe.UNIPipe import UNIPipe
 from magic_pdf.pipe.OCRPipe import OCRPipe
 from magic_pdf.pipe.TXTPipe import TXTPipe
-from magic_pdf.libs.config_reader import get_s3_config
 from magic_pdf.libs.path_utils import (
    parse_s3path,
    parse_s3_range_params,
    remove_non_official_s3_args,
 )
-from magic_pdf.libs.config_reader import get_local_dir
+from magic_pdf.libs.config_reader import (
+    get_local_dir,
+    get_s3_config,
+)
 from magic_pdf.rw.S3ReaderWriter import S3ReaderWriter
 from magic_pdf.rw.DiskReaderWriter import DiskReaderWriter
 from magic_pdf.rw.AbsReaderWriter import AbsReaderWriter
@@ -81,10 +83,12 @@ def do_parse(
        f_dump_model_json=True,
        f_dump_orig_pdf=True,
        f_dump_content_list=True,
+        f_make_md_mode=MakeMode.MM_MD,
 ):
    orig_model_list = copy.deepcopy(model_list)

    local_image_dir, local_md_dir = prepare_env(pdf_file_name, parse_method)
+    logger.info(f"local output dir is {local_md_dir}")
    image_writer, md_writer = DiskReaderWriter(local_image_dir), DiskReaderWriter(local_md_dir)
    image_dir = str(os.path.basename(local_image_dir))

@@ -105,6 +109,7 @@ def do_parse(
    if len(model_list) == 0:
        if model_config.__use_inside_model__:
            pipe.pipe_analyze()
+            orig_model_list = copy.deepcopy(pipe.model_list)
        else:
            logger.error("need model list input")
            exit(1)
@@ -116,7 +121,7 @@ def do_parse(
    if f_draw_span_bbox:
        draw_span_bbox(pdf_info, pdf_bytes, local_md_dir)

-    md_content = pipe.pipe_mk_markdown(image_dir, drop_mode=DropMode.NONE)
+    md_content = pipe.pipe_mk_markdown(image_dir, drop_mode=DropMode.NONE, md_make_mode=f_make_md_mode)
    if f_dump_md:
        """写markdown"""
        md_writer.write(
@@ -175,8 +180,10 @@ def cli():
    default="auto",
 )
 @click.option("--inside_model", type=click.BOOL, default=False, help="使用内置模型测试")
-def json_command(json, method, inside_model):
+@click.option("--model_mode", type=click.STRING, default="full", help="内置模型选择。lite: 快速解析，精度较低，full: 高精度解析，速度较慢")
+def json_command(json, method, inside_model, model_mode):
    model_config.__use_inside_model__ = inside_model
+    model_config.__model_mode__ = model_mode

    if not json.startswith("s3://"):
        logger.error("usage: magic-pdf json-command --json s3://some_bucket/some_path")
@@ -226,8 +233,10 @@ def json_command(json, method, inside_model):
    default="auto",
 )
 @click.option("--inside_model", type=click.BOOL, default=False, help="使用内置模型测试")
-def local_json_command(local_json, method, inside_model):
+@click.option("--model_mode", type=click.STRING, default="full", help="内置模型选择。lite: 快速解析，精度较低，full: 高精度解析，速度较慢")
+def local_json_command(local_json, method, inside_model, model_mode):
    model_config.__use_inside_model__ = inside_model
+    model_config.__model_mode__ = model_mode

    def read_s3_path(s3path):
        bucket, key = parse_s3path(s3path)
@@ -278,8 +287,10 @@ def local_json_command(local_json, method, inside_model):
    default="auto",
 )
 @click.option("--inside_model", type=click.BOOL, default=False, help="使用内置模型测试")
-def pdf_command(pdf, model, method, inside_model):
+@click.option("--model_mode", type=click.STRING, default="full", help="内置模型选择。lite: 快速解析，精度较低，full: 高精度解析，速度较慢")
+def pdf_command(pdf, model, method, inside_model, model_mode):
    model_config.__use_inside_model__ = inside_model
+    model_config.__model_mode__ = model_mode

    def read_fn(path):
        disk_rw = DiskReaderWriter(os.path.dirname(path))
@@ -290,7 +301,11 @@ def pdf_command(pdf, model, method, inside_model):
    def get_model_json(model_path):
        # 这里处理pdf和模型相关的逻辑
        if model_path is None:
-            model_path = pdf.replace(".pdf", ".json")
+            file_name_without_extension, extension = os.path.splitext(pdf)
+            if extension == ".pdf":
+                model_path = file_name_without_extension + ".json"
+            else:
+                raise Exception("pdf_path input error")
            if not os.path.exists(model_path):
                logger.warning(
                    f"not found json {model_path} existed"

--- a/magic_pdf/libs/config_reader.py
+++ b/magic_pdf/libs/config_reader.py
@@ -59,5 +59,15 @@ def get_local_dir():
    return config.get("temp-output-dir", "/tmp")


+def get_local_models_dir():
+    config = read_config()
+    return config.get("models-dir", "/tmp/models")
+
+
+def get_device():
+    config = read_config()
+    return config.get("device-mode", "cpu")
+
+
 if __name__ == "__main__":
    ak, sk, endpoint = get_s3_config("llm-raw")
--- a/magic_pdf/libs/language.py
+++ b/magic_pdf/libs/language.py
 import unicodedata
-from fast_langdetect import detect_langs
+from fast_langdetect import detect_language


 def detect_lang(text: str) -> str:
    if len(text) == 0:
        return ""
    try:
-        lang_upper = detect_langs(text)
+        lang_upper = detect_language(text)
    except:
        html_no_ctrl_chars = ''.join([l for l in text if unicodedata.category(l)[0] not in ['C', ]])
-        lang_upper = detect_langs(html_no_ctrl_chars)
+        lang_upper = detect_language(html_no_ctrl_chars)
    try:
        lang = lang_upper.lower()
    except:

--- a/magic_pdf/libs/version.py
+++ b/magic_pdf/libs/version.py
-__version__ = "0.5.12"
+__version__ = "0.5.13"
--- a/magic_pdf/model/360_layout_analysis.py
+++ b/magic_pdf/model/360_layout_analysis.py
-from ultralytics import YOLO
-
-image_path = ''  # 待预测图片路径
-model_path = ''  # 权重路径
-model = YOLO(model_path)
-
-result = model(image_path, save=True, conf=0.5, save_crop=False, line_width=2)
-print(result)
\ No newline at end of file
--- a/magic_pdf/model/__init__.py
+++ b/magic_pdf/model/__init__.py
-__use_inside_model__ = False
+__use_inside_model__ = True
+__model_mode__ = "full"
--- a/magic_pdf/model/doc_analyze_by_custom_model.py
+++ b/magic_pdf/model/doc_analyze_by_custom_model.py
+import time
+
 import fitz
 import numpy as np
 from loguru import logger
+
+from magic_pdf.libs.config_reader import get_local_models_dir, get_device
 from magic_pdf.model.model_list import MODEL
 import magic_pdf.model as model_config

@@ -21,10 +25,11 @@ def remove_duplicates_dicts(lst):

 def load_images_from_pdf(pdf_bytes: bytes, dpi=200) -> list:
    try:
-        import cv2
        from PIL import Image
    except ImportError:
-        logger.error("opencv-python and Pillow are not installed, please install by pip.")
+        logger.error("Pillow not installed, please install by pip.")
+        exit(1)
+
    images = []
    with fitz.open("pdf", pdf_bytes) as doc:
        for index in range(0, doc.page_count):
@@ -32,32 +37,49 @@ def load_images_from_pdf(pdf_bytes: bytes, dpi=200) -> list:
            mat = fitz.Matrix(dpi / 72, dpi / 72)
            pm = page.get_pixmap(matrix=mat, alpha=False)

-            # if width or height > 2000 pixels, don't enlarge the image
-            # if pm.width > 2000 or pm.height > 2000:
-            #     pm = page.get_pixmap(matrix=fitz.Matrix(1, 1), alpha=False)
+            # if width or height > 3000 pixels, don't enlarge the image
+            if pm.width > 3000 or pm.height > 3000:
+                pm = page.get_pixmap(matrix=fitz.Matrix(1, 1), alpha=False)

-            img = Image.frombytes("RGB", [pm.width, pm.height], pm.samples)
-            img = cv2.cvtColor(np.array(img), cv2.COLOR_RGB2BGR)
+            img = Image.frombytes("RGB", (pm.width, pm.height), pm.samples)
+            img = np.array(img)
            img_dict = {"img": img, "width": pm.width, "height": pm.height}
            images.append(img_dict)
    return images


-def doc_analyze(pdf_bytes: bytes, ocr: bool = False, show_log: bool = False, model=MODEL.Paddle):
+def doc_analyze(pdf_bytes: bytes, ocr: bool = False, show_log: bool = False):
+    model = None
+
+    if model_config.__model_mode__ == "lite":
+        model = MODEL.Paddle
+    elif model_config.__model_mode__ == "full":
+        model = MODEL.PEK

    if model_config.__use_inside_model__:
-        from magic_pdf.model.pp_structure_v2 import CustomPaddleModel
+        model_init_start = time.time()
+        if model == MODEL.Paddle:
+            from magic_pdf.model.pp_structure_v2 import CustomPaddleModel
+            custom_model = CustomPaddleModel(ocr=ocr, show_log=show_log)
+        elif model == MODEL.PEK:
+            from magic_pdf.model.pdf_extract_kit import CustomPEKModel
+            # 从配置文件读取model-dir和device
+            local_models_dir = get_local_models_dir()
+            device = get_device()
+            custom_model = CustomPEKModel(ocr=ocr, show_log=show_log, models_dir=local_models_dir, device=device)
+        else:
+            logger.error("Not allow model_name!")
+            exit(1)
+        model_init_cost = time.time() - model_init_start
+        logger.info(f"model init cost: {model_init_cost}")
    else:
        logger.error("use_inside_model is False, not allow to use inside model")
        exit(1)

    images = load_images_from_pdf(pdf_bytes)
-    custom_model = None
-    if model == MODEL.Paddle:
-        custom_model = CustomPaddleModel(ocr=ocr, show_log=show_log)
-    else:
-        pass
+
    model_json = []
+    doc_analyze_start = time.time()
    for index, img_dict in enumerate(images):
        img = img_dict["img"]
        page_width = img_dict["width"]
@@ -65,7 +87,8 @@ def doc_analyze(pdf_bytes: bytes, ocr: bool = False, show_log: bool = False, mod
        result = custom_model(img)
        page_info = {"page_no": index, "height": page_height, "width": page_width}
        page_dict = {"layout_dets": result, "page_info": page_info}
-
        model_json.append(page_dict)
+    doc_analyze_cost = time.time() - doc_analyze_start
+    logger.info(f"doc analyze cost: {doc_analyze_cost}")

    return model_json
--- a/magic_pdf/model/model_list.py
+++ b/magic_pdf/model/model_list.py
 class MODEL:
    Paddle = "pp_structure_v2"
+    PEK = "pdf_extract_kit"
--- a/magic_pdf/model/pdf_extract_kit.py
+++ b/magic_pdf/model/pdf_extract_kit.py
+import os
+import time
+
+import cv2
+import numpy as np
+import yaml
+from PIL import Image
+from ultralytics import YOLO
+from loguru import logger
+
+from magic_pdf.model.pek_sub_modules.layoutlmv3.model_init import Layoutlmv3_Predictor
+from unimernet.common.config import Config
+import unimernet.tasks as tasks
+from unimernet.processors import load_processor
+import argparse
+from torchvision import transforms
+from torch.utils.data import Dataset, DataLoader
+
+from magic_pdf.model.pek_sub_modules.post_process import get_croped_image, latex_rm_whitespace
+from magic_pdf.model.pek_sub_modules.self_modify import ModifiedPaddleOCR
+
+
+def layout_model_init(weight, config_file, device):
+    model = Layoutlmv3_Predictor(weight, config_file, device)
+    return model
+
+
+def mfr_model_init(weight_dir, cfg_path, device='cpu'):
+    args = argparse.Namespace(cfg_path=cfg_path, options=None)
+    cfg = Config(args)
+    cfg.config.model.pretrained = os.path.join(weight_dir, "pytorch_model.bin")
+    cfg.config.model.model_config.model_name = weight_dir
+    cfg.config.model.tokenizer_config.path = weight_dir
+    task = tasks.setup_task(cfg)
+    model = task.build_model(cfg)
+    model = model.to(device)
+    vis_processor = load_processor('formula_image_eval', cfg.config.datasets.formula_rec_eval.vis_processor.eval)
+    return model, vis_processor
+
+
+class MathDataset(Dataset):
+    def __init__(self, image_paths, transform=None):
+        self.image_paths = image_paths
+        self.transform = transform
+
+    def __len__(self):
+        return len(self.image_paths)
+
+    def __getitem__(self, idx):
+        # if not pil image, then convert to pil image
+        if isinstance(self.image_paths[idx], str):
+            raw_image = Image.open(self.image_paths[idx])
+        else:
+            raw_image = self.image_paths[idx]
+        if self.transform:
+            image = self.transform(raw_image)
+        return image
+
+
+class CustomPEKModel:
+    def __init__(self, ocr: bool = False, show_log: bool = False, **kwargs):
+        """
+        ======== model init ========
+        """
+        # 获取当前文件（即 pdf_extract_kit.py）的绝对路径
+        current_file_path = os.path.abspath(__file__)
+        # 获取当前文件所在的目录(model)
+        current_dir = os.path.dirname(current_file_path)
+        # 上一级目录(magic_pdf)
+        root_dir = os.path.dirname(current_dir)
+        # model_config目录
+        model_config_dir = os.path.join(root_dir, 'resources', 'model_config')
+        # 构建 model_configs.yaml 文件的完整路径
+        config_path = os.path.join(model_config_dir, 'model_configs.yaml')
+        with open(config_path, "r") as f:
+            self.configs = yaml.load(f, Loader=yaml.FullLoader)
+        # 初始化解析配置
+        self.apply_layout = kwargs.get("apply_layout", self.configs["config"]["layout"])
+        self.apply_formula = kwargs.get("apply_formula", self.configs["config"]["formula"])
+        self.apply_ocr = ocr
+        logger.info(
+            "DocAnalysis init, this may take some times. apply_layout: {}, apply_formula: {}, apply_ocr: {}".format(
+                self.apply_layout, self.apply_formula, self.apply_ocr
+            )
+        )
+        assert self.apply_layout, "DocAnalysis must contain layout model."
+        # 初始化解析方案
+        self.device = kwargs.get("device", self.configs["config"]["device"])
+        logger.info("using device: {}".format(self.device))
+        models_dir = kwargs.get("models_dir", os.path.join(root_dir, "resources", "models"))
+        # 初始化layout模型
+        self.layout_model = layout_model_init(
+            os.path.join(models_dir, self.configs['weights']['layout']),
+            os.path.join(model_config_dir, "layoutlmv3", "layoutlmv3_base_inference.yaml"),
+            device=self.device
+        )
+        # 初始化公式识别
+        if self.apply_formula:
+            # 初始化公式检测模型
+            self.mfd_model = YOLO(model=str(os.path.join(models_dir, self.configs["weights"]["mfd"])))
+            # 初始化公式解析模型
+            mfr_config_path = os.path.join(model_config_dir, 'UniMERNet', 'demo.yaml')
+            self.mfr_model, mfr_vis_processors = mfr_model_init(
+                os.path.join(models_dir, self.configs["weights"]["mfr"]),
+                mfr_config_path,
+                device=self.device
+            )
+            self.mfr_transform = transforms.Compose([mfr_vis_processors, ])
+        # 初始化ocr
+        if self.apply_ocr:
+            self.ocr_model = ModifiedPaddleOCR(show_log=show_log)
+
+        logger.info('DocAnalysis init done!')
+
+    def __call__(self, image):
+
+        latex_filling_list = []
+        mf_image_list = []
+
+        # layout检测
+        layout_start = time.time()
+        layout_res = self.layout_model(image, ignore_catids=[])
+        layout_cost = round(time.time() - layout_start, 2)
+        logger.info(f"layout detection cost: {layout_cost}")
+
+        # 公式检测
+        mfd_res = self.mfd_model.predict(image, imgsz=1888, conf=0.25, iou=0.45, verbose=True)[0]
+        for xyxy, conf, cla in zip(mfd_res.boxes.xyxy.cpu(), mfd_res.boxes.conf.cpu(), mfd_res.boxes.cls.cpu()):
+            xmin, ymin, xmax, ymax = [int(p.item()) for p in xyxy]
+            new_item = {
+                'category_id': 13 + int(cla.item()),
+                'poly': [xmin, ymin, xmax, ymin, xmax, ymax, xmin, ymax],
+                'score': round(float(conf.item()), 2),
+                'latex': '',
+            }
+            layout_res.append(new_item)
+            latex_filling_list.append(new_item)
+            bbox_img = get_croped_image(Image.fromarray(image), [xmin, ymin, xmax, ymax])
+            mf_image_list.append(bbox_img)
+
+        # 公式识别
+        mfr_start = time.time()
+        dataset = MathDataset(mf_image_list, transform=self.mfr_transform)
+        dataloader = DataLoader(dataset, batch_size=64, num_workers=0)
+        mfr_res = []
+        for mf_img in dataloader:
+            mf_img = mf_img.to(self.device)
+            output = self.mfr_model.generate({'image': mf_img})
+            mfr_res.extend(output['pred_str'])
+        for res, latex in zip(latex_filling_list, mfr_res):
+            res['latex'] = latex_rm_whitespace(latex)
+        mfr_cost = round(time.time() - mfr_start, 2)
+        logger.info(f"formula nums: {len(mf_image_list)}, mfr time: {mfr_cost}")
+
+        # ocr识别
+        if self.apply_ocr:
+            ocr_start = time.time()
+            pil_img = Image.fromarray(image)
+            single_page_mfdetrec_res = []
+            for res in layout_res:
+                if int(res['category_id']) in [13, 14]:
+                    xmin, ymin = int(res['poly'][0]), int(res['poly'][1])
+                    xmax, ymax = int(res['poly'][4]), int(res['poly'][5])
+                    single_page_mfdetrec_res.append({
+                        "bbox": [xmin, ymin, xmax, ymax],
+                    })
+            for res in layout_res:
+                if int(res['category_id']) in [0, 1, 2, 4, 6, 7]:  # 需要进行ocr的类别
+                    xmin, ymin = int(res['poly'][0]), int(res['poly'][1])
+                    xmax, ymax = int(res['poly'][4]), int(res['poly'][5])
+                    crop_box = (xmin, ymin, xmax, ymax)
+                    cropped_img = Image.new('RGB', pil_img.size, 'white')
+                    cropped_img.paste(pil_img.crop(crop_box), crop_box)
+                    cropped_img = cv2.cvtColor(np.asarray(cropped_img), cv2.COLOR_RGB2BGR)
+                    ocr_res = self.ocr_model.ocr(cropped_img, mfd_res=single_page_mfdetrec_res)[0]
+                    if ocr_res:
+                        for box_ocr_res in ocr_res:
+                            p1, p2, p3, p4 = box_ocr_res[0]
+                            text, score = box_ocr_res[1]
+                            layout_res.append({
+                                'category_id': 15,
+                                'poly': p1 + p2 + p3 + p4,
+                                'score': round(score, 2),
+                                'text': text,
+                            })
+            ocr_cost = round(time.time() - ocr_start, 2)
+            logger.info(f"ocr cost: {ocr_cost}")
+
+        return layout_res