Merge pull request #2065 from opendatalab/release-1.3.0

Release 1.3.0

Merge pull request #2065 from opendatalab/release-1.3.0
Release 1.3.0
41d96cd8 · Xiaomeng Zhao · GitHub · c3d43e52 · dd96663c · 41d96cd8
Unverified Commit 41d96cd8 authored Apr 03, 2025 by Xiaomeng Zhao Committed by GitHub Apr 03, 2025
20 changed files
--- a/magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/tools/infer/pytorchocr_utility.py
+++ b/magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/tools/infer/pytorchocr_utility.py
+import os
+import math
+from pathlib import Path
+import numpy as np
+import cv2
+import argparse
+
+
+root_dir = Path(__file__).resolve().parent.parent.parent
+DEFAULT_CFG_PATH = root_dir / "pytorchocr" / "utils" / "resources" / "arch_config.yaml"
+
+
+def init_args():
+    def str2bool(v):
+        return v.lower() in ("true", "t", "1")
+
+    parser = argparse.ArgumentParser()
+    # params for prediction engine
+    parser.add_argument("--use_gpu", type=str2bool, default=False)
+    parser.add_argument("--det", type=str2bool, default=True)
+    parser.add_argument("--rec", type=str2bool, default=True)
+    parser.add_argument("--device", type=str, default='cpu')
+    # parser.add_argument("--ir_optim", type=str2bool, default=True)
+    # parser.add_argument("--use_tensorrt", type=str2bool, default=False)
+    # parser.add_argument("--use_fp16", type=str2bool, default=False)
+    parser.add_argument("--gpu_mem", type=int, default=500)
+    parser.add_argument("--warmup", type=str2bool, default=False)
+
+    # params for text detector
+    parser.add_argument("--image_dir", type=str)
+    parser.add_argument("--det_algorithm", type=str, default='DB')
+    parser.add_argument("--det_model_path", type=str)
+    parser.add_argument("--det_limit_side_len", type=float, default=960)
+    parser.add_argument("--det_limit_type", type=str, default='max')
+
+    # DB parmas
+    parser.add_argument("--det_db_thresh", type=float, default=0.3)
+    parser.add_argument("--det_db_box_thresh", type=float, default=0.6)
+    parser.add_argument("--det_db_unclip_ratio", type=float, default=1.5)
+    parser.add_argument("--max_batch_size", type=int, default=10)
+    parser.add_argument("--use_dilation", type=str2bool, default=False)
+    parser.add_argument("--det_db_score_mode", type=str, default="fast")
+
+    # EAST parmas
+    parser.add_argument("--det_east_score_thresh", type=float, default=0.8)
+    parser.add_argument("--det_east_cover_thresh", type=float, default=0.1)
+    parser.add_argument("--det_east_nms_thresh", type=float, default=0.2)
+
+    # SAST parmas
+    parser.add_argument("--det_sast_score_thresh", type=float, default=0.5)
+    parser.add_argument("--det_sast_nms_thresh", type=float, default=0.2)
+    parser.add_argument("--det_sast_polygon", type=str2bool, default=False)
+
+    # PSE parmas
+    parser.add_argument("--det_pse_thresh", type=float, default=0)
+    parser.add_argument("--det_pse_box_thresh", type=float, default=0.85)
+    parser.add_argument("--det_pse_min_area", type=float, default=16)
+    parser.add_argument("--det_pse_box_type", type=str, default='box')
+    parser.add_argument("--det_pse_scale", type=int, default=1)
+
+    # FCE parmas
+    parser.add_argument("--scales", type=list, default=[8, 16, 32])
+    parser.add_argument("--alpha", type=float, default=1.0)
+    parser.add_argument("--beta", type=float, default=1.0)
+    parser.add_argument("--fourier_degree", type=int, default=5)
+    parser.add_argument("--det_fce_box_type", type=str, default='poly')
+
+    # params for text recognizer
+    parser.add_argument("--rec_algorithm", type=str, default='CRNN')
+    parser.add_argument("--rec_model_path", type=str)
+    parser.add_argument("--rec_image_inverse", type=str2bool, default=True)
+    parser.add_argument("--rec_image_shape", type=str, default="3, 48, 320")
+    parser.add_argument("--rec_char_type", type=str, default='ch')
+    parser.add_argument("--rec_batch_num", type=int, default=6)
+    parser.add_argument("--max_text_length", type=int, default=25)
+
+    parser.add_argument("--use_space_char", type=str2bool, default=True)
+    parser.add_argument("--drop_score", type=float, default=0.5)
+    parser.add_argument("--limited_max_width", type=int, default=1280)
+    parser.add_argument("--limited_min_width", type=int, default=16)
+
+    parser.add_argument(
+        "--vis_font_path", type=str,
+        default=os.path.join(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))), 'doc/fonts/simfang.ttf'))
+    parser.add_argument(
+        "--rec_char_dict_path",
+        type=str,
+        default=os.path.join(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))),
+                             'pytorchocr/utils/ppocr_keys_v1.txt'))
+
+    # params for text classifier
+    parser.add_argument("--use_angle_cls", type=str2bool, default=False)
+    parser.add_argument("--cls_model_path", type=str)
+    parser.add_argument("--cls_image_shape", type=str, default="3, 48, 192")
+    parser.add_argument("--label_list", type=list, default=['0', '180'])
+    parser.add_argument("--cls_batch_num", type=int, default=6)
+    parser.add_argument("--cls_thresh", type=float, default=0.9)
+
+    parser.add_argument("--enable_mkldnn", type=str2bool, default=False)
+    parser.add_argument("--use_pdserving", type=str2bool, default=False)
+
+    # params for e2e
+    parser.add_argument("--e2e_algorithm", type=str, default='PGNet')
+    parser.add_argument("--e2e_model_path", type=str)
+    parser.add_argument("--e2e_limit_side_len", type=float, default=768)
+    parser.add_argument("--e2e_limit_type", type=str, default='max')
+
+    # PGNet parmas
+    parser.add_argument("--e2e_pgnet_score_thresh", type=float, default=0.5)
+    parser.add_argument(
+        "--e2e_char_dict_path", type=str,
+        default=os.path.join(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))),
+                             'pytorchocr/utils/ic15_dict.txt'))
+    parser.add_argument("--e2e_pgnet_valid_set", type=str, default='totaltext')
+    parser.add_argument("--e2e_pgnet_polygon", type=bool, default=True)
+    parser.add_argument("--e2e_pgnet_mode", type=str, default='fast')
+
+    # SR parmas
+    parser.add_argument("--sr_model_path", type=str)
+    parser.add_argument("--sr_image_shape", type=str, default="3, 32, 128")
+    parser.add_argument("--sr_batch_num", type=int, default=1)
+
+    # params .yaml
+    parser.add_argument("--det_yaml_path", type=str, default=None)
+    parser.add_argument("--rec_yaml_path", type=str, default=None)
+    parser.add_argument("--cls_yaml_path", type=str, default=None)
+    parser.add_argument("--e2e_yaml_path", type=str, default=None)
+    parser.add_argument("--sr_yaml_path", type=str, default=None)
+
+    # multi-process
+    parser.add_argument("--use_mp", type=str2bool, default=False)
+    parser.add_argument("--total_process_num", type=int, default=1)
+    parser.add_argument("--process_id", type=int, default=0)
+
+    parser.add_argument("--benchmark", type=str2bool, default=False)
+    parser.add_argument("--save_log_path", type=str, default="./log_output/")
+
+    parser.add_argument("--show_log", type=str2bool, default=True)
+
+    return parser
+
+def parse_args():
+    parser = init_args()
+    return parser.parse_args()
+
+def get_default_config(args):
+    return vars(args)
+
+
+def read_network_config_from_yaml(yaml_path, char_num=None):
+    if not os.path.exists(yaml_path):
+        raise FileNotFoundError('{} is not existed.'.format(yaml_path))
+    import yaml
+    with open(yaml_path, encoding='utf-8') as f:
+        res = yaml.safe_load(f)
+    if res.get('Architecture') is None:
+        raise ValueError('{} has no Architecture'.format(yaml_path))
+    if res['Architecture']['Head']['name'] == 'MultiHead' and char_num is not None:
+        res['Architecture']['Head']['out_channels_list'] = {
+            'CTCLabelDecode': char_num,
+            'SARLabelDecode': char_num + 2,
+            'NRTRLabelDecode': char_num + 3
+        }
+    return res['Architecture']
+
+def AnalysisConfig(weights_path, yaml_path=None, char_num=None):
+    if not os.path.exists(os.path.abspath(weights_path)):
+        raise FileNotFoundError('{} is not found.'.format(weights_path))
+
+    if yaml_path is not None:
+        return read_network_config_from_yaml(yaml_path, char_num=char_num)
+
+
+def resize_img(img, input_size=600):
+    """
+    resize img and limit the longest side of the image to input_size
+    """
+    img = np.array(img)
+    im_shape = img.shape
+    im_size_max = np.max(im_shape[0:2])
+    im_scale = float(input_size) / float(im_size_max)
+    img = cv2.resize(img, None, None, fx=im_scale, fy=im_scale)
+    return img
+
+
+def str_count(s):
+    """
+    Count the number of Chinese characters,
+    a single English character and a single number
+    equal to half the length of Chinese characters.
+    args:
+        s(string): the input of string
+    return(int):
+        the number of Chinese characters
+    """
+    import string
+    count_zh = count_pu = 0
+    s_len = len(s)
+    en_dg_count = 0
+    for c in s:
+        if c in string.ascii_letters or c.isdigit() or c.isspace():
+            en_dg_count += 1
+        elif c.isalpha():
+            count_zh += 1
+        else:
+            count_pu += 1
+    return s_len - math.ceil(en_dg_count / 2)
+
+
+def base64_to_cv2(b64str):
+    import base64
+    data = base64.b64decode(b64str.encode('utf8'))
+    data = np.fromstring(data, np.uint8)
+    data = cv2.imdecode(data, cv2.IMREAD_COLOR)
+    return data
+
+
+def get_arch_config(model_path):
+    from omegaconf import OmegaConf
+    all_arch_config = OmegaConf.load(DEFAULT_CFG_PATH)
+    path = Path(model_path)
+    file_name = path.stem
+    if file_name not in all_arch_config:
+        raise ValueError(f"architecture {file_name} is not in arch_config.yaml")
+
+    arch_config = all_arch_config[file_name]
+    return arch_config
\ No newline at end of file
--- a/magic_pdf/model/sub_modules/table/rapidtable/rapid_table.py
+++ b/magic_pdf/model/sub_modules/table/rapidtable/rapid_table.py
@@ -9,7 +9,7 @@ from magic_pdf.libs.config_reader import get_device


 class RapidTableModel(object):
-    def __init__(self, ocr_engine, table_sub_model_name):
+    def __init__(self, ocr_engine, table_sub_model_name='slanet_plus'):
        sub_model_list = [model.value for model in ModelType]
        if table_sub_model_name is None:
            input_args = RapidTableInput()
@@ -23,25 +23,17 @@ class RapidTableModel(object):

        self.table_model = RapidTable(input_args)

-        # if ocr_engine is None:
-        #     self.ocr_model_name = "RapidOCR"
-        #     if torch.cuda.is_available():
-        #         from rapidocr_paddle import RapidOCR
-        #         self.ocr_engine = RapidOCR(det_use_cuda=True, cls_use_cuda=True, rec_use_cuda=True)
-        #     else:
-        #         from rapidocr_onnxruntime import RapidOCR
-        #         self.ocr_engine = RapidOCR()
+        # self.ocr_model_name = "RapidOCR"
+        # if torch.cuda.is_available():
+        #     from rapidocr_paddle import RapidOCR
+        #     self.ocr_engine = RapidOCR(det_use_cuda=True, cls_use_cuda=True, rec_use_cuda=True)
        # else:
-        #     self.ocr_model_name = "PaddleOCR"
-        #     self.ocr_engine = ocr_engine
+        #     from rapidocr_onnxruntime import RapidOCR
+        #     self.ocr_engine = RapidOCR()
+
+        self.ocr_model_name = "PaddleOCR"
+        self.ocr_engine = ocr_engine

-        self.ocr_model_name = "RapidOCR"
-        if torch.cuda.is_available():
-            from rapidocr_paddle import RapidOCR
-            self.ocr_engine = RapidOCR(det_use_cuda=True, cls_use_cuda=True, rec_use_cuda=True)
-        else:
-            from rapidocr_onnxruntime import RapidOCR
-            self.ocr_engine = RapidOCR()

    def predict(self, image):


--- a/magic_pdf/model/sub_modules/table/structeqtable/struct_eqtable.py
+++ b/magic_pdf/model/sub_modules/table/structeqtable/struct_eqtable.py
-import torch
-from struct_eqtable import build_model
-
-from magic_pdf.model.sub_modules.table.table_utils import minify_html
-
-
-class StructTableModel:
-    def __init__(self, model_path, max_new_tokens=1024, max_time=60):
-        # init
-        assert torch.cuda.is_available(), "CUDA must be available for StructEqTable model."
-        self.model = build_model(
-            model_ckpt=model_path,
-            max_new_tokens=max_new_tokens,
-            max_time=max_time,
-            lmdeploy=False,
-            flash_attn=False,
-            batch_size=1,
-        ).cuda()
-        self.default_format = "html"
-
-    def predict(self, images, output_format=None, **kwargs):
-
-        if output_format is None:
-            output_format = self.default_format
-        else:
-            if output_format not in ['latex', 'markdown', 'html']:
-                raise ValueError(f"Output format {output_format} is not supported.")
-
-        results = self.model(
-            images, output_format=output_format
-        )
-
-        if output_format == "html":
-            results = [minify_html(html) for html in results]
-
-        return results
-
--- a/magic_pdf/model/sub_modules/table/tablemaster/tablemaster_paddle.py
+++ b/magic_pdf/model/sub_modules/table/tablemaster/tablemaster_paddle.py
-import os
-
-import cv2
-import numpy as np
-from ppstructure.table.predict_table import TableSystem
-from ppstructure.utility import init_args
-from PIL import Image
-
-from magic_pdf.config.constants import *  # noqa: F403
-
-
-class TableMasterPaddleModel(object):
-    """This class is responsible for converting image of table into HTML format
-    using a pre-trained model.
-
-    Attributes:
-    - table_sys: An instance of TableSystem initialized with parsed arguments.
-
-    Methods:
-    - __init__(config): Initializes the model with configuration parameters.
-    - img2html(image): Converts a PIL Image or NumPy array to HTML string.
-    - parse_args(**kwargs): Parses configuration arguments.
-    """
-
-    def __init__(self, config):
-        """
-        Parameters:
-        - config (dict): Configuration dictionary containing model_dir and device.
-        """
-        args = self.parse_args(**config)
-        self.table_sys = TableSystem(args)
-
-    def img2html(self, image):
-        """
-        Parameters:
-        - image (PIL.Image or np.ndarray): The image of the table to be converted.
-
-        Return:
-        - HTML (str): A string representing the HTML structure with content of the table.
-        """
-        if isinstance(image, Image.Image):
-            image = np.asarray(image)
-            image = cv2.cvtColor(image, cv2.COLOR_RGB2BGR)
-        pred_res, _ = self.table_sys(image)
-        pred_html = pred_res['html']
-        # res = '<td><table  border="1">' + pred_html.replace("<html><body><table>", "").replace(
-        # "</table></body></html>","") + "</table></td>\n"
-        return pred_html
-
-    def parse_args(self, **kwargs):
-        parser = init_args()
-        model_dir = kwargs.get('model_dir')
-        table_model_dir = os.path.join(model_dir, TABLE_MASTER_DIR)  # noqa: F405
-        table_char_dict_path = os.path.join(model_dir, TABLE_MASTER_DICT)  # noqa: F405
-        det_model_dir = os.path.join(model_dir, DETECT_MODEL_DIR)  # noqa: F405
-        rec_model_dir = os.path.join(model_dir, REC_MODEL_DIR)  # noqa: F405
-        rec_char_dict_path = os.path.join(model_dir, REC_CHAR_DICT)  # noqa: F405
-        device = kwargs.get('device', 'cpu')
-        use_gpu = True if device.startswith('cuda') else False
-        config = {
-            'use_gpu': use_gpu,
-            'table_max_len': kwargs.get('table_max_len', TABLE_MAX_LEN),  # noqa: F405
-            'table_algorithm': 'TableMaster',
-            'table_model_dir': table_model_dir,
-            'table_char_dict_path': table_char_dict_path,
-            'det_model_dir': det_model_dir,
-            'rec_model_dir': rec_model_dir,
-            'rec_char_dict_path': rec_char_dict_path,
-        }
-        parser.set_defaults(**config)
-        return parser.parse_args([])
--- a/magic_pdf/pdf_parse_union_core_v2.py
+++ b/magic_pdf/pdf_parse_union_core_v2.py
@@ -4,6 +4,7 @@ import os
 import re
 import statistics
 import time
+import warnings
 from typing import List

 import cv2
@@ -11,6 +12,7 @@ import fitz
 import torch
 import numpy as np
 from loguru import logger
+from tqdm import tqdm

 from magic_pdf.config.enums import SupportedPdfParseMethod
 from magic_pdf.config.ocr_content_type import BlockType, ContentType
@@ -21,20 +23,9 @@ from magic_pdf.libs.config_reader import get_local_layoutreader_model_dir, get_l
 from magic_pdf.libs.convert_utils import dict_to_list
 from magic_pdf.libs.hash_utils import compute_md5
 from magic_pdf.libs.pdf_image_tools import cut_image_to_pil_image
-from magic_pdf.libs.performance_stats import measure_time, PerformanceStats
 from magic_pdf.model.magic_model import MagicModel
 from magic_pdf.post_proc.llm_aided import llm_aided_formula, llm_aided_text, llm_aided_title

-from concurrent.futures import ThreadPoolExecutor
-
-try:
-    import torchtext
-
-    if torchtext.__version__ >= '0.18.0':
-        torchtext.disable_torchtext_deprecation_warning()
-except ImportError:
-    pass
-
 from magic_pdf.model.sub_modules.model_init import AtomModelSingleton
 from magic_pdf.post_proc.para_split_v3 import para_split
 from magic_pdf.pre_proc.construct_page_dict import ocr_construct_page_component_v2
@@ -42,7 +33,7 @@ from magic_pdf.pre_proc.cut_image import ocr_cut_image_and_table
 from magic_pdf.pre_proc.ocr_detect_all_bboxes import ocr_prepare_bboxes_for_layout_split_v2
 from magic_pdf.pre_proc.ocr_dict_merge import fill_spans_in_blocks, fix_block_spans_v2, fix_discarded_block
 from magic_pdf.pre_proc.ocr_span_list_modify import get_qa_need_list_v2, remove_overlaps_low_confidence_spans, \
-    remove_overlaps_min_spans, check_chars_is_overlap_in_span
+    remove_overlaps_min_spans, remove_x_overlapping_chars

 os.environ['NO_ALBUMENTATIONS_UPDATE'] = '1'  # 禁止albumentations检查更新

@@ -64,14 +55,6 @@ def __replace_STX_ETX(text_str: str):
    return text_str


-def __replace_0xfffd(text_str: str):
-    """Replace \ufffd, as these characters become garbled when extracted using pymupdf."""
-    if text_str:
-        s = text_str.replace('\ufffd', " ")
-        return s
-    return text_str
-
-
 # 连写字符拆分
 def __replace_ligatures(text: str):
    ligatures = {
@@ -84,16 +67,17 @@ def chars_to_content(span):
    # 检查span中的char是否为空
    if len(span['chars']) == 0:
        pass
-        # span['content'] = ''
-    elif check_chars_is_overlap_in_span(span['chars']):
-        pass
    else:
        # 先给chars按char['bbox']的中心点的x坐标排序
        span['chars'] = sorted(span['chars'], key=lambda x: (x['bbox'][0] + x['bbox'][2]) / 2)

-        # 求char的平均宽度
-        char_width_sum = sum([char['bbox'][2] - char['bbox'][0] for char in span['chars']])
-        char_avg_width = char_width_sum / len(span['chars'])
+        # Calculate the width of each character
+        char_widths = [char['bbox'][2] - char['bbox'][0] for char in span['chars']]
+        # Calculate the median width
+        median_width = statistics.median(char_widths)
+
+        # 通过x轴重叠比率移除一部分char
+        span = remove_x_overlapping_chars(span, median_width)

        content = ''
        for char in span['chars']:
@@ -101,13 +85,12 @@ def chars_to_content(span):
            # 如果下一个char的x0和上一个char的x1距离超过0.25个字符宽度，则需要在中间插入一个空格
            char1 = char
            char2 = span['chars'][span['chars'].index(char) + 1] if span['chars'].index(char) + 1 < len(span['chars']) else None
-            if char2 and char2['bbox'][0] - char1['bbox'][2] > char_avg_width * 0.25 and char['c'] != ' ' and char2['c'] != ' ':
+            if char2 and char2['bbox'][0] - char1['bbox'][2] > median_width * 0.25 and char['c'] != ' ' and char2['c'] != ' ':
                content += f"{char['c']} "
            else:
                content += char['c']

-        content = __replace_ligatures(content)
-        span['content'] = __replace_0xfffd(content)
+        span['content'] = __replace_ligatures(content)

    del span['chars']

@@ -122,10 +105,6 @@ def fill_char_in_spans(spans, all_chars):
    spans = sorted(spans, key=lambda x: x['bbox'][1])

    for char in all_chars:
-        # 跳过非法bbox的char
-        # x1, y1, x2, y2 = char['bbox']
-        # if abs(x1 - x2) <= 0.01 or abs(y1 - y2) <= 0.01:
-        #     continue

        for span in spans:
            if calculate_char_in_span(char['bbox'], span['bbox'], char['c']):
@@ -215,7 +194,7 @@ def calculate_contrast(img, img_mode) -> float:
    std_dev = np.std(gray_img)
    # 对比度定义为标准差除以平均值（加上小常数避免除零错误）
    contrast = std_dev / (mean_value + 1e-6)
-    # logger.info(f"contrast: {contrast}")
+    # logger.debug(f"contrast: {contrast}")
    return round(contrast, 2)

 # @measure_time
@@ -308,41 +287,53 @@ def txt_spans_extract_v2(pdf_page, spans, all_bboxes, all_discarded_blocks, lang
    if len(need_ocr_spans) > 0:

        # 初始化ocr模型
-        atom_model_manager = AtomModelSingleton()
-        ocr_model = atom_model_manager.get_atom_model(
-            atom_model_name='ocr',
-            ocr_show_log=False,
-            det_db_box_thresh=0.3,
-            lang=lang
-        )
+        # atom_model_manager = AtomModelSingleton()
+        # ocr_model = atom_model_manager.get_atom_model(
+        #     atom_model_name='ocr',
+        #     ocr_show_log=False,
+        #     det_db_box_thresh=0.3,
+        #     lang=lang
+        # )

        for span in need_ocr_spans:
            # 对span的bbox截图再ocr
            span_img = cut_image_to_pil_image(span['bbox'], pdf_page, mode='cv2')

            # 计算span的对比度，低于0.20的span不进行ocr
-            if calculate_contrast(span_img, img_mode='bgr') <= 0.20:
+            if calculate_contrast(span_img, img_mode='bgr') <= 0.17:
                spans.remove(span)
                continue
+                # pass
+
+            span['content'] = ''
+            span['score'] = 1
+            span['np_img'] = span_img

-            ocr_res = ocr_model.ocr(span_img, det=False)
-            if ocr_res and len(ocr_res) > 0:
-                if len(ocr_res[0]) > 0:
-                    ocr_text, ocr_score = ocr_res[0][0]
-                    # logger.info(f"ocr_text: {ocr_text}, ocr_score: {ocr_score}")
-                    if ocr_score > 0.5 and len(ocr_text) > 0:
-                        span['content'] = ocr_text
-                        span['score'] = ocr_score
-                    else:
-                        spans.remove(span)
+
+            # ocr_res = ocr_model.ocr(span_img, det=False)
+            # if ocr_res and len(ocr_res) > 0:
+            #     if len(ocr_res[0]) > 0:
+            #         ocr_text, ocr_score = ocr_res[0][0]
+            #         # logger.info(f"ocr_text: {ocr_text}, ocr_score: {ocr_score}")
+            #         if ocr_score > 0.5 and len(ocr_text) > 0:
+            #             span['content'] = ocr_text
+            #             span['score'] = float(round(ocr_score, 2))
+            #         else:
+            #             spans.remove(span)

    return spans


 def model_init(model_name: str):
    from transformers import LayoutLMv3ForTokenClassification
-    device = torch.device(get_device())
-
+    device_name = get_device()
+    bf_16_support = False
+    if device_name.startswith("cuda"):
+        bf_16_support = torch.cuda.is_bf16_supported()
+    elif device_name.startswith("mps"):
+        bf_16_support = True
+
+    device = torch.device(device_name)
    if model_name == 'layoutreader':
        # 检测modelscope的缓存目录是否存在
        layoutreader_model_dir = get_local_layoutreader_model_dir()
@@ -357,7 +348,10 @@ def model_init(model_name: str):
            model = LayoutLMv3ForTokenClassification.from_pretrained(
                'hantian/layoutreader'
            )
-        model.to(device).eval()
+        if bf_16_support:
+            model.to(device).eval().bfloat16()
+        else:
+            model.to(device).eval()
    else:
        logger.error('model name not allow')
        exit(1)
@@ -383,9 +377,12 @@ def do_predict(boxes: List[List[int]], model) -> List[int]:
    from magic_pdf.model.sub_modules.reading_oreder.layoutreader.helpers import (
        boxes2inputs, parse_logits, prepare_inputs)

-    inputs = boxes2inputs(boxes)
-    inputs = prepare_inputs(inputs, model)
-    logits = model(**inputs).logits.cpu().squeeze(0)
+    with warnings.catch_warnings():
+        warnings.filterwarnings("ignore", category=FutureWarning, module="transformers")
+
+        inputs = boxes2inputs(boxes)
+        inputs = prepare_inputs(inputs, model)
+        logits = model(**inputs).logits.cpu().squeeze(0)
    return parse_logits(logits, len(boxes))


@@ -463,20 +460,20 @@ def insert_lines_into_block(block_bbox, line_height, page_w, page_h):
        if (
            block_height > page_h * 0.25 and page_w * 0.5 > block_weight > page_w * 0.25
        ):  # 可能是双列结构，可以切细点
-            lines = int(block_height / line_height) + 1
+            lines = int(block_height / line_height)
        else:
            # 如果block的宽度超过0.4页面宽度，则将block分成3行(是一种复杂布局，图不能切的太细)
            if block_weight > page_w * 0.4:
                lines = 3
-                line_height = (y1 - y0) / lines
            elif block_weight > page_w * 0.25:  # （可能是三列结构，也切细点）
-                lines = int(block_height / line_height) + 1
+                lines = int(block_height / line_height)
            else:  # 判断长宽比
                if block_height / block_weight > 1.2:  # 细长的不分
                    return [[x0, y0, x1, y1]]
                else:  # 不细长的还是分成两行
                    lines = 2
-                    line_height = (y1 - y0) / lines
+
+        line_height = (y1 - y0) / lines

        # 确定从哪个y位置开始绘制线条
        current_y = y0
@@ -492,7 +489,7 @@ def insert_lines_into_block(block_bbox, line_height, page_w, page_h):
    else:
        return [[x0, y0, x1, y1]]

-# @measure_time
+
 def sort_lines_by_model(fix_blocks, page_w, page_h, line_height):
    page_line_list = []

@@ -936,17 +933,18 @@ def pdf_parse_union(
        logger.warning('end_page_id is out of range, use pdf_docs length')
        end_page_id = len(dataset) - 1

-    """初始化启动时间"""
-    start_time = time.time()
+    # """初始化启动时间"""
+    # start_time = time.time()

-    for page_id, page in enumerate(dataset):
-        """debug时输出每页解析的耗时."""
-        if debug_mode:
-            time_now = time.time()
-            logger.info(
-                f'page_id: {page_id}, last_page_cost_time: {round(time.time() - start_time, 2)}'
-            )
-            start_time = time_now
+    # for page_id, page in enumerate(dataset):
+    for page_id, page in tqdm(enumerate(dataset), total=len(dataset), desc="Processing pages"):
+        # """debug时输出每页解析的耗时."""
+        # if debug_mode:
+            # time_now = time.time()
+            # logger.info(
+            #     f'page_id: {page_id}, last_page_cost_time: {round(time.time() - start_time, 2)}'
+            # )
+            # start_time = time_now

        """解析pdf中的每一页"""
        if start_page_id <= page_id <= end_page_id:
@@ -962,7 +960,47 @@ def pdf_parse_union(
            )
        pdf_info_dict[f'page_{page_id}'] = page_info

-    # PerformanceStats.print_stats()
+    need_ocr_list = []
+    img_crop_list = []
+    text_block_list = []
+    for pange_id, page_info in pdf_info_dict.items():
+        for block in page_info['preproc_blocks']:
+            if block['type'] in ['table', 'image']:
+                for sub_block in block['blocks']:
+                    if sub_block['type'] in ['image_caption', 'image_footnote', 'table_caption', 'table_footnote']:
+                        text_block_list.append(sub_block)
+            elif block['type'] in ['text', 'title']:
+                text_block_list.append(block)
+        for block in page_info['discarded_blocks']:
+            text_block_list.append(block)
+    for block in text_block_list:
+        for line in block['lines']:
+            for span in line['spans']:
+                if 'np_img' in span:
+                    need_ocr_list.append(span)
+                    img_crop_list.append(span['np_img'])
+                    span.pop('np_img')
+    if len(img_crop_list) > 0:
+        # Get OCR results for this language's images
+        atom_model_manager = AtomModelSingleton()
+        ocr_model = atom_model_manager.get_atom_model(
+            atom_model_name='ocr',
+            ocr_show_log=False,
+            det_db_box_thresh=0.3,
+            lang=lang
+        )
+        # rec_start = time.time()
+        ocr_res_list = ocr_model.ocr(img_crop_list, det=False, tqdm_enable=True)[0]
+        # Verify we have matching counts
+        assert len(ocr_res_list) == len(need_ocr_list), f'ocr_res_list: {len(ocr_res_list)}, need_ocr_list: {len(need_ocr_list)}'
+        # Process OCR results for this language
+        for index, span in enumerate(need_ocr_list):
+            ocr_text, ocr_score = ocr_res_list[index]
+            span['content'] = ocr_text
+            span['score'] = float(round(ocr_score, 2))
+        # rec_time = time.time() - rec_start
+        # logger.info(f'ocr-dynamic-rec time: {round(rec_time, 2)}, total images processed: {len(img_crop_list)}')
+

    """分段"""
    para_split(pdf_info_dict)

--- a/magic_pdf/pre_proc/ocr_dict_merge.py
+++ b/magic_pdf/pre_proc/ocr_dict_merge.py
@@ -62,7 +62,15 @@ def merge_spans_to_line(spans, threshold=0.6):

 def span_block_type_compatible(span_type, block_type):
    if span_type in [ContentType.Text, ContentType.InlineEquation]:
-        return block_type in [BlockType.Text, BlockType.Title, BlockType.ImageCaption, BlockType.ImageFootnote, BlockType.TableCaption, BlockType.TableFootnote]
+        return block_type in [
+            BlockType.Text,
+            BlockType.Title,
+            BlockType.ImageCaption,
+            BlockType.ImageFootnote,
+            BlockType.TableCaption,
+            BlockType.TableFootnote,
+            BlockType.Discarded
+        ]
    elif span_type == ContentType.InterlineEquation:
        return block_type in [BlockType.InterlineEquation, BlockType.Text]
    elif span_type == ContentType.Image:

--- a/magic_pdf/pre_proc/ocr_span_list_modify.py
+++ b/magic_pdf/pre_proc/ocr_span_list_modify.py
@@ -41,6 +41,57 @@ def check_chars_is_overlap_in_span(chars):
    return False


+def remove_x_overlapping_chars(span, median_width):
+    """
+    Remove characters from a span that overlap significantly on the x-axis.
+
+    Args:
+        median_width:
+        span (dict): A span containing a list of chars, each with bbox coordinates
+                    in the format [x0, y0, x1, y1]
+
+    Returns:
+        dict: The span with overlapping characters removed
+    """
+    if 'chars' not in span or len(span['chars']) < 2:
+        return span
+
+    overlap_threshold = median_width * 0.3
+
+    i = 0
+    while i < len(span['chars']) - 1:
+        char1 = span['chars'][i]
+        char2 = span['chars'][i + 1]
+
+        # Calculate overlap width
+        x_left = max(char1['bbox'][0], char2['bbox'][0])
+        x_right = min(char1['bbox'][2], char2['bbox'][2])
+
+        if x_right > x_left:  # There is overlap
+            overlap_width = x_right - x_left
+
+            if overlap_width > overlap_threshold:
+                if char1['c'] == char2['c'] or char1['c'] == ' ' or char2['c'] == ' ':
+                    # Determine which character to remove
+                    width1 = char1['bbox'][2] - char1['bbox'][0]
+                    width2 = char2['bbox'][2] - char2['bbox'][0]
+                    if width1 < width2:
+                        # Remove the narrower character
+                        span['chars'].pop(i)
+                    else:
+                        span['chars'].pop(i + 1)
+                else:
+                    i += 1
+
+                # Don't increment i since we need to check the new pair
+            else:
+                i += 1
+        else:
+            i += 1
+
+    return span
+
+
 def remove_overlaps_min_spans(spans):
    dropped_spans = []
    #  删除重叠spans中较小的那些

--- a/magic_pdf/resources/model_config/UniMERNet/demo.yaml
+++ b/magic_pdf/resources/model_config/UniMERNet/demo.yaml
-model:
-  arch: unimernet
-  model_type: unimernet
-  model_config:
-    model_name: ./models/unimernet_base
-    max_seq_len: 1536
-
-  load_pretrained: True
-  pretrained: './models/unimernet_base/pytorch_model.pth'
-  tokenizer_config:
-    path: ./models/unimernet_base
-
-datasets:
-  formula_rec_eval:
-    vis_processor:
-      eval:
-        name: "formula_image_eval"
-        image_size:
-          - 192
-          - 672
-
-run:
-  runner: runner_iter
-  task: unimernet_train
-
-  batch_size_train: 64
-  batch_size_eval: 64
-  num_workers: 1
-
-  iters_per_inner_epoch: 2000
-  max_iters: 60000
-
-  seed: 42
-  output_dir: "../output/demo"
-
-  evaluate: True
-  test_splits: [ "eval" ]
-
-  device: "cuda"
-  world_size: 1
-  dist_url: "env://"
-  distributed: True
-  distributed_type: ddp  # or fsdp when train llm
-
-  generate_cfg:
-    temperature: 0.0
\ No newline at end of file
--- a/magic_pdf/resources/model_config/layoutlmv3/layoutlmv3_base_inference.yaml
+++ b/magic_pdf/resources/model_config/layoutlmv3/layoutlmv3_base_inference.yaml
-AUG:
-  DETR: true
-CACHE_DIR: ~/cache/huggingface
-CUDNN_BENCHMARK: false
-DATALOADER:
-  ASPECT_RATIO_GROUPING: true
-  FILTER_EMPTY_ANNOTATIONS: false
-  NUM_WORKERS: 4
-  REPEAT_THRESHOLD: 0.0
-  SAMPLER_TRAIN: TrainingSampler
-DATASETS:
-  PRECOMPUTED_PROPOSAL_TOPK_TEST: 1000
-  PRECOMPUTED_PROPOSAL_TOPK_TRAIN: 2000
-  PROPOSAL_FILES_TEST: []
-  PROPOSAL_FILES_TRAIN: []
-  TEST:
-  - scihub_train
-  TRAIN:
-  - scihub_train
-GLOBAL:
-  HACK: 1.0
-ICDAR_DATA_DIR_TEST: ''
-ICDAR_DATA_DIR_TRAIN: ''
-INPUT:
-  CROP:
-    ENABLED: true
-    SIZE:
-    - 384
-    - 600
-    TYPE: absolute_range
-  FORMAT: RGB
-  MASK_FORMAT: polygon
-  MAX_SIZE_TEST: 1333
-  MAX_SIZE_TRAIN: 1333
-  MIN_SIZE_TEST: 800
-  MIN_SIZE_TRAIN:
-  - 480
-  - 512
-  - 544
-  - 576
-  - 608
-  - 640
-  - 672
-  - 704
-  - 736
-  - 768
-  - 800
-  MIN_SIZE_TRAIN_SAMPLING: choice
-  RANDOM_FLIP: horizontal
-MODEL:
-  ANCHOR_GENERATOR:
-    ANGLES:
-    - - -90
-      - 0
-      - 90
-    ASPECT_RATIOS:
-    - - 0.5
-      - 1.0
-      - 2.0
-    NAME: DefaultAnchorGenerator
-    OFFSET: 0.0
-    SIZES:
-    - - 32
-    - - 64
-    - - 128
-    - - 256
-    - - 512
-  BACKBONE:
-    FREEZE_AT: 2
-    NAME: build_vit_fpn_backbone
-  CONFIG_PATH: ''
-  DEVICE: cuda
-  FPN:
-    FUSE_TYPE: sum
-    IN_FEATURES:
-    - layer3
-    - layer5
-    - layer7
-    - layer11
-    NORM: ''
-    OUT_CHANNELS: 256
-  IMAGE_ONLY: true
-  KEYPOINT_ON: false
-  LOAD_PROPOSALS: false
-  MASK_ON: true
-  META_ARCHITECTURE: VLGeneralizedRCNN
-  PANOPTIC_FPN:
-    COMBINE:
-      ENABLED: true
-      INSTANCES_CONFIDENCE_THRESH: 0.5
-      OVERLAP_THRESH: 0.5
-      STUFF_AREA_LIMIT: 4096
-    INSTANCE_LOSS_WEIGHT: 1.0
-  PIXEL_MEAN:
-  - 127.5
-  - 127.5
-  - 127.5
-  PIXEL_STD:
-  - 127.5
-  - 127.5
-  - 127.5
-  PROPOSAL_GENERATOR:
-    MIN_SIZE: 0
-    NAME: RPN
-  RESNETS:
-    DEFORM_MODULATED: false
-    DEFORM_NUM_GROUPS: 1
-    DEFORM_ON_PER_STAGE:
-    - false
-    - false
-    - false
-    - false
-    DEPTH: 50
-    NORM: FrozenBN
-    NUM_GROUPS: 1
-    OUT_FEATURES:
-    - res4
-    RES2_OUT_CHANNELS: 256
-    RES5_DILATION: 1
-    STEM_OUT_CHANNELS: 64
-    STRIDE_IN_1X1: true
-    WIDTH_PER_GROUP: 64
-  RETINANET:
-    BBOX_REG_LOSS_TYPE: smooth_l1
-    BBOX_REG_WEIGHTS:
-    - 1.0
-    - 1.0
-    - 1.0
-    - 1.0
-    FOCAL_LOSS_ALPHA: 0.25
-    FOCAL_LOSS_GAMMA: 2.0
-    IN_FEATURES:
-    - p3
-    - p4
-    - p5
-    - p6
-    - p7
-    IOU_LABELS:
-    - 0
-    - -1
-    - 1
-    IOU_THRESHOLDS:
-    - 0.4
-    - 0.5
-    NMS_THRESH_TEST: 0.5
-    NORM: ''
-    NUM_CLASSES: 10
-    NUM_CONVS: 4
-    PRIOR_PROB: 0.01
-    SCORE_THRESH_TEST: 0.05
-    SMOOTH_L1_LOSS_BETA: 0.1
-    TOPK_CANDIDATES_TEST: 1000
-  ROI_BOX_CASCADE_HEAD:
-    BBOX_REG_WEIGHTS:
-    - - 10.0
-      - 10.0
-      - 5.0
-      - 5.0
-    - - 20.0
-      - 20.0
-      - 10.0
-      - 10.0
-    - - 30.0
-      - 30.0
-      - 15.0
-      - 15.0
-    IOUS:
-    - 0.5
-    - 0.6
-    - 0.7
-  ROI_BOX_HEAD:
-    BBOX_REG_LOSS_TYPE: smooth_l1
-    BBOX_REG_LOSS_WEIGHT: 1.0
-    BBOX_REG_WEIGHTS:
-    - 10.0
-    - 10.0
-    - 5.0
-    - 5.0
-    CLS_AGNOSTIC_BBOX_REG: true
-    CONV_DIM: 256
-    FC_DIM: 1024
-    NAME: FastRCNNConvFCHead
-    NORM: ''
-    NUM_CONV: 0
-    NUM_FC: 2
-    POOLER_RESOLUTION: 7
-    POOLER_SAMPLING_RATIO: 0
-    POOLER_TYPE: ROIAlignV2
-    SMOOTH_L1_BETA: 0.0
-    TRAIN_ON_PRED_BOXES: false
-  ROI_HEADS:
-    BATCH_SIZE_PER_IMAGE: 512
-    IN_FEATURES:
-    - p2
-    - p3
-    - p4
-    - p5
-    IOU_LABELS:
-    - 0
-    - 1
-    IOU_THRESHOLDS:
-    - 0.5
-    NAME: CascadeROIHeads
-    NMS_THRESH_TEST: 0.5
-    NUM_CLASSES: 10
-    POSITIVE_FRACTION: 0.25
-    PROPOSAL_APPEND_GT: true
-    SCORE_THRESH_TEST: 0.05
-  ROI_KEYPOINT_HEAD:
-    CONV_DIMS:
-    - 512
-    - 512
-    - 512
-    - 512
-    - 512
-    - 512
-    - 512
-    - 512
-    LOSS_WEIGHT: 1.0
-    MIN_KEYPOINTS_PER_IMAGE: 1
-    NAME: KRCNNConvDeconvUpsampleHead
-    NORMALIZE_LOSS_BY_VISIBLE_KEYPOINTS: true
-    NUM_KEYPOINTS: 17
-    POOLER_RESOLUTION: 14
-    POOLER_SAMPLING_RATIO: 0
-    POOLER_TYPE: ROIAlignV2
-  ROI_MASK_HEAD:
-    CLS_AGNOSTIC_MASK: false
-    CONV_DIM: 256
-    NAME: MaskRCNNConvUpsampleHead
-    NORM: ''
-    NUM_CONV: 4
-    POOLER_RESOLUTION: 14
-    POOLER_SAMPLING_RATIO: 0
-    POOLER_TYPE: ROIAlignV2
-  RPN:
-    BATCH_SIZE_PER_IMAGE: 256
-    BBOX_REG_LOSS_TYPE: smooth_l1
-    BBOX_REG_LOSS_WEIGHT: 1.0
-    BBOX_REG_WEIGHTS:
-    - 1.0
-    - 1.0
-    - 1.0
-    - 1.0
-    BOUNDARY_THRESH: -1
-    CONV_DIMS:
-    - -1
-    HEAD_NAME: StandardRPNHead
-    IN_FEATURES:
-    - p2
-    - p3
-    - p4
-    - p5
-    - p6
-    IOU_LABELS:
-    - 0
-    - -1
-    - 1
-    IOU_THRESHOLDS:
-    - 0.3
-    - 0.7
-    LOSS_WEIGHT: 1.0
-    NMS_THRESH: 0.7
-    POSITIVE_FRACTION: 0.5
-    POST_NMS_TOPK_TEST: 1000
-    POST_NMS_TOPK_TRAIN: 2000
-    PRE_NMS_TOPK_TEST: 1000
-    PRE_NMS_TOPK_TRAIN: 2000
-    SMOOTH_L1_BETA: 0.0
-  SEM_SEG_HEAD:
-    COMMON_STRIDE: 4
-    CONVS_DIM: 128
-    IGNORE_VALUE: 255
-    IN_FEATURES:
-    - p2
-    - p3
-    - p4
-    - p5
-    LOSS_WEIGHT: 1.0
-    NAME: SemSegFPNHead
-    NORM: GN
-    NUM_CLASSES: 10
-  VIT:
-    DROP_PATH: 0.1
-    IMG_SIZE:
-    - 224
-    - 224
-    NAME: layoutlmv3_base
-    OUT_FEATURES:
-    - layer3
-    - layer5
-    - layer7
-    - layer11
-    POS_TYPE: abs
-  WEIGHTS: 
-OUTPUT_DIR: 
-SCIHUB_DATA_DIR_TRAIN: ~/publaynet/layout_scihub/train
-SEED: 42
-SOLVER:
-  AMP:
-    ENABLED: true
-  BACKBONE_MULTIPLIER: 1.0
-  BASE_LR: 0.0002
-  BIAS_LR_FACTOR: 1.0
-  CHECKPOINT_PERIOD: 2000
-  CLIP_GRADIENTS:
-    CLIP_TYPE: full_model
-    CLIP_VALUE: 1.0
-    ENABLED: true
-    NORM_TYPE: 2.0
-  GAMMA: 0.1
-  GRADIENT_ACCUMULATION_STEPS: 1
-  IMS_PER_BATCH: 32
-  LR_SCHEDULER_NAME: WarmupCosineLR
-  MAX_ITER: 20000
-  MOMENTUM: 0.9
-  NESTEROV: false
-  OPTIMIZER: ADAMW
-  REFERENCE_WORLD_SIZE: 0
-  STEPS:
-  - 10000
-  WARMUP_FACTOR: 0.01
-  WARMUP_ITERS: 333
-  WARMUP_METHOD: linear
-  WEIGHT_DECAY: 0.05
-  WEIGHT_DECAY_BIAS: null
-  WEIGHT_DECAY_NORM: 0.0
-TEST:
-  AUG:
-    ENABLED: false
-    FLIP: true
-    MAX_SIZE: 4000
-    MIN_SIZES:
-    - 400
-    - 500
-    - 600
-    - 700
-    - 800
-    - 900
-    - 1000
-    - 1100
-    - 1200
-  DETECTIONS_PER_IMAGE: 100
-  EVAL_PERIOD: 1000
-  EXPECTED_RESULTS: []
-  KEYPOINT_OKS_SIGMAS: []
-  PRECISE_BN:
-    ENABLED: false
-    NUM_ITER: 200
-VERSION: 2
-VIS_PERIOD: 0
--- a/magic_pdf/resources/model_config/model_configs.yaml
+++ b/magic_pdf/resources/model_config/model_configs.yaml
@@ -2,7 +2,7 @@ weights:
  layoutlmv3: Layout/LayoutLMv3/model_final.pth
  doclayout_yolo: Layout/YOLO/doclayout_yolo_docstructbench_imgsz1280_2501.pt
  yolo_v8_mfd: MFD/YOLO/yolo_v8_ft.pt
-  unimernet_small: MFR/unimernet_small_2501
+  unimernet_small: MFR/unimernet_hf_small_2503
  struct_eqtable: TabRec/StructEqTable
  tablemaster: TabRec/TableMaster
  rapid_table: TabRec/RapidTable
\ No newline at end of file
--- a/magic_pdf/tools/cli.py
+++ b/magic_pdf/tools/cli.py
 import os
 import shutil
 import tempfile
+from pathlib import Path
+
 import click
 import fitz
 from loguru import logger
-from pathlib import Path

 import magic_pdf.model as model_config
+from magic_pdf.data.batch_build_dataset import batch_build_dataset
 from magic_pdf.data.data_reader_writer import FileBasedDataReader
+from magic_pdf.data.dataset import Dataset
 from magic_pdf.libs.version import __version__
-from magic_pdf.tools.common import do_parse, parse_pdf_methods
+from magic_pdf.tools.common import batch_do_parse, do_parse, parse_pdf_methods
 from magic_pdf.utils.office_to_pdf import convert_file_to_pdf

 pdf_suffixes = ['.pdf']
@@ -87,37 +90,38 @@ without method specified, auto will be used by default.""",
    default=None,
 )
 def cli(path, output_dir, method, lang, debug_able, start_page_id, end_page_id):
-    model_config.__use_inside_model__ = True
-    model_config.__model_mode__ = 'full'
    os.makedirs(output_dir, exist_ok=True)
    temp_dir = tempfile.mkdtemp()
    def read_fn(path: Path):
        if path.suffix in ms_office_suffixes:
            convert_file_to_pdf(str(path), temp_dir)
-            fn = os.path.join(temp_dir, f"{path.stem}.pdf")
+            fn = os.path.join(temp_dir, f'{path.stem}.pdf')
        elif path.suffix in image_suffixes:
            with open(str(path), 'rb') as f:
                bits = f.read()
            pdf_bytes = fitz.open(stream=bits).convert_to_pdf()
-            fn = os.path.join(temp_dir, f"{path.stem}.pdf")
+            fn = os.path.join(temp_dir, f'{path.stem}.pdf')
            with open(fn, 'wb') as f:
                f.write(pdf_bytes)
        elif path.suffix in pdf_suffixes:
            fn = str(path)
        else:
-            raise Exception(f"Unknown file suffix: {path.suffix}")
-        
+            raise Exception(f'Unknown file suffix: {path.suffix}')
+
        disk_rw = FileBasedDataReader(os.path.dirname(fn))
        return disk_rw.read(os.path.basename(fn))

-    def parse_doc(doc_path: Path):
+    def parse_doc(doc_path: Path, dataset: Dataset | None = None):
        try:
            file_name = str(Path(doc_path).stem)
-            pdf_data = read_fn(doc_path)
+            if dataset is None:
+                pdf_data_or_dataset = read_fn(doc_path)
+            else:
+                pdf_data_or_dataset = dataset
            do_parse(
                output_dir,
                file_name,
-                pdf_data,
+                pdf_data_or_dataset,
                [],
                method,
                debug_able,
@@ -130,9 +134,23 @@ def cli(path, output_dir, method, lang, debug_able, start_page_id, end_page_id):
            logger.exception(e)

    if os.path.isdir(path):
+        doc_paths = []
        for doc_path in Path(path).glob('*'):
            if doc_path.suffix in pdf_suffixes + image_suffixes + ms_office_suffixes:
-                parse_doc(doc_path)
+                if doc_path.suffix in ms_office_suffixes:
+                    convert_file_to_pdf(str(doc_path), temp_dir)
+                    doc_path = Path(os.path.join(temp_dir, f'{doc_path.stem}.pdf'))
+                elif doc_path.suffix in image_suffixes:
+                    with open(str(doc_path), 'rb') as f:
+                        bits = f.read()
+                        pdf_bytes = fitz.open(stream=bits).convert_to_pdf()
+                    fn = os.path.join(temp_dir, f'{doc_path.stem}.pdf')
+                    with open(fn, 'wb') as f:
+                        f.write(pdf_bytes)
+                    doc_path = Path(fn)
+                doc_paths.append(doc_path)
+        datasets = batch_build_dataset(doc_paths, 4, lang)
+        batch_do_parse(output_dir, [str(doc_path.stem) for doc_path in doc_paths], datasets, method, debug_able, lang=lang)
    else:
        parse_doc(Path(path))


--- a/magic_pdf/tools/common.py
+++ b/magic_pdf/tools/common.py
@@ -8,10 +8,10 @@ import magic_pdf.model as model_config
 from magic_pdf.config.enums import SupportedPdfParseMethod
 from magic_pdf.config.make_content_config import DropMode, MakeMode
 from magic_pdf.data.data_reader_writer import FileBasedDataWriter
-from magic_pdf.data.dataset import PymuDocDataset
+from magic_pdf.data.dataset import Dataset, PymuDocDataset
 from magic_pdf.libs.draw_bbox import draw_char_bbox
-from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
-from magic_pdf.operators.models import InferenceResult
+from magic_pdf.model.doc_analyze_by_custom_model import (batch_doc_analyze,
+                                                         doc_analyze)

 # from io import BytesIO
 # from pypdf import PdfReader, PdfWriter
@@ -67,13 +67,13 @@ def convert_pdf_bytes_to_bytes_by_pymupdf(pdf_bytes, start_page_id=0, end_page_i
    return output_bytes


-def do_parse(
+def _do_parse(
    output_dir,
    pdf_file_name,
-    pdf_bytes,
+    pdf_bytes_or_dataset,
    model_list,
    parse_method,
-    debug_able,
+    debug_able=False,
    f_draw_span_bbox=True,
    f_draw_layout_bbox=True,
    f_dump_md=True,
@@ -92,16 +92,21 @@ def do_parse(
    formula_enable=None,
    table_enable=None,
 ):
+    from magic_pdf.operators.models import InferenceResult
    if debug_able:
        logger.warning('debug mode is on')
        f_draw_model_bbox = True
        f_draw_line_sort_bbox = True
        # f_draw_char_bbox = True

-    pdf_bytes = convert_pdf_bytes_to_bytes_by_pymupdf(
-        pdf_bytes, start_page_id, end_page_id
-    )
-
+    if isinstance(pdf_bytes_or_dataset, bytes):
+        pdf_bytes = convert_pdf_bytes_to_bytes_by_pymupdf(
+            pdf_bytes_or_dataset, start_page_id, end_page_id
+        )
+        ds = PymuDocDataset(pdf_bytes, lang=lang)
+    else:
+        ds = pdf_bytes_or_dataset
+    pdf_bytes = ds._raw_data
    local_image_dir, local_md_dir = prepare_env(output_dir, pdf_file_name, parse_method)

    image_writer, md_writer = FileBasedDataWriter(local_image_dir), FileBasedDataWriter(
@@ -109,8 +114,6 @@ def do_parse(
    )
    image_dir = str(os.path.basename(local_image_dir))

-    ds = PymuDocDataset(pdf_bytes, lang=lang)
-
    if len(model_list) == 0:
        if model_config.__use_inside_model__:
            if parse_method == 'auto':
@@ -241,5 +244,80 @@ def do_parse(

    logger.info(f'local output dir is {local_md_dir}')

+def do_parse(
+    output_dir,
+    pdf_file_name,
+    pdf_bytes_or_dataset,
+    model_list,
+    parse_method,
+    debug_able=False,
+    f_draw_span_bbox=True,
+    f_draw_layout_bbox=True,
+    f_dump_md=True,
+    f_dump_middle_json=True,
+    f_dump_model_json=True,
+    f_dump_orig_pdf=True,
+    f_dump_content_list=True,
+    f_make_md_mode=MakeMode.MM_MD,
+    f_draw_model_bbox=False,
+    f_draw_line_sort_bbox=False,
+    f_draw_char_bbox=False,
+    start_page_id=0,
+    end_page_id=None,
+    lang=None,
+    layout_model=None,
+    formula_enable=None,
+    table_enable=None,
+):
+    parallel_count = 1
+    if os.environ.get('MINERU_PARALLEL_INFERENCE_COUNT'):
+        parallel_count = int(os.environ['MINERU_PARALLEL_INFERENCE_COUNT'])
+
+    if parallel_count > 1:
+        if isinstance(pdf_bytes_or_dataset, bytes):
+            pdf_bytes = convert_pdf_bytes_to_bytes_by_pymupdf(
+                pdf_bytes_or_dataset, start_page_id, end_page_id
+            )
+            ds = PymuDocDataset(pdf_bytes, lang=lang)
+        else:
+            ds = pdf_bytes_or_dataset
+        batch_do_parse(output_dir, [pdf_file_name], [ds], parse_method, debug_able, f_draw_span_bbox=f_draw_span_bbox, f_draw_layout_bbox=f_draw_layout_bbox, f_dump_md=f_dump_md, f_dump_middle_json=f_dump_middle_json, f_dump_model_json=f_dump_model_json, f_dump_orig_pdf=f_dump_orig_pdf, f_dump_content_list=f_dump_content_list, f_make_md_mode=f_make_md_mode, f_draw_model_bbox=f_draw_model_bbox, f_draw_line_sort_bbox=f_draw_line_sort_bbox, f_draw_char_bbox=f_draw_char_bbox, lang=lang)
+    else:
+        _do_parse(output_dir, pdf_file_name, pdf_bytes_or_dataset, model_list, parse_method, debug_able, start_page_id=start_page_id, end_page_id=end_page_id, lang=lang, layout_model=layout_model, formula_enable=formula_enable, table_enable=table_enable,  f_draw_span_bbox=f_draw_span_bbox, f_draw_layout_bbox=f_draw_layout_bbox, f_dump_md=f_dump_md, f_dump_middle_json=f_dump_middle_json, f_dump_model_json=f_dump_model_json, f_dump_orig_pdf=f_dump_orig_pdf, f_dump_content_list=f_dump_content_list, f_make_md_mode=f_make_md_mode, f_draw_model_bbox=f_draw_model_bbox, f_draw_line_sort_bbox=f_draw_line_sort_bbox, f_draw_char_bbox=f_draw_char_bbox)
+
+
+def batch_do_parse(
+    output_dir,
+    pdf_file_names: list[str],
+    pdf_bytes_or_datasets: list[bytes | Dataset],
+    parse_method,
+    debug_able=False,
+    f_draw_span_bbox=True,
+    f_draw_layout_bbox=True,
+    f_dump_md=True,
+    f_dump_middle_json=True,
+    f_dump_model_json=True,
+    f_dump_orig_pdf=True,
+    f_dump_content_list=True,
+    f_make_md_mode=MakeMode.MM_MD,
+    f_draw_model_bbox=False,
+    f_draw_line_sort_bbox=False,
+    f_draw_char_bbox=False,
+    lang=None,
+    layout_model=None,
+    formula_enable=None,
+    table_enable=None,
+):
+    dss = []
+    for v in pdf_bytes_or_datasets:
+        if isinstance(v, bytes):
+            dss.append(PymuDocDataset(v, lang=lang))
+        else:
+            dss.append(v)
+
+    infer_results = batch_doc_analyze(dss, parse_method, lang=lang, layout_model=layout_model, formula_enable=formula_enable, table_enable=table_enable)
+    for idx, infer_result in enumerate(infer_results):
+        _do_parse(output_dir, pdf_file_names[idx], dss[idx], infer_result.get_infer_res(), parse_method, debug_able, f_draw_span_bbox=f_draw_span_bbox, f_draw_layout_bbox=f_draw_layout_bbox, f_dump_md=f_dump_md, f_dump_middle_json=f_dump_middle_json, f_dump_model_json=f_dump_model_json, f_dump_orig_pdf=f_dump_orig_pdf, f_dump_content_list=f_dump_content_list, f_make_md_mode=f_make_md_mode, f_draw_model_bbox=f_draw_model_bbox, f_draw_line_sort_bbox=f_draw_line_sort_bbox, f_draw_char_bbox=f_draw_char_bbox, lang=lang)
+

 parse_pdf_methods = click.Choice(['ocr', 'txt', 'auto'])
--- a/next_docs/en/user_guide/install/boost_with_cuda.rst
+++ b/next_docs/en/user_guide/install/boost_with_cuda.rst
@@ -28,12 +28,12 @@ NVIDIA drivers are already installed, and you can skip Step 2.

 .. note::

-   ``CUDA Version`` should be >= 12.1, If the displayed version number is less than 12.1, please upgrade the driver.
+   ``CUDA Version`` should be >= 12.4, If the displayed version number is less than 12.4, please upgrade the driver.

 .. code:: text

   +---------------------------------------------------------------------------------------+
-   | NVIDIA-SMI 537.34                 Driver Version: 537.34       CUDA Version: 12.2     |
+   | NVIDIA-SMI 570.133.07             Driver Version: 572.83         CUDA Version: 12.8   |
   |-----------------------------------------+----------------------+----------------------+
   | GPU  Name                     TCC/WDDM  | Bus-Id        Disp.A | Volatile Uncorr. ECC |
   | Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
@@ -52,7 +52,7 @@ If no driver is installed, use the following command:
 .. code:: sh

   sudo apt-get update
-   sudo apt-get install nvidia-driver-545
+   sudo apt-get install nvidia-driver-570-server

 Install the proprietary driver and restart your computer after
 installation.
@@ -80,15 +80,15 @@ Specify Python version 3.10.

 .. code:: sh

-   conda create -n MinerU python=3.10
-   conda activate MinerU
+    conda create -n mineru 'python<3.13' -y
+    conda activate mineru

 5. Install Applications
 ~~~~~~~~~~~~~~~~~~~~~~~

 .. code:: sh

-   pip install -U magic-pdf[full] --extra-index-url https://wheels.myhloli.com
+   pip install -U magic-pdf[full]

 .. admonition:: Important
    :class: tip
@@ -99,7 +99,7 @@ Specify Python version 3.10.

   magic-pdf --version

-If the version number is less than 0.7.0, please report the issue.
+If the version number is less than 1.3.0, please report the issue.

 6. Download Models
 ~~~~~~~~~~~~~~~~~~
@@ -126,7 +126,7 @@ Download a sample file from the repository and test it.

 .. code:: sh

-   wget https://github.com/opendatalab/MinerU/raw/master/demo/small_ocr.pdf
+   wget https://github.com/opendatalab/MinerU/raw/master/demo/pdfs/small_ocr.pdf
   magic-pdf -p small_ocr.pdf -o ./output

 9. Test CUDA Acceleration
@@ -150,23 +150,6 @@ to test CUDA acceleration:

      magic-pdf -p small_ocr.pdf -o ./output

-10. Enable CUDA Acceleration for OCR
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-1. Download ``paddlepaddle-gpu``. Installation will automatically enable
-   OCR acceleration.
-
-   .. code:: sh
-
-      python -m pip install paddlepaddle-gpu==3.0.0b1 -i https://www.paddlepaddle.org.cn/packages/stable/cu118/
-
-2. Test OCR acceleration with the following command:
-
-   .. code:: sh
-
-      magic-pdf -p small_ocr.pdf -o ./output
-
-

 .. _windows_10_or_11_section:

@@ -176,11 +159,12 @@ Windows 10/11
 1. Install CUDA and cuDNN
 ~~~~~~~~~~~~~~~~~~~~~~~~~

-Required versions: CUDA 11.8 + cuDNN 8.7.0
+You need to install a CUDA version that is compatible with torch's requirements. Currently, torch supports CUDA 11.8/12.4/12.6.
+
+- CUDA 11.8 https://developer.nvidia.com/cuda-11-8-0-download-archive
+- CUDA 12.4 https://developer.nvidia.com/cuda-12-4-0-download-archive
+- CUDA 12.6 https://developer.nvidia.com/cuda-12-6-0-download-archive

-  CUDA 11.8: https://developer.nvidia.com/cuda-11-8-0-download-archive
-  cuDNN v8.7.0 (November 28th, 2022), for CUDA 11.x:
-   https://developer.nvidia.com/rdp/cudnn-archive

 2. Install Anaconda
 ~~~~~~~~~~~~~~~~~~~
@@ -192,19 +176,17 @@ Download link: https://repo.anaconda.com/archive/Anaconda3-2024.06-1-Windows-x86
 3. Create an Environment Using Conda
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

-Python version must be 3.10.
-
 ::

-   conda create -n MinerU python=3.10
-   conda activate MinerU
+    conda create -n mineru 'python<3.13' -y
+    conda activate mineru

 4. Install Applications
 ~~~~~~~~~~~~~~~~~~~~~~~

 ::

-   pip install -U magic-pdf[full] --extra-index-url https://wheels.myhloli.com
+   pip install -U magic-pdf[full]

 .. admonition:: Important
    :class: tip
@@ -215,7 +197,7 @@ Python version must be 3.10.

      magic-pdf --version

-    If the version number is less than 0.7.0, please report it in the issues section.
+    If the version number is less than 1.3.0, please report it in the issues section.

 5. Download Models
 ~~~~~~~~~~~~~~~~~~
@@ -242,7 +224,7 @@ Download a sample file from the repository and test it.

 .. code:: powershell

-     wget https://github.com/opendatalab/MinerU/raw/master/demo/small_ocr.pdf -O small_ocr.pdf
+     wget https://github.com/opendatalab/MinerU/raw/master/demo/pdfs/small_ocr.pdf -O small_ocr.pdf
     magic-pdf -p small_ocr.pdf -o ./output

 8. Test CUDA Acceleration
@@ -251,23 +233,12 @@ Download a sample file from the repository and test it.
 If your graphics card has at least 8GB of VRAM, follow these steps to
 test CUDA-accelerated parsing performance.

-1. **Overwrite the installation of torch and torchvision** supporting CUDA.
+1. **Overwrite the installation of torch and torchvision** supporting CUDA.(Please select the appropriate index-url based on your CUDA version. For more details, refer to the [PyTorch official website](https://pytorch.org/get-started/locally/).)

 .. code:: sh

-   pip install --force-reinstall torch==2.3.1 torchvision==0.18.1 --index-url https://download.pytorch.org/whl/cu118
-
-.. admonition:: Important
-    :class: tip
-
-    ❗️Ensure the following versions are specified in the command:
-
- 
-    .. code:: sh
+   pip install --force-reinstall torch==2.6.0 torchvision==0.21.1 "numpy<2.0.0" --index-url https://download.pytorch.org/whl/cu124

-         torch==2.3.1 torchvision==0.18.1
-
-    These are the highest versions we support. Installing higher versions without specifying them will cause the program to fail.

 2. **Modify the value of ``"device-mode"``** in the ``magic-pdf.json``
   configuration file located in your user directory.
@@ -283,19 +254,3 @@ test CUDA-accelerated parsing performance.
   ::

      magic-pdf -p small_ocr.pdf -o ./output
-
-9. Enable CUDA Acceleration for OCR
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-1. **Download paddlepaddle-gpu**, which will automatically enable OCR
-   acceleration upon installation.
-
-   ::
-
-      pip install paddlepaddle-gpu==2.6.1
-
-2. **Run the following command to test OCR acceleration**:
-
-   ::
-
-      magic-pdf -p small_ocr.pdf -o ./output
--- a/next_docs/en/user_guide/install/install.rst
+++ b/next_docs/en/user_guide/install/install.rst
@@ -37,53 +37,57 @@ Also you can try `online demo <https://www.modelscope.cn/studios/OpenDataLab/Min
        }
    </style>
    <table>
-        <tr>
-            <td colspan="3" rowspan="2">Operating System</td>
-        </tr>
-        <tr>
-            <td>Ubuntu 22.04 LTS</td>
-            <td>Windows 10 / 11</td>
-            <td>macOS 11+</td>
-        </tr>
-        <tr>
-            <td colspan="3">CPU</td>
-            <td>x86_64(unsupported ARM Linux)</td>
-            <td>x86_64(unsupported ARM Windows)</td>
-            <td>x86_64 / arm64</td>
-        </tr>
-        <tr>
-            <td colspan="3">Memory</td>
-            <td colspan="3">16GB or more, recommended 32GB+</td>
-        </tr>
-        <tr>
-            <td colspan="3">Python Version</td>
-            <td colspan="3">3.10(Please make sure to create a Python 3.10 virtual environment using conda)</td>
-        </tr>
-        <tr>
-            <td colspan="3">Nvidia Driver Version</td>
-            <td>latest (Proprietary Driver)</td>
-            <td>latest</td>
-            <td>None</td>
-        </tr>
-        <tr>
-            <td colspan="3">CUDA Environment</td>
-            <td>Automatic installation [12.1 (pytorch) + 11.8 (paddle)]</td>
-            <td>11.8 (manual installation) + cuDNN v8.7.0 (manual installation)</td>
-            <td>None</td>
-        </tr>
-        <tr>
-            <td rowspan="2">GPU Hardware Support List</td>
-            <td colspan="2">Minimum Requirement 8G+ VRAM</td>
-            <td colspan="2">3060ti/3070/4060<br>
-            8G VRAM enables layout, formula recognition acceleration and OCR acceleration</td>
-            <td rowspan="2">None</td>
-        </tr>
-        <tr>
-            <td colspan="2">Recommended Configuration 10G+ VRAM</td>
-            <td colspan="2">3080/3080ti/3090/3090ti/4070/4070ti/4070tisuper/4080/4090<br>
-            10G VRAM or more can enable layout, formula recognition, OCR acceleration and table recognition acceleration simultaneously
-            </td>
-        </tr>
+    <tr>
+        <td colspan="3" rowspan="2">Operating System</td>
+    </tr>
+    <tr>
+        <td>Linux after 2019</td>
+        <td>Windows 10 / 11</td>
+        <td>macOS 11+</td>
+    </tr>
+    <tr>
+        <td colspan="3">CPU</td>
+        <td>x86_64 / arm64</td>
+        <td>x86_64(unsupported ARM Windows)</td>
+        <td>x86_64 / arm64</td>
+    </tr>
+    <tr>
+        <td colspan="3">Memory Requirements</td>
+        <td colspan="3">16GB or more, recommended 32GB+</td>
+    </tr>
+    <tr>
+        <td colspan="3">Storage Requirements</td>
+        <td colspan="3">20GB or more, with a preference for SSD</td>
+    </tr>
+    <tr>
+        <td colspan="3">Python Version</td>
+        <td colspan="3">3.10~3.12</td>
+    </tr>
+    <tr>
+        <td colspan="3">Nvidia Driver Version</td>
+        <td>latest (Proprietary Driver)</td>
+        <td>latest</td>
+        <td>None</td>
+    </tr>
+    <tr>
+        <td colspan="3">CUDA Environment</td>
+        <td>11.8/12.4/12.6</td>
+        <td>11.8/12.4/12.6</td>
+        <td>None</td>
+    </tr>
+    <tr>
+        <td colspan="3">CANN Environment(NPU support)</td>
+        <td>8.0+(Ascend 910b)</td>
+        <td>None</td>
+        <td>None</td>
+    </tr>
+    <tr>
+        <td rowspan="2">GPU/MPS Hardware Support List</td>
+        <td colspan="2">GPU VRAM 6GB or more</td>
+        <td colspan="2">All GPUs with Tensor Cores produced from Volta(2017) onwards.<br>
+        More than 6GB VRAM </td>
+        <td rowspan="2">apple slicon</td>
+    </tr>
    </table>


@@ -93,9 +97,9 @@ Create an environment

 .. code-block:: shell

-    conda create -n MinerU python=3.10
-    conda activate MinerU
-    pip install -U magic-pdf[full] --extra-index-url https://wheels.myhloli.com
+    conda create -n mineru 'python<3.13' -y
+    conda activate mineru
+    pip install -U "magic-pdf[full]"


 Download model weight files

--- a/next_docs/zh_cn/user_guide/install/boost_with_cuda.rst
+++ b/next_docs/zh_cn/user_guide/install/boost_with_cuda.rst
@@ -10,7 +10,7 @@
 .. admonition:: Important
    :class: tip

-    Docker 需要至少 16GB 显存的 GPU，并且所有加速功能默认启用。
+    Docker 需要至少 6GB 显存的 GPU，并且所有加速功能默认启用。
   
    在运行此 Docker 容器之前，您可以使用以下命令检查您的设备是否支持 Docker 上的 CUDA 加速。

@@ -20,10 +20,10 @@

 .. code:: sh

-    wget https://github.com/opendatalab/MinerU/raw/master/Dockerfile
-    docker build -t mineru:latest .
-    docker run --rm -it --gpus=all mineru:latest /bin/bash
-    magic-pdf --help
+      wget https://gcore.jsdelivr.net/gh/opendatalab/MinerU@master/docker/china/Dockerfile -O Dockerfile
+      docker build -t mineru:latest .
+      docker run -it --name mineru --gpus=all mineru:latest /bin/bash -c "echo 'source /opt/mineru_venv/bin/activate' >> ~/.bashrc && exec bash"
+      magic-pdf --help


 .. _ubuntu_22_04_lts_section:
@@ -42,12 +42,12 @@ Ubuntu 22.04 LTS
 .. admonition:: Important
    :class: tip

-    ``CUDA Version`` 显示的版本号应 >=12.1，如显示的版本号小于12.1，请升级驱动
+    ``CUDA Version`` 显示的版本号应 >= 12.4，如显示的版本号小于12.4，请升级驱动

 .. code:: text

   +---------------------------------------------------------------------------------------+
-   | NVIDIA-SMI 537.34                 Driver Version: 537.34       CUDA Version: 12.2     |
+   | NVIDIA-SMI 570.133.07             Driver Version: 572.83         CUDA Version: 12.8   |
   |-----------------------------------------+----------------------+----------------------+
   | GPU  Name                     TCC/WDDM  | Bus-Id        Disp.A | Volatile Uncorr. ECC |
   | Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
@@ -66,7 +66,7 @@ Ubuntu 22.04 LTS
 .. code:: bash

   sudo apt-get update
-   sudo apt-get install nvidia-driver-545
+   sudo apt-get install nvidia-driver-570-server

 安装专有驱动，安装完成后，重启电脑

@@ -89,19 +89,17 @@ Ubuntu 22.04 LTS
 4. 使用 conda 创建环境
 ---------------------

-需指定 python 版本为3.10
-
 .. code:: bash

-   conda create -n MinerU python=3.10
-   conda activate MinerU
+   conda create -n mineru 'python<3.13' -y
+   conda activate mineru

 5. 安装应用
 -----------

 .. code:: bash

-   pip install -U magic-pdf[full] --extra-index-url https://wheels.myhloli.com -i https://mirrors.aliyun.com/pypi/simple
+   pip install -U magic-pdf[full] -i https://mirrors.aliyun.com/pypi/simple

 .. admonition:: Important
    :class: tip
@@ -112,7 +110,7 @@ Ubuntu 22.04 LTS

       magic-pdf --version

-    如果版本号小于0.7.0，请到issue中向我们反馈
+    如果版本号小于1.3.0，请到issue中向我们反馈

 6. 下载模型
 -----------
@@ -136,7 +134,7 @@ Ubuntu 22.04 LTS

 .. code:: bash

-   wget https://gcore.jsdelivr.net/gh/opendatalab/MinerU@master/demo/small_ocr.pdf
+   wget https://gcore.jsdelivr.net/gh/opendatalab/MinerU@master/demo/pdfs/small_ocr.pdf
   magic-pdf -p small_ocr.pdf -o ./output

 9. 测试CUDA加速
@@ -163,27 +161,8 @@ Ubuntu 22.04 LTS
 .. admonition:: Tip
    :class: tip

-    CUDA 加速是否生效可以根据 log 中输出的各个阶段 cost 耗时来简单判断，通常情况下， ``layout detection cost`` 和 ``mfr time`` 应提速10倍以上。
-
-10. 为 ocr 开启 cuda 加速
---------------------
-
-**1.下载paddlepaddle-gpu, 安装完成后会自动开启ocr加速**
-
-.. code:: bash
-
-   python -m pip install paddlepaddle-gpu==3.0.0b1 -i https://www.paddlepaddle.org.cn/packages/stable/cu118/
+    CUDA 加速是否生效可以根据 log 中输出的各个阶段的耗时来简单判断，通常情况下，cuda应比cpu更快。

-**2.运行以下命令测试ocr加速效果**
-
-.. code:: bash
-
-   magic-pdf -p small_ocr.pdf -o ./output
-
-.. admonition:: Tip
-    :class: tip
-
-    CUDA 加速是否生效可以根据 log 中输出的各个阶段 cost 耗时来简单判断，通常情况下， ``ocr cost`` 应提速10倍以上。


 .. _windows_10_or_11_section:
@@ -194,10 +173,12 @@ Windows 10/11
 1. 安装 cuda 和 cuDNN
 ------------------

-需要安装的版本 CUDA 11.8 + cuDNN 8.7.0
+需要安装符合torch要求的cuda版本，torch目前支持11.8/12.4/12.6
+
+- CUDA 11.8 https://developer.nvidia.com/cuda-11-8-0-download-archive
+- CUDA 12.4 https://developer.nvidia.com/cuda-12-4-0-download-archive
+- CUDA 12.6 https://developer.nvidia.com/cuda-12-6-0-download-archive

-  CUDA 11.8 https://developer.nvidia.com/cuda-11-8-0-download-archive
-  cuDNN v8.7.0 (November 28th, 2022), for CUDA 11.x https://developer.nvidia.com/rdp/cudnn-archive

 2. 安装 anaconda
 ---------------
@@ -209,19 +190,17 @@ Windows 10/11
 3. 使用 conda 创建环境
 ---------------------

-需指定python版本为3.10
-
 .. code:: bash

-   conda create -n MinerU python=3.10
-   conda activate MinerU
+    conda create -n mineru 'python<3.13' -y
+    conda activate mineru

 4. 安装应用
 -----------

 .. code:: bash

-   pip install -U magic-pdf[full] --extra-index-url https://wheels.myhloli.com -i https://mirrors.aliyun.com/pypi/simple
+   pip install -U magic-pdf[full] -i https://mirrors.aliyun.com/pypi/simple

 .. admonition:: Important
    :class: tip
@@ -232,7 +211,7 @@ Windows 10/11

      magic-pdf --version

-    如果版本号小于0.7.0，请到issue中向我们反馈
+    如果版本号小于1.3.0，请到issue中向我们反馈

 5. 下载模型
 -----------
@@ -256,7 +235,7 @@ Windows 10/11

 .. code:: powershell

-    wget https://github.com/opendatalab/MinerU/raw/master/demo/small_ocr.pdf -O small_ocr.pdf
+    wget https://github.com/opendatalab/MinerU/raw/master/demo/pdfs/small_ocr.pdf -O small_ocr.pdf
    magic-pdf -p small_ocr.pdf -o ./output

 8. 测试 CUDA 加速
@@ -264,22 +243,13 @@ Windows 10/11

 如果您的显卡显存大于等于 **8GB**，可以进行以下流程，测试 CUDA 解析加速效果

-**1.覆盖安装支持cuda的torch和torchvision**
-
-.. code:: bash
-
-   pip install --force-reinstall torch==2.3.1 torchvision==0.18.1 --index-url https://download.pytorch.org/whl/cu118
+**1.覆盖安装支持cuda的torch和torchvision**(请根据cuda版本选择合适的index-url，具体可参考[torch官网](https://pytorch.org/get-started/locally/))

-.. admonition:: Important
-    :class: tip

-    务必在命令中指定以下版本
-
-    .. code:: bash
+.. code:: bash

-      torch==2.3.1 torchvision==0.18.1
+   pip install --force-reinstall torch==2.6.0 torchvision==0.21.1 "numpy<2.0.0" --index-url https://download.pytorch.org/whl/cu124

-    这是我们支持的最高版本，如果不指定版本会自动安装更高版本导致程序无法运行

 **2.修改【用户目录】中配置文件magic-pdf.json中”device-mode”的值**

@@ -298,24 +268,5 @@ Windows 10/11
 .. admonition:: Tip
    :class: tip

-    CUDA 加速是否生效可以根据 log 中输出的各个阶段的耗时来简单判断，通常情况下， ``layout detection time`` 和 ``mfr time`` 应提速10倍以上。
-
-9. 为 ocr 开启 cuda 加速
--------------------
-
-**1.下载paddlepaddle-gpu, 安装完成后会自动开启ocr加速**
-
-.. code:: bash
-
-   pip install paddlepaddle-gpu==2.6.1
-
-**2.运行以下命令测试ocr加速效果**
-
-.. code:: bash
-
-   magic-pdf -p small_ocr.pdf -o ./output
-
-.. admonition:: Tip
-    :class: tip
+    CUDA 加速是否生效可以根据 log 中输出的各个阶段的耗时来简单判断，通常情况下， cuda会比cpu更快。

-    CUDA 加速是否生效可以根据 log 中输出的各个阶段 cost 耗时来简单判断，通常情况下， ``ocr time`` 应提速10倍以上。
--- a/next_docs/zh_cn/user_guide/install/install.rst
+++ b/next_docs/zh_cn/user_guide/install/install.rst
@@ -24,53 +24,58 @@
        }
    </style>
    <table>
-        <tr>
-            <td colspan="3" rowspan="2">操作系统</td>
-        </tr>
-        <tr>
-            <td>Ubuntu 22.04 LTS</td>
-            <td>Windows 10 / 11</td>
-            <td>macOS 11+</td>
-        </tr>
-        <tr>
-            <td colspan="3">CPU</td>
-            <td>x86_64(暂不支持ARM Linux)</td>
-            <td>x86_64(暂不支持ARM Windows)</td>
-            <td>x86_64 / arm64</td>
-        </tr>
-        <tr>
-            <td colspan="3">内存</td>
-            <td colspan="3">大于等于16GB，推荐32G以上</td>
-        </tr>
-        <tr>
-            <td colspan="3">python版本</td>
-            <td colspan="3">3.10 (请务必通过conda创建3.10虚拟环境)</td>
-        </tr>
-        <tr>
-            <td colspan="3">Nvidia Driver 版本</td>
-            <td>latest(专有驱动)</td>
-            <td>latest</td>
-            <td>None</td>
-        </tr>
-        <tr>
-            <td colspan="3">CUDA环境</td>
-            <td>自动安装[12.1(pytorch)+11.8(paddle)]</td>
-            <td>11.8(手动安装)+cuDNN v8.7.0(手动安装)</td>
-            <td>None</td>
-        </tr>
-        <tr>
-            <td rowspan="2">GPU硬件支持列表</td>
-            <td colspan="2">最低要求 8G+显存</td>
-            <td colspan="2">3060ti/3070/4060<br>
-            8G显存可开启layout、公式识别和ocr加速</td>
-            <td rowspan="2">None</td>
-        </tr>
-        <tr>
-            <td colspan="2">推荐配置 10G+显存</td>
-            <td colspan="2">3080/3080ti/3090/3090ti/4070/4070ti/4070tisuper/4080/4090<br>
-            10G显存及以上可以同时开启layout、公式识别和ocr加速和表格识别加速<br>
-            </td>
-        </tr>
+    <tr>
+        <td colspan="3" rowspan="2">操作系统</td>
+    </tr>
+    <tr>
+        <td>Linux after 2019</td>
+        <td>Windows 10 / 11</td>
+        <td>macOS 11+</td>
+    </tr>
+    <tr>
+        <td colspan="3">CPU</td>
+        <td>x86_64 / arm64</td>
+        <td>x86_64(暂不支持ARM Windows)</td>
+        <td>x86_64 / arm64</td>
+    </tr>
+    <tr>
+        <td colspan="3">内存</td>
+        <td colspan="3">大于等于16GB，推荐32G以上</td>
+    </tr>
+    <tr>
+        <td colspan="3">存储空间</td>
+        <td colspan="3">大于等于20GB，推荐使用SSD以获得最佳性能</td>
+    </tr>
+    <tr>
+        <td colspan="3">python版本</td>
+        <td colspan="3">>=3.9,<=3.12</td>
+    </tr>
+    <tr>
+        <td colspan="3">Nvidia Driver 版本</td>
+        <td>latest(专有驱动)</td>
+        <td>latest</td>
+        <td>None</td>
+    </tr>
+    <tr>
+        <td colspan="3">CUDA环境</td>
+        <td>11.8/12.4/12.6</td>
+        <td>11.8/12.4/12.6</td>
+        <td>None</td>
+    </tr>
+    <tr>
+        <td colspan="3">CANN环境(NPU支持)</td>
+        <td>8.0+(Ascend 910b)</td>
+        <td>None</td>
+        <td>None</td>
+    </tr>
+    <tr>
+        <td rowspan="2">GPU/MPS 硬件支持列表</td>
+        <td colspan="2">显存6G以上</td>
+        <td colspan="2">
+        Volta(2017)及之后生产的全部带Tensor Core的GPU <br>
+        6G显存及以上</td>
+        <td rowspan="2">apple slicon</td>
+    </tr>
    </table>


@@ -79,9 +84,9 @@

 .. code-block:: shell

-    conda create -n MinerU python=3.10
-    conda activate MinerU
-    pip install -U magic-pdf[full] --extra-index-url https://wheels.myhloli.com -i https://mirrors.aliyun.com/pypi/simple
+    conda create -n mineru 'python<3.13' -y
+    conda activate mineru
+    pip install -U "magic-pdf[full]" -i https://mirrors.aliyun.com/pypi/simple


 下载模型权重文件

--- a/projects/web_api/Dockerfile
+++ b/projects/web_api/Dockerfile
@@ -23,10 +23,10 @@ RUN apt-get update && \
 COPY requirements.txt .
 RUN python -m venv /app/venv && \
    . /app/venv/bin/activate && \
-    pip install -r requirements.txt && \
-    pip uninstall -y paddlepaddle && \
-    pip install -i https://www.paddlepaddle.org.cn/packages/stable/cu118/ \
-        paddlepaddle-gpu==3.0.0rc1
+    pip install -r requirements.txt
+#     pip uninstall -y paddlepaddle && \
+#     pip install -i https://www.paddlepaddle.org.cn/packages/stable/cu118/ \
+#         paddlepaddle-gpu==3.0.0rc1

 # Download models
 COPY download_models.py .
@@ -51,8 +51,8 @@ RUN apt-get update && \
    rm -rf /var/lib/apt/lists/*

 # Create volume for paddleocr models
-RUN mkdir -p /root/.paddleocr
-VOLUME [ "/root/.paddleocr" ]
+# RUN mkdir -p /root/.paddleocr
+# VOLUME [ "/root/.paddleocr" ]

 # Copy the app and its configuration file
 COPY entrypoint.sh /app/entrypoint.sh

--- a/projects/web_api/README.md
+++ b/projects/web_api/README.md
@@ -18,11 +18,9 @@ docker build --build-arg http_proxy=http://127.0.0.1:7890 --build-arg https_prox
 ## 启动命令

 ```
-docker run --rm -it --gpus=all -v ./paddleocr:/root/.paddleocr -p 8000:8000 mineru-api
+docker run --rm -it --gpus=all -p 8000:8000 mineru-api
 ```

-初次调用 API 时会自动下载 paddleocr 的模型（约数十 MB），其余模型已包含在镜像中。
-
 ## 测试参数

 访问地址：
@@ -30,31 +28,4 @@ docker run --rm -it --gpus=all -v ./paddleocr:/root/.paddleocr -p 8000:8000 mine
 ```
 http://localhost:8000/docs
 http://127.0.0.1:8000/docs
-```
-
-## 旧版镜像地址
-
-> 阿里云地址：docker pull registry.cn-beijing.aliyuncs.com/quincyqiang/mineru:0.1-models
->
-> dockerhub地址：docker pull quincyqiang/mineru:0.1-models
-
-
-## 旧版截图
-
-### 启动命令
-
-![](https://i-blog.csdnimg.cn/direct/bcff4f524ea5400db14421ba7cec4989.png)
-
-具体截图请见博客：https://blog.csdn.net/yanqianglifei/article/details/141979684
-
-### 启动日志
-
-![](https://i-blog.csdnimg.cn/direct/4eb5657567e4415eba912179dca5c8aa.png)
-
-### 测试参数
-
-![](https://i-blog.csdnimg.cn/direct/8b3a2bc5908042268e8cc69756e331a2.png)
-
-### 解析效果
-
-![](https://i-blog.csdnimg.cn/direct/a54dcae834ae48d498fb595aca4212c3.png)
+```
\ No newline at end of file
--- a/projects/web_api/app.py
+++ b/projects/web_api/app.py
@@ -3,6 +3,7 @@ import os
 from base64 import b64encode
 from glob import glob
 from io import StringIO
+import tempfile
 from typing import Tuple, Union

 import uvicorn
@@ -10,11 +11,12 @@ from fastapi import FastAPI, HTTPException, UploadFile
 from fastapi.responses import JSONResponse
 from loguru import logger

+from magic_pdf.data.read_api import read_local_images, read_local_office
 import magic_pdf.model as model_config
 from magic_pdf.config.enums import SupportedPdfParseMethod
 from magic_pdf.data.data_reader_writer import DataWriter, FileBasedDataWriter
 from magic_pdf.data.data_reader_writer.s3 import S3DataReader, S3DataWriter
-from magic_pdf.data.dataset import PymuDocDataset
+from magic_pdf.data.dataset import ImageDataset, PymuDocDataset
 from magic_pdf.libs.config_reader import get_bucket_name, get_s3_config
 from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
 from magic_pdf.operators.models import InferenceResult
@@ -24,6 +26,9 @@ model_config.__use_inside_model__ = True

 app = FastAPI()

+pdf_extensions = [".pdf"]
+office_extensions = [".ppt", ".pptx", ".doc", ".docx"]
+image_extensions = [".png", ".jpg"]

 class MemoryDataWriter(DataWriter):
    def __init__(self):
@@ -46,8 +51,8 @@ class MemoryDataWriter(DataWriter):


 def init_writers(
-    pdf_path: str = None,
-    pdf_file: UploadFile = None,
+    file_path: str = None,
+    file: UploadFile = None,
    output_path: str = None,
    output_image_path: str = None,
 ) -> Tuple[
@@ -59,19 +64,19 @@ def init_writers(
    Initialize writers based on path type

    Args:
-        pdf_path: PDF file path (local path or S3 path)
-        pdf_file: Uploaded PDF file object
+        file_path: file path (local path or S3 path)
+        file: Uploaded file object
        output_path: Output directory path
        output_image_path: Image output directory path

    Returns:
-        Tuple[writer, image_writer, pdf_bytes]: Returns initialized writer tuple and PDF
-        file content
+        Tuple[writer, image_writer, file_bytes]: Returns initialized writer tuple and file content
    """
-    if pdf_path:
-        is_s3_path = pdf_path.startswith("s3://")
+    file_extension:str = None
+    if file_path:
+        is_s3_path = file_path.startswith("s3://")
        if is_s3_path:
-            bucket = get_bucket_name(pdf_path)
+            bucket = get_bucket_name(file_path)
            ak, sk, endpoint = get_s3_config(bucket)

            writer = S3DataWriter(
@@ -84,25 +89,29 @@ def init_writers(
            temp_reader = S3DataReader(
                "", bucket=bucket, ak=ak, sk=sk, endpoint_url=endpoint
            )
-            pdf_bytes = temp_reader.read(pdf_path)
+            file_bytes = temp_reader.read(file_path)
+            file_extension = os.path.splitext(file_path)[1]
        else:
            writer = FileBasedDataWriter(output_path)
            image_writer = FileBasedDataWriter(output_image_path)
            os.makedirs(output_image_path, exist_ok=True)
-            with open(pdf_path, "rb") as f:
-                pdf_bytes = f.read()
+            with open(file_path, "rb") as f:
+                file_bytes = f.read()
+            file_extension = os.path.splitext(file_path)[1]
    else:
        # 处理上传的文件
-        pdf_bytes = pdf_file.file.read()
+        file_bytes = file.file.read()
+        file_extension = os.path.splitext(file.filename)[1]
        writer = FileBasedDataWriter(output_path)
        image_writer = FileBasedDataWriter(output_image_path)
        os.makedirs(output_image_path, exist_ok=True)

-    return writer, image_writer, pdf_bytes
+    return writer, image_writer, file_bytes, file_extension


-def process_pdf(
-    pdf_bytes: bytes,
+def process_file(
+    file_bytes: bytes,
+    file_extension: str,
    parse_method: str,
    image_writer: Union[S3DataWriter, FileBasedDataWriter],
 ) -> Tuple[InferenceResult, PipeResult]:
@@ -110,14 +119,30 @@ def process_pdf(
    Process PDF file content

    Args:
-        pdf_bytes: Binary content of PDF file
+        file_bytes: Binary content of file
+        file_extension: file extension
        parse_method: Parse method ('ocr', 'txt', 'auto')
        image_writer: Image writer

    Returns:
        Tuple[InferenceResult, PipeResult]: Returns inference result and pipeline result
    """
-    ds = PymuDocDataset(pdf_bytes)
+
+    ds = Union[PymuDocDataset, ImageDataset]
+    if file_extension in pdf_extensions:
+        ds = PymuDocDataset(file_bytes)
+    elif file_extension in office_extensions:
+        # 需要使用office解析
+        temp_dir = tempfile.mkdtemp()
+        with open(os.path.join(temp_dir, f"temp_file.{file_extension}"), "wb") as f:
+            f.write(file_bytes)
+        ds = read_local_office(temp_dir)[0]
+    elif file_extension in image_extensions:
+        # 需要使用ocr解析
+        temp_dir = tempfile.mkdtemp()
+        with open(os.path.join(temp_dir, f"temp_file.{file_extension}"), "wb") as f:
+            f.write(file_bytes)
+        ds = read_local_images(temp_dir)[0]
    infer_result: InferenceResult = None
    pipe_result: PipeResult = None

@@ -145,13 +170,13 @@ def encode_image(image_path: str) -> str:


 @app.post(
-    "/pdf_parse",
+    "/file_parse",
    tags=["projects"],
-    summary="Parse PDF files (supports local files and S3)",
+    summary="Parse files (supports local files and S3)",
 )
-async def pdf_parse(
-    pdf_file: UploadFile = None,
-    pdf_path: str = None,
+async def file_parse(
+    file: UploadFile = None,
+    file_path: str = None,
    parse_method: str = "auto",
    is_json_md_dump: bool = False,
    output_dir: str = "output",
@@ -165,10 +190,10 @@ async def pdf_parse(
    to the specified directory.

    Args:
-        pdf_file: The PDF file to be parsed. Must not be specified together with
-            `pdf_path`
-        pdf_path: The path to the PDF file to be parsed. Must not be specified together
-            with `pdf_file`
+        file: The PDF file to be parsed. Must not be specified together with
+            `file_path`
+        file_path: The path to the PDF file to be parsed. Must not be specified together
+            with `file`
        parse_method: Parsing method, can be auto, ocr, or txt. Default is auto. If
            results are not satisfactory, try ocr
        is_json_md_dump: Whether to write parsed data to .json and .md files. Default
@@ -181,31 +206,31 @@ async def pdf_parse(
        return_content_list: Whether to return parsed PDF content list. Default to False
    """
    try:
-        if (pdf_file is None and pdf_path is None) or (
-            pdf_file is not None and pdf_path is not None
+        if (file is None and file_path is None) or (
+            file is not None and file_path is not None
        ):
            return JSONResponse(
-                content={"error": "Must provide either pdf_file or pdf_path"},
+                content={"error": "Must provide either file or file_path"},
                status_code=400,
            )

        # Get PDF filename
-        pdf_name = os.path.basename(pdf_path if pdf_path else pdf_file.filename).split(
+        file_name = os.path.basename(file_path if file_path else file.filename).split(
            "."
        )[0]
-        output_path = f"{output_dir}/{pdf_name}"
+        output_path = f"{output_dir}/{file_name}"
        output_image_path = f"{output_path}/images"

        # Initialize readers/writers and get PDF content
-        writer, image_writer, pdf_bytes = init_writers(
-            pdf_path=pdf_path,
-            pdf_file=pdf_file,
+        writer, image_writer, file_bytes, file_extension = init_writers(
+            file_path=file_path,
+            file=file,
            output_path=output_path,
            output_image_path=output_image_path,
        )

        # Process PDF
-        infer_result, pipe_result = process_pdf(pdf_bytes, parse_method, image_writer)
+        infer_result, pipe_result = process_file(file_bytes, file_extension, parse_method, image_writer)

        # Use MemoryDataWriter to get results
        content_list_writer = MemoryDataWriter()
@@ -226,23 +251,23 @@ async def pdf_parse(
        # If results need to be saved
        if is_json_md_dump:
            writer.write_string(
-                f"{pdf_name}_content_list.json", content_list_writer.get_value()
+                f"{file_name}_content_list.json", content_list_writer.get_value()
            )
-            writer.write_string(f"{pdf_name}.md", md_content)
+            writer.write_string(f"{file_name}.md", md_content)
            writer.write_string(
-                f"{pdf_name}_middle.json", middle_json_writer.get_value()
+                f"{file_name}_middle.json", middle_json_writer.get_value()
            )
            writer.write_string(
-                f"{pdf_name}_model.json",
+                f"{file_name}_model.json",
                json.dumps(model_json, indent=4, ensure_ascii=False),
            )
            # Save visualization results
-            pipe_result.draw_layout(os.path.join(output_path, f"{pdf_name}_layout.pdf"))
-            pipe_result.draw_span(os.path.join(output_path, f"{pdf_name}_spans.pdf"))
+            pipe_result.draw_layout(os.path.join(output_path, f"{file_name}_layout.pdf"))
+            pipe_result.draw_span(os.path.join(output_path, f"{file_name}_spans.pdf"))
            pipe_result.draw_line_sort(
-                os.path.join(output_path, f"{pdf_name}_line_sort.pdf")
+                os.path.join(output_path, f"{file_name}_line_sort.pdf")
            )
-            infer_result.draw_model(os.path.join(output_path, f"{pdf_name}_model.pdf"))
+            infer_result.draw_model(os.path.join(output_path, f"{file_name}_model.pdf"))

        # Build return data
        data = {}

--- a/projects/web_api/download_models.py
+++ b/projects/web_api/download_models.py
@@ -4,12 +4,13 @@ from huggingface_hub import snapshot_download
 if __name__ == "__main__":

    mineru_patterns = [
-        "models/Layout/LayoutLMv3/*",
+        # "models/Layout/LayoutLMv3/*",
        "models/Layout/YOLO/*",
        "models/MFD/YOLO/*",
-        "models/MFR/unimernet_small_2501/*",
-        "models/TabRec/TableMaster/*",
-        "models/TabRec/StructEqTable/*",
+        "models/MFR/unimernet_hf_small_2503/*",
+        "models/OCR/paddleocr_torch/*",
+        # "models/TabRec/TableMaster/*",
+        # "models/TabRec/StructEqTable/*",
    ]
    model_dir = snapshot_download(
        "opendatalab/PDF-Extract-Kit-1.0",