Merge pull request #2611 from myhloli/dev

Dev

Merge pull request #2611 from myhloli/dev
Dev
0c7a0882 · Xiaomeng Zhao · GitHub · 3bd0ecf1 · a392f445 · 0c7a0882
Unverified Commit 0c7a0882 authored Jun 12, 2025 by Xiaomeng Zhao Committed by GitHub Jun 12, 2025
20 changed files
--- a/mineru/utils/draw_bbox.py
+++ b/mineru/utils/draw_bbox.py
+import json
+from io import BytesIO
+
+from pypdf import PdfReader, PdfWriter
+from reportlab.pdfgen import canvas
+
+from .enum_class import BlockType, ContentType
+
+
+def draw_bbox_without_number(i, bbox_list, page, c, rgb_config, fill_config):
+    new_rgb = [float(color) / 255 for color in rgb_config]
+    page_data = bbox_list[i]
+    page_width, page_height = page.cropbox[2], page.cropbox[3]
+
+    for bbox in page_data:
+        width = bbox[2] - bbox[0]
+        height = bbox[3] - bbox[1]
+        rect = [bbox[0], page_height - bbox[3], width, height]  # Define the rectangle
+
+        if fill_config:  # filled rectangle
+            c.setFillColorRGB(new_rgb[0], new_rgb[1], new_rgb[2], 0.3)
+            c.rect(rect[0], rect[1], rect[2], rect[3], stroke=0, fill=1)
+        else:  # bounding box
+            c.setStrokeColorRGB(new_rgb[0], new_rgb[1], new_rgb[2])
+            c.rect(rect[0], rect[1], rect[2], rect[3], stroke=1, fill=0)
+    return c
+
+
+def draw_bbox_with_number(i, bbox_list, page, c, rgb_config, fill_config, draw_bbox=True):
+    new_rgb = [float(color) / 255 for color in rgb_config]
+    page_data = bbox_list[i]
+    # 强制转换为 float
+    page_width, page_height = float(page.cropbox[2]), float(page.cropbox[3])
+
+    for j, bbox in enumerate(page_data):
+        # 确保bbox的每个元素都是float
+        x0, y0, x1, y1 = map(float, bbox)
+        width = x1 - x0
+        height = y1 - y0
+        rect = [x0, page_height - y1, width, height]
+        if draw_bbox:
+            if fill_config:
+                c.setFillColorRGB(*new_rgb, 0.3)
+                c.rect(rect[0], rect[1], rect[2], rect[3], stroke=0, fill=1)
+            else:
+                c.setStrokeColorRGB(*new_rgb)
+                c.rect(rect[0], rect[1], rect[2], rect[3], stroke=1, fill=0)
+        c.setFillColorRGB(*new_rgb, 1.0)
+        c.setFontSize(size=10)
+        # 这里也要用float
+        c.drawString(x1 + 2, page_height - y0 - 10, str(j + 1))
+
+    return c
+
+
+def draw_layout_bbox(pdf_info, pdf_bytes, out_path, filename):
+    dropped_bbox_list = []
+    tables_list, tables_body_list = [], []
+    tables_caption_list, tables_footnote_list = [], []
+    imgs_list, imgs_body_list, imgs_caption_list = [], [], []
+    imgs_footnote_list = []
+    titles_list = []
+    texts_list = []
+    interequations_list = []
+    lists_list = []
+    indexs_list = []
+    for page in pdf_info:
+        page_dropped_list = []
+        tables, tables_body, tables_caption, tables_footnote = [], [], [], []
+        imgs, imgs_body, imgs_caption, imgs_footnote = [], [], [], []
+        titles = []
+        texts = []
+        interequations = []
+        lists = []
+        indices = []
+
+        for dropped_bbox in page['discarded_blocks']:
+            page_dropped_list.append(dropped_bbox['bbox'])
+        dropped_bbox_list.append(page_dropped_list)
+        for block in page["para_blocks"]:
+            bbox = block["bbox"]
+            if block["type"] == BlockType.TABLE:
+                tables.append(bbox)
+                for nested_block in block["blocks"]:
+                    bbox = nested_block["bbox"]
+                    if nested_block["type"] == BlockType.TABLE_BODY:
+                        tables_body.append(bbox)
+                    elif nested_block["type"] == BlockType.TABLE_CAPTION:
+                        tables_caption.append(bbox)
+                    elif nested_block["type"] == BlockType.TABLE_FOOTNOTE:
+                        tables_footnote.append(bbox)
+            elif block["type"] == BlockType.IMAGE:
+                imgs.append(bbox)
+                for nested_block in block["blocks"]:
+                    bbox = nested_block["bbox"]
+                    if nested_block["type"] == BlockType.IMAGE_BODY:
+                        imgs_body.append(bbox)
+                    elif nested_block["type"] == BlockType.IMAGE_CAPTION:
+                        imgs_caption.append(bbox)
+                    elif nested_block["type"] == BlockType.IMAGE_FOOTNOTE:
+                        imgs_footnote.append(bbox)
+            elif block["type"] == BlockType.TITLE:
+                titles.append(bbox)
+            elif block["type"] == BlockType.TEXT:
+                texts.append(bbox)
+            elif block["type"] == BlockType.INTERLINE_EQUATION:
+                interequations.append(bbox)
+            elif block["type"] == BlockType.LIST:
+                lists.append(bbox)
+            elif block["type"] == BlockType.INDEX:
+                indices.append(bbox)
+
+        tables_list.append(tables)
+        tables_body_list.append(tables_body)
+        tables_caption_list.append(tables_caption)
+        tables_footnote_list.append(tables_footnote)
+        imgs_list.append(imgs)
+        imgs_body_list.append(imgs_body)
+        imgs_caption_list.append(imgs_caption)
+        imgs_footnote_list.append(imgs_footnote)
+        titles_list.append(titles)
+        texts_list.append(texts)
+        interequations_list.append(interequations)
+        lists_list.append(lists)
+        indexs_list.append(indices)
+
+    layout_bbox_list = []
+
+    table_type_order = {"table_caption": 1, "table_body": 2, "table_footnote": 3}
+    for page in pdf_info:
+        page_block_list = []
+        for block in page["para_blocks"]:
+            if block["type"] in [
+                BlockType.TEXT,
+                BlockType.TITLE,
+                BlockType.INTERLINE_EQUATION,
+                BlockType.LIST,
+                BlockType.INDEX,
+            ]:
+                bbox = block["bbox"]
+                page_block_list.append(bbox)
+            elif block["type"] in [BlockType.IMAGE]:
+                for sub_block in block["blocks"]:
+                    bbox = sub_block["bbox"]
+                    page_block_list.append(bbox)
+            elif block["type"] in [BlockType.TABLE]:
+                sorted_blocks = sorted(block["blocks"], key=lambda x: table_type_order[x["type"]])
+                for sub_block in sorted_blocks:
+                    bbox = sub_block["bbox"]
+                    page_block_list.append(bbox)
+
+        layout_bbox_list.append(page_block_list)
+
+    pdf_bytes_io = BytesIO(pdf_bytes)
+    pdf_docs = PdfReader(pdf_bytes_io)
+    output_pdf = PdfWriter()
+
+    for i, page in enumerate(pdf_docs.pages):
+        # 获取原始页面尺寸
+        page_width, page_height = float(page.cropbox[2]), float(page.cropbox[3])
+        custom_page_size = (page_width, page_height)
+
+        packet = BytesIO()
+        # 使用原始PDF的尺寸创建canvas
+        c = canvas.Canvas(packet, pagesize=custom_page_size)
+
+        c = draw_bbox_without_number(i, dropped_bbox_list, page, c, [158, 158, 158], True)
+        c = draw_bbox_without_number(i, tables_body_list, page, c, [204, 204, 0], True)
+        c = draw_bbox_without_number(i, tables_caption_list, page, c, [255, 255, 102], True)
+        c = draw_bbox_without_number(i, tables_footnote_list, page, c, [229, 255, 204], True)
+        c = draw_bbox_without_number(i, imgs_body_list, page, c, [153, 255, 51], True)
+        c = draw_bbox_without_number(i, imgs_caption_list, page, c, [102, 178, 255], True)
+        c = draw_bbox_without_number(i, imgs_footnote_list, page, c, [255, 178, 102], True)
+        c = draw_bbox_without_number(i, titles_list, page, c, [102, 102, 255], True)
+        c = draw_bbox_without_number(i, texts_list, page, c, [153, 0, 76], True)
+        c = draw_bbox_without_number(i, interequations_list, page, c, [0, 255, 0], True)
+        c = draw_bbox_without_number(i, lists_list, page, c, [40, 169, 92], True)
+        c = draw_bbox_without_number(i, indexs_list, page, c, [40, 169, 92], True)
+        c = draw_bbox_with_number(i, layout_bbox_list, page, c, [255, 0, 0], False, draw_bbox=False)
+
+        c.save()
+        packet.seek(0)
+        overlay_pdf = PdfReader(packet)
+
+        page.merge_page(overlay_pdf.pages[0])
+        output_pdf.add_page(page)
+
+    # 保存结果
+    with open(f"{out_path}/{filename}", "wb") as f:
+        output_pdf.write(f)
+
+
+def draw_span_bbox(pdf_info, pdf_bytes, out_path, filename):
+    text_list = []
+    inline_equation_list = []
+    interline_equation_list = []
+    image_list = []
+    table_list = []
+    dropped_list = []
+    next_page_text_list = []
+    next_page_inline_equation_list = []
+
+    def get_span_info(span):
+        if span['type'] == ContentType.TEXT:
+            if span.get('cross_page', False):
+                next_page_text_list.append(span['bbox'])
+            else:
+                page_text_list.append(span['bbox'])
+        elif span['type'] == ContentType.INLINE_EQUATION:
+            if span.get('cross_page', False):
+                next_page_inline_equation_list.append(span['bbox'])
+            else:
+                page_inline_equation_list.append(span['bbox'])
+        elif span['type'] == ContentType.INTERLINE_EQUATION:
+            page_interline_equation_list.append(span['bbox'])
+        elif span['type'] == ContentType.IMAGE:
+            page_image_list.append(span['bbox'])
+        elif span['type'] == ContentType.TABLE:
+            page_table_list.append(span['bbox'])
+
+    for page in pdf_info:
+        page_text_list = []
+        page_inline_equation_list = []
+        page_interline_equation_list = []
+        page_image_list = []
+        page_table_list = []
+        page_dropped_list = []
+
+        # 将跨页的span放到移动到下一页的列表中
+        if len(next_page_text_list) > 0:
+            page_text_list.extend(next_page_text_list)
+            next_page_text_list.clear()
+        if len(next_page_inline_equation_list) > 0:
+            page_inline_equation_list.extend(next_page_inline_equation_list)
+            next_page_inline_equation_list.clear()
+
+        # 构造dropped_list
+        for block in page['discarded_blocks']:
+            if block['type'] == BlockType.DISCARDED:
+                for line in block['lines']:
+                    for span in line['spans']:
+                        page_dropped_list.append(span['bbox'])
+        dropped_list.append(page_dropped_list)
+        # 构造其余useful_list
+        # for block in page['para_blocks']:  # span直接用分段合并前的结果就可以
+        for block in page['preproc_blocks']:
+            if block['type'] in [
+                BlockType.TEXT,
+                BlockType.TITLE,
+                BlockType.INTERLINE_EQUATION,
+                BlockType.LIST,
+                BlockType.INDEX,
+            ]:
+                for line in block['lines']:
+                    for span in line['spans']:
+                        get_span_info(span)
+            elif block['type'] in [BlockType.IMAGE, BlockType.TABLE]:
+                for sub_block in block['blocks']:
+                    for line in sub_block['lines']:
+                        for span in line['spans']:
+                            get_span_info(span)
+        text_list.append(page_text_list)
+        inline_equation_list.append(page_inline_equation_list)
+        interline_equation_list.append(page_interline_equation_list)
+        image_list.append(page_image_list)
+        table_list.append(page_table_list)
+
+    pdf_bytes_io = BytesIO(pdf_bytes)
+    pdf_docs = PdfReader(pdf_bytes_io)
+    output_pdf = PdfWriter()
+
+    for i, page in enumerate(pdf_docs.pages):
+        # 获取原始页面尺寸
+        page_width, page_height = float(page.cropbox[2]), float(page.cropbox[3])
+        custom_page_size = (page_width, page_height)
+
+        packet = BytesIO()
+        # 使用原始PDF的尺寸创建canvas
+        c = canvas.Canvas(packet, pagesize=custom_page_size)
+
+        # 获取当前页面的数据
+        draw_bbox_without_number(i, text_list, page, c,[255, 0, 0], False)
+        draw_bbox_without_number(i, inline_equation_list, page, c, [0, 255, 0], False)
+        draw_bbox_without_number(i, interline_equation_list, page, c, [0, 0, 255], False)
+        draw_bbox_without_number(i, image_list, page, c, [255, 204, 0], False)
+        draw_bbox_without_number(i, table_list, page, c, [204, 0, 255], False)
+        draw_bbox_without_number(i, dropped_list, page, c, [158, 158, 158], False)
+
+        c.save()
+        packet.seek(0)
+        overlay_pdf = PdfReader(packet)
+
+        page.merge_page(overlay_pdf.pages[0])
+        output_pdf.add_page(page)
+
+    # Save the PDF
+    with open(f"{out_path}/{filename}", "wb") as f:
+        output_pdf.write(f)
+
+
+if __name__ == "__main__":
+    # 读取PDF文件
+    pdf_path = "examples/demo1.pdf"
+    with open(pdf_path, "rb") as f:
+        pdf_bytes = f.read()
+
+    # 从json文件读取pdf_info
+
+    json_path = "examples/demo1_1746005777.0863056_middle.json"
+    with open(json_path, "r", encoding="utf-8") as f:
+        pdf_ann = json.load(f)
+    pdf_info = pdf_ann["pdf_info"]
+    # 调用可视化函数,输出到examples目录
+    draw_layout_bbox(pdf_info, pdf_bytes, "examples", "output_with_layout.pdf")
--- a/mineru/utils/enum_class.py
+++ b/mineru/utils/enum_class.py
+class BlockType:
+    IMAGE = 'image'
+    TABLE = 'table'
+    IMAGE_BODY = 'image_body'
+    TABLE_BODY = 'table_body'
+    IMAGE_CAPTION = 'image_caption'
+    TABLE_CAPTION = 'table_caption'
+    IMAGE_FOOTNOTE = 'image_footnote'
+    TABLE_FOOTNOTE = 'table_footnote'
+    TEXT = 'text'
+    TITLE = 'title'
+    INTERLINE_EQUATION = 'interline_equation'
+    LIST = 'list'
+    INDEX = 'index'
+    DISCARDED = 'discarded'
+
+
+class ContentType:
+    IMAGE = 'image'
+    TABLE = 'table'
+    TEXT = 'text'
+    INTERLINE_EQUATION = 'interline_equation'
+    INLINE_EQUATION = 'inline_equation'
+
+
+class CategoryId:
+    Title = 0
+    Text = 1
+    Abandon = 2
+    ImageBody = 3
+    ImageCaption = 4
+    TableBody = 5
+    TableCaption = 6
+    TableFootnote = 7
+    InterlineEquation_Layout = 8
+    InlineEquation = 13
+    InterlineEquation_YOLO = 14
+    OcrText = 15
+    ImageFootnote = 101
+
+
+class MakeMode:
+    MM_MD = 'mm_markdown'
+    NLP_MD = 'nlp_markdown'
+    CONTENT_LIST = 'content_list'
+
+
+class ModelPath:
+    vlm_root_hf = "opendatalab/MinerU2.0-2505-0.9B"
+    vlm_root_modelscope = "OpenDataLab/MinerU2.0-2505-0.9B"
+    pipeline_root_modelscope = "OpenDataLab/PDF-Extract-Kit-1.0"
+    pipeline_root_hf = "opendatalab/PDF-Extract-Kit-1.0"
+    doclayout_yolo = "models/Layout/YOLO/doclayout_yolo_docstructbench_imgsz1280_2501.pt"
+    yolo_v8_mfd = "models/MFD/YOLO/yolo_v8_ft.pt"
+    unimernet_small = "models/MFR/unimernet_hf_small_2503"
+    pytorch_paddle = "models/OCR/paddleocr_torch"
+    layout_reader = "models/ReadingOrder/layout_reader"
+    slanet_plus = "models/TabRec/SlanetPlus/slanet-plus.onnx"
+
+
+class SplitFlag:
+    CROSS_PAGE = 'cross_page'
+    LINES_DELETED = 'lines_deleted'
\ No newline at end of file
--- a/mineru/utils/format_utils.py
+++ b/mineru/utils/format_utils.py
+
+import re
+import itertools
+import html
+from typing import Any, Dict, List
+from pydantic import (
+    BaseModel,
+    computed_field,
+    model_validator,
+)
+
+class TableCell(BaseModel):
+    """TableCell."""
+    row_span: int = 1
+    col_span: int = 1
+    start_row_offset_idx: int
+    end_row_offset_idx: int
+    start_col_offset_idx: int
+    end_col_offset_idx: int
+    text: str
+    column_header: bool = False
+    row_header: bool = False
+    row_section: bool = False
+
+    @model_validator(mode="before")
+    @classmethod
+    def from_dict_format(cls, data: Any) -> Any:
+        """from_dict_format."""
+        if isinstance(data, Dict):
+            # Check if this is a native BoundingBox or a bbox from docling-ibm-models
+            if (
+                    # "bbox" not in data
+                    # or data["bbox"] is None
+                    # or isinstance(data["bbox"], BoundingBox)
+                    "text"
+                    in data
+            ):
+                return data
+            text = data["bbox"].get("token", "")
+            if not len(text):
+                text_cells = data.pop("text_cell_bboxes", None)
+                if text_cells:
+                    for el in text_cells:
+                        text += el["token"] + " "
+
+                text = text.strip()
+            data["text"] = text
+
+        return data
+
+
+class TableData(BaseModel):  # TBD
+    """BaseTableData."""
+
+    table_cells: List[TableCell] = []
+    num_rows: int = 0
+    num_cols: int = 0
+
+    @computed_field  # type: ignore
+    @property
+    def grid(
+            self,
+    ) -> List[List[TableCell]]:
+        """grid."""
+        # Initialise empty table data grid (only empty cells)
+        table_data = [
+            [
+                TableCell(
+                    text="",
+                    start_row_offset_idx=i,
+                    end_row_offset_idx=i + 1,
+                    start_col_offset_idx=j,
+                    end_col_offset_idx=j + 1,
+                )
+                for j in range(self.num_cols)
+            ]
+            for i in range(self.num_rows)
+        ]
+
+        # Overwrite cells in table data for which there is actual cell content.
+        for cell in self.table_cells:
+            for i in range(
+                    min(cell.start_row_offset_idx, self.num_rows),
+                    min(cell.end_row_offset_idx, self.num_rows),
+            ):
+                for j in range(
+                        min(cell.start_col_offset_idx, self.num_cols),
+                        min(cell.end_col_offset_idx, self.num_cols),
+                ):
+                    table_data[i][j] = cell
+
+        return table_data
+
+
+"""
+OTSL
+"""
+OTSL_NL = "<nl>"
+OTSL_FCEL = "<fcel>"
+OTSL_ECEL = "<ecel>"
+OTSL_LCEL = "<lcel>"
+OTSL_UCEL = "<ucel>"
+OTSL_XCEL = "<xcel>"
+
+
+def otsl_extract_tokens_and_text(s: str):
+    # Pattern to match anything enclosed by < >
+    # (including the angle brackets themselves)
+    # pattern = r"(<[^>]+>)"
+    pattern = r"(" + r"|".join([OTSL_NL, OTSL_FCEL, OTSL_ECEL, OTSL_LCEL, OTSL_UCEL, OTSL_XCEL]) + r")"
+    # Find all tokens (e.g. "<otsl>", "<loc_140>", etc.)
+    tokens = re.findall(pattern, s)
+    # Remove any tokens that start with "<loc_"
+    tokens = [token for token in tokens]
+    # Split the string by those tokens to get the in-between text
+    text_parts = re.split(pattern, s)
+    text_parts = [token for token in text_parts]
+    # Remove any empty or purely whitespace strings from text_parts
+    text_parts = [part for part in text_parts if part.strip()]
+
+    return tokens, text_parts
+
+
+def otsl_parse_texts(texts, tokens):
+    split_word = OTSL_NL
+    split_row_tokens = [
+        list(y)
+        for x, y in itertools.groupby(tokens, lambda z: z == split_word)
+        if not x
+    ]
+    table_cells = []
+    r_idx = 0
+    c_idx = 0
+
+    def count_right(tokens, c_idx, r_idx, which_tokens):
+        span = 0
+        c_idx_iter = c_idx
+        while tokens[r_idx][c_idx_iter] in which_tokens:
+            c_idx_iter += 1
+            span += 1
+            if c_idx_iter >= len(tokens[r_idx]):
+                return span
+        return span
+
+    def count_down(tokens, c_idx, r_idx, which_tokens):
+        span = 0
+        r_idx_iter = r_idx
+        while tokens[r_idx_iter][c_idx] in which_tokens:
+            r_idx_iter += 1
+            span += 1
+            if r_idx_iter >= len(tokens):
+                return span
+        return span
+
+    for i, text in enumerate(texts):
+        cell_text = ""
+        if text in [
+            OTSL_FCEL,
+            OTSL_ECEL,
+        ]:
+            row_span = 1
+            col_span = 1
+            right_offset = 1
+            if text != OTSL_ECEL:
+                cell_text = texts[i + 1]
+                right_offset = 2
+
+            # Check next element(s) for lcel / ucel / xcel,
+            # set properly row_span, col_span
+            next_right_cell = ""
+            if i + right_offset < len(texts):
+                next_right_cell = texts[i + right_offset]
+
+            next_bottom_cell = ""
+            if r_idx + 1 < len(split_row_tokens):
+                if c_idx < len(split_row_tokens[r_idx + 1]):
+                    next_bottom_cell = split_row_tokens[r_idx + 1][c_idx]
+
+            if next_right_cell in [
+                OTSL_LCEL,
+                OTSL_XCEL,
+            ]:
+                # we have horisontal spanning cell or 2d spanning cell
+                col_span += count_right(
+                    split_row_tokens,
+                    c_idx + 1,
+                    r_idx,
+                    [OTSL_LCEL, OTSL_XCEL],
+                )
+            if next_bottom_cell in [
+                OTSL_UCEL,
+                OTSL_XCEL,
+            ]:
+                # we have a vertical spanning cell or 2d spanning cell
+                row_span += count_down(
+                    split_row_tokens,
+                    c_idx,
+                    r_idx + 1,
+                    [OTSL_UCEL, OTSL_XCEL],
+                )
+
+            table_cells.append(
+                TableCell(
+                    text=cell_text.strip(),
+                    row_span=row_span,
+                    col_span=col_span,
+                    start_row_offset_idx=r_idx,
+                    end_row_offset_idx=r_idx + row_span,
+                    start_col_offset_idx=c_idx,
+                    end_col_offset_idx=c_idx + col_span,
+                )
+            )
+        if text in [
+            OTSL_FCEL,
+            OTSL_ECEL,
+            OTSL_LCEL,
+            OTSL_UCEL,
+            OTSL_XCEL,
+        ]:
+            c_idx += 1
+        if text == OTSL_NL:
+            r_idx += 1
+            c_idx = 0
+    return table_cells, split_row_tokens
+
+
+def export_to_html(table_data: TableData):
+    nrows = table_data.num_rows
+    ncols = table_data.num_cols
+
+    text = ""
+
+    if len(table_data.table_cells) == 0:
+        return ""
+
+    body = ""
+
+    for i in range(nrows):
+        body += "<tr>"
+        for j in range(ncols):
+            cell: TableCell = table_data.grid[i][j]
+
+            rowspan, rowstart = (
+                cell.row_span,
+                cell.start_row_offset_idx,
+            )
+            colspan, colstart = (
+                cell.col_span,
+                cell.start_col_offset_idx,
+            )
+
+            if rowstart != i:
+                continue
+            if colstart != j:
+                continue
+
+            content = html.escape(cell.text.strip())
+            celltag = "td"
+            if cell.column_header:
+                celltag = "th"
+
+            opening_tag = f"{celltag}"
+            if rowspan > 1:
+                opening_tag += f' rowspan="{rowspan}"'
+            if colspan > 1:
+                opening_tag += f' colspan="{colspan}"'
+
+            body += f"<{opening_tag}>{content}</{celltag}>"
+        body += "</tr>"
+
+    # dir = get_text_direction(text)
+    body = f"<table>{body}</table>"
+
+    return body
+
+
+def convert_otsl_to_html(otsl_content: str):
+    tokens, mixed_texts = otsl_extract_tokens_and_text(otsl_content)
+    table_cells, split_row_tokens = otsl_parse_texts(mixed_texts, tokens)
+
+    table_data = TableData(
+        num_rows=len(split_row_tokens),
+        num_cols=(
+            max(len(row) for row in split_row_tokens) if split_row_tokens else 0
+        ),
+        table_cells=table_cells,
+    )
+
+    return export_to_html(table_data)
--- a/magic_pdf/libs/hash_utils.py
+++ b/magic_pdf/libs/hash_utils.py
+# Copyright (c) Opendatalab. All rights reserved.
 import hashlib
+import json


-def compute_md5(file_bytes):
+def bytes_md5(file_bytes):
    hasher = hashlib.md5()
    hasher.update(file_bytes)
    return hasher.hexdigest().upper()


-def compute_sha256(input_string):
+def str_md5(input_string):
+    hasher = hashlib.md5()
+    # 在Python3中，需要将字符串转化为字节对象才能被哈希函数处理
+    input_bytes = input_string.encode('utf-8')
+    hasher.update(input_bytes)
+    return hasher.hexdigest()
+
+
+def str_sha256(input_string):
    hasher = hashlib.sha256()
    # 在Python3中，需要将字符串转化为字节对象才能被哈希函数处理
    input_bytes = input_string.encode('utf-8')
    hasher.update(input_bytes)
    return hasher.hexdigest()
+
+
+def dict_md5(d):
+    json_str = json.dumps(d, sort_keys=True, ensure_ascii=False)
+    return hashlib.md5(json_str.encode('utf-8')).hexdigest()
\ No newline at end of file
--- a/magic_pdf/libs/language.py
+++ b/magic_pdf/libs/language.py
--- a/magic_pdf/post_proc/llm_aided.py
+++ b/magic_pdf/post_proc/llm_aided.py
 # Copyright (c) Opendatalab. All rights reserved.
-import json
 from loguru import logger
-from magic_pdf.dict2md.ocr_mkcontent import merge_para_with_text
 from openai import OpenAI
 import ast

+from mineru.backend.pipeline.pipeline_middle_json_mkcontent import merge_para_with_text

-#@todo: 有的公式以"\"结尾，这样会导致尾部拼接的"$"被转义，也需要修复
-formula_optimize_prompt = """请根据以下指南修正LaTeX公式的错误，确保公式能够渲染且符合原始内容：

-1. 修正渲染或编译错误：
-    - Some syntax errors such as mismatched/missing/extra tokens. Your task is to fix these syntax errors and make sure corrected results conform to latex math syntax principles.
-    - 包含KaTeX不支持的关键词等原因导致的无法编译或渲染的错误
-
-2. 保留原始信息：
-   - 保留原始公式中的所有重要信息
-   - 不要添加任何原始公式中没有的新信息
-
-IMPORTANT:请仅返回修正后的公式，不要包含任何介绍、解释或元数据。
-
-LaTeX recognition result:
-$FORMULA
-
-Your corrected result:
-"""
-
-text_optimize_prompt = f"""请根据以下指南修正OCR引起的错误，确保文本连贯并符合原始内容：
-
-1. 修正OCR引起的拼写错误和错误：
-   - 修正常见的OCR错误（例如，'rn' 被误读为 'm'）
-   - 使用上下文和常识进行修正
-   - 只修正明显的错误，不要不必要的修改内容
-   - 不要添加额外的句号或其他不必要的标点符号
-
-2. 保持原始结构：
-   - 保留所有标题和子标题
-
-3. 保留原始内容：
-   - 保留原始文本中的所有重要信息
-   - 不要添加任何原始文本中没有的新信息
-   - 保留段落之间的换行符
-
-4. 保持连贯性：
-   - 确保内容与前文顺畅连接
-   - 适当处理在句子中间开始或结束的文本
-   
-5. 修正行内公式：
-   - 去除行内公式前后多余的空格
-   - 修正公式中的OCR错误
-   - 确保公式能够通过KaTeX渲染
-   
-6. 修正全角字符
-    - 修正全角标点符号为半角标点符号
-    - 修正全角字母为半角字母
-    - 修正全角数字为半角数字
-
-IMPORTANT:请仅返回修正后的文本，保留所有原始格式，包括换行符。不要包含任何介绍、解释或元数据。
-
-Previous context:
-
-Current chunk to process:
-
-Corrected text:
-"""
-
-def llm_aided_formula(pdf_info_dict, formula_aided_config):
-    pass
-
-def llm_aided_text(pdf_info_dict, text_aided_config):
-    pass
-
-def llm_aided_title(pdf_info_dict, title_aided_config):
+def llm_aided_title(page_info_list, title_aided_config):
    client = OpenAI(
        api_key=title_aided_config["api_key"],
        base_url=title_aided_config["base_url"],
@@ -78,8 +14,8 @@ def llm_aided_title(pdf_info_dict, title_aided_config):
    title_dict = {}
    origin_title_list = []
    i = 0
-    for page_num, page in pdf_info_dict.items():
-        blocks = page["para_blocks"]
+    for page_info in page_info_list:
+        blocks = page_info["para_blocks"]
        for block in blocks:
            if block["type"] == "title":
                origin_title_list.append(block)
@@ -92,7 +28,7 @@ def llm_aided_title(pdf_info_dict, title_aided_config):
                    line_avg_height = sum(page_line_height_list) / len(page_line_height_list)
                else:
                    line_avg_height = int(block['bbox'][3] - block['bbox'][1])
-                title_dict[f"{i}"] = [title_text, line_avg_height, int(page_num[5:])+1]
+                title_dict[f"{i}"] = [title_text, line_avg_height, int(page_info['page_idx']) + 1]
                i += 1
    # logger.info(f"Title list: {title_dict}")

@@ -115,16 +51,21 @@ def llm_aided_title(pdf_info_dict, title_aided_config):
    - 标题从前至后的层级必须是连续的，不能跳过层级
    - 标题层级最多为4级，不要添加过多的层级
    - 优化后的标题只保留代表该标题的层级的整数，不要保留其他信息
-    
+
 5. 合理性检查与微调：
    - 在完成初步分级后，仔细检查分级结果的合理性
    - 根据上下文关系和逻辑顺序，对不合理的分级进行微调
    - 确保最终的分级结果符合文档的实际结构和逻辑
    - 字典中可能包含被误当成标题的正文，你可以通过将其层级标记为 0 来排除它们
-    
+
 IMPORTANT: 
 请直接返回优化过的由标题层级组成的字典，格式为{{标题id:标题层级}}，如下：
-{{0:1,1:2,2:2,3:3}}
+{{
+  0:1,
+  1:2,
+  2:2,
+  3:3
+}}
 不需要对字典格式化，不需要返回任何其他信息。

 Input title list:
@@ -145,16 +86,23 @@ Corrected title list:
                    {'role': 'user', 'content': title_optimize_prompt}],
                temperature=0.7,
            )
-            # logger.info(f"Title completion: {completion.choices[0].message.content}")
-            dict_completion = ast.literal_eval(completion.choices[0].message.content)
-            # logger.info(f"len(dict_completion): {len(dict_completion)}, len(title_dict): {len(title_dict)}")
+            content = completion.choices[0].message.content.strip()
+            # logger.info(f"Title completion: {content}")
+            if "</think>" in content:
+                idx = content.index("</think>") + len("</think>")
+                content = content[idx:].strip()
+            import json_repair
+            dict_completion = json_repair.loads(content)
+            dict_completion = {int(k): int(v) for k, v in dict_completion.items()}

+            # logger.info(f"len(dict_completion): {len(dict_completion)}, len(title_dict): {len(title_dict)}")
            if len(dict_completion) == len(title_dict):
                for i, origin_title_block in enumerate(origin_title_list):
                    origin_title_block["level"] = int(dict_completion[i])
                break
            else:
-                logger.warning("The number of titles in the optimized result is not equal to the number of titles in the input.")
+                logger.warning(
+                    "The number of titles in the optimized result is not equal to the number of titles in the input.")
                retry_count += 1
        except Exception as e:
            logger.exception(e)

--- a/magic_pdf/model/sub_modules/model_utils.py
+++ b/magic_pdf/model/sub_modules/model_utils.py
 import time
 import torch
+import gc
+from PIL import Image
 from loguru import logger
 import numpy as np

-from magic_pdf.libs.boxbase import get_minbox_if_overlap_by_ratio
-from magic_pdf.libs.clean_memory import clean_memory
+from mineru.utils.boxbase import get_minbox_if_overlap_by_ratio


-def crop_img(input_res, input_np_img, crop_paste_x=0, crop_paste_y=0):
+def crop_img(input_res, input_img, crop_paste_x=0, crop_paste_y=0):

    crop_xmin, crop_ymin = int(input_res['poly'][0]), int(input_res['poly'][1])
    crop_xmax, crop_ymax = int(input_res['poly'][4]), int(input_res['poly'][5])
@@ -16,15 +17,24 @@ def crop_img(input_res, input_np_img, crop_paste_x=0, crop_paste_y=0):
    crop_new_width = crop_xmax - crop_xmin + crop_paste_x * 2
    crop_new_height = crop_ymax - crop_ymin + crop_paste_y * 2

-    # Create a white background array
-    return_image = np.ones((crop_new_height, crop_new_width, 3), dtype=np.uint8) * 255
+    if isinstance(input_img, np.ndarray):

-    # Crop the original image using numpy slicing
-    cropped_img = input_np_img[crop_ymin:crop_ymax, crop_xmin:crop_xmax]
+        # Create a white background array
+        return_image = np.ones((crop_new_height, crop_new_width, 3), dtype=np.uint8) * 255

-    # Paste the cropped image onto the white background
-    return_image[crop_paste_y:crop_paste_y + (crop_ymax - crop_ymin),
-    crop_paste_x:crop_paste_x + (crop_xmax - crop_xmin)] = cropped_img
+        # Crop the original image using numpy slicing
+        cropped_img = input_img[crop_ymin:crop_ymax, crop_xmin:crop_xmax]
+
+        # Paste the cropped image onto the white background
+        return_image[crop_paste_y:crop_paste_y + (crop_ymax - crop_ymin),
+        crop_paste_x:crop_paste_x + (crop_xmax - crop_xmin)] = cropped_img
+    else:
+        # Create a white background array
+        return_image = Image.new('RGB', (crop_new_width, crop_new_height), 'white')
+        # Crop image
+        crop_box = (crop_xmin, crop_ymin, crop_xmax, crop_ymax)
+        cropped_img = input_img.crop(crop_box)
+        return_image.paste(cropped_img, (crop_paste_x, crop_paste_y))

    return_list = [crop_paste_x, crop_paste_y, crop_xmin, crop_ymin, crop_xmax, crop_ymax, crop_new_width,
                   crop_new_height]
@@ -287,6 +297,20 @@ def get_res_list_from_layout_res(layout_res, iou_threshold=0.7, overlap_threshol
    return ocr_res_list, filtered_table_res_list, single_page_mfdetrec_res


+def clean_memory(device='cuda'):
+    if device == 'cuda':
+        if torch.cuda.is_available():
+            torch.cuda.empty_cache()
+            torch.cuda.ipc_collect()
+    elif str(device).startswith("npu"):
+        import torch_npu
+        if torch_npu.npu.is_available():
+            torch_npu.npu.empty_cache()
+    elif str(device).startswith("mps"):
+        torch.mps.empty_cache()
+    gc.collect()
+
+
 def clean_vram(device, vram_threshold=8):
    total_memory = get_vram(device)
    if total_memory and total_memory <= vram_threshold:

--- a/mineru/utils/models_download_utils.py
+++ b/mineru/utils/models_download_utils.py
+import os
+from huggingface_hub import snapshot_download as hf_snapshot_download
+from modelscope import snapshot_download as ms_snapshot_download
+
+from mineru.utils.config_reader import get_local_models_dir
+from mineru.utils.enum_class import ModelPath
+
+def auto_download_and_get_model_root_path(relative_path: str, repo_mode='pipeline') -> str:
+    """
+    支持文件或目录的可靠下载。
+    - 如果输入文件: 返回本地文件绝对路径
+    - 如果输入目录: 返回本地缓存下与 relative_path 同结构的相对路径字符串
+    :param repo_mode: 指定仓库模式，'pipeline' 或 'vlm'
+    :param relative_path: 文件或目录相对路径
+    :return: 本地文件绝对路径或相对路径
+    """
+    model_source = os.getenv('MINERU_MODEL_SOURCE', "huggingface")
+
+    if model_source == 'local':
+        local_models_config = get_local_models_dir()
+        root_path = local_models_config.get(repo_mode, None)
+        if not root_path:
+            raise ValueError(f"Local path for repo_mode '{repo_mode}' is not configured.")
+        return root_path
+
+    # 建立仓库模式到路径的映射
+    repo_mapping = {
+        'pipeline': {
+            'huggingface': ModelPath.pipeline_root_hf,
+            'modelscope': ModelPath.pipeline_root_modelscope,
+            'default': ModelPath.pipeline_root_hf
+        },
+        'vlm': {
+            'huggingface': ModelPath.vlm_root_hf,
+            'modelscope': ModelPath.vlm_root_modelscope,
+            'default': ModelPath.vlm_root_hf
+        }
+    }
+
+    if repo_mode not in repo_mapping:
+        raise ValueError(f"Unsupported repo_mode: {repo_mode}, must be 'pipeline' or 'vlm'")
+
+    # 如果没有指定model_source或值不是'modelscope'，则使用默认值
+    repo = repo_mapping[repo_mode].get(model_source, repo_mapping[repo_mode]['default'])
+
+
+    if model_source == "huggingface":
+        snapshot_download = hf_snapshot_download
+    elif model_source == "modelscope":
+        snapshot_download = ms_snapshot_download
+    else:
+        raise ValueError(f"未知的仓库类型: {model_source}")
+
+    relative_path = relative_path.strip('/')
+    cache_dir = snapshot_download(repo, allow_patterns=[relative_path, relative_path+"/*"])
+
+    return cache_dir
+
+
+if __name__ == '__main__':
+    path1 = "models/README.md"
+    root = auto_download_and_get_model_root_path(path1)
+    print("本地文件绝对路径:", os.path.join(root, path1))
\ No newline at end of file
--- a/magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/ocr_utils.py
+++ b/magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/ocr_utils.py
 # Copyright (c) Opendatalab. All rights reserved.
 import copy
-
 import cv2
 import numpy as np
-from magic_pdf.pre_proc.ocr_dict_merge import merge_spans_to_line
-from magic_pdf.libs.boxbase import __is_overlaps_y_exceeds_threshold
+
+
+def merge_spans_to_line(spans, threshold=0.6):
+    if len(spans) == 0:
+        return []
+    else:
+        # 按照y0坐标排序
+        spans.sort(key=lambda span: span['bbox'][1])
+
+        lines = []
+        current_line = [spans[0]]
+        for span in spans[1:]:
+            # 如果当前的span与当前行的最后一个span在y轴上重叠，则添加到当前行
+            if __is_overlaps_y_exceeds_threshold(span['bbox'], current_line[-1]['bbox'], threshold):
+                current_line.append(span)
+            else:
+                # 否则，开始新行
+                lines.append(current_line)
+                current_line = [span]
+
+        # 添加最后一行
+        if current_line:
+            lines.append(current_line)
+
+        return lines
+
+def __is_overlaps_y_exceeds_threshold(bbox1,
+                                      bbox2,
+                                      overlap_ratio_threshold=0.8):
+    """检查两个bbox在y轴上是否有重叠，并且该重叠区域的高度占两个bbox高度更低的那个超过80%"""
+    _, y0_1, _, y1_1 = bbox1
+    _, y0_2, _, y1_2 = bbox2
+
+    overlap = max(0, min(y1_1, y1_2) - max(y0_1, y0_2))
+    height1, height2 = y1_1 - y0_1, y1_2 - y0_2
+    # max_height = max(height1, height2)
+    min_height = min(height1, height2)
+
+    return (overlap / min_height) > overlap_ratio_threshold


 def img_decode(content: bytes):
@@ -212,10 +248,7 @@ def merge_det_boxes(dt_boxes):
            angle_boxes_list.append(text_box)
            continue

-        text_box_dict = {
-            'bbox': text_bbox,
-            'type': 'text',
-        }
+        text_box_dict = {'bbox': text_bbox}
        dt_boxes_dict_list.append(text_box_dict)

    # Merge adjacent text regions into lines

--- a/mineru/utils/pdf_classify.py
+++ b/mineru/utils/pdf_classify.py
+# Copyright (c) Opendatalab. All rights reserved.
+import re
+from io import BytesIO
+import numpy as np
+import pypdfium2 as pdfium
+from loguru import logger
+from pdfminer.high_level import extract_text
+from pdfminer.layout import LAParams
+from pypdf import PdfReader
+
+
+def classify(pdf_bytes):
+    """
+    判断PDF文件是可以直接提取文本还是需要OCR
+
+    Args:
+        pdf_bytes: PDF文件的字节数据
+
+    Returns:
+        str: 'txt' 表示可以直接提取文本，'ocr' 表示需要OCR
+    """
+    try:
+        # 从字节数据加载PDF
+        sample_pdf_bytes = extract_pages(pdf_bytes)
+        pdf = pdfium.PdfDocument(sample_pdf_bytes)
+
+        # 获取PDF页数
+        page_count = len(pdf)
+
+        # 如果PDF页数为0，直接返回OCR
+        if page_count == 0:
+            return 'ocr'
+
+        # 检查的页面数（最多检查10页）
+        pages_to_check = min(page_count, 10)
+
+        # 设置阈值：如果每页平均少于50个有效字符，认为需要OCR
+        chars_threshold = 50
+
+        if (get_avg_cleaned_chars_per_page(pdf, pages_to_check) < chars_threshold) or detect_invalid_chars(sample_pdf_bytes):
+            return 'ocr'
+        else:
+
+            if get_high_image_coverage_ratio(sample_pdf_bytes, pages_to_check) >= 0.9:
+                return 'ocr'
+
+            return 'txt'
+    except Exception as e:
+        logger.error(f"判断PDF类型时出错: {e}")
+        # 出错时默认使用OCR
+        return 'ocr'
+
+
+def get_avg_cleaned_chars_per_page(pdf_doc, pages_to_check):
+    # 总字符数
+    total_chars = 0
+    # 清理后的总字符数
+    cleaned_total_chars = 0
+
+    # 检查前几页的文本
+    for i in range(pages_to_check):
+        page = pdf_doc[i]
+        text_page = page.get_textpage()
+        text = text_page.get_text_bounded()
+        total_chars += len(text)
+
+        # 清理提取的文本，移除空白字符
+        cleaned_text = re.sub(r'\s+', '', text)
+        cleaned_total_chars += len(cleaned_text)
+
+    # 计算平均每页字符数
+    avg_cleaned_chars_per_page = cleaned_total_chars / pages_to_check
+
+    # logger.debug(f"PDF分析: 平均每页清理后{avg_cleaned_chars_per_page:.1f}字符")
+
+    pdf_doc.close()  # 关闭PDF文档
+
+    return avg_cleaned_chars_per_page
+
+def get_high_image_coverage_ratio(sample_pdf_bytes, pages_to_check):
+    pdf_stream = BytesIO(sample_pdf_bytes)
+    pdf_reader = PdfReader(pdf_stream)
+
+    # 记录高图像覆盖率的页面数量
+    high_image_coverage_pages = 0
+
+    # 检查前几页的图像
+    for i in range(pages_to_check):
+        page = pdf_reader.pages[i]
+
+        # 获取页面尺寸
+        page_width = float(page.mediabox.width)
+        page_height = float(page.mediabox.height)
+        page_area = page_width * page_height
+
+        # 估算图像覆盖率
+        image_area = 0
+        if '/Resources' in page:
+            resources = page['/Resources']
+            if '/XObject' in resources:
+                x_objects = resources['/XObject']
+                # 计算所有图像对象占据的面积
+                for obj_name in x_objects:
+                    try:
+                        obj = x_objects[obj_name]
+                        if obj['/Subtype'] == '/Image':
+                            # 获取图像宽高
+                            width = obj.get('/Width', 0)
+                            height = obj.get('/Height', 0)
+
+                            # 计算图像在页面上的估计面积
+                            # 注意：这是估计值，因为没有考虑图像变换矩阵
+                            scale_factor = 1.0  # 估计缩放因子
+                            img_area = width * height * scale_factor
+                            image_area += img_area
+                    except Exception as e:
+                        # logger.debug(f"处理图像对象时出错: {e}")
+                        continue
+
+        # 估算图像覆盖率
+        estimated_coverage = min(image_area / page_area, 1.0) if page_area > 0 else 0
+        # logger.debug(f"PDF分析: 页面 {i + 1} 图像覆盖率: {estimated_coverage:.2f}")
+        # 基于估计的图像覆盖率
+        if estimated_coverage >= 1:
+            # 如果图像覆盖率超过80%，认为是高图像覆盖率页面
+            high_image_coverage_pages += 1
+    # 计算高图像覆盖页面比例
+    high_image_coverage_ratio = high_image_coverage_pages / pages_to_check
+    # logger.debug(f"PDF分析: 高图像覆盖页面比例: {high_image_coverage_ratio:.2f}")
+
+    pdf_stream.close()  # 关闭字节流
+    pdf_reader.close()
+    return high_image_coverage_ratio
+
+
+def extract_pages(src_pdf_bytes: bytes) -> bytes:
+    """
+    从PDF字节数据中随机提取最多10页，返回新的PDF字节数据
+
+    Args:
+        src_pdf_bytes: PDF文件的字节数据
+
+    Returns:
+        bytes: 提取页面后的PDF字节数据
+    """
+
+    # 从字节数据加载PDF
+    pdf = pdfium.PdfDocument(src_pdf_bytes)
+
+    # 获取PDF页数
+    total_page = len(pdf)
+    if total_page == 0:
+        # 如果PDF没有页面，直接返回空文档
+        logger.warning("PDF is empty, return empty document")
+        return b''
+
+    # 选择最多10页
+    select_page_cnt = min(10, total_page)
+
+    # 从总页数中随机选择页面
+    page_indices = np.random.choice(total_page, select_page_cnt, replace=False).tolist()
+
+    # 创建一个新的PDF文档
+    sample_docs = pdfium.PdfDocument.new()
+
+    try:
+        # 将选择的页面导入新文档
+        sample_docs.import_pages(pdf, page_indices)
+
+        # 将新PDF保存到内存缓冲区
+        output_buffer = BytesIO()
+        sample_docs.save(output_buffer)
+
+        # 获取字节数据
+        return output_buffer.getvalue()
+    except Exception as e:
+        logger.exception(e)
+        return b''  # 出错时返回空字节
+
+
+def detect_invalid_chars(sample_pdf_bytes: bytes) -> bool:
+    """"
+    检测PDF中是否包含非法字符
+    """
+    '''pdfminer比较慢,需要先随机抽取10页左右的sample'''
+    # sample_pdf_bytes = extract_pages(src_pdf_bytes)
+    sample_pdf_file_like_object = BytesIO(sample_pdf_bytes)
+    laparams = LAParams(
+        line_overlap=0.5,
+        char_margin=2.0,
+        line_margin=0.5,
+        word_margin=0.1,
+        boxes_flow=None,
+        detect_vertical=False,
+        all_texts=False,
+    )
+    text = extract_text(pdf_file=sample_pdf_file_like_object, laparams=laparams)
+    text = text.replace("\n", "")
+    # logger.info(text)
+    '''乱码文本用pdfminer提取出来的文本特征是(cid:xxx)'''
+    cid_pattern = re.compile(r'\(cid:\d+\)')
+    matches = cid_pattern.findall(text)
+    cid_count = len(matches)
+    cid_len = sum(len(match) for match in matches)
+    text_len = len(text)
+    if text_len == 0:
+        cid_chars_radio = 0
+    else:
+        cid_chars_radio = cid_count/(cid_count + text_len - cid_len)
+    # logger.debug(f"cid_count: {cid_count}, text_len: {text_len}, cid_chars_radio: {cid_chars_radio}")
+    '''当一篇文章存在5%以上的文本是乱码时,认为该文档为乱码文档'''
+    if cid_chars_radio > 0.05:
+        return True  # 乱码文档
+    else:
+        return False   # 正常文档
+
+
+if __name__ == '__main__':
+    with open('/Users/myhloli/pdf/luanma2x10.pdf', 'rb') as f:
+        p_bytes = f.read()
+        logger.info(f"PDF分类结果: {classify(p_bytes)}")
\ No newline at end of file
--- a/mineru/utils/pdf_image_tools.py
+++ b/mineru/utils/pdf_image_tools.py
+# Copyright (c) Opendatalab. All rights reserved.
+from io import BytesIO
+
+import pypdfium2 as pdfium
+from loguru import logger
+from PIL import Image
+
+from mineru.data.data_reader_writer import FileBasedDataWriter
+from mineru.utils.pdf_reader import image_to_b64str, image_to_bytes, page_to_image
+from .hash_utils import str_sha256
+
+
+def pdf_page_to_image(page: pdfium.PdfPage, dpi=200) -> dict:
+    """Convert pdfium.PdfDocument to image, Then convert the image to base64.
+
+    Args:
+        page (_type_): pdfium.PdfPage
+        dpi (int, optional): reset the dpi of dpi. Defaults to 200.
+
+    Returns:
+        dict:  {'img_base64': str, 'img_pil': pil_img, 'scale': float }
+    """
+    pil_img, scale = page_to_image(page, dpi=dpi)
+    img_base64 = image_to_b64str(pil_img)
+
+    image_dict = {
+        "img_base64": img_base64,
+        "img_pil": pil_img,
+        "scale": scale,
+    }
+    return image_dict
+
+
+def load_images_from_pdf(
+    pdf_bytes: bytes,
+    dpi=200,
+    start_page_id=0,
+    end_page_id=None,
+):
+    images_list = []
+    pdf_doc = pdfium.PdfDocument(pdf_bytes)
+    pdf_page_num = len(pdf_doc)
+    end_page_id = end_page_id if end_page_id is not None and end_page_id >= 0 else pdf_page_num - 1
+    if end_page_id > pdf_page_num - 1:
+        logger.warning("end_page_id is out of range, use images length")
+        end_page_id = pdf_page_num - 1
+
+    for index in range(0, pdf_page_num):
+        if start_page_id <= index <= end_page_id:
+            page = pdf_doc[index]
+            image_dict = pdf_page_to_image(page, dpi=dpi)
+            images_list.append(image_dict)
+
+    return images_list, pdf_doc
+
+
+def cut_image(bbox: tuple, page_num: int, page_pil_img, return_path, image_writer: FileBasedDataWriter, scale=2):
+    """从第page_num页的page中，根据bbox进行裁剪出一张jpg图片，返回图片路径 save_path：需要同时支持s3和本地,
+    图片存放在save_path下，文件名是:
+    {page_num}_{bbox[0]}_{bbox[1]}_{bbox[2]}_{bbox[3]}.jpg , bbox内数字取整。"""
+
+    # 拼接文件名
+    filename = f"{page_num}_{int(bbox[0])}_{int(bbox[1])}_{int(bbox[2])}_{int(bbox[3])}"
+
+    # 老版本返回不带bucket的路径
+    img_path = f"{return_path}_{filename}" if return_path is not None else None
+
+    # 新版本生成平铺路径
+    img_hash256_path = f"{str_sha256(img_path)}.jpg"
+    # img_hash256_path = f'{img_path}.jpg'
+
+    crop_img = get_crop_img(bbox, page_pil_img, scale=scale)
+
+    img_bytes = image_to_bytes(crop_img, image_format="JPEG")
+
+    image_writer.write(img_hash256_path, img_bytes)
+    return img_hash256_path
+
+
+def get_crop_img(bbox: tuple, pil_img, scale=2):
+    scale_bbox = (
+        int(bbox[0] * scale),
+        int(bbox[1] * scale),
+        int(bbox[2] * scale),
+        int(bbox[3] * scale),
+    )
+    return pil_img.crop(scale_bbox)
+
+
+def images_bytes_to_pdf_bytes(image_bytes):
+    # 内存缓冲区
+    pdf_buffer = BytesIO()
+
+    # 载入并转换所有图像为 RGB 模式
+    image = Image.open(BytesIO(image_bytes)).convert("RGB")
+
+    # 第一张图保存为 PDF，其余追加
+    image.save(pdf_buffer, format="PDF", save_all=True)
+
+    # 获取 PDF bytes 并重置指针（可选）
+    pdf_bytes = pdf_buffer.getvalue()
+    pdf_buffer.close()
+    return pdf_bytes
--- a/mineru/utils/pdf_reader.py
+++ b/mineru/utils/pdf_reader.py
+# Copyright (c) Opendatalab. All rights reserved.
+import base64
+from io import BytesIO
+
+from loguru import logger
+from PIL import Image
+from pypdfium2 import PdfBitmap, PdfDocument, PdfPage
+
+
+def page_to_image(
+    page: PdfPage,
+    dpi: int = 144,  # changed from 200 to 144
+    max_width_or_height: int = 2560,  # changed from 4500 to 2560
+) -> (Image.Image, float):
+    scale = dpi / 72
+
+    long_side_length = max(*page.get_size())
+    if long_side_length > max_width_or_height:
+        scale = max_width_or_height / long_side_length
+
+    bitmap: PdfBitmap = page.render(scale=scale)  # type: ignore
+    try:
+        image = bitmap.to_pil()
+    finally:
+        try:
+            bitmap.close()
+        except Exception:
+            pass
+    return image, scale
+
+
+def image_to_bytes(
+    image: Image.Image,
+    image_format: str = "PNG",  # 也可以用 "JPEG"
+) -> bytes:
+    with BytesIO() as image_buffer:
+        image.save(image_buffer, format=image_format)
+        return image_buffer.getvalue()
+
+
+def image_to_b64str(
+    image: Image.Image,
+    image_format: str = "PNG",  # 也可以用 "JPEG"
+) -> str:
+    image_bytes = image_to_bytes(image, image_format)
+    return base64.b64encode(image_bytes).decode("utf-8")
+
+
+def pdf_to_images(
+    pdf: str | bytes | PdfDocument,
+    dpi: int = 144,
+    max_width_or_height: int = 2560,
+    start_page_id: int = 0,
+    end_page_id: int | None = None,
+) -> list[Image.Image]:
+    doc = pdf if isinstance(pdf, PdfDocument) else PdfDocument(pdf)
+    page_num = len(doc)
+
+    end_page_id = end_page_id if end_page_id is not None and end_page_id >= 0 else page_num - 1
+    if end_page_id > page_num - 1:
+        logger.warning("end_page_id is out of range, use images length")
+        end_page_id = page_num - 1
+
+    images = []
+    try:
+        for i in range(start_page_id, end_page_id + 1):
+            image, _ = page_to_image(doc[i], dpi, max_width_or_height)
+            images.append(image)
+    finally:
+        try:
+            doc.close()
+        except Exception:
+            pass
+    return images
+
+
+def pdf_to_images_bytes(
+    pdf: str | bytes | PdfDocument,
+    dpi: int = 144,
+    max_width_or_height: int = 2560,
+    start_page_id: int = 0,
+    end_page_id: int | None = None,
+    image_format: str = "PNG",
+) -> list[bytes]:
+    images = pdf_to_images(pdf, dpi, max_width_or_height, start_page_id, end_page_id)
+    return [image_to_bytes(image, image_format) for image in images]
+
+
+def pdf_to_images_b64strs(
+    pdf: str | bytes | PdfDocument,
+    dpi: int = 144,
+    max_width_or_height: int = 2560,
+    start_page_id: int = 0,
+    end_page_id: int | None = None,
+    image_format: str = "PNG",
+) -> list[str]:
+    images = pdf_to_images(pdf, dpi, max_width_or_height, start_page_id, end_page_id)
+    return [image_to_b64str(image, image_format) for image in images]
--- a/mineru/utils/pdf_text_tool.py
+++ b/mineru/utils/pdf_text_tool.py
+from typing import List
+import math
+
+import pypdfium2 as pdfium
+from pdftext.pdf.chars import get_chars, deduplicate_chars
+from pdftext.pdf.pages import get_spans, get_lines, assign_scripts, get_blocks
+
+
+def get_page(
+    page: pdfium.PdfPage,
+    quote_loosebox: bool =True,
+    superscript_height_threshold: float = 0.7,
+    line_distance_threshold: float = 0.1,
+) -> dict:
+
+        textpage = page.get_textpage()
+        page_bbox: List[float] = page.get_bbox()
+        page_width = math.ceil(abs(page_bbox[2] - page_bbox[0]))
+        page_height = math.ceil(abs(page_bbox[1] - page_bbox[3]))
+
+        page_rotation = 0
+        try:
+            page_rotation = page.get_rotation()
+        except:
+            pass
+
+        chars = deduplicate_chars(get_chars(textpage, page_bbox, page_rotation, quote_loosebox))
+        spans = get_spans(chars, superscript_height_threshold=superscript_height_threshold, line_distance_threshold=line_distance_threshold)
+        lines = get_lines(spans)
+        assign_scripts(lines, height_threshold=superscript_height_threshold, line_distance_threshold=line_distance_threshold)
+        blocks = get_blocks(lines)
+
+        page = {
+            "bbox": page_bbox,
+            "width": page_width,
+            "height": page_height,
+            "rotation": page_rotation,
+            "blocks": blocks
+        }
+        return page
\ No newline at end of file
--- a/mineru/utils/run_async.py
+++ b/mineru/utils/run_async.py
+import asyncio
+import threading
+from queue import Queue
+from typing import Any, AsyncIterable, Coroutine, Iterable, TypeVar
+
+T = TypeVar("T")
+
+
+def run_async(coroutine: Coroutine[Any, Any, T]) -> T:
+    if not asyncio.iscoroutine(coroutine):
+        raise ValueError("a coroutine was expected, got {!r}".format(coroutine))
+
+    try:
+        loop = asyncio.get_running_loop()
+    except RuntimeError:
+        loop = None
+
+    if loop is not None:
+        return loop.run_until_complete(coroutine)
+    else:
+        return asyncio.run(coroutine)
+
+
+def iter_async(iterable: AsyncIterable[T]) -> Iterable[T]:
+    if not isinstance(iterable, AsyncIterable):
+        raise ValueError("an async iterable was expected, got {!r}".format(iterable))
+
+    queue = Queue()
+
+    async def async_helper():
+        try:
+            async for chunk in iterable:
+                queue.put(chunk)
+            queue.put(None)
+        except Exception as e:
+            queue.put(e)
+
+    def helper():
+        run_async(async_helper())
+
+    thread = threading.Thread(target=helper, daemon=True)
+    thread.start()
+
+    while True:
+        chunk = queue.get()
+        if chunk is None:
+            break
+        if isinstance(chunk, Exception):
+            raise chunk
+        yield chunk
+
+    thread.join()
--- a/magic_pdf/pre_proc/ocr_dict_merge.py
+++ b/magic_pdf/pre_proc/ocr_dict_merge.py
-from magic_pdf.config.ocr_content_type import BlockType, ContentType
-from magic_pdf.libs.boxbase import __is_overlaps_y_exceeds_threshold, calculate_overlap_area_in_bbox1_area_ratio
+# Copyright (c) Opendatalab. All rights reserved.
+from mineru.utils.boxbase import calculate_overlap_area_in_bbox1_area_ratio
+from mineru.utils.enum_class import BlockType, ContentType
+from mineru.utils.ocr_utils import __is_overlaps_y_exceeds_threshold


-# 将每一个line中的span从左到右排序
-def line_sort_spans_by_left_to_right(lines):
-    line_objects = []
-    for line in lines:
-        #  按照x0坐标排序
-        line.sort(key=lambda span: span['bbox'][0])
-        line_bbox = [
-            min(span['bbox'][0] for span in line),  # x0
-            min(span['bbox'][1] for span in line),  # y0
-            max(span['bbox'][2] for span in line),  # x1
-            max(span['bbox'][3] for span in line),  # y1
+def fill_spans_in_blocks(blocks, spans, radio):
+    """将allspans中的span按位置关系，放入blocks中."""
+    block_with_spans = []
+    for block in blocks:
+        block_type = block[7]
+        block_bbox = block[0:4]
+        block_dict = {
+            'type': block_type,
+            'bbox': block_bbox,
+        }
+        if block_type in [
+            BlockType.IMAGE_BODY, BlockType.IMAGE_CAPTION, BlockType.IMAGE_FOOTNOTE,
+            BlockType.TABLE_BODY, BlockType.TABLE_CAPTION, BlockType.TABLE_FOOTNOTE
+        ]:
+            block_dict['group_id'] = block[-1]
+        block_spans = []
+        for span in spans:
+            span_bbox = span['bbox']
+            if calculate_overlap_area_in_bbox1_area_ratio(span_bbox, block_bbox) > radio and span_block_type_compatible(
+                    span['type'], block_type):
+                block_spans.append(span)
+
+        block_dict['spans'] = block_spans
+        block_with_spans.append(block_dict)
+
+        # 从spans删除已经放入block_spans中的span
+        if len(block_spans) > 0:
+            for span in block_spans:
+                spans.remove(span)
+
+    return block_with_spans, spans
+
+
+def span_block_type_compatible(span_type, block_type):
+    if span_type in [ContentType.TEXT, ContentType.INLINE_EQUATION]:
+        return block_type in [
+            BlockType.TEXT,
+            BlockType.TITLE,
+            BlockType.IMAGE_CAPTION,
+            BlockType.IMAGE_FOOTNOTE,
+            BlockType.TABLE_CAPTION,
+            BlockType.TABLE_FOOTNOTE,
+            BlockType.DISCARDED
        ]
-        line_objects.append({
-            'bbox': line_bbox,
-            'spans': line,
-        })
-    return line_objects
+    elif span_type == ContentType.INTERLINE_EQUATION:
+        return block_type in [BlockType.INTERLINE_EQUATION, BlockType.TEXT]
+    elif span_type == ContentType.IMAGE:
+        return block_type in [BlockType.IMAGE_BODY]
+    elif span_type == ContentType.TABLE:
+        return block_type in [BlockType.TABLE_BODY]
+    else:
+        return False
+
+
+def fix_discarded_block(discarded_block_with_spans):
+    fix_discarded_blocks = []
+    for block in discarded_block_with_spans:
+        block = fix_text_block(block)
+        fix_discarded_blocks.append(block)
+    return fix_discarded_blocks
+
+
+def fix_text_block(block):
+    # 文本block中的公式span都应该转换成行内type
+    for span in block['spans']:
+        if span['type'] == ContentType.INTERLINE_EQUATION:
+            span['type'] = ContentType.INLINE_EQUATION
+    block_lines = merge_spans_to_line(block['spans'])
+    sort_block_lines = line_sort_spans_by_left_to_right(block_lines)
+    block['lines'] = sort_block_lines
+    del block['spans']
+    return block


 def merge_spans_to_line(spans, threshold=0.6):
@@ -34,11 +91,11 @@ def merge_spans_to_line(spans, threshold=0.6):
            # 如果当前的span类型为"interline_equation" 或者 当前行中已经有"interline_equation"
            # image和table类型，同上
            if span['type'] in [
-                    ContentType.InterlineEquation, ContentType.Image,
-                    ContentType.Table
+                    ContentType.INTERLINE_EQUATION, ContentType.IMAGE,
+                    ContentType.TABLE
            ] or any(s['type'] in [
-                    ContentType.InterlineEquation, ContentType.Image,
-                    ContentType.Table
+                    ContentType.INTERLINE_EQUATION, ContentType.IMAGE,
+                    ContentType.TABLE
            ] for s in current_line):
                # 则开始新行
                lines.append(current_line)
@@ -60,70 +117,36 @@ def merge_spans_to_line(spans, threshold=0.6):
        return lines


-def span_block_type_compatible(span_type, block_type):
-    if span_type in [ContentType.Text, ContentType.InlineEquation]:
-        return block_type in [
-            BlockType.Text,
-            BlockType.Title,
-            BlockType.ImageCaption,
-            BlockType.ImageFootnote,
-            BlockType.TableCaption,
-            BlockType.TableFootnote,
-            BlockType.Discarded
+# 将每一个line中的span从左到右排序
+def line_sort_spans_by_left_to_right(lines):
+    line_objects = []
+    for line in lines:
+        #  按照x0坐标排序
+        line.sort(key=lambda span: span['bbox'][0])
+        line_bbox = [
+            min(span['bbox'][0] for span in line),  # x0
+            min(span['bbox'][1] for span in line),  # y0
+            max(span['bbox'][2] for span in line),  # x1
+            max(span['bbox'][3] for span in line),  # y1
        ]
-    elif span_type == ContentType.InterlineEquation:
-        return block_type in [BlockType.InterlineEquation, BlockType.Text]
-    elif span_type == ContentType.Image:
-        return block_type in [BlockType.ImageBody]
-    elif span_type == ContentType.Table:
-        return block_type in [BlockType.TableBody]
-    else:
-        return False
-
-
-def fill_spans_in_blocks(blocks, spans, radio):
-    """将allspans中的span按位置关系，放入blocks中."""
-    block_with_spans = []
-    for block in blocks:
-        block_type = block[7]
-        block_bbox = block[0:4]
-        block_dict = {
-            'type': block_type,
-            'bbox': block_bbox,
-        }
-        if block_type in [
-            BlockType.ImageBody, BlockType.ImageCaption, BlockType.ImageFootnote,
-            BlockType.TableBody, BlockType.TableCaption, BlockType.TableFootnote
-        ]:
-            block_dict['group_id'] = block[-1]
-        block_spans = []
-        for span in spans:
-            span_bbox = span['bbox']
-            if calculate_overlap_area_in_bbox1_area_ratio(span_bbox, block_bbox) > radio and span_block_type_compatible(span['type'], block_type):
-                block_spans.append(span)
-
-        block_dict['spans'] = block_spans
-        block_with_spans.append(block_dict)
-
-        # 从spans删除已经放入block_spans中的span
-        if len(block_spans) > 0:
-            for span in block_spans:
-                spans.remove(span)
-
-    return block_with_spans, spans
+        line_objects.append({
+            'bbox': line_bbox,
+            'spans': line,
+        })
+    return line_objects


-def fix_block_spans_v2(block_with_spans):
+def fix_block_spans(block_with_spans):
    fix_blocks = []
    for block in block_with_spans:
        block_type = block['type']

-        if block_type in [BlockType.Text, BlockType.Title,
-                          BlockType.ImageCaption, BlockType.ImageFootnote,
-                          BlockType.TableCaption, BlockType.TableFootnote
+        if block_type in [BlockType.TEXT, BlockType.TITLE,
+                          BlockType.IMAGE_CAPTION, BlockType.IMAGE_CAPTION,
+                          BlockType.TABLE_CAPTION, BlockType.TABLE_FOOTNOTE
                          ]:
            block = fix_text_block(block)
-        elif block_type in [BlockType.InterlineEquation, BlockType.ImageBody, BlockType.TableBody]:
+        elif block_type in [BlockType.INTERLINE_EQUATION, BlockType.IMAGE_BODY, BlockType.TABLE_BODY]:
            block = fix_interline_block(block)
        else:
            continue
@@ -131,29 +154,9 @@ def fix_block_spans_v2(block_with_spans):
    return fix_blocks


-def fix_discarded_block(discarded_block_with_spans):
-    fix_discarded_blocks = []
-    for block in discarded_block_with_spans:
-        block = fix_text_block(block)
-        fix_discarded_blocks.append(block)
-    return fix_discarded_blocks
-
-
-def fix_text_block(block):
-    # 文本block中的公式span都应该转换成行内type
-    for span in block['spans']:
-        if span['type'] == ContentType.InterlineEquation:
-            span['type'] = ContentType.InlineEquation
-    block_lines = merge_spans_to_line(block['spans'])
-    sort_block_lines = line_sort_spans_by_left_to_right(block_lines)
-    block['lines'] = sort_block_lines
-    del block['spans']
-    return block
-
-
 def fix_interline_block(block):
    block_lines = merge_spans_to_line(block['spans'])
    sort_block_lines = line_sort_spans_by_left_to_right(block_lines)
    block['lines'] = sort_block_lines
    del block['spans']
-    return block
+    return block
\ No newline at end of file
--- a/mineru/utils/span_pre_proc.py
+++ b/mineru/utils/span_pre_proc.py
+# Copyright (c) Opendatalab. All rights reserved.
+import re
+import statistics
+
+import cv2
+import numpy as np
+from loguru import logger
+
+from mineru.utils.boxbase import calculate_overlap_area_in_bbox1_area_ratio, calculate_iou, \
+    get_minbox_if_overlap_by_ratio
+from mineru.utils.enum_class import BlockType, ContentType
+from mineru.utils.pdf_image_tools import get_crop_img
+from mineru.utils.pdf_text_tool import get_page
+
+
+def remove_outside_spans(spans, all_bboxes, all_discarded_blocks):
+    def get_block_bboxes(blocks, block_type_list):
+        return [block[0:4] for block in blocks if block[7] in block_type_list]
+
+    image_bboxes = get_block_bboxes(all_bboxes, [BlockType.IMAGE_BODY])
+    table_bboxes = get_block_bboxes(all_bboxes, [BlockType.TABLE_BODY])
+    other_block_type = []
+    for block_type in BlockType.__dict__.values():
+        if not isinstance(block_type, str):
+            continue
+        if block_type not in [BlockType.IMAGE_BODY, BlockType.TABLE_BODY]:
+            other_block_type.append(block_type)
+    other_block_bboxes = get_block_bboxes(all_bboxes, other_block_type)
+    discarded_block_bboxes = get_block_bboxes(all_discarded_blocks, [BlockType.DISCARDED])
+
+    new_spans = []
+
+    for span in spans:
+        span_bbox = span['bbox']
+        span_type = span['type']
+
+        if any(calculate_overlap_area_in_bbox1_area_ratio(span_bbox, block_bbox) > 0.4 for block_bbox in
+               discarded_block_bboxes):
+            new_spans.append(span)
+            continue
+
+        if span_type == ContentType.IMAGE:
+            if any(calculate_overlap_area_in_bbox1_area_ratio(span_bbox, block_bbox) > 0.5 for block_bbox in
+                   image_bboxes):
+                new_spans.append(span)
+        elif span_type == ContentType.TABLE:
+            if any(calculate_overlap_area_in_bbox1_area_ratio(span_bbox, block_bbox) > 0.5 for block_bbox in
+                   table_bboxes):
+                new_spans.append(span)
+        else:
+            if any(calculate_overlap_area_in_bbox1_area_ratio(span_bbox, block_bbox) > 0.5 for block_bbox in
+                   other_block_bboxes):
+                new_spans.append(span)
+
+    return new_spans
+
+
+def remove_overlaps_low_confidence_spans(spans):
+    dropped_spans = []
+    #  删除重叠spans中置信度低的的那些
+    for span1 in spans:
+        for span2 in spans:
+            if span1 != span2:
+                # span1 或 span2 任何一个都不应该在 dropped_spans 中
+                if span1 in dropped_spans or span2 in dropped_spans:
+                    continue
+                else:
+                    if calculate_iou(span1['bbox'], span2['bbox']) > 0.9:
+                        if span1['score'] < span2['score']:
+                            span_need_remove = span1
+                        else:
+                            span_need_remove = span2
+                        if (
+                            span_need_remove is not None
+                            and span_need_remove not in dropped_spans
+                        ):
+                            dropped_spans.append(span_need_remove)
+
+    if len(dropped_spans) > 0:
+        for span_need_remove in dropped_spans:
+            spans.remove(span_need_remove)
+
+    return spans, dropped_spans
+
+
+def remove_overlaps_min_spans(spans):
+    dropped_spans = []
+    #  删除重叠spans中较小的那些
+    for span1 in spans:
+        for span2 in spans:
+            if span1 != span2:
+                # span1 或 span2 任何一个都不应该在 dropped_spans 中
+                if span1 in dropped_spans or span2 in dropped_spans:
+                    continue
+                else:
+                    overlap_box = get_minbox_if_overlap_by_ratio(span1['bbox'], span2['bbox'], 0.65)
+                    if overlap_box is not None:
+                        span_need_remove = next((span for span in spans if span['bbox'] == overlap_box), None)
+                        if span_need_remove is not None and span_need_remove not in dropped_spans:
+                            dropped_spans.append(span_need_remove)
+    if len(dropped_spans) > 0:
+        for span_need_remove in dropped_spans:
+            spans.remove(span_need_remove)
+
+    return spans, dropped_spans
+
+
+def __replace_ligatures(text: str):
+    ligatures = {
+        'ﬁ': 'fi', 'ﬂ': 'fl', 'ﬀ': 'ff', 'ﬃ': 'ffi', 'ﬄ': 'ffl', 'ﬅ': 'ft', 'ﬆ': 'st'
+    }
+    return re.sub('|'.join(map(re.escape, ligatures.keys())), lambda m: ligatures[m.group()], text)
+
+def __replace_unicode(text: str):
+    ligatures = {
+        '\r\n': '', '\u0002': '-',
+    }
+    return re.sub('|'.join(map(re.escape, ligatures.keys())), lambda m: ligatures[m.group()], text)
+
+
+"""pdf_text dict方案 char级别"""
+def txt_spans_extract(pdf_page, spans, pil_img, scale, all_bboxes, all_discarded_blocks):
+
+    page_dict = get_page(pdf_page)
+
+    page_all_chars = []
+    page_all_lines = []
+    for block in page_dict['blocks']:
+        for line in block['lines']:
+            if 0 < abs(line['rotation']) < 90:
+                # 旋转角度在0-90度之间的行，直接跳过
+                continue
+            page_all_lines.append(line)
+            for span in line['spans']:
+                for char in span['chars']:
+                    page_all_chars.append(char)
+
+    # 计算所有sapn的高度的中位数
+    span_height_list = []
+    for span in spans:
+        if span['type'] in [ContentType.TEXT]:
+            span_height = span['bbox'][3] - span['bbox'][1]
+            span['height'] = span_height
+            span['width'] = span['bbox'][2] - span['bbox'][0]
+            span_height_list.append(span_height)
+    if len(span_height_list) == 0:
+        return spans
+    else:
+        median_span_height = statistics.median(span_height_list)
+
+    useful_spans = []
+    unuseful_spans = []
+    # 纵向span的两个特征：1. 高度超过多个line 2. 高宽比超过某个值
+    vertical_spans = []
+    for span in spans:
+        if span['type'] in [ContentType.TEXT]:
+            for block in all_bboxes + all_discarded_blocks:
+                if block[7] in [BlockType.IMAGE_BODY, BlockType.TABLE_BODY, BlockType.INTERLINE_EQUATION]:
+                    continue
+                if calculate_overlap_area_in_bbox1_area_ratio(span['bbox'], block[0:4]) > 0.5:
+                    if span['height'] > median_span_height * 3 and span['height'] > span['width'] * 3:
+                        vertical_spans.append(span)
+                    elif block in all_bboxes:
+                        useful_spans.append(span)
+                    else:
+                        unuseful_spans.append(span)
+                    break
+
+    """垂直的span框直接用line进行填充"""
+    if len(vertical_spans) > 0:
+        for pdfium_line in page_all_lines:
+            for span in vertical_spans:
+                if calculate_overlap_area_in_bbox1_area_ratio(pdfium_line['bbox'].bbox, span['bbox']) > 0.5:
+                    for pdfium_span in pdfium_line['spans']:
+                        span['content'] += pdfium_span['text']
+                    break
+
+        for span in vertical_spans:
+            if len(span['content']) == 0:
+                spans.remove(span)
+
+    """水平的span框先用char填充，再用ocr填充空的span框"""
+    new_spans = []
+
+    for span in useful_spans + unuseful_spans:
+        if span['type'] in [ContentType.TEXT]:
+            span['chars'] = []
+            new_spans.append(span)
+
+    need_ocr_spans = fill_char_in_spans(new_spans, page_all_chars)
+
+    """对未填充的span进行ocr"""
+    if len(need_ocr_spans) > 0:
+
+        for span in need_ocr_spans:
+            # 对span的bbox截图再ocr
+            span_pil_img = get_crop_img(span['bbox'], pil_img, scale)
+            span_img = cv2.cvtColor(np.array(span_pil_img), cv2.COLOR_RGB2BGR)
+            # 计算span的对比度，低于0.20的span不进行ocr
+            if calculate_contrast(span_img, img_mode='bgr') <= 0.17:
+                spans.remove(span)
+                continue
+
+            span['content'] = ''
+            span['score'] = 1.0
+            span['np_img'] = span_img
+
+    return spans
+
+
+def fill_char_in_spans(spans, all_chars):
+
+    # 简单从上到下排一下序
+    spans = sorted(spans, key=lambda x: x['bbox'][1])
+
+    for char in all_chars:
+
+        for span in spans:
+            if calculate_char_in_span(char['bbox'], span['bbox'], char['char']):
+                span['chars'].append(char)
+                break
+
+    need_ocr_spans = []
+    for span in spans:
+        chars_to_content(span)
+        # 有的span中虽然没有字但有一两个空的占位符，用宽高和content长度过滤
+        if len(span['content']) * span['height'] < span['width'] * 0.5:
+            # logger.info(f"maybe empty span: {len(span['content'])}, {span['height']}, {span['width']}")
+            need_ocr_spans.append(span)
+        del span['height'], span['width']
+    return need_ocr_spans
+
+
+LINE_STOP_FLAG = ('.', '!', '?', '。', '！', '？', ')', '）', '"', '”', ':', '：', ';', '；', ']', '】', '}', '}', '>', '》', '、', ',', '，', '-', '—', '–',)
+LINE_START_FLAG = ('(', '（', '"', '“', '【', '{', '《', '<', '「', '『', '【', '[',)
+
+Span_Height_Radio = 0.33  # 字符的中轴和span的中轴高度差不能超过1/3span高度
+def calculate_char_in_span(char_bbox, span_bbox, char, span_height_radio=Span_Height_Radio):
+    char_center_x = (char_bbox[0] + char_bbox[2]) / 2
+    char_center_y = (char_bbox[1] + char_bbox[3]) / 2
+    span_center_y = (span_bbox[1] + span_bbox[3]) / 2
+    span_height = span_bbox[3] - span_bbox[1]
+
+    if (
+        span_bbox[0] < char_center_x < span_bbox[2]
+        and span_bbox[1] < char_center_y < span_bbox[3]
+        and abs(char_center_y - span_center_y) < span_height * span_height_radio  # 字符的中轴和span的中轴高度差不能超过Span_Height_Radio
+    ):
+        return True
+    else:
+        # 如果char是LINE_STOP_FLAG，就不用中心点判定，换一种方案（左边界在span区域内，高度判定和之前逻辑一致）
+        # 主要是给结尾符号一个进入span的机会，这个char还应该离span右边界较近
+        if char in LINE_STOP_FLAG:
+            if (
+                (span_bbox[2] - span_height) < char_bbox[0] < span_bbox[2]
+                and char_center_x > span_bbox[0]
+                and span_bbox[1] < char_center_y < span_bbox[3]
+                and abs(char_center_y - span_center_y) < span_height * span_height_radio
+            ):
+                return True
+        elif char in LINE_START_FLAG:
+            if (
+                span_bbox[0] < char_bbox[2] < (span_bbox[0] + span_height)
+                and char_center_x < span_bbox[2]
+                and span_bbox[1] < char_center_y < span_bbox[3]
+                and abs(char_center_y - span_center_y) < span_height * span_height_radio
+            ):
+                return True
+        else:
+            return False
+
+
+def chars_to_content(span):
+    # 检查span中的char是否为空
+    if len(span['chars']) == 0:
+        pass
+    else:
+        # 给chars按char_idx排序
+        span['chars'] = sorted(span['chars'], key=lambda x: x['char_idx'])
+
+        # Calculate the width of each character
+        char_widths = [char['bbox'][2] - char['bbox'][0] for char in span['chars']]
+        # Calculate the median width
+        median_width = statistics.median(char_widths)
+
+        content = ''
+        for char in span['chars']:
+
+            # 如果下一个char的x0和上一个char的x1距离超过0.25个字符宽度，则需要在中间插入一个空格
+            char1 = char
+            char2 = span['chars'][span['chars'].index(char) + 1] if span['chars'].index(char) + 1 < len(span['chars']) else None
+            if char2 and char2['bbox'][0] - char1['bbox'][2] > median_width * 0.25 and char['char'] != ' ' and char2['char'] != ' ':
+                content += f"{char['char']} "
+            else:
+                content += char['char']
+
+        content = __replace_unicode(content)
+        content = __replace_ligatures(content)
+        content = __replace_ligatures(content)
+        span['content'] = content.strip()
+
+    del span['chars']
+
+
+def calculate_contrast(img, img_mode) -> float:
+    """
+    计算给定图像的对比度。
+    :param img: 图像，类型为numpy.ndarray
+    :Param img_mode = 图像的色彩通道，'rgb' 或 'bgr'
+    :return: 图像的对比度值
+    """
+    if img_mode == 'rgb':
+        # 将RGB图像转换为灰度图
+        gray_img = cv2.cvtColor(img, cv2.COLOR_RGB2GRAY)
+    elif img_mode == 'bgr':
+        # 将BGR图像转换为灰度图
+        gray_img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
+    else:
+        raise ValueError("Invalid image mode. Please provide 'rgb' or 'bgr'.")
+
+    # 计算均值和标准差
+    mean_value = np.mean(gray_img)
+    std_dev = np.std(gray_img)
+    # 对比度定义为标准差除以平均值（加上小常数避免除零错误）
+    contrast = std_dev / (mean_value + 1e-6)
+    # logger.debug(f"contrast: {contrast}")
+    return round(contrast, 2)
\ No newline at end of file
--- a/mineru/version.py
+++ b/mineru/version.py
+__version__ = "2.0.0"
\ No newline at end of file
--- a/projects/gradio_app/app.py
+++ b/projects/gradio_app/app.py
@@ -4,30 +4,22 @@ import base64
 import os
 import re
 import time
-import uuid
 import zipfile
 from pathlib import Path

 import gradio as gr
-import pymupdf
 from gradio_pdf import PDF
 from loguru import logger

-from magic_pdf.data.data_reader_writer import FileBasedDataReader
-from magic_pdf.libs.hash_utils import compute_sha256
-from magic_pdf.tools.common import do_parse, prepare_env
+from mineru.cli.common import prepare_env, do_parse, read_fn
+from mineru.utils.hash_utils import str_sha256


-def read_fn(path):
-    disk_rw = FileBasedDataReader(os.path.dirname(path))
-    return disk_rw.read(os.path.basename(path))
-
-
-def parse_pdf(doc_path, output_dir, end_page_id, is_ocr, layout_mode, formula_enable, table_enable, language):
+def parse_pdf(doc_path, output_dir, end_page_id, is_ocr, formula_enable, table_enable, language):
    os.makedirs(output_dir, exist_ok=True)

    try:
-        file_name = f'{str(Path(doc_path).stem)}_{time.time()}'
+        file_name = f'{str(Path(doc_path).stem)}_{time.strftime("%y%m%d_%H%M%S")}'
        pdf_data = read_fn(doc_path)
        if is_ocr:
            parse_method = 'ocr'
@@ -35,17 +27,14 @@ def parse_pdf(doc_path, output_dir, end_page_id, is_ocr, layout_mode, formula_en
            parse_method = 'auto'
        local_image_dir, local_md_dir = prepare_env(output_dir, file_name, parse_method)
        do_parse(
-            output_dir,
-            file_name,
-            pdf_data,
-            [],
-            parse_method,
-            False,
+            output_dir=output_dir,
+            pdf_file_names=[file_name],
+            pdf_bytes_list=[pdf_data],
+            p_lang_list=[language],
+            parse_method=parse_method,
            end_page_id=end_page_id,
-            layout_model=layout_mode,
-            formula_enable=formula_enable,
-            table_enable=table_enable,
-            lang=language,
+            p_formula_enable=formula_enable,
+            p_table_enable=table_enable,
        )
        return local_md_dir, file_name
    except Exception as e:
@@ -96,12 +85,11 @@ def replace_image_with_base64(markdown_text, image_dir_path):
    return re.sub(pattern, replace, markdown_text)


-def to_markdown(file_path, end_pages, is_ocr, layout_mode, formula_enable, table_enable, language):
+def to_markdown(file_path, end_pages, is_ocr, formula_enable, table_enable, language):
    file_path = to_pdf(file_path)
    # 获取识别的md文件以及压缩包文件路径
-    local_md_dir, file_name = parse_pdf(file_path, './output', end_pages - 1, is_ocr,
-                                        layout_mode, formula_enable, table_enable, language)
-    archive_zip_path = os.path.join('./output', compute_sha256(local_md_dir) + '.zip')
+    local_md_dir, file_name = parse_pdf(file_path, './output', end_pages - 1, is_ocr, formula_enable, table_enable, language)
+    archive_zip_path = os.path.join('./output', str_sha256(local_md_dir) + '.zip')
    zip_archive_success = compress_directory_to_zip(local_md_dir, archive_zip_path)
    if zip_archive_success == 0:
        logger.info('压缩成功')
@@ -125,24 +113,6 @@ latex_delimiters = [
 ]


-def init_model():
-    from magic_pdf.model.doc_analyze_by_custom_model import ModelSingleton
-    try:
-        model_manager = ModelSingleton()
-        txt_model = model_manager.get_model(False, False)  # noqa: F841
-        logger.info('txt_model init final')
-        ocr_model = model_manager.get_model(True, False)  # noqa: F841
-        logger.info('ocr_model init final')
-        return 0
-    except Exception as e:
-        logger.exception(e)
-        return -1
-
-
-model_init = init_model()
-logger.info(f'model_init: {model_init}')
-
-
 with open('header.html', 'r') as file:
    header = file.read()

@@ -171,24 +141,30 @@ all_lang = []
 all_lang.extend([*other_lang, *add_lang])


+def safe_stem(file_path):
+    stem = Path(file_path).stem
+    # 只保留字母、数字、下划线和点，其他字符替换为下划线
+    return re.sub(r'[^\w.]', '_', stem)
+
+
 def to_pdf(file_path):
-    with pymupdf.open(file_path) as f:
-        if f.is_pdf:
-            return file_path
-        else:
-            pdf_bytes = f.convert_to_pdf()
-            # 将pdfbytes 写入到uuid.pdf中
-            # 生成唯一的文件名
-            unique_filename = f'{uuid.uuid4()}.pdf'

-            # 构建完整的文件路径
-            tmp_file_path = os.path.join(os.path.dirname(file_path), unique_filename)
+    if file_path is None:
+        return None
+
+    pdf_bytes = read_fn(file_path)

-            # 将字节数据写入文件
-            with open(tmp_file_path, 'wb') as tmp_pdf_file:
-                tmp_pdf_file.write(pdf_bytes)
+    # unique_filename = f'{uuid.uuid4()}.pdf'
+    unique_filename = f'{safe_stem(file_path)}.pdf'

-            return tmp_file_path
+    # 构建完整的文件路径
+    tmp_file_path = os.path.join(os.path.dirname(file_path), unique_filename)
+
+    # 将字节数据写入文件
+    with open(tmp_file_path, 'wb') as tmp_pdf_file:
+        tmp_pdf_file.write(pdf_bytes)
+
+    return tmp_file_path


 if __name__ == '__main__':
@@ -196,14 +172,16 @@ if __name__ == '__main__':
        gr.HTML(header)
        with gr.Row():
            with gr.Column(variant='panel', scale=5):
-                file = gr.File(label='Please upload a PDF or image', file_types=['.pdf', '.png', '.jpeg', '.jpg'])
-                max_pages = gr.Slider(1, 20, 10, step=1, label='Max convert pages')
                with gr.Row():
-                    layout_mode = gr.Dropdown(['doclayout_yolo'], label='Layout model', value='doclayout_yolo')
-                    language = gr.Dropdown(all_lang, label='Language', value='ch')
+                    file = gr.File(label='Please upload a PDF or image', file_types=['.pdf', '.png', '.jpeg', '.jpg'])
+                with gr.Row(equal_height=True):
+                    with gr.Column(scale=4):
+                        max_pages = gr.Slider(1, 20, 10, step=1, label='Max convert pages')
+                    with gr.Column(scale=1):
+                        language = gr.Dropdown(all_lang, label='Language', value='ch')
                with gr.Row():
-                    formula_enable = gr.Checkbox(label='Enable formula recognition', value=True)
                    is_ocr = gr.Checkbox(label='Force enable OCR', value=False)
+                    formula_enable = gr.Checkbox(label='Enable formula recognition', value=True)
                    table_enable = gr.Checkbox(label='Enable table recognition(test)', value=True)
                with gr.Row():
                    change_bu = gr.Button('Convert')
@@ -227,7 +205,7 @@ if __name__ == '__main__':
                    with gr.Tab('Markdown text'):
                        md_text = gr.TextArea(lines=45, show_copy_button=True)
        file.change(fn=to_pdf, inputs=file, outputs=pdf_show)
-        change_bu.click(fn=to_markdown, inputs=[file, max_pages, is_ocr, layout_mode, formula_enable, table_enable, language],
+        change_bu.click(fn=to_markdown, inputs=[file, max_pages, is_ocr, formula_enable, table_enable, language],
                        outputs=[md, md_text, output_file, pdf_show])
        clear_bu.add([file, md, pdf_show, md_text, output_file, is_ocr])


--- a/pyproject.toml
+++ b/pyproject.toml
+[build-system]
+requires = ["setuptools>=61.0", "wheel"]
+build-backend = "setuptools.build_meta"
+
+[project]
+name = "mineru"
+dynamic = ["version"]
+license = {text = "AGPL-3.0"}
+description = "A practical tool for converting PDF to Markdown"
+readme = "README.md"
+requires-python = ">=3.10,<3.14"
+keywords = ["magic-pdf", "mineru", "MinerU", "convert", "pdf", "markdown"]
+classifiers = [
+    "Programming Language :: Python :: 3.10",
+    "Programming Language :: Python :: 3.11",
+    "Programming Language :: Python :: 3.12",
+    "Programming Language :: Python :: 3.13",
+]
+dependencies = [
+    "boto3>=1.28.43",
+    "click>=8.1.7",
+    "loguru>=0.7.2",
+    "numpy>=1.21.6",
+    "pdfminer.six==20250506",
+    "tqdm>=4.67.1",
+    "requests",
+    "httpx",
+    "pillow>=11.0.0",
+    "pypdfium2>=4.30.0",
+    "pypdf>=5.6.0",
+    "reportlab",
+    "pdftext>=0.6.2",
+    "modelscope>=1.26.0",
+    "huggingface-hub>=0.32.4",
+    "json-repair>=0.46.2",
+]
+
+[project.optional-dependencies]
+vlm = [
+    "transformers>=4.51.1",
+    "torch>=2.6.0",
+    "accelerate>=1.5.1",
+    "pydantic",
+]
+sglang = [
+    "sglang[all]>=0.4.7",
+]
+pipeline = [
+    "matplotlib>=3.10,<4",
+    "ultralytics>=8.3.48,<9",
+    "doclayout_yolo==0.0.4",
+    "dill>=0.3.8,<1",
+    "rapid_table>=1.0.5,<2.0.0",
+    "PyYAML>=6.0.2,<7",
+    "ftfy>=6.3.1,<7",
+    "openai>=1.70.0,<2",
+    "shapely>=2.0.7,<3",
+    "pyclipper>=1.3.0,<2",
+    "omegaconf>=2.3.0,<3",
+    "torch>=2.2.2,!=2.5.0,!=2.5.1,<3",
+    "torchvision",
+    "transformers>=4.49.0,!=4.51.0,<5.0.0",
+    "fast-langdetect>=0.2.3,<0.3.0",
+]
+core = [
+    "mineru[vlm]",
+    "mineru[pipeline]",
+]
+all = [
+    "mineru[core]",
+    "mineru[sglang]",
+]
+pipeline_old_linux = [
+    "matplotlib>=3.10,<=3.10.1",
+    "ultralytics>=8.3.48,<=8.3.104",
+    "doclayout_yolo==0.0.4",
+    "dill==0.3.8",
+    "PyYAML==6.0.2",
+    "ftfy==6.3.1",
+    "openai==1.71.0",
+    "shapely==2.1.0",
+    "pyclipper==1.3.0.post6",
+    "omegaconf==2.3.0",
+    "albumentations==1.4.20",
+    "rapid_table==1.0.3",
+    "torch>=2.2.2,!=2.5.0,!=2.5.1,<3",
+    "torchvision",
+    "transformers>=4.49.0,!=4.51.0,<5.0.0",
+    "fast-langdetect>=0.2.3,<0.3.0",
+]
+
+[project.urls]
+Home = "https://mineru.net/"
+Repository = "https://github.com/opendatalab/MinerU"
+
+[project.scripts]
+mineru = "mineru.cli:client.main"
+mineru-sglang-server = "mineru.cli.vlm_sglang_server:main"
+mineru-models-download = "mineru.cli.models_download:download_models"
+
+[tool.setuptools.dynamic]
+version = {attr = "mineru.version.__version__"}
+
+[tool.setuptools.packages.find]
+include = ["mineru*"]
+namespaces = false
+
+[tool.setuptools.package-data]
+"mineru" = ["resources/**"]
+"mineru.model.ocr.paddleocr2pytorch.pytorchocr.utils" = ["resources/**"]
+
+[tool.setuptools]
+include-package-data = true
+zip-safe = false
--- a/requirements.txt
+++ b/requirements.txt
-boto3>=1.28.43
-Brotli>=1.1.0
-click>=8.1.7
-fast-langdetect>=0.2.3,<0.3.0
-loguru>=0.6.0
-numpy>=1.21.6
-pydantic>=2.7.2,<2.11
-PyMuPDF>=1.24.9,<1.25.0
-scikit-learn>=1.0.2
-torch>=2.2.2,!=2.5.0,!=2.5.1,<3
-torchvision
-transformers>=4.49.0,!=4.51.0,<5.0.0
-pdfminer.six==20250506
-tqdm>=4.67.1
-# The requirements.txt must ensure that only necessary external dependencies are introduced. If there are new dependencies to add, please contact the project administrator.