Merge pull request #2611 from myhloli/dev

Dev

Merge pull request #2611 from myhloli/dev
Dev
0c7a0882 · Xiaomeng Zhao · GitHub · 3bd0ecf1 · a392f445 · 3bd0ecf1
Unverified Commit 0c7a0882 authored Jun 12, 2025 by Xiaomeng Zhao Committed by GitHub Jun 12, 2025
20 changed files
--- a/magic_pdf/operators/pipes.py
+++ b/magic_pdf/operators/pipes.py
-import copy
-import json
-import os
-from typing import Callable
-from magic_pdf.config.make_content_config import DropMode, MakeMode
-from magic_pdf.data.data_reader_writer import DataWriter
-from magic_pdf.data.dataset import Dataset
-from magic_pdf.dict2md.ocr_mkcontent import union_make
-from magic_pdf.libs.draw_bbox import (draw_layout_bbox, draw_line_sort_bbox,
-                                      draw_span_bbox)
-from magic_pdf.libs.json_compressor import JsonCompressor
-class PipeResult:
-    def __init__(self, pipe_res, dataset: Dataset):
-        """Initialized.
-        Args:
-            pipe_res (list[dict]): the pipeline processed result of model inference result
-            dataset (Dataset): the dataset associated with pipe_res
-        """
-        self._pipe_res = pipe_res
-        self._dataset = dataset
-    def get_markdown(
-        self,
-        img_dir_or_bucket_prefix: str,
-        drop_mode=DropMode.NONE,
-        md_make_mode=MakeMode.MM_MD,
-    ) -> str:
-        """Get markdown content.
-        Args:
-            img_dir_or_bucket_prefix (str): The s3 bucket prefix or local file directory which used to store the figure
-            drop_mode (str, optional): Drop strategy when some page which is corrupted or inappropriate. Defaults to DropMode.NONE.
-            md_make_mode (str, optional): The content Type of Markdown be made. Defaults to MakeMode.MM_MD.
-        Returns:
-            str: return markdown content
-        """
-        pdf_info_list = self._pipe_res['pdf_info']
-        md_content = union_make(
-            pdf_info_list, md_make_mode, drop_mode, img_dir_or_bucket_prefix
-        )
-        return md_content
-    def dump_md(
-        self,
-        writer: DataWriter,
-        file_path: str,
-        img_dir_or_bucket_prefix: str,
-        drop_mode=DropMode.NONE,
-        md_make_mode=MakeMode.MM_MD,
-    ):
-        """Dump The Markdown.
-        Args:
-            writer (DataWriter): File writer handle
-            file_path (str): The file location of markdown
-            img_dir_or_bucket_prefix (str): The s3 bucket prefix or local file directory which used to store the figure
-            drop_mode (str, optional): Drop strategy when some page which is corrupted or inappropriate. Defaults to DropMode.NONE.
-            md_make_mode (str, optional): The content Type of Markdown be made. Defaults to MakeMode.MM_MD.
-        """
-        md_content = self.get_markdown(
-            img_dir_or_bucket_prefix, drop_mode=drop_mode, md_make_mode=md_make_mode
-        )
-        writer.write_string(file_path, md_content)
-    def get_content_list(
-        self,
-        image_dir_or_bucket_prefix: str,
-        drop_mode=DropMode.NONE,
-    ) -> str:
-        """Get Content List.
-        Args:
-            image_dir_or_bucket_prefix (str): The s3 bucket prefix or local file directory which used to store the figure
-            drop_mode (str, optional): Drop strategy when some page which is corrupted or inappropriate. Defaults to DropMode.NONE.
-        Returns:
-            str: content list content
-        """
-        pdf_info_list = self._pipe_res['pdf_info']
-        content_list = union_make(
-            pdf_info_list,
-            MakeMode.STANDARD_FORMAT,
-            drop_mode,
-            image_dir_or_bucket_prefix,
-        )
-        return content_list
-    def dump_content_list(
-        self,
-        writer: DataWriter,
-        file_path: str,
-        image_dir_or_bucket_prefix: str,
-        drop_mode=DropMode.NONE,
-    ):
-        """Dump Content List.
-        Args:
-            writer (DataWriter): File writer handle
-            file_path (str): The file location of content list
-            image_dir_or_bucket_prefix (str): The s3 bucket prefix or local file directory which used to store the figure
-            drop_mode (str, optional): Drop strategy when some page which is corrupted or inappropriate. Defaults to DropMode.NONE.
-        """
-        content_list = self.get_content_list(
-            image_dir_or_bucket_prefix, drop_mode=drop_mode,
-        )
-        writer.write_string(
-            file_path, json.dumps(content_list, ensure_ascii=False, indent=4)
-        )
-    def get_middle_json(self) -> str:
-        """Get middle json.
-        Returns:
-            str: The content of middle json
-        """
-        return json.dumps(self._pipe_res, ensure_ascii=False, indent=4)
-    def dump_middle_json(self, writer: DataWriter, file_path: str):
-        """Dump the result of pipeline.
-        Args:
-            writer (DataWriter): File writer handler
-            file_path (str): The file location of middle json
-        """
-        middle_json = self.get_middle_json()
-        writer.write_string(file_path, middle_json)
-    def draw_layout(self, file_path: str) -> None:
-        """Draw the layout.
-        Args:
-            file_path (str): The file location of layout result file
-        """
-        dir_name = os.path.dirname(file_path)
-        base_name = os.path.basename(file_path)
-        if not os.path.exists(dir_name):
-            os.makedirs(dir_name, exist_ok=True)
-        pdf_info = self._pipe_res['pdf_info']
-        draw_layout_bbox(pdf_info, self._dataset.data_bits(), dir_name, base_name)
-    def draw_span(self, file_path: str):
-        """Draw the Span.
-        Args:
-            file_path (str): The file location of span result file
-        """
-        dir_name = os.path.dirname(file_path)
-        base_name = os.path.basename(file_path)
-        if not os.path.exists(dir_name):
-            os.makedirs(dir_name, exist_ok=True)
-        pdf_info = self._pipe_res['pdf_info']
-        draw_span_bbox(pdf_info, self._dataset.data_bits(), dir_name, base_name)
-    def draw_line_sort(self, file_path: str):
-        """Draw line sort.
-        Args:
-            file_path (str): The file location of line sort result file
-        """
-        dir_name = os.path.dirname(file_path)
-        base_name = os.path.basename(file_path)
-        if not os.path.exists(dir_name):
-            os.makedirs(dir_name, exist_ok=True)
-        pdf_info = self._pipe_res['pdf_info']
-        draw_line_sort_bbox(pdf_info, self._dataset.data_bits(), dir_name, base_name)
-    def get_compress_pdf_mid_data(self):
-        """Compress the pipeline result.
-        Returns:
-            str: compress the pipeline result and return
-        """
-        return JsonCompressor.compress_json(self._pipe_res)
-    def apply(self, proc: Callable, *args, **kwargs):
-        """Apply callable method which.
-        Args:
-            proc (Callable): invoke proc as follows:
-                proc(pipeline_result, *args, **kwargs)
-        Returns:
-            Any: return the result generated by proc
-        """
-        return proc(copy.deepcopy(self._pipe_res), *args, **kwargs)
--- a/magic_pdf/pdf_parse_union_core_v2.py
+++ b/magic_pdf/pdf_parse_union_core_v2.py
-import copy
-import math
-import os
-import re
-import statistics
-import time
-import warnings
-from typing import List
-import cv2
-import fitz
-import torch
-import numpy as np
-from loguru import logger
-from tqdm import tqdm
-from magic_pdf.config.enums import SupportedPdfParseMethod
-from magic_pdf.config.ocr_content_type import BlockType, ContentType
-from magic_pdf.data.dataset import Dataset, PageableData
-from magic_pdf.libs.boxbase import calculate_overlap_area_in_bbox1_area_ratio, __is_overlaps_y_exceeds_threshold
-from magic_pdf.libs.clean_memory import clean_memory
-from magic_pdf.libs.config_reader import get_local_layoutreader_model_dir, get_llm_aided_config, get_device
-from magic_pdf.libs.convert_utils import dict_to_list
-from magic_pdf.libs.hash_utils import compute_md5
-from magic_pdf.libs.pdf_image_tools import cut_image_to_pil_image
-from magic_pdf.model.magic_model import MagicModel
-from magic_pdf.post_proc.llm_aided import llm_aided_formula, llm_aided_text, llm_aided_title
-from magic_pdf.model.sub_modules.model_init import AtomModelSingleton
-from magic_pdf.post_proc.para_split_v3 import para_split
-from magic_pdf.pre_proc.construct_page_dict import ocr_construct_page_component_v2
-from magic_pdf.pre_proc.cut_image import ocr_cut_image_and_table
-from magic_pdf.pre_proc.ocr_detect_all_bboxes import ocr_prepare_bboxes_for_layout_split_v2
-from magic_pdf.pre_proc.ocr_dict_merge import fill_spans_in_blocks, fix_block_spans_v2, fix_discarded_block
-from magic_pdf.pre_proc.ocr_span_list_modify import get_qa_need_list_v2, remove_overlaps_low_confidence_spans, \
-    remove_overlaps_min_spans, remove_x_overlapping_chars
-os.environ['NO_ALBUMENTATIONS_UPDATE'] = '1'  # 禁止albumentations检查更新
-def __replace_STX_ETX(text_str: str):
-    """Replace \u0002 and \u0003, as these characters become garbled when extracted using pymupdf. In fact, they were originally quotation marks.
-    Drawback: This issue is only observed in English text; it has not been found in Chinese text so far.
-        Args:
-            text_str (str): raw text
-        Returns:
-            _type_: replaced text
-    """  # noqa: E501
-    if text_str:
-        s = text_str.replace('\u0002', "'")
-        s = s.replace('\u0003', "'")
-        return s
-    return text_str
-# 连写字符拆分
-def __replace_ligatures(text: str):
-    ligatures = {
-        'ﬁ': 'fi', 'ﬂ': 'fl', 'ﬀ': 'ff', 'ﬃ': 'ffi', 'ﬄ': 'ffl', 'ﬅ': 'ft', 'ﬆ': 'st'
-    }
-    return re.sub('|'.join(map(re.escape, ligatures.keys())), lambda m: ligatures[m.group()], text)
-def chars_to_content(span):
-    # 检查span中的char是否为空
-    if len(span['chars']) == 0:
-        pass
-    else:
-        # 先给chars按char['bbox']的中心点的x坐标排序
-        span['chars'] = sorted(span['chars'], key=lambda x: (x['bbox'][0] + x['bbox'][2]) / 2)
-        # Calculate the width of each character
-        char_widths = [char['bbox'][2] - char['bbox'][0] for char in span['chars']]
-        # Calculate the median width
-        median_width = statistics.median(char_widths)
-        # 通过x轴重叠比率移除一部分char
-        span = remove_x_overlapping_chars(span, median_width)
-        content = ''
-        for char in span['chars']:
-            # 如果下一个char的x0和上一个char的x1距离超过0.25个字符宽度，则需要在中间插入一个空格
-            char1 = char
-            char2 = span['chars'][span['chars'].index(char) + 1] if span['chars'].index(char) + 1 < len(span['chars']) else None
-            if char2 and char2['bbox'][0] - char1['bbox'][2] > median_width * 0.25 and char['c'] != ' ' and char2['c'] != ' ':
-                content += f"{char['c']} "
-            else:
-                content += char['c']
-        span['content'] = __replace_ligatures(content)
-    del span['chars']
-LINE_STOP_FLAG = ('.', '!', '?', '。', '！', '？', ')', '）', '"', '”', ':', '：', ';', '；', ']', '】', '}', '}', '>', '》', '、', ',', '，', '-', '—', '–',)
-LINE_START_FLAG = ('(', '（', '"', '“', '【', '{', '《', '<', '「', '『', '【', '[',)
-def fill_char_in_spans(spans, all_chars):
-    # 简单从上到下排一下序
-    spans = sorted(spans, key=lambda x: x['bbox'][1])
-    for char in all_chars:
-        for span in spans:
-            if calculate_char_in_span(char['bbox'], span['bbox'], char['c']):
-                span['chars'].append(char)
-                break
-    need_ocr_spans = []
-    for span in spans:
-        chars_to_content(span)
-        # 有的span中虽然没有字但有一两个空的占位符，用宽高和content长度过滤
-        if len(span['content']) * span['height'] < span['width'] * 0.5:
-            # logger.info(f"maybe empty span: {len(span['content'])}, {span['height']}, {span['width']}")
-            need_ocr_spans.append(span)
-        del span['height'], span['width']
-    return need_ocr_spans
-# 使用鲁棒性更强的中心点坐标判断
-def calculate_char_in_span(char_bbox, span_bbox, char, span_height_radio=0.33):
-    char_center_x = (char_bbox[0] + char_bbox[2]) / 2
-    char_center_y = (char_bbox[1] + char_bbox[3]) / 2
-    span_center_y = (span_bbox[1] + span_bbox[3]) / 2
-    span_height = span_bbox[3] - span_bbox[1]
-    if (
-        span_bbox[0] < char_center_x < span_bbox[2]
-        and span_bbox[1] < char_center_y < span_bbox[3]
-        and abs(char_center_y - span_center_y) < span_height * span_height_radio  # 字符的中轴和span的中轴高度差不能超过1/4span高度
-    ):
-        return True
-    else:
-        # 如果char是LINE_STOP_FLAG，就不用中心点判定，换一种方案（左边界在span区域内，高度判定和之前逻辑一致）
-        # 主要是给结尾符号一个进入span的机会，这个char还应该离span右边界较近
-        if char in LINE_STOP_FLAG:
-            if (
-                (span_bbox[2] - span_height) < char_bbox[0] < span_bbox[2]
-                and char_center_x > span_bbox[0]
-                and span_bbox[1] < char_center_y < span_bbox[3]
-                and abs(char_center_y - span_center_y) < span_height * span_height_radio
-            ):
-                return True
-        elif char in LINE_START_FLAG:
-            if (
-                span_bbox[0] < char_bbox[2] < (span_bbox[0] + span_height)
-                and char_center_x < span_bbox[2]
-                and span_bbox[1] < char_center_y < span_bbox[3]
-                and abs(char_center_y - span_center_y) < span_height * span_height_radio
-            ):
-                return True
-        else:
-            return False
-def remove_tilted_line(text_blocks):
-    for block in text_blocks:
-        remove_lines = []
-        for line in block['lines']:
-            cosine, sine = line['dir']
-            # 计算弧度值
-            angle_radians = math.atan2(sine, cosine)
-            # 将弧度值转换为角度值
-            angle_degrees = math.degrees(angle_radians)
-            if 2 < abs(angle_degrees) < 88:
-                remove_lines.append(line)
-        for line in remove_lines:
-            block['lines'].remove(line)
-def calculate_contrast(img, img_mode) -> float:
-    """
-    计算给定图像的对比度。
-    :param img: 图像，类型为numpy.ndarray
-    :Param img_mode = 图像的色彩通道，'rgb' 或 'bgr'
-    :return: 图像的对比度值
-    """
-    if img_mode == 'rgb':
-        # 将RGB图像转换为灰度图
-        gray_img = cv2.cvtColor(img, cv2.COLOR_RGB2GRAY)
-    elif img_mode == 'bgr':
-        # 将BGR图像转换为灰度图
-        gray_img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
-    else:
-        raise ValueError("Invalid image mode. Please provide 'rgb' or 'bgr'.")
-    # 计算均值和标准差
-    mean_value = np.mean(gray_img)
-    std_dev = np.std(gray_img)
-    # 对比度定义为标准差除以平均值（加上小常数避免除零错误）
-    contrast = std_dev / (mean_value + 1e-6)
-    # logger.debug(f"contrast: {contrast}")
-    return round(contrast, 2)
-# @measure_time
-def txt_spans_extract_v2(pdf_page, spans, all_bboxes, all_discarded_blocks, lang):
-    # cid用0xfffd表示，连字符拆开
-    # text_blocks_raw = pdf_page.get_text('rawdict', flags=fitz.TEXT_PRESERVE_WHITESPACE | fitz.TEXT_MEDIABOX_CLIP)['blocks']
-    # cid用0xfffd表示，连字符不拆开
-    #text_blocks_raw = pdf_page.get_text('rawdict', flags=fitz.TEXT_PRESERVE_LIGATURES | fitz.TEXT_PRESERVE_WHITESPACE | fitz.TEXT_MEDIABOX_CLIP)['blocks']
-    # 自定义flags出现较多0xfffd，可能是pymupdf可以自行处理内置字典的pdf，不再使用
-    text_blocks_raw = pdf_page.get_text('rawdict', flags=fitz.TEXTFLAGS_TEXT)['blocks']
-    # text_blocks = pdf_page.get_text('dict', flags=fitz.TEXTFLAGS_TEXT)['blocks']
-    # 移除所有角度不为0或90的line
-    remove_tilted_line(text_blocks_raw)
-    all_pymu_chars = []
-    for block in text_blocks_raw:
-        for line in block['lines']:
-            cosine, sine = line['dir']
-            if abs(cosine) < 0.9 or abs(sine) > 0.1:
-                continue
-            for span in line['spans']:
-                all_pymu_chars.extend(span['chars'])
-    # 计算所有sapn的高度的中位数
-    span_height_list = []
-    for span in spans:
-        if span['type'] in [ContentType.InterlineEquation, ContentType.Image, ContentType.Table]:
-            continue
-        span_height = span['bbox'][3] - span['bbox'][1]
-        span['height'] = span_height
-        span['width'] = span['bbox'][2] - span['bbox'][0]
-        span_height_list.append(span_height)
-    if len(span_height_list) == 0:
-        return spans
-    else:
-        median_span_height = statistics.median(span_height_list)
-    useful_spans = []
-    unuseful_spans = []
-    # 纵向span的两个特征：1. 高度超过多个line 2. 高宽比超过某个值
-    vertical_spans = []
-    for span in spans:
-        if span['type'] in [ContentType.InterlineEquation, ContentType.Image, ContentType.Table]:
-            continue
-        for block in all_bboxes + all_discarded_blocks:
-            if block[7] in [BlockType.ImageBody, BlockType.TableBody, BlockType.InterlineEquation]:
-                continue
-            if calculate_overlap_area_in_bbox1_area_ratio(span['bbox'], block[0:4]) > 0.5:
-                if span['height'] > median_span_height * 3 and span['height'] > span['width'] * 3:
-                    vertical_spans.append(span)
-                elif block in all_bboxes:
-                    useful_spans.append(span)
-                else:
-                    unuseful_spans.append(span)
-                break
-    """垂直的span框直接用pymu的line进行填充"""
-    if len(vertical_spans) > 0:
-        text_blocks = pdf_page.get_text('dict', flags=fitz.TEXTFLAGS_TEXT)['blocks']
-        all_pymu_lines = []
-        for block in text_blocks:
-            for line in block['lines']:
-                all_pymu_lines.append(line)
-        for pymu_line in all_pymu_lines:
-            for span in vertical_spans:
-                if calculate_overlap_area_in_bbox1_area_ratio(pymu_line['bbox'], span['bbox']) > 0.5:
-                    for pymu_span in pymu_line['spans']:
-                        span['content'] += pymu_span['text']
-                    break
-        for span in vertical_spans:
-            if len(span['content']) == 0:
-                spans.remove(span)
-    """水平的span框如果没有char则用ocr进行填充"""
-    new_spans = []
-    for span in useful_spans + unuseful_spans:
-        if span['type'] in [ContentType.Text]:
-            span['chars'] = []
-            new_spans.append(span)
-    need_ocr_spans = fill_char_in_spans(new_spans, all_pymu_chars)
-    if len(need_ocr_spans) > 0:
-        # 初始化ocr模型
-        # atom_model_manager = AtomModelSingleton()
-        # ocr_model = atom_model_manager.get_atom_model(
-        #     atom_model_name='ocr',
-        #     ocr_show_log=False,
-        #     det_db_box_thresh=0.3,
-        #     lang=lang
-        # )
-        for span in need_ocr_spans:
-            # 对span的bbox截图再ocr
-            span_img = cut_image_to_pil_image(span['bbox'], pdf_page, mode='cv2')
-            # 计算span的对比度，低于0.20的span不进行ocr
-            if calculate_contrast(span_img, img_mode='bgr') <= 0.17:
-                spans.remove(span)
-                continue
-                # pass
-            span['content'] = ''
-            span['score'] = 1
-            span['np_img'] = span_img
-            # ocr_res = ocr_model.ocr(span_img, det=False)
-            # if ocr_res and len(ocr_res) > 0:
-            #     if len(ocr_res[0]) > 0:
-            #         ocr_text, ocr_score = ocr_res[0][0]
-            #         # logger.info(f"ocr_text: {ocr_text}, ocr_score: {ocr_score}")
-            #         if ocr_score > 0.5 and len(ocr_text) > 0:
-            #             span['content'] = ocr_text
-            #             span['score'] = float(round(ocr_score, 2))
-            #         else:
-            #             spans.remove(span)
-    return spans
-def model_init(model_name: str):
-    from transformers import LayoutLMv3ForTokenClassification
-    device_name = get_device()
-    bf_16_support = False
-    if device_name.startswith("cuda"):
-        bf_16_support = torch.cuda.is_bf16_supported()
-    elif device_name.startswith("mps"):
-        bf_16_support = True
-    device = torch.device(device_name)
-    if model_name == 'layoutreader':
-        # 检测modelscope的缓存目录是否存在
-        layoutreader_model_dir = get_local_layoutreader_model_dir()
-        if os.path.exists(layoutreader_model_dir):
-            model = LayoutLMv3ForTokenClassification.from_pretrained(
-                layoutreader_model_dir
-            )
-        else:
-            logger.warning(
-                'local layoutreader model not exists, use online model from huggingface'
-            )
-            model = LayoutLMv3ForTokenClassification.from_pretrained(
-                'hantian/layoutreader'
-            )
-        if bf_16_support:
-            model.to(device).eval().bfloat16()
-        else:
-            model.to(device).eval()
-    else:
-        logger.error('model name not allow')
-        exit(1)
-    return model
-class ModelSingleton:
-    _instance = None
-    _models = {}
-    def __new__(cls, *args, **kwargs):
-        if cls._instance is None:
-            cls._instance = super().__new__(cls)
-        return cls._instance
-    def get_model(self, model_name: str):
-        if model_name not in self._models:
-            self._models[model_name] = model_init(model_name=model_name)
-        return self._models[model_name]
-def do_predict(boxes: List[List[int]], model) -> List[int]:
-    from magic_pdf.model.sub_modules.reading_oreder.layoutreader.helpers import (
-        boxes2inputs, parse_logits, prepare_inputs)
-    with warnings.catch_warnings():
-        warnings.filterwarnings("ignore", category=FutureWarning, module="transformers")
-        inputs = boxes2inputs(boxes)
-        inputs = prepare_inputs(inputs, model)
-        logits = model(**inputs).logits.cpu().squeeze(0)
-    return parse_logits(logits, len(boxes))
-def cal_block_index(fix_blocks, sorted_bboxes):
-    if sorted_bboxes is not None:
-        # 使用layoutreader排序
-        for block in fix_blocks:
-            line_index_list = []
-            if len(block['lines']) == 0:
-                block['index'] = sorted_bboxes.index(block['bbox'])
-            else:
-                for line in block['lines']:
-                    line['index'] = sorted_bboxes.index(line['bbox'])
-                    line_index_list.append(line['index'])
-                median_value = statistics.median(line_index_list)
-                block['index'] = median_value
-            # 删除图表body block中的虚拟line信息, 并用real_lines信息回填
-            if block['type'] in [BlockType.ImageBody, BlockType.TableBody, BlockType.Title, BlockType.InterlineEquation]:
-                if 'real_lines' in block:
-                    block['virtual_lines'] = copy.deepcopy(block['lines'])
-                    block['lines'] = copy.deepcopy(block['real_lines'])
-                    del block['real_lines']
-    else:
-        # 使用xycut排序
-        block_bboxes = []
-        for block in fix_blocks:
-            # 如果block['bbox']任意值小于0，将其置为0
-            block['bbox'] = [max(0, x) for x in block['bbox']]
-            block_bboxes.append(block['bbox'])
-            # 删除图表body block中的虚拟line信息, 并用real_lines信息回填
-            if block['type'] in [BlockType.ImageBody, BlockType.TableBody, BlockType.Title, BlockType.InterlineEquation]:
-                if 'real_lines' in block:
-                    block['virtual_lines'] = copy.deepcopy(block['lines'])
-                    block['lines'] = copy.deepcopy(block['real_lines'])
-                    del block['real_lines']
-        import numpy as np
-        from magic_pdf.model.sub_modules.reading_oreder.layoutreader.xycut import \
-            recursive_xy_cut
-        random_boxes = np.array(block_bboxes)
-        np.random.shuffle(random_boxes)
-        res = []
-        recursive_xy_cut(np.asarray(random_boxes).astype(int), np.arange(len(block_bboxes)), res)
-        assert len(res) == len(block_bboxes)
-        sorted_boxes = random_boxes[np.array(res)].tolist()
-        for i, block in enumerate(fix_blocks):
-            block['index'] = sorted_boxes.index(block['bbox'])
-        # 生成line index
-        sorted_blocks = sorted(fix_blocks, key=lambda b: b['index'])
-        line_inedx = 1
-        for block in sorted_blocks:
-            for line in block['lines']:
-                line['index'] = line_inedx
-                line_inedx += 1
-    return fix_blocks
-def insert_lines_into_block(block_bbox, line_height, page_w, page_h):
-    # block_bbox是一个元组(x0, y0, x1, y1)，其中(x0, y0)是左下角坐标，(x1, y1)是右上角坐标
-    x0, y0, x1, y1 = block_bbox
-    block_height = y1 - y0
-    block_weight = x1 - x0
-    # 如果block高度小于n行正文，则直接返回block的bbox
-    if line_height * 2 < block_height:
-        if (
-            block_height > page_h * 0.25 and page_w * 0.5 > block_weight > page_w * 0.25
-        ):  # 可能是双列结构，可以切细点
-            lines = int(block_height / line_height)
-        else:
-            # 如果block的宽度超过0.4页面宽度，则将block分成3行(是一种复杂布局，图不能切的太细)
-            if block_weight > page_w * 0.4:
-                lines = 3
-            elif block_weight > page_w * 0.25:  # （可能是三列结构，也切细点）
-                lines = int(block_height / line_height)
-            else:  # 判断长宽比
-                if block_height / block_weight > 1.2:  # 细长的不分
-                    return [[x0, y0, x1, y1]]
-                else:  # 不细长的还是分成两行
-                    lines = 2
-        line_height = (y1 - y0) / lines
-        # 确定从哪个y位置开始绘制线条
-        current_y = y0
-        # 用于存储线条的位置信息[(x0, y), ...]
-        lines_positions = []
-        for i in range(lines):
-            lines_positions.append([x0, current_y, x1, current_y + line_height])
-            current_y += line_height
-        return lines_positions
-    else:
-        return [[x0, y0, x1, y1]]
-def sort_lines_by_model(fix_blocks, page_w, page_h, line_height, footnote_blocks):
-    page_line_list = []
-    def add_lines_to_block(b):
-        line_bboxes = insert_lines_into_block(b['bbox'], line_height, page_w, page_h)
-        b['lines'] = []
-        for line_bbox in line_bboxes:
-            b['lines'].append({'bbox': line_bbox, 'spans': []})
-        page_line_list.extend(line_bboxes)
-    for block in fix_blocks:
-        if block['type'] in [
-            BlockType.Text, BlockType.Title,
-            BlockType.ImageCaption, BlockType.ImageFootnote,
-            BlockType.TableCaption, BlockType.TableFootnote
-        ]:
-            if len(block['lines']) == 0:
-                add_lines_to_block(block)
-            elif block['type'] in [BlockType.Title] and len(block['lines']) == 1 and (block['bbox'][3] - block['bbox'][1]) > line_height * 2:
-                block['real_lines'] = copy.deepcopy(block['lines'])
-                add_lines_to_block(block)
-            else:
-                for line in block['lines']:
-                    bbox = line['bbox']
-                    page_line_list.append(bbox)
-        elif block['type'] in [BlockType.ImageBody, BlockType.TableBody, BlockType.InterlineEquation]:
-            block['real_lines'] = copy.deepcopy(block['lines'])
-            add_lines_to_block(block)
-    for block in footnote_blocks:
-        footnote_block = {'bbox': block[:4]}
-        add_lines_to_block(footnote_block)
-    if len(page_line_list) > 200:  # layoutreader最高支持512line
-        return None
-    # 使用layoutreader排序
-    x_scale = 1000.0 / page_w
-    y_scale = 1000.0 / page_h
-    boxes = []
-    # logger.info(f"Scale: {x_scale}, {y_scale}, Boxes len: {len(page_line_list)}")
-    for left, top, right, bottom in page_line_list:
-        if left < 0:
-            logger.warning(
-                f'left < 0, left: {left}, right: {right}, top: {top}, bottom: {bottom}, page_w: {page_w}, page_h: {page_h}'
-            )  # noqa: E501
-            left = 0
-        if right > page_w:
-            logger.warning(
-                f'right > page_w, left: {left}, right: {right}, top: {top}, bottom: {bottom}, page_w: {page_w}, page_h: {page_h}'
-            )  # noqa: E501
-            right = page_w
-        if top < 0:
-            logger.warning(
-                f'top < 0, left: {left}, right: {right}, top: {top}, bottom: {bottom}, page_w: {page_w}, page_h: {page_h}'
-            )  # noqa: E501
-            top = 0
-        if bottom > page_h:
-            logger.warning(
-                f'bottom > page_h, left: {left}, right: {right}, top: {top}, bottom: {bottom}, page_w: {page_w}, page_h: {page_h}'
-            )  # noqa: E501
-            bottom = page_h
-        left = round(left * x_scale)
-        top = round(top * y_scale)
-        right = round(right * x_scale)
-        bottom = round(bottom * y_scale)
-        assert (
-            1000 >= right >= left >= 0 and 1000 >= bottom >= top >= 0
-        ), f'Invalid box. right: {right}, left: {left}, bottom: {bottom}, top: {top}'  # noqa: E126, E121
-        boxes.append([left, top, right, bottom])
-    model_manager = ModelSingleton()
-    model = model_manager.get_model('layoutreader')
-    with torch.no_grad():
-        orders = do_predict(boxes, model)
-    sorted_bboxes = [page_line_list[i] for i in orders]
-    return sorted_bboxes
-def get_line_height(blocks):
-    page_line_height_list = []
-    for block in blocks:
-        if block['type'] in [
-            BlockType.Text, BlockType.Title,
-            BlockType.ImageCaption, BlockType.ImageFootnote,
-            BlockType.TableCaption, BlockType.TableFootnote
-        ]:
-            for line in block['lines']:
-                bbox = line['bbox']
-                page_line_height_list.append(int(bbox[3] - bbox[1]))
-    if len(page_line_height_list) > 0:
-        return statistics.median(page_line_height_list)
-    else:
-        return 10
-def process_groups(groups, body_key, caption_key, footnote_key):
-    body_blocks = []
-    caption_blocks = []
-    footnote_blocks = []
-    for i, group in enumerate(groups):
-        group[body_key]['group_id'] = i
-        body_blocks.append(group[body_key])
-        for caption_block in group[caption_key]:
-            caption_block['group_id'] = i
-            caption_blocks.append(caption_block)
-        for footnote_block in group[footnote_key]:
-            footnote_block['group_id'] = i
-            footnote_blocks.append(footnote_block)
-    return body_blocks, caption_blocks, footnote_blocks
-def process_block_list(blocks, body_type, block_type):
-    indices = [block['index'] for block in blocks]
-    median_index = statistics.median(indices)
-    body_bbox = next((block['bbox'] for block in blocks if block.get('type') == body_type), [])
-    return {
-        'type': block_type,
-        'bbox': body_bbox,
-        'blocks': blocks,
-        'index': median_index,
-    }
-def revert_group_blocks(blocks):
-    image_groups = {}
-    table_groups = {}
-    new_blocks = []
-    for block in blocks:
-        if block['type'] in [BlockType.ImageBody, BlockType.ImageCaption, BlockType.ImageFootnote]:
-            group_id = block['group_id']
-            if group_id not in image_groups:
-                image_groups[group_id] = []
-            image_groups[group_id].append(block)
-        elif block['type'] in [BlockType.TableBody, BlockType.TableCaption, BlockType.TableFootnote]:
-            group_id = block['group_id']
-            if group_id not in table_groups:
-                table_groups[group_id] = []
-            table_groups[group_id].append(block)
-        else:
-            new_blocks.append(block)
-    for group_id, blocks in image_groups.items():
-        new_blocks.append(process_block_list(blocks, BlockType.ImageBody, BlockType.Image))
-    for group_id, blocks in table_groups.items():
-        new_blocks.append(process_block_list(blocks, BlockType.TableBody, BlockType.Table))
-    return new_blocks
-def remove_outside_spans(spans, all_bboxes, all_discarded_blocks):
-    def get_block_bboxes(blocks, block_type_list):
-        return [block[0:4] for block in blocks if block[7] in block_type_list]
-    image_bboxes = get_block_bboxes(all_bboxes, [BlockType.ImageBody])
-    table_bboxes = get_block_bboxes(all_bboxes, [BlockType.TableBody])
-    other_block_type = []
-    for block_type in BlockType.__dict__.values():
-        if not isinstance(block_type, str):
-            continue
-        if block_type not in [BlockType.ImageBody, BlockType.TableBody]:
-            other_block_type.append(block_type)
-    other_block_bboxes = get_block_bboxes(all_bboxes, other_block_type)
-    discarded_block_bboxes = get_block_bboxes(all_discarded_blocks, [BlockType.Discarded])
-    new_spans = []
-    for span in spans:
-        span_bbox = span['bbox']
-        span_type = span['type']
-        if any(calculate_overlap_area_in_bbox1_area_ratio(span_bbox, block_bbox) > 0.4 for block_bbox in
-               discarded_block_bboxes):
-            new_spans.append(span)
-            continue
-        if span_type == ContentType.Image:
-            if any(calculate_overlap_area_in_bbox1_area_ratio(span_bbox, block_bbox) > 0.5 for block_bbox in
-                   image_bboxes):
-                new_spans.append(span)
-        elif span_type == ContentType.Table:
-            if any(calculate_overlap_area_in_bbox1_area_ratio(span_bbox, block_bbox) > 0.5 for block_bbox in
-                   table_bboxes):
-                new_spans.append(span)
-        else:
-            if any(calculate_overlap_area_in_bbox1_area_ratio(span_bbox, block_bbox) > 0.5 for block_bbox in
-                   other_block_bboxes):
-                new_spans.append(span)
-    return new_spans
-def parse_page_core(
-    page_doc: PageableData, magic_model, page_id, pdf_bytes_md5, imageWriter, parse_mode, lang
-):
-    need_drop = False
-    drop_reason = []
-    """从magic_model对象中获取后面会用到的区块信息"""
-    img_groups = magic_model.get_imgs_v2(page_id)
-    table_groups = magic_model.get_tables_v2(page_id)
-    """对image和table的区块分组"""
-    img_body_blocks, img_caption_blocks, img_footnote_blocks = process_groups(
-        img_groups, 'image_body', 'image_caption_list', 'image_footnote_list'
-    )
-    table_body_blocks, table_caption_blocks, table_footnote_blocks = process_groups(
-        table_groups, 'table_body', 'table_caption_list', 'table_footnote_list'
-    )
-    discarded_blocks = magic_model.get_discarded(page_id)
-    text_blocks = magic_model.get_text_blocks(page_id)
-    title_blocks = magic_model.get_title_blocks(page_id)
-    inline_equations, interline_equations, interline_equation_blocks = magic_model.get_equations(page_id)
-    page_w, page_h = magic_model.get_page_size(page_id)
-    def merge_title_blocks(blocks, x_distance_threshold=0.1*page_w):
-        def merge_two_bbox(b1, b2):
-            x_min = min(b1['bbox'][0], b2['bbox'][0])
-            y_min = min(b1['bbox'][1], b2['bbox'][1])
-            x_max = max(b1['bbox'][2], b2['bbox'][2])
-            y_max = max(b1['bbox'][3], b2['bbox'][3])
-            return x_min, y_min, x_max, y_max
-        def merge_two_blocks(b1, b2):
-            # 合并两个标题块的边界框
-            b1['bbox'] = merge_two_bbox(b1, b2)
-            # 合并两个标题块的文本内容
-            line1 = b1['lines'][0]
-            line2 = b2['lines'][0]
-            line1['bbox'] = merge_two_bbox(line1, line2)
-            line1['spans'].extend(line2['spans'])
-            return b1, b2
-        # 按 y 轴重叠度聚集标题块
-        y_overlapping_blocks = []
-        title_bs = [b for b in blocks if b['type'] == BlockType.Title]
-        while title_bs:
-            block1 = title_bs.pop(0)
-            current_row = [block1]
-            to_remove = []
-            for block2 in title_bs:
-                if (
-                    __is_overlaps_y_exceeds_threshold(block1['bbox'], block2['bbox'], 0.9)
-                    and len(block1['lines']) == 1
-                    and len(block2['lines']) == 1
-                ):
-                    current_row.append(block2)
-                    to_remove.append(block2)
-            for b in to_remove:
-                title_bs.remove(b)
-            y_overlapping_blocks.append(current_row)
-        # 按x轴坐标排序并合并标题块
-        to_remove_blocks = []
-        for row in y_overlapping_blocks:
-            if len(row) == 1:
-                continue
-            # 按x轴坐标排序
-            row.sort(key=lambda x: x['bbox'][0])
-            merged_block = row[0]
-            for i in range(1, len(row)):
-                left_block = merged_block
-                right_block = row[i]
-                left_height = left_block['bbox'][3] - left_block['bbox'][1]
-                right_height = right_block['bbox'][3] - right_block['bbox'][1]
-                if (
-                    right_block['bbox'][0] - left_block['bbox'][2] < x_distance_threshold
-                    and left_height * 0.95 < right_height < left_height * 1.05
-                ):
-                    merged_block, to_remove_block = merge_two_blocks(merged_block, right_block)
-                    to_remove_blocks.append(to_remove_block)
-                else:
-                    merged_block = right_block
-        for b in to_remove_blocks:
-            blocks.remove(b)
-    """将所有区块的bbox整理到一起"""
-    # interline_equation_blocks参数不够准，后面切换到interline_equations上
-    interline_equation_blocks = []
-    if len(interline_equation_blocks) > 0:
-        all_bboxes, all_discarded_blocks, footnote_blocks = ocr_prepare_bboxes_for_layout_split_v2(
-            img_body_blocks, img_caption_blocks, img_footnote_blocks,
-            table_body_blocks, table_caption_blocks, table_footnote_blocks,
-            discarded_blocks,
-            text_blocks,
-            title_blocks,
-            interline_equation_blocks,
-            page_w,
-            page_h,
-        )
-    else:
-        all_bboxes, all_discarded_blocks, footnote_blocks = ocr_prepare_bboxes_for_layout_split_v2(
-            img_body_blocks, img_caption_blocks, img_footnote_blocks,
-            table_body_blocks, table_caption_blocks, table_footnote_blocks,
-            discarded_blocks,
-            text_blocks,
-            title_blocks,
-            interline_equations,
-            page_w,
-            page_h,
-        )
-    """获取所有的spans信息"""
-    spans = magic_model.get_all_spans(page_id)
-    """在删除重复span之前，应该通过image_body和table_body的block过滤一下image和table的span"""
-    """顺便删除大水印并保留abandon的span"""
-    spans = remove_outside_spans(spans, all_bboxes, all_discarded_blocks)
-    """删除重叠spans中置信度较低的那些"""
-    spans, dropped_spans_by_confidence = remove_overlaps_low_confidence_spans(spans)
-    """删除重叠spans中较小的那些"""
-    spans, dropped_spans_by_span_overlap = remove_overlaps_min_spans(spans)
-    """根据parse_mode，构造spans，主要是文本类的字符填充"""
-    if parse_mode == SupportedPdfParseMethod.TXT:
-        """使用新版本的混合ocr方案."""
-        spans = txt_spans_extract_v2(page_doc, spans, all_bboxes, all_discarded_blocks, lang)
-    elif parse_mode == SupportedPdfParseMethod.OCR:
-        pass
-    else:
-        raise Exception('parse_mode must be txt or ocr')
-    """先处理不需要排版的discarded_blocks"""
-    discarded_block_with_spans, spans = fill_spans_in_blocks(
-        all_discarded_blocks, spans, 0.4
-    )
-    fix_discarded_blocks = fix_discarded_block(discarded_block_with_spans)
-    """如果当前页面没有有效的bbox则跳过"""
-    if len(all_bboxes) == 0:
-        logger.warning(f'skip this page, not found useful bbox, page_id: {page_id}')
-        return ocr_construct_page_component_v2(
-            [],
-            [],
-            page_id,
-            page_w,
-            page_h,
-            [],
-            [],
-            [],
-            interline_equations,
-            fix_discarded_blocks,
-            need_drop,
-            drop_reason,
-        )
-    """对image和table截图"""
-    spans = ocr_cut_image_and_table(
-        spans, page_doc, page_id, pdf_bytes_md5, imageWriter
-    )
-    """span填充进block"""
-    block_with_spans, spans = fill_spans_in_blocks(all_bboxes, spans, 0.5)
-    """对block进行fix操作"""
-    fix_blocks = fix_block_spans_v2(block_with_spans)
-    """同一行被断开的titile合并"""
-    merge_title_blocks(fix_blocks)
-    """获取所有line并计算正文line的高度"""
-    line_height = get_line_height(fix_blocks)
-    """获取所有line并对line排序"""
-    sorted_bboxes = sort_lines_by_model(fix_blocks, page_w, page_h, line_height, footnote_blocks)
-    """根据line的中位数算block的序列关系"""
-    fix_blocks = cal_block_index(fix_blocks, sorted_bboxes)
-    """将image和table的block还原回group形式参与后续流程"""
-    fix_blocks = revert_group_blocks(fix_blocks)
-    """重排block"""
-    sorted_blocks = sorted(fix_blocks, key=lambda b: b['index'])
-    """block内重排(img和table的block内多个caption或footnote的排序)"""
-    for block in sorted_blocks:
-        if block['type'] in [BlockType.Image, BlockType.Table]:
-            block['blocks'] = sorted(block['blocks'], key=lambda b: b['index'])
-    """获取QA需要外置的list"""
-    images, tables, interline_equations = get_qa_need_list_v2(sorted_blocks)
-    """构造pdf_info_dict"""
-    page_info = ocr_construct_page_component_v2(
-        sorted_blocks,
-        [],
-        page_id,
-        page_w,
-        page_h,
-        [],
-        images,
-        tables,
-        interline_equations,
-        fix_discarded_blocks,
-        need_drop,
-        drop_reason,
-    )
-    return page_info
-def pdf_parse_union(
-    model_list,
-    dataset: Dataset,
-    imageWriter,
-    parse_mode,
-    start_page_id=0,
-    end_page_id=None,
-    debug_mode=False,
-    lang=None,
-):
-    pdf_bytes_md5 = compute_md5(dataset.data_bits())
-    """初始化空的pdf_info_dict"""
-    pdf_info_dict = {}
-    """用model_list和docs对象初始化magic_model"""
-    magic_model = MagicModel(model_list, dataset)
-    """根据输入的起始范围解析pdf"""
-    end_page_id = (
-        end_page_id
-        if end_page_id is not None and end_page_id >= 0
-        else len(dataset) - 1
-    )
-    if end_page_id > len(dataset) - 1:
-        logger.warning('end_page_id is out of range, use pdf_docs length')
-        end_page_id = len(dataset) - 1
-    # """初始化启动时间"""
-    # start_time = time.time()
-    # for page_id, page in enumerate(dataset):
-    for page_id, page in tqdm(enumerate(dataset), total=len(dataset), desc="Processing pages"):
-        # """debug时输出每页解析的耗时."""
-        # if debug_mode:
-            # time_now = time.time()
-            # logger.info(
-            #     f'page_id: {page_id}, last_page_cost_time: {round(time.time() - start_time, 2)}'
-            # )
-            # start_time = time_now
-        """解析pdf中的每一页"""
-        if start_page_id <= page_id <= end_page_id:
-            page_info = parse_page_core(
-                page, magic_model, page_id, pdf_bytes_md5, imageWriter, parse_mode, lang
-            )
-        else:
-            page_info = page.get_page_info()
-            page_w = page_info.w
-            page_h = page_info.h
-            page_info = ocr_construct_page_component_v2(
-                [], [], page_id, page_w, page_h, [], [], [], [], [], True, 'skip page'
-            )
-        pdf_info_dict[f'page_{page_id}'] = page_info
-    need_ocr_list = []
-    img_crop_list = []
-    text_block_list = []
-    for pange_id, page_info in pdf_info_dict.items():
-        for block in page_info['preproc_blocks']:
-            if block['type'] in ['table', 'image']:
-                for sub_block in block['blocks']:
-                    if sub_block['type'] in ['image_caption', 'image_footnote', 'table_caption', 'table_footnote']:
-                        text_block_list.append(sub_block)
-            elif block['type'] in ['text', 'title']:
-                text_block_list.append(block)
-        for block in page_info['discarded_blocks']:
-            text_block_list.append(block)
-    for block in text_block_list:
-        for line in block['lines']:
-            for span in line['spans']:
-                if 'np_img' in span:
-                    need_ocr_list.append(span)
-                    img_crop_list.append(span['np_img'])
-                    span.pop('np_img')
-    if len(img_crop_list) > 0:
-        # Get OCR results for this language's images
-        atom_model_manager = AtomModelSingleton()
-        ocr_model = atom_model_manager.get_atom_model(
-            atom_model_name='ocr',
-            ocr_show_log=False,
-            det_db_box_thresh=0.3,
-            lang=lang
-        )
-        # rec_start = time.time()
-        ocr_res_list = ocr_model.ocr(img_crop_list, det=False, tqdm_enable=True)[0]
-        # Verify we have matching counts
-        assert len(ocr_res_list) == len(need_ocr_list), f'ocr_res_list: {len(ocr_res_list)}, need_ocr_list: {len(need_ocr_list)}'
-        # Process OCR results for this language
-        for index, span in enumerate(need_ocr_list):
-            ocr_text, ocr_score = ocr_res_list[index]
-            span['content'] = ocr_text
-            span['score'] = float(f"{ocr_score:.3f}")
-        # rec_time = time.time() - rec_start
-        # logger.info(f'ocr-dynamic-rec time: {round(rec_time, 2)}, total images processed: {len(img_crop_list)}')
-    """分段"""
-    para_split(pdf_info_dict)
-    """llm优化"""
-    llm_aided_config = get_llm_aided_config()
-    if llm_aided_config is not None:
-        """公式优化"""
-        formula_aided_config = llm_aided_config.get('formula_aided', None)
-        if formula_aided_config is not None:
-            if formula_aided_config.get('enable', False):
-                llm_aided_formula_start_time = time.time()
-                llm_aided_formula(pdf_info_dict, formula_aided_config)
-                logger.info(f'llm aided formula time: {round(time.time() - llm_aided_formula_start_time, 2)}')
-        """文本优化"""
-        text_aided_config = llm_aided_config.get('text_aided', None)
-        if text_aided_config is not None:
-            if text_aided_config.get('enable', False):
-                llm_aided_text_start_time = time.time()
-                llm_aided_text(pdf_info_dict, text_aided_config)
-                logger.info(f'llm aided text time: {round(time.time() - llm_aided_text_start_time, 2)}')
-        """标题优化"""
-        title_aided_config = llm_aided_config.get('title_aided', None)
-        if title_aided_config is not None:
-            if title_aided_config.get('enable', False):
-                llm_aided_title_start_time = time.time()
-                llm_aided_title(pdf_info_dict, title_aided_config)
-                logger.info(f'llm aided title time: {round(time.time() - llm_aided_title_start_time, 2)}')
-    """dict转list"""
-    pdf_info_list = dict_to_list(pdf_info_dict)
-    new_pdf_info_dict = {
-        'pdf_info': pdf_info_list,
-    }
-    clean_memory(get_device())
-    return new_pdf_info_dict
-if __name__ == '__main__':
-    pass
--- a/magic_pdf/pre_proc/__init__.py
+++ b/magic_pdf/pre_proc/__init__.py
--- a/magic_pdf/pre_proc/construct_page_dict.py
+++ b/magic_pdf/pre_proc/construct_page_dict.py
-def ocr_construct_page_component_v2(blocks, layout_bboxes, page_id, page_w, page_h, layout_tree,
-                                    images, tables, interline_equations, discarded_blocks, need_drop, drop_reason):
-    return_dict = {
-        'preproc_blocks': blocks,
-        'layout_bboxes': layout_bboxes,
-        'page_idx': page_id,
-        'page_size': [page_w, page_h],
-        '_layout_tree': layout_tree,
-        'images': images,
-        'tables': tables,
-        'interline_equations': interline_equations,
-        'discarded_blocks': discarded_blocks,
-        'need_drop': need_drop,
-        'drop_reason': drop_reason,
-    }
-    return return_dict
--- a/magic_pdf/pre_proc/cut_image.py
+++ b/magic_pdf/pre_proc/cut_image.py
-from loguru import logger
-from magic_pdf.config.ocr_content_type import ContentType
-from magic_pdf.libs.commons import join_path
-from magic_pdf.libs.pdf_image_tools import cut_image
-def ocr_cut_image_and_table(spans, page, page_id, pdf_bytes_md5, imageWriter):
-    def return_path(type):
-        return join_path(pdf_bytes_md5, type)
-    for span in spans:
-        span_type = span['type']
-        if span_type == ContentType.Image:
-            if not check_img_bbox(span['bbox']) or not imageWriter:
-                continue
-            span['image_path'] = cut_image(span['bbox'], page_id, page, return_path=return_path('images'),
-                                           imageWriter=imageWriter)
-        elif span_type == ContentType.Table:
-            if not check_img_bbox(span['bbox']) or not imageWriter:
-                continue
-            span['image_path'] = cut_image(span['bbox'], page_id, page, return_path=return_path('tables'),
-                                           imageWriter=imageWriter)
-    return spans
-def check_img_bbox(bbox) -> bool:
-    if any([bbox[0] >= bbox[2], bbox[1] >= bbox[3]]):
-        logger.warning(f'image_bboxes: 错误的box, {bbox}')
-        return False
-    return True
--- a/magic_pdf/pre_proc/ocr_span_list_modify.py
+++ b/magic_pdf/pre_proc/ocr_span_list_modify.py
-from magic_pdf.config.drop_tag import DropTag
-from magic_pdf.config.ocr_content_type import BlockType
-from magic_pdf.libs.boxbase import calculate_iou, get_minbox_if_overlap_by_ratio
-def remove_overlaps_low_confidence_spans(spans):
-    dropped_spans = []
-    #  删除重叠spans中置信度低的的那些
-    for span1 in spans:
-        for span2 in spans:
-            if span1 != span2:
-                # span1 或 span2 任何一个都不应该在 dropped_spans 中
-                if span1 in dropped_spans or span2 in dropped_spans:
-                    continue
-                else:
-                    if calculate_iou(span1['bbox'], span2['bbox']) > 0.9:
-                        if span1['score'] < span2['score']:
-                            span_need_remove = span1
-                        else:
-                            span_need_remove = span2
-                        if (
-                            span_need_remove is not None
-                            and span_need_remove not in dropped_spans
-                        ):
-                            dropped_spans.append(span_need_remove)
-    if len(dropped_spans) > 0:
-        for span_need_remove in dropped_spans:
-            spans.remove(span_need_remove)
-            span_need_remove['tag'] = DropTag.SPAN_OVERLAP
-    return spans, dropped_spans
-def check_chars_is_overlap_in_span(chars):
-    for i in range(len(chars)):
-        for j in range(i + 1, len(chars)):
-            if calculate_iou(chars[i]['bbox'], chars[j]['bbox']) > 0.35:
-                return True
-    return False
-def remove_x_overlapping_chars(span, median_width):
-    """
-    Remove characters from a span that overlap significantly on the x-axis.
-    Args:
-        median_width:
-        span (dict): A span containing a list of chars, each with bbox coordinates
-                    in the format [x0, y0, x1, y1]
-    Returns:
-        dict: The span with overlapping characters removed
-    """
-    if 'chars' not in span or len(span['chars']) < 2:
-        return span
-    overlap_threshold = median_width * 0.3
-    i = 0
-    while i < len(span['chars']) - 1:
-        char1 = span['chars'][i]
-        char2 = span['chars'][i + 1]
-        # Calculate overlap width
-        x_left = max(char1['bbox'][0], char2['bbox'][0])
-        x_right = min(char1['bbox'][2], char2['bbox'][2])
-        if x_right > x_left:  # There is overlap
-            overlap_width = x_right - x_left
-            if overlap_width > overlap_threshold:
-                if char1['c'] == char2['c'] or char1['c'] == ' ' or char2['c'] == ' ':
-                    # Determine which character to remove
-                    width1 = char1['bbox'][2] - char1['bbox'][0]
-                    width2 = char2['bbox'][2] - char2['bbox'][0]
-                    if width1 < width2:
-                        # Remove the narrower character
-                        span['chars'].pop(i)
-                    else:
-                        span['chars'].pop(i + 1)
-                else:
-                    i += 1
-                # Don't increment i since we need to check the new pair
-            else:
-                i += 1
-        else:
-            i += 1
-    return span
-def remove_overlaps_min_spans(spans):
-    dropped_spans = []
-    #  删除重叠spans中较小的那些
-    for span1 in spans:
-        for span2 in spans:
-            if span1 != span2:
-                # span1 或 span2 任何一个都不应该在 dropped_spans 中
-                if span1 in dropped_spans or span2 in dropped_spans:
-                    continue
-                else:
-                    overlap_box = get_minbox_if_overlap_by_ratio(span1['bbox'], span2['bbox'], 0.65)
-                    if overlap_box is not None:
-                        span_need_remove = next((span for span in spans if span['bbox'] == overlap_box), None)
-                        if span_need_remove is not None and span_need_remove not in dropped_spans:
-                            dropped_spans.append(span_need_remove)
-    if len(dropped_spans) > 0:
-        for span_need_remove in dropped_spans:
-            spans.remove(span_need_remove)
-            span_need_remove['tag'] = DropTag.SPAN_OVERLAP
-    return spans, dropped_spans
-def get_qa_need_list_v2(blocks):
-    # 创建 images, tables, interline_equations, inline_equations 的副本
-    images = []
-    tables = []
-    interline_equations = []
-    for block in blocks:
-        if block['type'] == BlockType.Image:
-            images.append(block)
-        elif block['type'] == BlockType.Table:
-            tables.append(block)
-        elif block['type'] == BlockType.InterlineEquation:
-            interline_equations.append(block)
-    return images, tables, interline_equations
--- a/magic_pdf/pre_proc/remove_bbox_overlap.py
+++ b/magic_pdf/pre_proc/remove_bbox_overlap.py
-from magic_pdf.config.drop_reason import DropReason
-from magic_pdf.libs.boxbase import _is_in, _is_part_overlap
-def _remove_overlap_between_bbox(bbox1, bbox2):
-    if _is_part_overlap(bbox1, bbox2):
-        ix0, iy0, ix1, iy1 = bbox1
-        x0, y0, x1, y1 = bbox2
-        diff_x = min(x1, ix1) - max(x0, ix0)
-        diff_y = min(y1, iy1) - max(y0, iy0)
-        if diff_y > diff_x:
-            if x1 >= ix1:
-                mid = (x0 + ix1) // 2
-                ix1 = min(mid - 0.25, ix1)
-                x0 = max(mid + 0.25, x0)
-            else:
-                mid = (ix0 + x1) // 2
-                ix0 = max(mid + 0.25, ix0)
-                x1 = min(mid - 0.25, x1)
-        else:
-            if y1 >= iy1:
-                mid = (y0 + iy1) // 2
-                y0 = max(mid + 0.25, y0)
-                iy1 = min(iy1, mid - 0.25)
-            else:
-                mid = (iy0 + y1) // 2
-                y1 = min(y1, mid - 0.25)
-                iy0 = max(mid + 0.25, iy0)
-        if ix1 > ix0 and iy1 > iy0 and y1 > y0 and x1 > x0:
-            bbox1 = [ix0, iy0, ix1, iy1]
-            bbox2 = [x0, y0, x1, y1]
-            return bbox1, bbox2, None
-        else:
-            return bbox1, bbox2, DropReason.NEGATIVE_BBOX_AREA
-    else:
-        return bbox1, bbox2, None
-def _remove_overlap_between_bboxes(arr):
-    drop_reasons = []
-    N = len(arr)
-    keeps = [True] * N
-    res = [None] * N
-    for i in range(N):
-        for j in range(N):
-            if i == j:
-                continue
-            if _is_in(arr[i]['bbox'], arr[j]['bbox']):
-                keeps[i] = False
-    for idx, v in enumerate(arr):
-        if not keeps[idx]:
-            continue
-        for i in range(N):
-            if res[i] is None:
-                continue
-            bbox1, bbox2, drop_reason = _remove_overlap_between_bbox(
-                v['bbox'], res[i]['bbox']
-            )
-            if drop_reason is None:
-                v['bbox'] = bbox1
-                res[i]['bbox'] = bbox2
-            else:
-                if v['score'] > res[i]['score']:
-                    keeps[i] = False
-                    res[i] = None
-                else:
-                    keeps[idx] = False
-                drop_reasons.append(drop_reason)
-        if keeps[idx]:
-            res[idx] = v
-    return res, drop_reasons
-def remove_overlap_between_bbox_for_span(spans):
-    arr = [{'bbox': span['bbox'], 'score': span.get('score', 0.1)} for span in spans]
-    res, drop_reasons = _remove_overlap_between_bboxes(arr)
-    ret = []
-    for i in range(len(res)):
-        if res[i] is None:
-            continue
-        spans[i]['bbox'] = res[i]['bbox']
-        ret.append(spans[i])
-    return ret, drop_reasons
-def remove_overlap_between_bbox_for_block(all_bboxes):
-    arr = [{'bbox': bbox[:4], 'score': bbox[-1]} for bbox in all_bboxes]
-    res, drop_reasons = _remove_overlap_between_bboxes(arr)
-    ret = []
-    for i in range(len(res)):
-        if res[i] is None:
-            continue
-        all_bboxes[i][:4] = res[i]['bbox']
-        ret.append(all_bboxes[i])
-    return ret, drop_reasons
--- a/magic_pdf/resources/model_config/model_configs.yaml
+++ b/magic_pdf/resources/model_config/model_configs.yaml
-weights:
-  layoutlmv3: Layout/LayoutLMv3/model_final.pth
-  doclayout_yolo: Layout/YOLO/doclayout_yolo_docstructbench_imgsz1280_2501.pt
-  yolo_v8_mfd: MFD/YOLO/yolo_v8_ft.pt
-  unimernet_small: MFR/unimernet_hf_small_2503
-  struct_eqtable: TabRec/StructEqTable
-  tablemaster: TabRec/TableMaster
-  rapid_table: TabRec/RapidTable
\ No newline at end of file
--- a/magic_pdf/resources/slanet_plus/slanet-plus.onnx
+++ b/magic_pdf/resources/slanet_plus/slanet-plus.onnx
--- a/magic_pdf/resources/yolov11-langdetect/yolo_v11_ft.pt
+++ b/magic_pdf/resources/yolov11-langdetect/yolo_v11_ft.pt
--- a/magic_pdf/spark/__init__.py
+++ b/magic_pdf/spark/__init__.py
--- a/magic_pdf/spark/spark_api.py
+++ b/magic_pdf/spark/spark_api.py
-from loguru import logger
-from magic_pdf.config.drop_reason import DropReason
-def get_data_source(jso: dict):
-    data_source = jso.get('data_source')
-    if data_source is None:
-        data_source = jso.get('file_source')
-    return data_source
-def get_data_type(jso: dict):
-    data_type = jso.get('data_type')
-    if data_type is None:
-        data_type = jso.get('file_type')
-    return data_type
-def get_bookid(jso: dict):
-    book_id = jso.get('bookid')
-    if book_id is None:
-        book_id = jso.get('original_file_id')
-    return book_id
-def exception_handler(jso: dict, e):
-    logger.exception(e)
-    jso['_need_drop'] = True
-    jso['_drop_reason'] = DropReason.Exception
-    jso['_exception'] = f'ERROR: {e}'
-    return jso
-def get_bookname(jso: dict):
-    data_source = get_data_source(jso)
-    file_id = jso.get('file_id')
-    book_name = f'{data_source}/{file_id}'
-    return book_name
-def spark_json_extractor(jso: dict) -> dict:
-    """从json中提取数据，返回一个dict."""
-    return {
-        '_pdf_type': jso['_pdf_type'],
-        'model_list': jso['doc_layout_result'],
-    }
--- a/magic_pdf/tools/__init__.py
+++ b/magic_pdf/tools/__init__.py
--- a/magic_pdf/tools/cli.py
+++ b/magic_pdf/tools/cli.py
-import os
-import shutil
-import tempfile
-from pathlib import Path
-import click
-import fitz
-from loguru import logger
-import magic_pdf.model as model_config
-from magic_pdf.data.batch_build_dataset import batch_build_dataset
-from magic_pdf.data.data_reader_writer import FileBasedDataReader
-from magic_pdf.data.dataset import Dataset
-from magic_pdf.libs.version import __version__
-from magic_pdf.tools.common import batch_do_parse, do_parse, parse_pdf_methods
-from magic_pdf.utils.office_to_pdf import convert_file_to_pdf
-pdf_suffixes = ['.pdf']
-ms_office_suffixes = ['.ppt', '.pptx', '.doc', '.docx']
-image_suffixes = ['.png', '.jpeg', '.jpg']
-@click.command()
-@click.version_option(__version__,
-                      '--version',
-                      '-v',
-                      help='display the version and exit')
-@click.option(
-    '-p',
-    '--path',
-    'path',
-    type=click.Path(exists=True),
-    required=True,
-    help='local filepath or directory. support PDF, PPT, PPTX, DOC, DOCX, PNG, JPG files',
-)
-@click.option(
-    '-o',
-    '--output-dir',
-    'output_dir',
-    type=click.Path(),
-    required=True,
-    help='output local directory',
-)
-@click.option(
-    '-m',
-    '--method',
-    'method',
-    type=parse_pdf_methods,
-    help="""the method for parsing pdf.
-ocr: using ocr technique to extract information from pdf.
-txt: suitable for the text-based pdf only and outperform ocr.
-auto: automatically choose the best method for parsing pdf from ocr and txt.
-without method specified, auto will be used by default.""",
-    default='auto',
-)
-@click.option(
-    '-l',
-    '--lang',
-    'lang',
-    type=str,
-    help="""
-    Input the languages in the pdf (if known) to improve OCR accuracy.  Optional.
-    You should input "Abbreviation" with language form url:
-    https://paddlepaddle.github.io/PaddleOCR/latest/en/ppocr/blog/multi_languages.html#5-support-languages-and-abbreviations
-    """,
-    default=None,
-)
-@click.option(
-    '-d',
-    '--debug',
-    'debug_able',
-    type=bool,
-    help='Enables detailed debugging information during the execution of the CLI commands.',
-    default=False,
-)
-@click.option(
-    '-s',
-    '--start',
-    'start_page_id',
-    type=int,
-    help='The starting page for PDF parsing, beginning from 0.',
-    default=0,
-)
-@click.option(
-    '-e',
-    '--end',
-    'end_page_id',
-    type=int,
-    help='The ending page for PDF parsing, beginning from 0.',
-    default=None,
-)
-def cli(path, output_dir, method, lang, debug_able, start_page_id, end_page_id):
-    os.makedirs(output_dir, exist_ok=True)
-    temp_dir = tempfile.mkdtemp()
-    def read_fn(path: Path):
-        if path.suffix in ms_office_suffixes:
-            convert_file_to_pdf(str(path), temp_dir)
-            fn = os.path.join(temp_dir, f'{path.stem}.pdf')
-        elif path.suffix in image_suffixes:
-            with open(str(path), 'rb') as f:
-                bits = f.read()
-            pdf_bytes = fitz.open(stream=bits).convert_to_pdf()
-            fn = os.path.join(temp_dir, f'{path.stem}.pdf')
-            with open(fn, 'wb') as f:
-                f.write(pdf_bytes)
-        elif path.suffix in pdf_suffixes:
-            fn = str(path)
-        else:
-            raise Exception(f'Unknown file suffix: {path.suffix}')
-        disk_rw = FileBasedDataReader(os.path.dirname(fn))
-        return disk_rw.read(os.path.basename(fn))
-    def parse_doc(doc_path: Path, dataset: Dataset | None = None):
-        try:
-            file_name = str(Path(doc_path).stem)
-            if dataset is None:
-                pdf_data_or_dataset = read_fn(doc_path)
-            else:
-                pdf_data_or_dataset = dataset
-            do_parse(
-                output_dir,
-                file_name,
-                pdf_data_or_dataset,
-                [],
-                method,
-                debug_able,
-                start_page_id=start_page_id,
-                end_page_id=end_page_id,
-                lang=lang
-            )
-        except Exception as e:
-            logger.exception(e)
-    if os.path.isdir(path):
-        doc_paths = []
-        for doc_path in Path(path).glob('*'):
-            if doc_path.suffix in pdf_suffixes + image_suffixes + ms_office_suffixes:
-                if doc_path.suffix in ms_office_suffixes:
-                    convert_file_to_pdf(str(doc_path), temp_dir)
-                    doc_path = Path(os.path.join(temp_dir, f'{doc_path.stem}.pdf'))
-                elif doc_path.suffix in image_suffixes:
-                    with open(str(doc_path), 'rb') as f:
-                        bits = f.read()
-                        pdf_bytes = fitz.open(stream=bits).convert_to_pdf()
-                    fn = os.path.join(temp_dir, f'{doc_path.stem}.pdf')
-                    with open(fn, 'wb') as f:
-                        f.write(pdf_bytes)
-                    doc_path = Path(fn)
-                doc_paths.append(doc_path)
-        datasets = batch_build_dataset(doc_paths, 4, lang)
-        batch_do_parse(output_dir, [str(doc_path.stem) for doc_path in doc_paths], datasets, method, debug_able, lang=lang)
-    else:
-        parse_doc(Path(path))
-    shutil.rmtree(temp_dir)
-if __name__ == '__main__':
-    cli()
--- a/magic_pdf/tools/cli_dev.py
+++ b/magic_pdf/tools/cli_dev.py
-import json as json_parse
-import os
-from pathlib import Path
-import click
-import magic_pdf.model as model_config
-from magic_pdf.data.data_reader_writer import FileBasedDataReader, S3DataReader
-from magic_pdf.libs.config_reader import get_s3_config
-from magic_pdf.libs.path_utils import (parse_s3_range_params, parse_s3path,
-                                       remove_non_official_s3_args)
-from magic_pdf.libs.version import __version__
-from magic_pdf.tools.common import do_parse, parse_pdf_methods
-def read_s3_path(s3path):
-    bucket, key = parse_s3path(s3path)
-    s3_ak, s3_sk, s3_endpoint = get_s3_config(bucket)
-    s3_rw = S3DataReader('', bucket, s3_ak, s3_sk, s3_endpoint, 'auto')
-    may_range_params = parse_s3_range_params(s3path)
-    if may_range_params is None or 2 != len(may_range_params):
-        byte_start, byte_end = 0, -1
-    else:
-        byte_start, byte_end = int(may_range_params[0]), int(
-            may_range_params[1])
-    return s3_rw.read_at(
-        remove_non_official_s3_args(s3path),
-        byte_start,
-        byte_end,
-    )
-@click.group()
-@click.version_option(__version__, '--version', '-v', help='显示版本信息')
-def cli():
-    pass
-@cli.command()
-@click.option(
-    '-j',
-    '--jsonl',
-    'jsonl',
-    type=str,
-    help='输入 jsonl 路径，本地或者 s3 上的文件',
-    required=True,
-)
-@click.option(
-    '-m',
-    '--method',
-    'method',
-    type=parse_pdf_methods,
-    help='指定解析方法。txt: 文本型 pdf 解析方法， ocr: 光学识别解析 pdf, auto: 程序智能选择解析方法',
-    default='auto',
-)
-@click.option(
-    '-o',
-    '--output-dir',
-    'output_dir',
-    type=click.Path(),
-    required=True,
-    help='输出到本地目录',
-)
-def jsonl(jsonl, method, output_dir):
-    model_config.__use_inside_model__ = False
-    if jsonl.startswith('s3://'):
-        jso = json_parse.loads(read_s3_path(jsonl).decode('utf-8'))
-    else:
-        with open(jsonl) as f:
-            jso = json_parse.loads(f.readline())
-    os.makedirs(output_dir, exist_ok=True)
-    s3_file_path = jso.get('file_location')
-    if s3_file_path is None:
-        s3_file_path = jso.get('path')
-    pdf_file_name = Path(s3_file_path).stem
-    pdf_data = read_s3_path(s3_file_path)
-    print(pdf_file_name, jso, method)
-    do_parse(
-        output_dir,
-        pdf_file_name,
-        pdf_data,
-        jso['doc_layout_result'],
-        method,
-        False,
-        f_dump_content_list=True,
-        f_draw_model_bbox=True,
-    )
-@cli.command()
-@click.option(
-    '-p',
-    '--pdf',
-    'pdf',
-    type=click.Path(exists=True),
-    required=True,
-    help='本地 PDF 文件',
-)
-@click.option(
-    '-j',
-    '--json',
-    'json_data',
-    type=click.Path(exists=True),
-    required=True,
-    help='本地模型推理出的 json 数据',
-)
-@click.option('-o',
-              '--output-dir',
-              'output_dir',
-              type=click.Path(),
-              required=True,
-              help='本地输出目录')
-@click.option(
-    '-m',
-    '--method',
-    'method',
-    type=parse_pdf_methods,
-    help='指定解析方法。txt: 文本型 pdf 解析方法， ocr: 光学识别解析 pdf, auto: 程序智能选择解析方法',
-    default='auto',
-)
-def pdf(pdf, json_data, output_dir, method):
-    model_config.__use_inside_model__ = False
-    full_pdf_path = os.path.realpath(pdf)
-    os.makedirs(output_dir, exist_ok=True)
-    def read_fn(path):
-        disk_rw = FileBasedDataReader(os.path.dirname(path))
-        return disk_rw.read(os.path.basename(path))
-    model_json_list = json_parse.loads(read_fn(json_data).decode('utf-8'))
-    file_name = str(Path(full_pdf_path).stem)
-    pdf_data = read_fn(full_pdf_path)
-    do_parse(
-        output_dir,
-        file_name,
-        pdf_data,
-        model_json_list,
-        method,
-        False,
-        f_dump_content_list=True,
-        f_draw_model_bbox=True,
-    )
-if __name__ == '__main__':
-    cli()
--- a/magic_pdf/tools/common.py
+++ b/magic_pdf/tools/common.py
-import os
-import click
-import fitz
-from loguru import logger
-import magic_pdf.model as model_config
-from magic_pdf.config.enums import SupportedPdfParseMethod
-from magic_pdf.config.make_content_config import DropMode, MakeMode
-from magic_pdf.data.data_reader_writer import FileBasedDataWriter
-from magic_pdf.data.dataset import Dataset, PymuDocDataset
-from magic_pdf.libs.draw_bbox import draw_char_bbox
-from magic_pdf.model.doc_analyze_by_custom_model import (batch_doc_analyze,
-                                                         doc_analyze)
-# from io import BytesIO
-# from pypdf import PdfReader, PdfWriter
-def prepare_env(output_dir, pdf_file_name, method):
-    local_parent_dir = os.path.join(output_dir, pdf_file_name, method)
-    local_image_dir = os.path.join(str(local_parent_dir), 'images')
-    local_md_dir = local_parent_dir
-    os.makedirs(local_image_dir, exist_ok=True)
-    os.makedirs(local_md_dir, exist_ok=True)
-    return local_image_dir, local_md_dir
-# def convert_pdf_bytes_to_bytes_by_pypdf(pdf_bytes, start_page_id=0, end_page_id=None):
-#     # 将字节数据包装在 BytesIO 对象中
-#     pdf_file = BytesIO(pdf_bytes)
-#     # 读取 PDF 的字节数据
-#     reader = PdfReader(pdf_file)
-#     # 创建一个新的 PDF 写入器
-#     writer = PdfWriter()
-#     # 将所有页面添加到新的 PDF 写入器中
-#     end_page_id = end_page_id if end_page_id is not None and end_page_id >= 0 else len(reader.pages) - 1
-#     if end_page_id > len(reader.pages) - 1:
-#         logger.warning("end_page_id is out of range, use pdf_docs length")
-#         end_page_id = len(reader.pages) - 1
-#     for i, page in enumerate(reader.pages):
-#         if start_page_id <= i <= end_page_id:
-#             writer.add_page(page)
-#     # 创建一个字节缓冲区来存储输出的 PDF 数据
-#     output_buffer = BytesIO()
-#     # 将 PDF 写入字节缓冲区
-#     writer.write(output_buffer)
-#     # 获取字节缓冲区的内容
-#     converted_pdf_bytes = output_buffer.getvalue()
-#     return converted_pdf_bytes
-def convert_pdf_bytes_to_bytes_by_pymupdf(pdf_bytes, start_page_id=0, end_page_id=None):
-    document = fitz.open('pdf', pdf_bytes)
-    output_document = fitz.open()
-    end_page_id = (
-        end_page_id
-        if end_page_id is not None and end_page_id >= 0
-        else len(document) - 1
-    )
-    if end_page_id > len(document) - 1:
-        logger.warning('end_page_id is out of range, use pdf_docs length')
-        end_page_id = len(document) - 1
-    output_document.insert_pdf(document, from_page=start_page_id, to_page=end_page_id)
-    output_bytes = output_document.tobytes()
-    return output_bytes
-def _do_parse(
-    output_dir,
-    pdf_file_name,
-    pdf_bytes_or_dataset,
-    model_list,
-    parse_method,
-    debug_able=False,
-    f_draw_span_bbox=True,
-    f_draw_layout_bbox=True,
-    f_dump_md=True,
-    f_dump_middle_json=True,
-    f_dump_model_json=True,
-    f_dump_orig_pdf=True,
-    f_dump_content_list=True,
-    f_make_md_mode=MakeMode.MM_MD,
-    f_draw_model_bbox=False,
-    f_draw_line_sort_bbox=False,
-    f_draw_char_bbox=False,
-    start_page_id=0,
-    end_page_id=None,
-    lang=None,
-    layout_model=None,
-    formula_enable=None,
-    table_enable=None,
-):
-    from magic_pdf.operators.models import InferenceResult
-    if debug_able:
-        logger.warning('debug mode is on')
-        f_draw_model_bbox = True
-        f_draw_line_sort_bbox = True
-        # f_draw_char_bbox = True
-    if isinstance(pdf_bytes_or_dataset, bytes):
-        pdf_bytes = convert_pdf_bytes_to_bytes_by_pymupdf(
-            pdf_bytes_or_dataset, start_page_id, end_page_id
-        )
-        ds = PymuDocDataset(pdf_bytes, lang=lang)
-    else:
-        ds = pdf_bytes_or_dataset
-    pdf_bytes = ds._raw_data
-    local_image_dir, local_md_dir = prepare_env(output_dir, pdf_file_name, parse_method)
-    image_writer, md_writer = FileBasedDataWriter(local_image_dir), FileBasedDataWriter(local_md_dir)
-    image_dir = str(os.path.basename(local_image_dir))
-    if len(model_list) == 0:
-        if model_config.__use_inside_model__:
-            if parse_method == 'auto':
-                if ds.classify() == SupportedPdfParseMethod.TXT:
-                    infer_result = ds.apply(
-                        doc_analyze,
-                        ocr=False,
-                        lang=ds._lang,
-                        layout_model=layout_model,
-                        formula_enable=formula_enable,
-                        table_enable=table_enable,
-                    )
-                    pipe_result = infer_result.pipe_txt_mode(
-                        image_writer, debug_mode=True, lang=ds._lang
-                    )
-                else:
-                    infer_result = ds.apply(
-                        doc_analyze,
-                        ocr=True,
-                        lang=ds._lang,
-                        layout_model=layout_model,
-                        formula_enable=formula_enable,
-                        table_enable=table_enable,
-                    )
-                    pipe_result = infer_result.pipe_ocr_mode(
-                        image_writer, debug_mode=True, lang=ds._lang
-                    )
-            elif parse_method == 'txt':
-                infer_result = ds.apply(
-                    doc_analyze,
-                    ocr=False,
-                    lang=ds._lang,
-                    layout_model=layout_model,
-                    formula_enable=formula_enable,
-                    table_enable=table_enable,
-                )
-                pipe_result = infer_result.pipe_txt_mode(
-                    image_writer, debug_mode=True, lang=ds._lang
-                )
-            elif parse_method == 'ocr':
-                infer_result = ds.apply(
-                    doc_analyze,
-                    ocr=True,
-                    lang=ds._lang,
-                    layout_model=layout_model,
-                    formula_enable=formula_enable,
-                    table_enable=table_enable,
-                )
-                pipe_result = infer_result.pipe_ocr_mode(
-                    image_writer, debug_mode=True, lang=ds._lang
-                )
-            else:
-                logger.error('unknown parse method')
-                exit(1)
-        else:
-            logger.error('need model list input')
-            exit(2)
-    else:
-        infer_result = InferenceResult(model_list, ds)
-        if parse_method == 'ocr':
-            pipe_result = infer_result.pipe_ocr_mode(
-                image_writer, debug_mode=True, lang=ds._lang
-            )
-        elif parse_method == 'txt':
-            pipe_result = infer_result.pipe_txt_mode(
-                image_writer, debug_mode=True, lang=ds._lang
-            )
-        else:
-            if ds.classify() == SupportedPdfParseMethod.TXT:
-                pipe_result = infer_result.pipe_txt_mode(
-                        image_writer, debug_mode=True, lang=ds._lang
-                    )
-            else:
-                pipe_result = infer_result.pipe_ocr_mode(
-                        image_writer, debug_mode=True, lang=ds._lang
-                    )
-    if f_draw_model_bbox:
-        infer_result.draw_model(
-            os.path.join(local_md_dir, f'{pdf_file_name}_model.pdf')
-        )
-    if f_draw_layout_bbox:
-        pipe_result.draw_layout(
-            os.path.join(local_md_dir, f'{pdf_file_name}_layout.pdf')
-        )
-    if f_draw_span_bbox:
-        pipe_result.draw_span(os.path.join(local_md_dir, f'{pdf_file_name}_spans.pdf'))
-    if f_draw_line_sort_bbox:
-        pipe_result.draw_line_sort(
-            os.path.join(local_md_dir, f'{pdf_file_name}_line_sort.pdf')
-        )
-    if f_draw_char_bbox:
-        draw_char_bbox(pdf_bytes, local_md_dir, f'{pdf_file_name}_char_bbox.pdf')
-    if f_dump_md:
-        pipe_result.dump_md(
-            md_writer,
-            f'{pdf_file_name}.md',
-            image_dir,
-            drop_mode=DropMode.NONE,
-            md_make_mode=f_make_md_mode,
-        )
-    if f_dump_middle_json:
-        pipe_result.dump_middle_json(md_writer, f'{pdf_file_name}_middle.json')
-    if f_dump_model_json:
-        infer_result.dump_model(md_writer, f'{pdf_file_name}_model.json')
-    if f_dump_orig_pdf:
-        md_writer.write(
-            f'{pdf_file_name}_origin.pdf',
-            pdf_bytes,
-        )
-    if f_dump_content_list:
-        pipe_result.dump_content_list(
-            md_writer,
-            f'{pdf_file_name}_content_list.json',
-            image_dir
-        )
-    logger.info(f'local output dir is {local_md_dir}')
-def do_parse(
-    output_dir,
-    pdf_file_name,
-    pdf_bytes_or_dataset,
-    model_list,
-    parse_method,
-    debug_able=False,
-    f_draw_span_bbox=True,
-    f_draw_layout_bbox=True,
-    f_dump_md=True,
-    f_dump_middle_json=True,
-    f_dump_model_json=True,
-    f_dump_orig_pdf=True,
-    f_dump_content_list=True,
-    f_make_md_mode=MakeMode.MM_MD,
-    f_draw_model_bbox=False,
-    f_draw_line_sort_bbox=False,
-    f_draw_char_bbox=False,
-    start_page_id=0,
-    end_page_id=None,
-    lang=None,
-    layout_model=None,
-    formula_enable=None,
-    table_enable=None,
-):
-    parallel_count = 1
-    if os.environ.get('MINERU_PARALLEL_INFERENCE_COUNT'):
-        parallel_count = int(os.environ['MINERU_PARALLEL_INFERENCE_COUNT'])
-    if parallel_count > 1:
-        if isinstance(pdf_bytes_or_dataset, bytes):
-            pdf_bytes = convert_pdf_bytes_to_bytes_by_pymupdf(
-                pdf_bytes_or_dataset, start_page_id, end_page_id
-            )
-            ds = PymuDocDataset(pdf_bytes, lang=lang)
-        else:
-            ds = pdf_bytes_or_dataset
-        batch_do_parse(output_dir, [pdf_file_name], [ds], parse_method, debug_able, f_draw_span_bbox=f_draw_span_bbox, f_draw_layout_bbox=f_draw_layout_bbox, f_dump_md=f_dump_md, f_dump_middle_json=f_dump_middle_json, f_dump_model_json=f_dump_model_json, f_dump_orig_pdf=f_dump_orig_pdf, f_dump_content_list=f_dump_content_list, f_make_md_mode=f_make_md_mode, f_draw_model_bbox=f_draw_model_bbox, f_draw_line_sort_bbox=f_draw_line_sort_bbox, f_draw_char_bbox=f_draw_char_bbox, lang=lang)
-    else:
-        _do_parse(output_dir, pdf_file_name, pdf_bytes_or_dataset, model_list, parse_method, debug_able, start_page_id=start_page_id, end_page_id=end_page_id, lang=lang, layout_model=layout_model, formula_enable=formula_enable, table_enable=table_enable,  f_draw_span_bbox=f_draw_span_bbox, f_draw_layout_bbox=f_draw_layout_bbox, f_dump_md=f_dump_md, f_dump_middle_json=f_dump_middle_json, f_dump_model_json=f_dump_model_json, f_dump_orig_pdf=f_dump_orig_pdf, f_dump_content_list=f_dump_content_list, f_make_md_mode=f_make_md_mode, f_draw_model_bbox=f_draw_model_bbox, f_draw_line_sort_bbox=f_draw_line_sort_bbox, f_draw_char_bbox=f_draw_char_bbox)
-def batch_do_parse(
-    output_dir,
-    pdf_file_names: list[str],
-    pdf_bytes_or_datasets: list[bytes | Dataset],
-    parse_method,
-    debug_able=False,
-    f_draw_span_bbox=True,
-    f_draw_layout_bbox=True,
-    f_dump_md=True,
-    f_dump_middle_json=True,
-    f_dump_model_json=True,
-    f_dump_orig_pdf=True,
-    f_dump_content_list=True,
-    f_make_md_mode=MakeMode.MM_MD,
-    f_draw_model_bbox=False,
-    f_draw_line_sort_bbox=False,
-    f_draw_char_bbox=False,
-    lang=None,
-    layout_model=None,
-    formula_enable=None,
-    table_enable=None,
-):
-    dss = []
-    for v in pdf_bytes_or_datasets:
-        if isinstance(v, bytes):
-            dss.append(PymuDocDataset(v, lang=lang))
-        else:
-            dss.append(v)
-    infer_results = batch_doc_analyze(dss, parse_method, lang=lang, layout_model=layout_model, formula_enable=formula_enable, table_enable=table_enable)
-    for idx, infer_result in enumerate(infer_results):
-        _do_parse(
-            output_dir = output_dir,
-            pdf_file_name = pdf_file_names[idx],
-            pdf_bytes_or_dataset = dss[idx],
-            model_list = infer_result.get_infer_res(),
-            parse_method = parse_method,
-            debug_able = debug_able,
-            f_draw_span_bbox = f_draw_span_bbox,
-            f_draw_layout_bbox = f_draw_layout_bbox,
-            f_dump_md=f_dump_md,
-            f_dump_middle_json=f_dump_middle_json,
-            f_dump_model_json=f_dump_model_json,
-            f_dump_orig_pdf=f_dump_orig_pdf,
-            f_dump_content_list=f_dump_content_list,
-            f_make_md_mode=MakeMode.MM_MD,
-            f_draw_model_bbox=f_draw_model_bbox,
-            f_draw_line_sort_bbox=f_draw_line_sort_bbox,
-            f_draw_char_bbox=f_draw_char_bbox,
-            lang=lang,
-        )
-parse_pdf_methods = click.Choice(['ocr', 'txt', 'auto'])
--- a/magic_pdf/utils/__init__.py
+++ b/magic_pdf/utils/__init__.py
--- a/magic_pdf/utils/annotations.py
+++ b/magic_pdf/utils/annotations.py
-from loguru import logger
-def ImportPIL(f):
-    try:
-        import PIL  # noqa: F401
-    except ImportError:
-        logger.error('Pillow not installed, please install by pip.')
-        exit(1)
-    return f
--- a/magic_pdf/utils/office_to_pdf.py
+++ b/magic_pdf/utils/office_to_pdf.py
-import os
-import subprocess
-import platform
-from pathlib import Path
-import shutil
-from loguru import logger
-class ConvertToPdfError(Exception):
-    def __init__(self, msg):
-        self.msg = msg
-        super().__init__(self.msg)
-def check_fonts_installed():
-    """Check if required Chinese fonts are installed."""
-    system_type = platform.system()
-    if system_type in ['Windows', 'Darwin']:
-        pass
-    else:
-        # Linux: use fc-list
-        try:
-            output = subprocess.check_output(['fc-list', ':lang=zh'], encoding='utf-8')
-            if output.strip():  # 只要有任何输出（非空）
-                return True
-            else:
-                logger.warning(
-                    f"No Chinese fonts were detected, the converted document may not display Chinese content properly."
-                )
-        except Exception:
-            pass
-def get_soffice_command():
-    """Return the path to LibreOffice's soffice executable depending on the platform."""
-    system_type = platform.system()
-    # First check if soffice is in PATH
-    soffice_path = shutil.which('soffice')
-    if soffice_path:
-        return soffice_path
-    if system_type == 'Windows':
-        # Check common installation paths
-        possible_paths = [
-            Path(os.environ.get('PROGRAMFILES', 'C:/Program Files')) / 'LibreOffice/program/soffice.exe',
-            Path(os.environ.get('PROGRAMFILES(X86)', 'C:/Program Files (x86)')) / 'LibreOffice/program/soffice.exe',
-            Path('C:/Program Files/LibreOffice/program/soffice.exe'),
-            Path('C:/Program Files (x86)/LibreOffice/program/soffice.exe')
-        ]
-        # Check other drives for windows
-        for drive in ['C:', 'D:', 'E:', 'F:', 'G:', 'H:']:
-            possible_paths.append(Path(f"{drive}/LibreOffice/program/soffice.exe"))
-        for path in possible_paths:
-            if path.exists():
-                return str(path)
-        raise ConvertToPdfError(
-            "LibreOffice not found. Please install LibreOffice from https://www.libreoffice.org/ "
-            "or ensure soffice.exe is in your PATH environment variable."
-        )
-    else:
-        # For Linux/macOS, provide installation instructions if not found
-        try:
-            # Try to find soffice in standard locations
-            possible_paths = [
-                '/usr/bin/soffice',
-                '/usr/local/bin/soffice',
-                '/opt/libreoffice/program/soffice',
-                '/Applications/LibreOffice.app/Contents/MacOS/soffice'
-            ]
-            for path in possible_paths:
-                if os.path.exists(path):
-                    return path
-            raise ConvertToPdfError(
-                "LibreOffice not found. Please install it:\n"
-                "  - Ubuntu/Debian: sudo apt-get install libreoffice\n"
-                "  - CentOS/RHEL: sudo yum install libreoffice\n"
-                "  - macOS: brew install libreoffice or download from https://www.libreoffice.org/\n"
-                "  - Or ensure soffice is in your PATH environment variable."
-            )
-        except Exception as e:
-            raise ConvertToPdfError(f"Error locating LibreOffice: {str(e)}")
-def convert_file_to_pdf(input_path, output_dir):
-    """Convert a single document (ppt, doc, etc.) to PDF."""
-    if not os.path.isfile(input_path):
-        raise FileNotFoundError(f"The input file {input_path} does not exist.")
-    os.makedirs(output_dir, exist_ok=True)
-    check_fonts_installed()
-    soffice_cmd = get_soffice_command()
-    cmd = [
-        soffice_cmd,
-        '--headless',
-        '--norestore',
-        '--invisible',
-        '--convert-to', 'pdf',
-        '--outdir', str(output_dir),
-        str(input_path)
-    ]
-    process = subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
-    if process.returncode != 0:
-        raise ConvertToPdfError(f"LibreOffice convert failed: {process.stderr.decode()}")
--- a/magic-pdf.template.json
+++ b/magic-pdf.template.json
@@ -3,23 +3,6 @@
        "bucket-name-1":["ak", "sk", "endpoint"],
        "bucket-name-2":["ak", "sk", "endpoint"]
    },
-    "models-dir":"/tmp/models",
-    "layoutreader-model-dir":"/tmp/layoutreader",
-    "device-mode":"cpu",
-    "layout-config": {
-        "model": "doclayout_yolo"
-    },
-    "formula-config": {
-        "mfd_model": "yolo_v8_mfd",
-        "mfr_model": "unimernet_small",
-        "enable": true
-    },
-    "table-config": {
-        "model": "rapid_table",
-        "sub_model": "slanet_plus",
-        "enable": true,
-        "max_time": 400
-    },
    "latex-delimiter-config": {
        "display": {
            "left": "$$",
@@ -31,18 +14,6 @@
        }
    },
    "llm-aided-config": {
-        "formula_aided": {
-            "api_key": "your_api_key",
-            "base_url": "https://dashscope.aliyuncs.com/compatible-mode/v1",
-            "model": "qwen2.5-7b-instruct",
-            "enable": false
-        },
-        "text_aided": {
-            "api_key": "your_api_key",
-            "base_url": "https://dashscope.aliyuncs.com/compatible-mode/v1",
-            "model": "qwen2.5-7b-instruct",
-            "enable": false
-        },
        "title_aided": {
            "api_key": "your_api_key",
            "base_url": "https://dashscope.aliyuncs.com/compatible-mode/v1",
@@ -50,5 +21,9 @@
            "enable": false
        }
    },
-    "config_version": "1.2.1"
+    "models-dir": {
+        "pipeline": "",
+        "vlm": ""
+    },
+    "config_version": "1.3.0"
 }
\ No newline at end of file