Deleted magic_pdf/__pycache__/__init__.cpython-310.pyc,...

Deleted magic_pdf/__pycache__/__init__.cpython-310.pyc, magic_pdf/__pycache__/pdf_parse_by_ocr.cpython-310.pyc, magic_pdf/__pycache__/pdf_parse_by_txt.cpython-310.pyc, magic_pdf/__pycache__/pdf_parse_union_core.cpython-310.pyc, magic_pdf/__pycache__/user_api.cpython-310.pyc, magic_pdf/dict2md/__pycache__/__init__.cpython-310.pyc, magic_pdf/dict2md/__pycache__/ocr_client.cpython-310.pyc, magic_pdf/dict2md/__pycache__/ocr_mkcontent.cpython-310.pyc, magic_pdf/dict2md/__init__.py, magic_pdf/dict2md/mkcontent.py, magic_pdf/dict2md/ocr_client.py, magic_pdf/dict2md/ocr_mkcontent.py, magic_pdf/dict2md/ocr_server.py, magic_pdf/filter/__init__.py, magic_pdf/filter/pdf_classify_by_type.py, magic_pdf/filter/pdf_meta_scan.py, magic_pdf/integrations/rag/__init__.py, magic_pdf/integrations/rag/api.py, magic_pdf/integrations/rag/type.py, magic_pdf/integrations/rag/utils.py, magic_pdf/integrations/__init__.py, magic_pdf/layout/__init__.py, magic_pdf/layout/bbox_sort.py, magic_pdf/layout/layout_det_utils.py, magic_pdf/layout/layout_sort.py, magic_pdf/layout/layout_spiler_recog.py, magic_pdf/layout/mcol_sort.py, magic_pdf/libs/Constants.py, magic_pdf/libs/MakeContentConfig.py, magic_pdf/libs/ModelBlockTypeEnum.py, magic_pdf/libs/__init__.py, magic_pdf/libs/boxbase.py, magic_pdf/libs/calc_span_stats.py, magic_pdf/libs/commons.py, magic_pdf/libs/config_reader.py, magic_pdf/libs/convert_utils.py, magic_pdf/libs/coordinate_transform.py, magic_pdf/libs/detect_language_from_model.py, magic_pdf/libs/draw_bbox.py, magic_pdf/libs/drop_reason.py, magic_pdf/libs/drop_tag.py, magic_pdf/libs/hash_utils.py, magic_pdf/libs/json_compressor.py, magic_pdf/libs/language.py, magic_pdf/libs/local_math.py, magic_pdf/libs/markdown_utils.py, magic_pdf/libs/nlp_utils.py, magic_pdf/libs/ocr_content_type.py, magic_pdf/libs/path_utils.py, magic_pdf/libs/pdf_check.py, magic_pdf/libs/pdf_image_tools.py, magic_pdf/libs/safe_filename.py, magic_pdf/libs/textbase.py, magic_pdf/libs/version.py, magic_pdf/libs/vis_utils.py, magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/data/__init__.py, magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/data/cord.py, magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/data/data_collator.py, magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/data/funsd.py, magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/data/image_utils.py, magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/data/xfund.py, magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/models/layoutlmv3/__init__.py, magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/models/layoutlmv3/configuration_layoutlmv3.py, magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/models/layoutlmv3/modeling_layoutlmv3.py, magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/models/layoutlmv3/tokenization_layoutlmv3.py, magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/models/layoutlmv3/tokenization_layoutlmv3_fast.py, magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/models/__init__.py, magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/__init__.py, magic_pdf/model/pek_sub_modules/layoutlmv3/__init__.py, magic_pdf/model/pek_sub_modules/layoutlmv3/backbone.py, magic_pdf/model/pek_sub_modules/layoutlmv3/beit.py, magic_pdf/model/pek_sub_modules/layoutlmv3/deit.py, magic_pdf/model/pek_sub_modules/layoutlmv3/model_init.py, magic_pdf/model/pek_sub_modules/layoutlmv3/rcnn_vl.py, magic_pdf/model/pek_sub_modules/layoutlmv3/visualizer.py, magic_pdf/model/pek_sub_modules/structeqtable/StructTableModel.py, magic_pdf/model/pek_sub_modules/structeqtable/__init__.py, magic_pdf/model/pek_sub_modules/__init__.py, magic_pdf/model/pek_sub_modules/post_process.py, magic_pdf/model/pek_sub_modules/self_modify.py, magic_pdf/model/__init__.py, magic_pdf/model/doc_analyze_by_custom_model.py, magic_pdf/model/magic_model.py, magic_pdf/model/model_list.py, magic_pdf/model/pdf_extract_kit.py, magic_pdf/model/ppTableModel.py, magic_pdf/model/pp_structure_v2.py, magic_pdf/para/__init__.py, magic_pdf/para/block_continuation_processor.py, magic_pdf/para/block_termination_processor.py, magic_pdf/para/commons.py, magic_pdf/para/denoise.py, magic_pdf/para/draw.py, magic_pdf/para/exceptions.py, magic_pdf/para/layout_match_processor.py, magic_pdf/para/para_pipeline.py, magic_pdf/para/para_split.py, magic_pdf/para/para_split_v2.py, magic_pdf/para/raw_processor.py, magic_pdf/para/stats.py, magic_pdf/para/title_processor.py, magic_pdf/parse/__init__.py, magic_pdf/parse/common_parse.py, magic_pdf/parse/excel_parse.py, magic_pdf/parse/pdf_client.py, magic_pdf/pipe/AbsPipe.py, magic_pdf/pipe/OCRPipe.py, magic_pdf/pipe/TXTPipe.py, magic_pdf/pipe/UNIPipe.py, magic_pdf/pipe/__init__.py, magic_pdf/post_proc/__init__.py, magic_pdf/post_proc/detect_para.py, magic_pdf/post_proc/pdf_post_filter.py, magic_pdf/post_proc/remove_footnote.py, magic_pdf/pre_proc/__init__.py, magic_pdf/pre_proc/citationmarker_remove.py, magic_pdf/pre_proc/construct_page_dict.py, magic_pdf/pre_proc/cut_image.py, magic_pdf/pre_proc/detect_equation.py, magic_pdf/pre_proc/detect_footer_by_model.py, magic_pdf/pre_proc/detect_footer_header_by_statistics.py, magic_pdf/pre_proc/detect_footnote.py, magic_pdf/pre_proc/detect_header.py, magic_pdf/pre_proc/detect_images.py, magic_pdf/pre_proc/detect_page_number.py, magic_pdf/pre_proc/detect_tables.py, magic_pdf/pre_proc/equations_replace.py, magic_pdf/pre_proc/fix_image.py, magic_pdf/pre_proc/fix_table.py, magic_pdf/pre_proc/main_text_font.py, magic_pdf/pre_proc/ocr_detect_all_bboxes.py, magic_pdf/pre_proc/ocr_detect_layout.py, magic_pdf/pre_proc/ocr_dict_merge.py, magic_pdf/pre_proc/ocr_span_list_modify.py, magic_pdf/pre_proc/pdf_pre_filter.py, magic_pdf/pre_proc/post_layout_split.py, magic_pdf/pre_proc/remove_bbox_overlap.py, magic_pdf/pre_proc/remove_colored_strip_bbox.py, magic_pdf/pre_proc/remove_footer_header.py, magic_pdf/pre_proc/remove_rotate_bbox.py, magic_pdf/pre_proc/resolve_bbox_conflict.py, magic_pdf/pre_proc/solve_line_alien.py, magic_pdf/pre_proc/statistics.py, magic_pdf/resources/fasttext-langdetect/lid.176.ftz, magic_pdf/resources/model_config/UniMERNet/demo.yaml, magic_pdf/resources/model_config/layoutlmv3/layoutlmv3_base_inference.yaml, magic_pdf/resources/model_config/model_configs.yaml, magic_pdf/rw/AbsReaderWriter.py, magic_pdf/rw/DiskReaderWriter.py, magic_pdf/rw/S3ReaderWriter.py, magic_pdf/rw/__init__.py, magic_pdf/spark/__init__.py, magic_pdf/spark/spark_api.py, magic_pdf/tools/__init__.py, magic_pdf/tools/cli.py, magic_pdf/tools/cli_dev.py, magic_pdf/tools/common.py, magic_pdf/tools/pdf_server.py, magic_pdf/__init__.py, magic_pdf/config.ini, magic_pdf/pdf_parse_by_ocr.py, magic_pdf/pdf_parse_by_txt.py, magic_pdf/pdf_parse_union_core.py, magic_pdf/user_api.py files

Deleted magic_pdf/pycache/init.cpython-310.pyc,...
Deleted magic_pdf/__pycache__/__init__.cpython-310.pyc, magic_pdf/__pycache__/pdf_parse_by_ocr.cpython-310.pyc, magic_pdf/__pycache__/pdf_parse_by_txt.cpython-310.pyc, magic_pdf/__pycache__/pdf_parse_union_core.cpython-310.pyc, magic_pdf/__pycache__/user_api.cpython-310.pyc, magic_pdf/dict2md/__pycache__/__init__.cpython-310.pyc, magic_pdf/dict2md/__pycache__/ocr_client.cpython-310.pyc, magic_pdf/dict2md/__pycache__/ocr_mkcontent.cpython-310.pyc, magic_pdf/dict2md/__init__.py, magic_pdf/dict2md/mkcontent.py, magic_pdf/dict2md/ocr_client.py, magic_pdf/dict2md/ocr_mkcontent.py, magic_pdf/dict2md/ocr_server.py, magic_pdf/filter/__init__.py, magic_pdf/filter/pdf_classify_by_type.py, magic_pdf/filter/pdf_meta_scan.py, magic_pdf/integrations/rag/__init__.py, magic_pdf/integrations/rag/api.py, magic_pdf/integrations/rag/type.py, magic_pdf/integrations/rag/utils.py, magic_pdf/integrations/__init__.py, magic_pdf/layout/__init__.py, magic_pdf/layout/bbox_sort.py, magic_pdf/layout/layout_det_utils.py, magic_pdf/layout/layout_sort.py, magic_pdf/layout/layout_spiler_recog.py, magic_pdf/layout/mcol_sort.py, magic_pdf/libs/Constants.py, magic_pdf/libs/MakeContentConfig.py, magic_pdf/libs/ModelBlockTypeEnum.py, magic_pdf/libs/__init__.py, magic_pdf/libs/boxbase.py, magic_pdf/libs/calc_span_stats.py, magic_pdf/libs/commons.py, magic_pdf/libs/config_reader.py, magic_pdf/libs/convert_utils.py, magic_pdf/libs/coordinate_transform.py, magic_pdf/libs/detect_language_from_model.py, magic_pdf/libs/draw_bbox.py, magic_pdf/libs/drop_reason.py, magic_pdf/libs/drop_tag.py, magic_pdf/libs/hash_utils.py, magic_pdf/libs/json_compressor.py, magic_pdf/libs/language.py, magic_pdf/libs/local_math.py, magic_pdf/libs/markdown_utils.py, magic_pdf/libs/nlp_utils.py, magic_pdf/libs/ocr_content_type.py, magic_pdf/libs/path_utils.py, magic_pdf/libs/pdf_check.py, magic_pdf/libs/pdf_image_tools.py, magic_pdf/libs/safe_filename.py, magic_pdf/libs/textbase.py, magic_pdf/libs/version.py, magic_pdf/libs/vis_utils.py, magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/data/__init__.py, magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/data/cord.py, magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/data/data_collator.py, magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/data/funsd.py, magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/data/image_utils.py, magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/data/xfund.py, magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/models/layoutlmv3/__init__.py, magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/models/layoutlmv3/configuration_layoutlmv3.py, magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/models/layoutlmv3/modeling_layoutlmv3.py, magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/models/layoutlmv3/tokenization_layoutlmv3.py, magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/models/layoutlmv3/tokenization_layoutlmv3_fast.py, magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/models/__init__.py, magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/__init__.py, magic_pdf/model/pek_sub_modules/layoutlmv3/__init__.py, magic_pdf/model/pek_sub_modules/layoutlmv3/backbone.py, magic_pdf/model/pek_sub_modules/layoutlmv3/beit.py, magic_pdf/model/pek_sub_modules/layoutlmv3/deit.py, magic_pdf/model/pek_sub_modules/layoutlmv3/model_init.py, magic_pdf/model/pek_sub_modules/layoutlmv3/rcnn_vl.py, magic_pdf/model/pek_sub_modules/layoutlmv3/visualizer.py, magic_pdf/model/pek_sub_modules/structeqtable/StructTableModel.py, magic_pdf/model/pek_sub_modules/structeqtable/__init__.py, magic_pdf/model/pek_sub_modules/__init__.py, magic_pdf/model/pek_sub_modules/post_process.py, magic_pdf/model/pek_sub_modules/self_modify.py, magic_pdf/model/__init__.py, magic_pdf/model/doc_analyze_by_custom_model.py, magic_pdf/model/magic_model.py, magic_pdf/model/model_list.py, magic_pdf/model/pdf_extract_kit.py, magic_pdf/model/ppTableModel.py, magic_pdf/model/pp_structure_v2.py, magic_pdf/para/__init__.py, magic_pdf/para/block_continuation_processor.py, magic_pdf/para/block_termination_processor.py, magic_pdf/para/commons.py, magic_pdf/para/denoise.py, magic_pdf/para/draw.py, magic_pdf/para/exceptions.py, magic_pdf/para/layout_match_processor.py, magic_pdf/para/para_pipeline.py, magic_pdf/para/para_split.py, magic_pdf/para/para_split_v2.py, magic_pdf/para/raw_processor.py, magic_pdf/para/stats.py, magic_pdf/para/title_processor.py, magic_pdf/parse/__init__.py, magic_pdf/parse/common_parse.py, magic_pdf/parse/excel_parse.py, magic_pdf/parse/pdf_client.py, magic_pdf/pipe/AbsPipe.py, magic_pdf/pipe/OCRPipe.py, magic_pdf/pipe/TXTPipe.py, magic_pdf/pipe/UNIPipe.py, magic_pdf/pipe/__init__.py, magic_pdf/post_proc/__init__.py, magic_pdf/post_proc/detect_para.py, magic_pdf/post_proc/pdf_post_filter.py, magic_pdf/post_proc/remove_footnote.py, magic_pdf/pre_proc/__init__.py, magic_pdf/pre_proc/citationmarker_remove.py, magic_pdf/pre_proc/construct_page_dict.py, magic_pdf/pre_proc/cut_image.py, magic_pdf/pre_proc/detect_equation.py, magic_pdf/pre_proc/detect_footer_by_model.py, magic_pdf/pre_proc/detect_footer_header_by_statistics.py, magic_pdf/pre_proc/detect_footnote.py, magic_pdf/pre_proc/detect_header.py, magic_pdf/pre_proc/detect_images.py, magic_pdf/pre_proc/detect_page_number.py, magic_pdf/pre_proc/detect_tables.py, magic_pdf/pre_proc/equations_replace.py, magic_pdf/pre_proc/fix_image.py, magic_pdf/pre_proc/fix_table.py, magic_pdf/pre_proc/main_text_font.py, magic_pdf/pre_proc/ocr_detect_all_bboxes.py, magic_pdf/pre_proc/ocr_detect_layout.py, magic_pdf/pre_proc/ocr_dict_merge.py, magic_pdf/pre_proc/ocr_span_list_modify.py, magic_pdf/pre_proc/pdf_pre_filter.py, magic_pdf/pre_proc/post_layout_split.py, magic_pdf/pre_proc/remove_bbox_overlap.py, magic_pdf/pre_proc/remove_colored_strip_bbox.py, magic_pdf/pre_proc/remove_footer_header.py, magic_pdf/pre_proc/remove_rotate_bbox.py, magic_pdf/pre_proc/resolve_bbox_conflict.py, magic_pdf/pre_proc/solve_line_alien.py, magic_pdf/pre_proc/statistics.py, magic_pdf/resources/fasttext-langdetect/lid.176.ftz, magic_pdf/resources/model_config/UniMERNet/demo.yaml, magic_pdf/resources/model_config/layoutlmv3/layoutlmv3_base_inference.yaml, magic_pdf/resources/model_config/model_configs.yaml, magic_pdf/rw/AbsReaderWriter.py, magic_pdf/rw/DiskReaderWriter.py, magic_pdf/rw/S3ReaderWriter.py, magic_pdf/rw/__init__.py, magic_pdf/spark/__init__.py, magic_pdf/spark/spark_api.py, magic_pdf/tools/__init__.py, magic_pdf/tools/cli.py, magic_pdf/tools/cli_dev.py, magic_pdf/tools/common.py, magic_pdf/tools/pdf_server.py, magic_pdf/__init__.py, magic_pdf/config.ini, magic_pdf/pdf_parse_by_ocr.py, magic_pdf/pdf_parse_by_txt.py, magic_pdf/pdf_parse_union_core.py, magic_pdf/user_api.py files
826086d2 · zhougaofeng · 57aaa1cf · 57aaa1cf · 57aaa1cf · 57aaa1cf
Commit 826086d2 authored Nov 12, 2024 by zhougaofeng
20 changed files
--- a/magic_pdf/integrations/rag/api.py
+++ b/magic_pdf/integrations/rag/api.py
-import os
-from pathlib import Path
-
-from loguru import logger
-
-from magic_pdf.integrations.rag.type import (ElementRelation, LayoutElements,
-                                             Node)
-from magic_pdf.integrations.rag.utils import inference
-
-
-class RagPageReader:
-
-    def __init__(self, pagedata: LayoutElements):
-        self.o = [
-            Node(
-                category_type=v.category_type,
-                text=v.text,
-                image_path=v.image_path,
-                anno_id=v.anno_id,
-                latex=v.latex,
-                html=v.html,
-            ) for v in pagedata.layout_dets
-        ]
-
-        self.pagedata = pagedata
-
-    def __iter__(self):
-        return iter(self.o)
-
-    def get_rel_map(self) -> list[ElementRelation]:
-        return self.pagedata.extra.element_relation
-
-
-class RagDocumentReader:
-
-    def __init__(self, ragdata: list[LayoutElements]):
-        self.o = [RagPageReader(v) for v in ragdata]
-
-    def __iter__(self):
-        return iter(self.o)
-
-
-class DataReader:
-
-    def __init__(self, path_or_directory: str, method: str, output_dir: str):
-        self.path_or_directory = path_or_directory
-        self.method = method
-        self.output_dir = output_dir
-        self.pdfs = []
-        if os.path.isdir(path_or_directory):
-            for doc_path in Path(path_or_directory).glob('*.pdf'):
-                self.pdfs.append(doc_path)
-        else:
-            assert path_or_directory.endswith('.pdf')
-            self.pdfs.append(Path(path_or_directory))
-
-    def get_documents_count(self) -> int:
-        """Returns the number of documents in the directory."""
-        return len(self.pdfs)
-
-    def get_document_result(self, idx: int) -> RagDocumentReader | None:
-        """
-        Args:
-            idx (int): the index of documents under the
-                directory path_or_directory
-
-        Returns:
-            RagDocumentReader | None: RagDocumentReader is an iterable object,
-            more details @RagDocumentReader
-        """
-        if idx >= self.get_documents_count() or idx < 0:
-            logger.error(f'invalid idx: {idx}')
-            return None
-        res = inference(str(self.pdfs[idx]), self.output_dir, self.method)
-        if res is None:
-            logger.warning(f'failed to inference pdf {self.pdfs[idx]}')
-            return None
-        return RagDocumentReader(res)
-
-    def get_document_filename(self, idx: int) -> Path:
-        """get the filename of the document."""
-        return self.pdfs[idx]
--- a/magic_pdf/integrations/rag/type.py
+++ b/magic_pdf/integrations/rag/type.py
-from enum import Enum
-
-from pydantic import BaseModel, Field
-
-
-# rag
-class CategoryType(Enum):  # py310 not support StrEnum
-    text = 'text'
-    title = 'title'
-    interline_equation = 'interline_equation'
-    image = 'image'
-    image_body = 'image_body'
-    image_caption = 'image_caption'
-    table = 'table'
-    table_body = 'table_body'
-    table_caption = 'table_caption'
-    table_footnote = 'table_footnote'
-
-
-class ElementRelType(Enum):
-    sibling = 'sibling'
-
-
-class PageInfo(BaseModel):
-    page_no: int = Field(description='the index of page, start from zero',
-                         ge=0)
-    height: int = Field(description='the height of page', gt=0)
-    width: int = Field(description='the width of page', ge=0)
-    image_path: str | None = Field(description='the image of this page',
-                                   default=None)
-
-
-class ContentObject(BaseModel):
-    category_type: CategoryType = Field(description='类别')
-    poly: list[float] = Field(
-        description=('Coordinates, need to convert back to PDF coordinates,'
-                     ' order is top-left, top-right, bottom-right, bottom-left'
-                     ' x,y coordinates'))
-    ignore: bool = Field(description='whether ignore this object',
-                         default=False)
-    text: str | None = Field(description='text content of the object',
-                             default=None)
-    image_path: str | None = Field(description='path of embedded image',
-                                   default=None)
-    order: int = Field(description='the order of this object within a page',
-                       default=-1)
-    anno_id: int = Field(description='unique id', default=-1)
-    latex: str | None = Field(description='latex result', default=None)
-    html: str | None = Field(description='html result', default=None)
-
-
-class ElementRelation(BaseModel):
-    source_anno_id: int = Field(description='unique id of the source object',
-                                default=-1)
-    target_anno_id: int = Field(description='unique id of the target object',
-                                default=-1)
-    relation: ElementRelType = Field(
-        description='the relation between source and target element')
-
-
-class LayoutElementsExtra(BaseModel):
-    element_relation: list[ElementRelation] = Field(
-        description='the relation between source and target element')
-
-
-class LayoutElements(BaseModel):
-    layout_dets: list[ContentObject] = Field(
-        description='layout element details')
-    page_info: PageInfo = Field(description='page info')
-    extra: LayoutElementsExtra = Field(description='extra information')
-
-
-# iter data format
-class Node(BaseModel):
-    category_type: CategoryType = Field(description='类别')
-    text: str | None = Field(description='text content of the object',
-                             default=None)
-    image_path: str | None = Field(description='path of embedded image',
-                                   default=None)
-    anno_id: int = Field(description='unique id', default=-1)
-    latex: str | None = Field(description='latex result', default=None)
-    html: str | None = Field(description='html result', default=None)
--- a/magic_pdf/integrations/rag/utils.py
+++ b/magic_pdf/integrations/rag/utils.py
-import json
-import os
-from pathlib import Path
-
-from loguru import logger
-
-import magic_pdf.model as model_config
-from magic_pdf.dict2md.ocr_mkcontent import merge_para_with_text
-from magic_pdf.integrations.rag.type import (CategoryType, ContentObject,
-                                             ElementRelation, ElementRelType,
-                                             LayoutElements,
-                                             LayoutElementsExtra, PageInfo)
-from magic_pdf.libs.ocr_content_type import BlockType, ContentType
-from magic_pdf.rw.AbsReaderWriter import AbsReaderWriter
-from magic_pdf.rw.DiskReaderWriter import DiskReaderWriter
-from magic_pdf.tools.common import do_parse, prepare_env
-
-
-def convert_middle_json_to_layout_elements(
-    json_data: dict,
-    output_dir: str,
-) -> list[LayoutElements]:
-    uniq_anno_id = 0
-
-    res: list[LayoutElements] = []
-    for page_no, page_data in enumerate(json_data['pdf_info']):
-        order_id = 0
-        page_info = PageInfo(
-            height=int(page_data['page_size'][1]),
-            width=int(page_data['page_size'][0]),
-            page_no=page_no,
-        )
-        layout_dets: list[ContentObject] = []
-        extra_element_relation: list[ElementRelation] = []
-
-        for para_block in page_data['para_blocks']:
-            para_text = ''
-            para_type = para_block['type']
-
-            if para_type == BlockType.Text:
-                para_text = merge_para_with_text(para_block)
-                x0, y0, x1, y1 = para_block['bbox']
-                content = ContentObject(
-                    anno_id=uniq_anno_id,
-                    category_type=CategoryType.text,
-                    text=para_text,
-                    order=order_id,
-                    poly=[x0, y0, x1, y0, x1, y1, x0, y1],
-                )
-                uniq_anno_id += 1
-                order_id += 1
-                layout_dets.append(content)
-
-            elif para_type == BlockType.Title:
-                para_text = merge_para_with_text(para_block)
-                x0, y0, x1, y1 = para_block['bbox']
-                content = ContentObject(
-                    anno_id=uniq_anno_id,
-                    category_type=CategoryType.title,
-                    text=para_text,
-                    order=order_id,
-                    poly=[x0, y0, x1, y0, x1, y1, x0, y1],
-                )
-                uniq_anno_id += 1
-                order_id += 1
-                layout_dets.append(content)
-
-            elif para_type == BlockType.InterlineEquation:
-                para_text = merge_para_with_text(para_block)
-                x0, y0, x1, y1 = para_block['bbox']
-                content = ContentObject(
-                    anno_id=uniq_anno_id,
-                    category_type=CategoryType.interline_equation,
-                    text=para_text,
-                    order=order_id,
-                    poly=[x0, y0, x1, y0, x1, y1, x0, y1],
-                )
-                uniq_anno_id += 1
-                order_id += 1
-                layout_dets.append(content)
-
-            elif para_type == BlockType.Image:
-                body_anno_id = -1
-                caption_anno_id = -1
-
-                for block in para_block['blocks']:
-                    if block['type'] == BlockType.ImageBody:
-                        for line in block['lines']:
-                            for span in line['spans']:
-                                if span['type'] == ContentType.Image:
-                                    x0, y0, x1, y1 = block['bbox']
-                                    content = ContentObject(
-                                        anno_id=uniq_anno_id,
-                                        category_type=CategoryType.image_body,
-                                        image_path=os.path.join(
-                                            output_dir, span['image_path']),
-                                        order=order_id,
-                                        poly=[x0, y0, x1, y0, x1, y1, x0, y1],
-                                    )
-                                    body_anno_id = uniq_anno_id
-                                    uniq_anno_id += 1
-                                    order_id += 1
-                                    layout_dets.append(content)
-
-                for block in para_block['blocks']:
-                    if block['type'] == BlockType.ImageCaption:
-                        para_text += merge_para_with_text(block)
-                        x0, y0, x1, y1 = block['bbox']
-                        content = ContentObject(
-                            anno_id=uniq_anno_id,
-                            category_type=CategoryType.image_caption,
-                            text=para_text,
-                            order=order_id,
-                            poly=[x0, y0, x1, y0, x1, y1, x0, y1],
-                        )
-                        caption_anno_id = uniq_anno_id
-                        uniq_anno_id += 1
-                        order_id += 1
-                        layout_dets.append(content)
-
-                if body_anno_id > 0 and caption_anno_id > 0:
-                    element_relation = ElementRelation(
-                        relation=ElementRelType.sibling,
-                        source_anno_id=body_anno_id,
-                        target_anno_id=caption_anno_id,
-                    )
-                    extra_element_relation.append(element_relation)
-
-            elif para_type == BlockType.Table:
-                body_anno_id, caption_anno_id, footnote_anno_id = -1, -1, -1
-
-                for block in para_block['blocks']:
-                    if block['type'] == BlockType.TableCaption:
-                        para_text += merge_para_with_text(block)
-                        x0, y0, x1, y1 = block['bbox']
-                        content = ContentObject(
-                            anno_id=uniq_anno_id,
-                            category_type=CategoryType.table_caption,
-                            text=para_text,
-                            order=order_id,
-                            poly=[x0, y0, x1, y0, x1, y1, x0, y1],
-                        )
-                        caption_anno_id = uniq_anno_id
-                        uniq_anno_id += 1
-                        order_id += 1
-                        layout_dets.append(content)
-
-                for block in para_block['blocks']:
-                    if block['type'] == BlockType.TableBody:
-                        for line in block['lines']:
-                            for span in line['spans']:
-                                if span['type'] == ContentType.Table:
-                                    x0, y0, x1, y1 = para_block['bbox']
-                                    content = ContentObject(
-                                        anno_id=uniq_anno_id,
-                                        category_type=CategoryType.table_body,
-                                        order=order_id,
-                                        poly=[x0, y0, x1, y0, x1, y1, x0, y1],
-                                    )
-                                    body_anno_id = uniq_anno_id
-                                    uniq_anno_id += 1
-                                    order_id += 1
-                                    # if processed by table model
-                                    if span.get('latex', ''):
-                                        content.latex = span['latex']
-                                    else:
-                                        content.image_path = os.path.join(
-                                            output_dir, span['image_path'])
-                                    layout_dets.append(content)
-
-                for block in para_block['blocks']:
-                    if block['type'] == BlockType.TableFootnote:
-                        para_text += merge_para_with_text(block)
-                        x0, y0, x1, y1 = block['bbox']
-                        content = ContentObject(
-                            anno_id=uniq_anno_id,
-                            category_type=CategoryType.table_footnote,
-                            text=para_text,
-                            order=order_id,
-                            poly=[x0, y0, x1, y0, x1, y1, x0, y1],
-                        )
-                        footnote_anno_id = uniq_anno_id
-                        uniq_anno_id += 1
-                        order_id += 1
-                        layout_dets.append(content)
-
-                if caption_anno_id != -1 and body_anno_id != -1:
-                    element_relation = ElementRelation(
-                        relation=ElementRelType.sibling,
-                        source_anno_id=body_anno_id,
-                        target_anno_id=caption_anno_id,
-                    )
-                    extra_element_relation.append(element_relation)
-
-                if footnote_anno_id != -1 and body_anno_id != -1:
-                    element_relation = ElementRelation(
-                        relation=ElementRelType.sibling,
-                        source_anno_id=body_anno_id,
-                        target_anno_id=footnote_anno_id,
-                    )
-                    extra_element_relation.append(element_relation)
-
-        res.append(
-            LayoutElements(
-                page_info=page_info,
-                layout_dets=layout_dets,
-                extra=LayoutElementsExtra(
-                    element_relation=extra_element_relation),
-            ))
-
-    return res
-
-
-def inference(path, output_dir, method):
-    model_config.__use_inside_model__ = True
-    model_config.__model_mode__ = 'full'
-    if output_dir == '':
-        if os.path.isdir(path):
-            output_dir = os.path.join(path, 'output')
-        else:
-            output_dir = os.path.join(os.path.dirname(path), 'output')
-
-    local_image_dir, local_md_dir = prepare_env(output_dir,
-                                                str(Path(path).stem), method)
-
-    def read_fn(path):
-        disk_rw = DiskReaderWriter(os.path.dirname(path))
-        return disk_rw.read(os.path.basename(path), AbsReaderWriter.MODE_BIN)
-
-    def parse_doc(doc_path: str):
-        try:
-            file_name = str(Path(doc_path).stem)
-            pdf_data = read_fn(doc_path)
-            do_parse(
-                output_dir,
-                file_name,
-                pdf_data,
-                [],
-                method,
-                False,
-                f_draw_span_bbox=False,
-                f_draw_layout_bbox=False,
-                f_dump_md=False,
-                f_dump_middle_json=True,
-                f_dump_model_json=False,
-                f_dump_orig_pdf=False,
-                f_dump_content_list=False,
-                f_draw_model_bbox=False,
-            )
-
-            middle_json_fn = os.path.join(local_md_dir,
-                                          f'{file_name}_middle.json')
-            with open(middle_json_fn) as fd:
-                jso = json.load(fd)
-            os.remove(middle_json_fn)
-            return convert_middle_json_to_layout_elements(jso, local_image_dir)
-
-        except Exception as e:
-            logger.exception(e)
-
-    return parse_doc(path)
-
-
-if __name__ == '__main__':
-    import pprint
-
-    base_dir = '/opt/data/pdf/resources/samples/'
-    if 0:
-        with open(base_dir + 'json_outputs/middle.json') as f:
-            d = json.load(f)
-        result = convert_middle_json_to_layout_elements(d, '/tmp')
-        pprint.pp(result)
-    if 0:
-        with open(base_dir + 'json_outputs/middle.3.json') as f:
-            d = json.load(f)
-        result = convert_middle_json_to_layout_elements(d, '/tmp')
-        pprint.pp(result)
-
-    if 1:
-        res = inference(
-            base_dir + 'samples/pdf/one_page_with_table_image.pdf',
-            '/tmp/output',
-            'ocr',
-        )
-        pprint.pp(res)
--- a/magic_pdf/layout/__init__.py
+++ b/magic_pdf/layout/__init__.py
--- a/magic_pdf/layout/bbox_sort.py
+++ b/magic_pdf/layout/bbox_sort.py
-# 定义这里的bbox是一个list [x0, y0, x1, y1, block_content, idx_x, idx_y, content_type, ext_x0, ext_y0, ext_x1, ext_y1], 初始时候idx_x, idx_y都是None
-# 其中x0, y0代表左上角坐标，x1, y1代表右下角坐标，坐标原点在左上角。
-
-
-
-from magic_pdf.layout.layout_spiler_recog import get_spilter_of_page
-from magic_pdf.libs.boxbase import _is_in, _is_in_or_part_overlap, _is_vertical_full_overlap
-from magic_pdf.libs.commons import mymax
-
-X0_IDX = 0
-Y0_IDX = 1
-X1_IDX = 2
-Y1_IDX = 3
-CONTENT_IDX = 4
-IDX_X = 5
-IDX_Y = 6
-CONTENT_TYPE_IDX = 7
-
-X0_EXT_IDX = 8
-Y0_EXT_IDX = 9
-X1_EXT_IDX = 10
-Y1_EXT_IDX = 11
-
-
-def prepare_bboxes_for_layout_split(image_info, image_backup_info, table_info, inline_eq_info, interline_eq_info, text_raw_blocks: dict, page_boundry, page):
-    """
-    text_raw_blocks:结构参考test/assets/papre/pymu_textblocks.json
-    把bbox重新组装成一个list，每个元素[x0, y0, x1, y1, block_content, idx_x, idx_y, content_type, ext_x0, ext_y0, ext_x1, ext_y1], 初始时候idx_x, idx_y都是None. 对于图片、公式来说，block_content是图片的地址， 对于段落来说，block_content是pymupdf里的block结构
-    """
-    all_bboxes = []
-    
-    for image in image_info:
-        box = image['bbox']
-        # 由于没有实现横向的栏切分，因此在这里先过滤掉一些小的图片。这些图片有可能影响layout，造成没有横向栏切分的情况下，layout切分不准确。例如 scihub_76500000/libgen.scimag76570000-76570999.zip_10.1186/s13287-019-1355-1
-        # 把长宽都小于50的去掉
-        if abs(box[0]-box[2]) < 50 and abs(box[1]-box[3]) < 50:
-            continue
-        all_bboxes.append([box[0], box[1], box[2], box[3], None, None, None, 'image', None, None, None, None])
-        
-    for table in table_info:
-        box = table['bbox']
-        all_bboxes.append([box[0], box[1], box[2], box[3], None, None, None, 'table', None, None, None, None])
-    
-    """由于公式与段落混合，因此公式不再参与layout划分，无需加入all_bboxes"""
-    # 加入文本block
-    text_block_temp = []
-    for block in text_raw_blocks:
-        bbox = block['bbox']
-        text_block_temp.append([bbox[0], bbox[1], bbox[2], bbox[3], None, None, None, 'text', None, None, None, None])
-        
-    text_block_new = resolve_bbox_overlap_for_layout_det(text_block_temp)   
-    text_block_new = filter_lines_bbox(text_block_new) # 去掉线条bbox，有可能让layout探测陷入无限循环
-    
-        
-    """找出会影响layout的色块、横向分割线"""
-    spilter_bboxes = get_spilter_of_page(page, [b['bbox'] for b in image_info]+[b['bbox'] for b in image_backup_info], [b['bbox'] for b in table_info], )
-    # 还要去掉存在于spilter_bboxes里的text_block
-    if len(spilter_bboxes) > 0:
-        text_block_new = [box for box in text_block_new if not any([_is_in_or_part_overlap(box[:4], spilter_bbox) for spilter_bbox in spilter_bboxes])]
-        
-    for bbox in text_block_new:
-        all_bboxes.append([bbox[0], bbox[1], bbox[2], bbox[3], None, None, None, 'text', None, None, None, None]) 
-        
-    for bbox in spilter_bboxes:
-        all_bboxes.append([bbox[0], bbox[1], bbox[2], bbox[3], None, None, None, 'spilter', None, None, None, None])
-    
-     
-    return all_bboxes
-
-def resolve_bbox_overlap_for_layout_det(bboxes:list):
-    """
-    1. 去掉bbox互相包含的，去掉被包含的
-    2. 上下方向上如果有重叠，就扩大大box范围，直到覆盖小box
-    """
-    def _is_in_other_bbox(i:int):
-        """
-        判断i个box是否被其他box有所包含
-        """
-        for j in range(0, len(bboxes)):
-            if j!=i and _is_in(bboxes[i][:4], bboxes[j][:4]):
-                return True
-            # elif j!=i and _is_bottom_full_overlap(bboxes[i][:4], bboxes[j][:4]):
-            #     return True
-            
-        return False
-    
-    # 首先去掉被包含的bbox
-    new_bbox_1 = []
-    for i in range(0, len(bboxes)):
-        if not _is_in_other_bbox(i):
-            new_bbox_1.append(bboxes[i])
-            
-    # 其次扩展大的box
-    new_box = []
-    new_bbox_2 = []
-    len_1 = len(new_bbox_2)
-    while True:
-        merged_idx = []
-        for i in range(0, len(new_bbox_1)):
-            if i in merged_idx:
-                continue
-            for j in range(i+1, len(new_bbox_1)):
-                if j in merged_idx:
-                    continue
-                bx1 = new_bbox_1[i]
-                bx2 = new_bbox_1[j]
-                if i!=j and _is_vertical_full_overlap(bx1[:4], bx2[:4]):
-                    merged_box = min([bx1[0], bx2[0]]), min([bx1[1], bx2[1]]), max([bx1[2], bx2[2]]), max([bx1[3], bx2[3]])
-                    new_bbox_2.append(merged_box)
-                    merged_idx.append(i)
-                    merged_idx.append(j)
-                    
-        for i in range(0, len(new_bbox_1)): # 没有合并的加入进来
-            if i not in merged_idx:
-                new_bbox_2.append(new_bbox_1[i])        
-
-        if len(new_bbox_2)==0 or len_1==len(new_bbox_2):
-            break
-        else:
-            len_1 = len(new_bbox_2)
-            new_box = new_bbox_2
-            new_bbox_1, new_bbox_2 = new_bbox_2, []
-                        
-    return new_box
-
-
-def filter_lines_bbox(bboxes: list):
-    """
-    过滤掉bbox为空的行
-    """
-    new_box = []
-    for box in bboxes:
-        x0, y0, x1, y1 = box[0], box[1], box[2], box[3]
-        if abs(x0-x1)<=1 or abs(y0-y1)<=1:
-            continue
-        else:
-            new_box.append(box)
-    return new_box
-
-
-################################################################################
-# 第一种排序算法
-# 以下是基于延长线遮挡做的一个算法
-#
-################################################################################
-def find_all_left_bbox(this_bbox, all_bboxes) -> list:
-    """
-    寻找this_bbox左边的所有bbox
-    """
-    left_boxes = [box for box in all_bboxes if box[X1_IDX] <= this_bbox[X0_IDX]]
-    return left_boxes
-
-
-def find_all_top_bbox(this_bbox, all_bboxes) -> list:
-    """
-    寻找this_bbox上面的所有bbox
-    """
-    top_boxes = [box for box in all_bboxes if box[Y1_IDX] <= this_bbox[Y0_IDX]]
-    return top_boxes
-
-
-def get_and_set_idx_x(this_bbox, all_bboxes) -> int:
-    """
-    寻找this_bbox在all_bboxes中的遮挡深度 idx_x
-    """
-    if this_bbox[IDX_X] is not None:
-        return this_bbox[IDX_X]
-    else:
-        all_left_bboxes = find_all_left_bbox(this_bbox, all_bboxes)
-        if len(all_left_bboxes) == 0:
-            this_bbox[IDX_X] = 0
-        else:
-            all_left_bboxes_idx = [get_and_set_idx_x(bbox, all_bboxes) for bbox in all_left_bboxes]
-            max_idx_x = mymax(all_left_bboxes_idx)
-            this_bbox[IDX_X] = max_idx_x + 1
-        return this_bbox[IDX_X]
-
-
-def get_and_set_idx_y(this_bbox, all_bboxes) -> int:
-    """
-    寻找this_bbox在all_bboxes中y方向的遮挡深度 idx_y
-    """
-    if this_bbox[IDX_Y] is not None:
-        return this_bbox[IDX_Y]
-    else:
-        all_top_bboxes = find_all_top_bbox(this_bbox, all_bboxes)
-        if len(all_top_bboxes) == 0:
-            this_bbox[IDX_Y] = 0
-        else:
-            all_top_bboxes_idx = [get_and_set_idx_y(bbox, all_bboxes) for bbox in all_top_bboxes]
-            max_idx_y = mymax(all_top_bboxes_idx)
-            this_bbox[IDX_Y] = max_idx_y + 1
-        return this_bbox[IDX_Y]
-
-
-def bbox_sort(all_bboxes: list):
-    """
-    排序
-    """
-    all_bboxes_idx_x = [get_and_set_idx_x(bbox, all_bboxes) for bbox in all_bboxes]
-    all_bboxes_idx_y = [get_and_set_idx_y(bbox, all_bboxes) for bbox in all_bboxes]
-    all_bboxes_idx = [(idx_x, idx_y) for idx_x, idx_y in zip(all_bboxes_idx_x, all_bboxes_idx_y)]
-
-    all_bboxes_idx = [idx_x_y[0] * 100000 + idx_x_y[1] for idx_x_y in all_bboxes_idx]  # 变换成一个点，保证能够先X，X相同时按Y排序
-    all_bboxes_idx = list(zip(all_bboxes_idx, all_bboxes))
-    all_bboxes_idx.sort(key=lambda x: x[0])
-    sorted_bboxes = [bbox for idx, bbox in all_bboxes_idx]
-    return sorted_bboxes
-
-
-################################################################################
-# 第二种排序算法
-# 下面的算法在计算idx_x和idx_y的时候不考虑延长线，而只考虑实际的长或者宽被遮挡的情况
-#
-################################################################################
-
-def find_left_nearest_bbox(this_bbox, all_bboxes) -> list:
-    """
-    在all_bboxes里找到所有右侧高度和this_bbox有重叠的bbox
-    """
-    left_boxes = [box for box in all_bboxes if box[X1_IDX] <= this_bbox[X0_IDX] and any([
-         box[Y0_IDX] < this_bbox[Y0_IDX] < box[Y1_IDX], box[Y0_IDX] < this_bbox[Y1_IDX] < box[Y1_IDX],
-         this_bbox[Y0_IDX] < box[Y0_IDX] < this_bbox[Y1_IDX], this_bbox[Y0_IDX] < box[Y1_IDX] < this_bbox[Y1_IDX],
-         box[Y0_IDX]==this_bbox[Y0_IDX] and box[Y1_IDX]==this_bbox[Y1_IDX]])]
-        
-    # 然后再过滤一下，找到水平上距离this_bbox最近的那个
-    if len(left_boxes) > 0:
-        left_boxes.sort(key=lambda x: x[X1_IDX], reverse=True)
-        left_boxes = [left_boxes[0]]
-    else:
-        left_boxes = []
-    return left_boxes
-
-
-def get_and_set_idx_x_2(this_bbox, all_bboxes):
-    """
-    寻找this_bbox在all_bboxes中的被直接遮挡的深度 idx_x
-    这个遮挡深度不考虑延长线，而是被实际的长或者宽遮挡的情况
-    """
-    if this_bbox[IDX_X] is not None:
-        return this_bbox[IDX_X]
-    else:
-        left_nearest_bbox = find_left_nearest_bbox(this_bbox, all_bboxes)
-        if len(left_nearest_bbox) == 0:
-            this_bbox[IDX_X] = 0
-        else:
-            left_idx_x = get_and_set_idx_x_2(left_nearest_bbox[0], all_bboxes)
-            this_bbox[IDX_X] = left_idx_x + 1
-        return this_bbox[IDX_X]
-
-
-def find_top_nearest_bbox(this_bbox, all_bboxes) -> list:
-    """
-    在all_bboxes里找到所有下侧宽度和this_bbox有重叠的bbox
-    """
-    top_boxes = [box for box in all_bboxes if box[Y1_IDX] <= this_bbox[Y0_IDX] and any([
-        box[X0_IDX] < this_bbox[X0_IDX] < box[X1_IDX], box[X0_IDX] < this_bbox[X1_IDX] < box[X1_IDX],
-         this_bbox[X0_IDX] < box[X0_IDX] < this_bbox[X1_IDX], this_bbox[X0_IDX] < box[X1_IDX] < this_bbox[X1_IDX],
-        box[X0_IDX]==this_bbox[X0_IDX] and box[X1_IDX]==this_bbox[X1_IDX]])]
-    # 然后再过滤一下，找到水平上距离this_bbox最近的那个
-    if len(top_boxes) > 0:
-        top_boxes.sort(key=lambda x: x[Y1_IDX], reverse=True)
-        top_boxes = [top_boxes[0]]
-    else:
-        top_boxes = []
-    return top_boxes
-
-
-def get_and_set_idx_y_2(this_bbox, all_bboxes):
-    """
-    寻找this_bbox在all_bboxes中的被直接遮挡的深度 idx_y
-    这个遮挡深度不考虑延长线，而是被实际的长或者宽遮挡的情况
-    """
-    if this_bbox[IDX_Y] is not None:
-        return this_bbox[IDX_Y]
-    else:
-        top_nearest_bbox = find_top_nearest_bbox(this_bbox, all_bboxes)
-        if len(top_nearest_bbox) == 0:
-            this_bbox[IDX_Y] = 0
-        else:
-            top_idx_y = get_and_set_idx_y_2(top_nearest_bbox[0], all_bboxes)
-            this_bbox[IDX_Y] = top_idx_y + 1
-        return this_bbox[IDX_Y]
-
-
-def paper_bbox_sort(all_bboxes: list, page_width, page_height):
-    all_bboxes_idx_x = [get_and_set_idx_x_2(bbox, all_bboxes) for bbox in all_bboxes]
-    all_bboxes_idx_y = [get_and_set_idx_y_2(bbox, all_bboxes) for bbox in all_bboxes]
-    all_bboxes_idx = [(idx_x, idx_y) for idx_x, idx_y in zip(all_bboxes_idx_x, all_bboxes_idx_y)]
-
-    all_bboxes_idx = [idx_x_y[0] * 100000 + idx_x_y[1] for idx_x_y in all_bboxes_idx]  # 变换成一个点，保证能够先X，X相同时按Y排序
-    all_bboxes_idx = list(zip(all_bboxes_idx, all_bboxes))
-    all_bboxes_idx.sort(key=lambda x: x[0])
-    sorted_bboxes = [bbox for idx, bbox in all_bboxes_idx]
-    return sorted_bboxes
-
-################################################################################
-"""
-第三种排序算法, 假设page的最左侧为X0，最右侧为X1，最上侧为Y0，最下侧为Y1
-这个排序算法在第二种算法基础上增加对bbox的预处理步骤。预处理思路如下：
-1. 首先在水平方向上对bbox进行扩展。扩展方法是：
-    - 对每个bbox，找到其左边最近的bbox（也就是y方向有重叠），然后将其左边界扩展到左边最近bbox的右边界(x1+1),这里加1是为了避免重叠。如果没有左边的bbox，那么就将其左边界扩展到page的最左侧X0。
-    - 对每个bbox，找到其右边最近的bbox（也就是y方向有重叠），然后将其右边界扩展到右边最近bbox的左边界(x0-1),这里减1是为了避免重叠。如果没有右边的bbox，那么就将其右边界扩展到page的最右侧X1。
-    - 经过上面2个步骤，bbox扩展到了水平方向的最大范围。[左最近bbox.x1+1, 右最近bbox.x0-1]
-    
-2. 合并所有的连续水平方向的bbox, 合并方法是：
-    - 对bbox进行y方向排序，然后从上到下遍历所有bbox，如果当前bbox和下一个bbox的x0, x1等于X0, X1，那么就合并这两个bbox。
-    
-3. 然后在垂直方向上对bbox进行扩展。扩展方法是：
-    - 首先从page上切割掉合并后的水平bbox, 得到几个新的block
-    针对每个block
-    - x0: 扎到位于左侧x=x0延长线的左侧所有的bboxes, 找到最大的x1,让x0=x1+1。如果没有，则x0=X0
-    - x1: 找到位于右侧x=x1延长线右侧所有的bboxes， 找到最小的x0, 让x1=x0-1。如果没有，则x1=X1
-    随后在垂直方向上合并所有的连续的block，方法如下：
-    - 对block进行x方向排序，然后从左到右遍历所有block，如果当前block和下一个block的x0, x1相等，那么就合并这两个block。
-    如果垂直切分后所有小bbox都被分配到了一个block, 那么分割就完成了。这些合并后的block打上标签'GOOD_LAYOUT’
-    如果在某个垂直方向上无法被完全分割到一个block，那么就将这个block打上标签'BAD_LAYOUT'。
-    至此完成，一个页面的预处理，天然的block要么属于'GOOD_LAYOUT'，要么属于'BAD_LAYOUT'。针对含有'BAD_LAYOUT'的页面，可以先按照自上而下，自左到右进行天然排序，也可以先过滤掉这种书籍。
-    (完成条件下次加强：进行水平方向切分，把混乱的layout部分尽可能切割出去)
-"""
-################################################################################
-def find_left_neighbor_bboxes(this_bbox, all_bboxes) -> list:
-    """
-    在all_bboxes里找到所有右侧高度和this_bbox有重叠的bbox
-    这里使用扩展之后的bbox
-    """
-    left_boxes = [box for box in all_bboxes if box[X1_EXT_IDX] <= this_bbox[X0_EXT_IDX] and any([
-         box[Y0_EXT_IDX] < this_bbox[Y0_EXT_IDX] < box[Y1_EXT_IDX], box[Y0_EXT_IDX] < this_bbox[Y1_EXT_IDX] < box[Y1_EXT_IDX],
-         this_bbox[Y0_EXT_IDX] < box[Y0_EXT_IDX] < this_bbox[Y1_EXT_IDX], this_bbox[Y0_EXT_IDX] < box[Y1_EXT_IDX] < this_bbox[Y1_EXT_IDX],
-         box[Y0_EXT_IDX]==this_bbox[Y0_EXT_IDX] and box[Y1_EXT_IDX]==this_bbox[Y1_EXT_IDX]])]
-        
-    # 然后再过滤一下，找到水平上距离this_bbox最近的那个
-    if len(left_boxes) > 0:
-        left_boxes.sort(key=lambda x: x[X1_EXT_IDX], reverse=True)
-        left_boxes = left_boxes
-    else:
-        left_boxes = []
-    return left_boxes
-
-def find_top_neighbor_bboxes(this_bbox, all_bboxes) -> list:
-    """
-    在all_bboxes里找到所有下侧宽度和this_bbox有重叠的bbox
-    这里使用扩展之后的bbox
-    """
-    top_boxes = [box for box in all_bboxes if box[Y1_EXT_IDX] <= this_bbox[Y0_EXT_IDX] and any([
-        box[X0_EXT_IDX] < this_bbox[X0_EXT_IDX] < box[X1_EXT_IDX], box[X0_EXT_IDX] < this_bbox[X1_EXT_IDX] < box[X1_EXT_IDX],
-         this_bbox[X0_EXT_IDX] < box[X0_EXT_IDX] < this_bbox[X1_EXT_IDX], this_bbox[X0_EXT_IDX] < box[X1_EXT_IDX] < this_bbox[X1_EXT_IDX],
-        box[X0_EXT_IDX]==this_bbox[X0_EXT_IDX] and box[X1_EXT_IDX]==this_bbox[X1_EXT_IDX]])]
-    # 然后再过滤一下，找到水平上距离this_bbox最近的那个
-    if len(top_boxes) > 0:
-        top_boxes.sort(key=lambda x: x[Y1_EXT_IDX], reverse=True)
-        top_boxes = top_boxes
-    else:
-        top_boxes = []
-    return top_boxes
-
-def get_and_set_idx_x_2_ext(this_bbox, all_bboxes):
-    """
-    寻找this_bbox在all_bboxes中的被直接遮挡的深度 idx_x
-    这个遮挡深度不考虑延长线，而是被实际的长或者宽遮挡的情况
-    """
-    if this_bbox[IDX_X] is not None:
-        return this_bbox[IDX_X]
-    else:
-        left_nearest_bbox = find_left_neighbor_bboxes(this_bbox, all_bboxes)
-        if len(left_nearest_bbox) == 0:
-            this_bbox[IDX_X] = 0
-        else:
-            left_idx_x = [get_and_set_idx_x_2(b, all_bboxes) for b in left_nearest_bbox]
-            this_bbox[IDX_X] = mymax(left_idx_x) + 1
-        return this_bbox[IDX_X]
-   
-def get_and_set_idx_y_2_ext(this_bbox, all_bboxes):
-    """
-    寻找this_bbox在all_bboxes中的被直接遮挡的深度 idx_y
-    这个遮挡深度不考虑延长线，而是被实际的长或者宽遮挡的情况
-    """
-    if this_bbox[IDX_Y] is not None:
-        return this_bbox[IDX_Y]
-    else:
-        top_nearest_bbox = find_top_neighbor_bboxes(this_bbox, all_bboxes)
-        if len(top_nearest_bbox) == 0:
-            this_bbox[IDX_Y] = 0
-        else:
-            top_idx_y = [get_and_set_idx_y_2_ext(b, all_bboxes) for b in top_nearest_bbox]
-            this_bbox[IDX_Y] = mymax(top_idx_y) + 1
-        return this_bbox[IDX_Y]
- 
-def _paper_bbox_sort_ext(all_bboxes: list):
-    all_bboxes_idx_x = [get_and_set_idx_x_2_ext(bbox, all_bboxes) for bbox in all_bboxes]
-    all_bboxes_idx_y = [get_and_set_idx_y_2_ext(bbox, all_bboxes) for bbox in all_bboxes]
-    all_bboxes_idx = [(idx_x, idx_y) for idx_x, idx_y in zip(all_bboxes_idx_x, all_bboxes_idx_y)]
-
-    all_bboxes_idx = [idx_x_y[0] * 100000 + idx_x_y[1] for idx_x_y in all_bboxes_idx]  # 变换成一个点，保证能够先X，X相同时按Y排序
-    all_bboxes_idx = list(zip(all_bboxes_idx, all_bboxes))
-    all_bboxes_idx.sort(key=lambda x: x[0])
-    sorted_bboxes = [bbox for idx, bbox in all_bboxes_idx]
-    return sorted_bboxes
-
-# ===============================================================================================
-def find_left_bbox_ext_line(this_bbox, all_bboxes) -> list:
-    """
-    寻找this_bbox左边的所有bbox, 使用延长线
-    """
-    left_boxes = [box for box in all_bboxes if box[X1_IDX] <= this_bbox[X0_IDX]]
-    if len(left_boxes):
-        left_boxes.sort(key=lambda x: x[X1_IDX], reverse=True)
-        left_boxes = left_boxes[0]
-    else:
-        left_boxes = None
-    
-    return left_boxes
-
-def find_right_bbox_ext_line(this_bbox, all_bboxes) -> list:
-    """
-    寻找this_bbox右边的所有bbox, 使用延长线
-    """
-    right_boxes = [box for box in all_bboxes if box[X0_IDX] >= this_bbox[X1_IDX]]
-    if len(right_boxes):
-        right_boxes.sort(key=lambda x: x[X0_IDX])
-        right_boxes = right_boxes[0]
-    else:
-        right_boxes = None
-    return right_boxes
-
-# =============================================================================================
-
-def find_left_nearest_bbox_direct(this_bbox, all_bboxes) -> list:
-    """
-    在all_bboxes里找到所有右侧高度和this_bbox有重叠的bbox， 不用延长线并且不能像
-    """
-    left_boxes = [box for box in all_bboxes if box[X1_IDX] <= this_bbox[X0_IDX] and any([
-         box[Y0_IDX] < this_bbox[Y0_IDX] < box[Y1_IDX], box[Y0_IDX] < this_bbox[Y1_IDX] < box[Y1_IDX],
-         this_bbox[Y0_IDX] < box[Y0_IDX] < this_bbox[Y1_IDX], this_bbox[Y0_IDX] < box[Y1_IDX] < this_bbox[Y1_IDX],
-         box[Y0_IDX]==this_bbox[Y0_IDX] and box[Y1_IDX]==this_bbox[Y1_IDX]])]
-        
-    # 然后再过滤一下，找到水平上距离this_bbox最近的那个——x1最大的那个
-    if len(left_boxes) > 0:
-        left_boxes.sort(key=lambda x: x[X1_EXT_IDX] if x[X1_EXT_IDX] else x[X1_IDX], reverse=True)
-        left_boxes = left_boxes[0]
-    else:
-        left_boxes = None
-    return left_boxes
-
-def find_right_nearst_bbox_direct(this_bbox, all_bboxes) -> list:
-    """
-    找到在this_bbox右侧且距离this_bbox距离最近的bbox.必须是直接遮挡的那种
-    """
-    right_bboxes = [box for box in all_bboxes if box[X0_IDX] >= this_bbox[X1_IDX] and any([
-        this_bbox[Y0_IDX] < box[Y0_IDX] < this_bbox[Y1_IDX], this_bbox[Y0_IDX] < box[Y1_IDX] < this_bbox[Y1_IDX],
-        box[Y0_IDX] < this_bbox[Y0_IDX] < box[Y1_IDX], box[Y0_IDX] < this_bbox[Y1_IDX] < box[Y1_IDX],
-        box[Y0_IDX]==this_bbox[Y0_IDX] and box[Y1_IDX]==this_bbox[Y1_IDX]])]
-    
-    if len(right_bboxes)>0:
-        right_bboxes.sort(key=lambda x: x[X0_EXT_IDX] if x[X0_EXT_IDX] else x[X0_IDX])
-        right_bboxes = right_bboxes[0]
-    else:
-        right_bboxes = None
-    return right_bboxes
-
-def reset_idx_x_y(all_boxes:list)->list:
-    for box in all_boxes:
-        box[IDX_X] = None
-        box[IDX_Y] = None
-        
-    return all_boxes
-
-# ===================================================================================================
-def find_top_nearest_bbox_direct(this_bbox, bboxes_collection) -> list:
-    """
-    找到在this_bbox上方且距离this_bbox距离最近的bbox.必须是直接遮挡的那种
-    """
-    top_bboxes = [box for box in bboxes_collection if box[Y1_IDX] <= this_bbox[Y0_IDX] and any([
-        box[X0_IDX] < this_bbox[X0_IDX] < box[X1_IDX], box[X0_IDX] < this_bbox[X1_IDX] < box[X1_IDX],
-         this_bbox[X0_IDX] < box[X0_IDX] < this_bbox[X1_IDX], this_bbox[X0_IDX] < box[X1_IDX] < this_bbox[X1_IDX],
-        box[X0_IDX]==this_bbox[X0_IDX] and box[X1_IDX]==this_bbox[X1_IDX]])]
-    # 然后再过滤一下，找到上方距离this_bbox最近的那个
-    if len(top_bboxes) > 0:
-        top_bboxes.sort(key=lambda x: x[Y1_IDX], reverse=True)
-        top_bboxes = top_bboxes[0]
-    else:
-        top_bboxes = None
-    return top_bboxes
-
-def find_bottom_nearest_bbox_direct(this_bbox, bboxes_collection) -> list:
-    """
-    找到在this_bbox下方且距离this_bbox距离最近的bbox.必须是直接遮挡的那种
-    """
-    bottom_bboxes = [box for box in bboxes_collection if box[Y0_IDX] >= this_bbox[Y1_IDX] and any([
-        box[X0_IDX] < this_bbox[X0_IDX] < box[X1_IDX], box[X0_IDX] < this_bbox[X1_IDX] < box[X1_IDX],
-         this_bbox[X0_IDX] < box[X0_IDX] < this_bbox[X1_IDX], this_bbox[X0_IDX] < box[X1_IDX] < this_bbox[X1_IDX],
-        box[X0_IDX]==this_bbox[X0_IDX] and box[X1_IDX]==this_bbox[X1_IDX]])]
-    # 然后再过滤一下，找到水平上距离this_bbox最近的那个
-    if len(bottom_bboxes) > 0:
-        bottom_bboxes.sort(key=lambda x: x[Y0_IDX])
-        bottom_bboxes = bottom_bboxes[0]
-    else:
-        bottom_bboxes = None
-    return bottom_bboxes
-
-def find_boundry_bboxes(bboxes:list) -> tuple:
-    """
-    找到bboxes的边界——找到所有bbox里最小的(x0, y0), 最大的(x1, y1)
-    """
-    x0, y0, x1, y1 = bboxes[0][X0_IDX], bboxes[0][Y0_IDX], bboxes[0][X1_IDX], bboxes[0][Y1_IDX]
-    for box in bboxes:
-        x0 = min(box[X0_IDX], x0)
-        y0 = min(box[Y0_IDX], y0)
-        x1 = max(box[X1_IDX], x1)
-        y1 = max(box[Y1_IDX], y1)
-        
-    return x0, y0, x1, y1
-    
-
-def extend_bbox_vertical(bboxes:list, boundry_x0, boundry_y0, boundry_x1, boundry_y1) -> list:
-    """
-    在垂直方向上扩展能够直接垂直打通的bbox,也就是那些上下都没有其他box的bbox
-    """
-    for box in bboxes:
-        top_nearest_bbox = find_top_nearest_bbox_direct(box, bboxes)
-        bottom_nearest_bbox = find_bottom_nearest_bbox_direct(box, bboxes)
-        if top_nearest_bbox is None and bottom_nearest_bbox is None: # 独占一列
-            box[X0_EXT_IDX] = box[X0_IDX]
-            box[Y0_EXT_IDX] = boundry_y0
-            box[X1_EXT_IDX] = box[X1_IDX]
-            box[Y1_EXT_IDX] = boundry_y1
-        # else:
-        #     if top_nearest_bbox is None:
-        #         box[Y0_EXT_IDX] = boundry_y0
-        #     else:
-        #         box[Y0_EXT_IDX] = top_nearest_bbox[Y1_IDX] + 1
-        #     if bottom_nearest_bbox is None:
-        #         box[Y1_EXT_IDX] = boundry_y1
-        #     else:
-        #         box[Y1_EXT_IDX] = bottom_nearest_bbox[Y0_IDX] - 1
-        #     box[X0_EXT_IDX] = box[X0_IDX]
-        #     box[X1_EXT_IDX] = box[X1_IDX]
-    return bboxes
-    
-
-# ===================================================================================================
-
-def paper_bbox_sort_v2(all_bboxes: list, page_width:int, page_height:int):
-    """
-    增加预处理行为的排序:
-    return:
-    [
-        {
-            "layout_bbox": [x0, y0, x1, y1],
-            "layout_label":"GOOD_LAYOUT/BAD_LAYOUT",
-            "content_bboxes": [] #每个元素都是[x0, y0, x1, y1, block_content, idx_x, idx_y, content_type, ext_x0, ext_y0, ext_x1, ext_y1], 并且顺序就是阅读顺序
-        }
-    ]
-    """
-    sorted_layouts = [] # 最后的返回结果
-    page_x0, page_y0, page_x1, page_y1 = 1, 1, page_width-1, page_height-1
-    
-    all_bboxes = paper_bbox_sort(all_bboxes) # 大致拍下序
-    # 首先在水平方向上扩展独占一行的bbox
-    for bbox in all_bboxes:
-        left_nearest_bbox = find_left_nearest_bbox_direct(bbox, all_bboxes) # 非扩展线
-        right_nearest_bbox = find_right_nearst_bbox_direct(bbox, all_bboxes)
-        if left_nearest_bbox is None and right_nearest_bbox is None: # 独占一行
-            bbox[X0_EXT_IDX] = page_x0
-            bbox[Y0_EXT_IDX] = bbox[Y0_IDX]
-            bbox[X1_EXT_IDX] = page_x1
-            bbox[Y1_EXT_IDX] = bbox[Y1_IDX]
-            
-    # 此时独占一行的被成功扩展到指定的边界上，这个时候利用边界条件合并连续的bbox，成为一个group
-    if len(all_bboxes)==1:
-        return [{"layout_bbox": [page_x0, page_y0, page_x1, page_y1], "layout_label":"GOOD_LAYOUT", "content_bboxes": all_bboxes}]
-    if len(all_bboxes)==0:
-        return []
-    
-    """
-    然后合并所有连续水平方向的bbox.
-    
-    """
-    all_bboxes.sort(key=lambda x: x[Y0_IDX])
-    h_bboxes = []
-    h_bbox_group = []
-    v_boxes = []
-
-    for bbox in all_bboxes:
-        if bbox[X0_IDX] == page_x0 and bbox[X1_IDX] == page_x1:
-            h_bbox_group.append(bbox)
-        else:
-            if len(h_bbox_group)>0:
-                h_bboxes.append(h_bbox_group) 
-                h_bbox_group = []
-    # 最后一个group
-    if len(h_bbox_group)>0:
-        h_bboxes.append(h_bbox_group)
-
-    """
-    现在h_bboxes里面是所有的group了，每个group都是一个list
-    对h_bboxes里的每个group进行计算放回到sorted_layouts里
-    """
-    for gp in h_bboxes:
-        gp.sort(key=lambda x: x[Y0_IDX])
-        block_info = {"layout_label":"GOOD_LAYOUT", "content_bboxes": gp}
-        # 然后计算这个group的layout_bbox，也就是最小的x0,y0, 最大的x1,y1
-        x0, y0, x1, y1 = gp[0][X0_EXT_IDX], gp[0][Y0_EXT_IDX], gp[-1][X1_EXT_IDX], gp[-1][Y1_EXT_IDX]
-        block_info["layout_bbox"] = [x0, y0, x1, y1]
-        sorted_layouts.append(block_info)
-        
-    # 接下来利用这些连续的水平bbox的layout_bbox的y0, y1，从水平上切分开其余的为几个部分
-    h_split_lines = [page_y0]
-    for gp in h_bboxes:
-        layout_bbox = gp['layout_bbox']
-        y0, y1 = layout_bbox[1], layout_bbox[3]
-        h_split_lines.append(y0)
-        h_split_lines.append(y1)
-    h_split_lines.append(page_y1)
-    
-    unsplited_bboxes = []
-    for i in range(0, len(h_split_lines), 2):
-        start_y0, start_y1 = h_split_lines[i:i+2]
-        # 然后找出[start_y0, start_y1]之间的其他bbox，这些组成一个未分割板块
-        bboxes_in_block = [bbox for bbox in all_bboxes if bbox[Y0_IDX]>=start_y0 and bbox[Y1_IDX]<=start_y1]
-        unsplited_bboxes.append(bboxes_in_block)
-    # ================== 至此，水平方向的 已经切分排序完毕====================================
-    """
-    接下来针对每个非水平的部分切分垂直方向的
-    此时，只剩下了无法被完全水平打通的bbox了。对这些box，优先进行垂直扩展，然后进行垂直切分.
-    分3步：
-    1. 先把能完全垂直打通的隔离出去当做一个layout
-    2. 其余的先垂直切分
-    3. 垂直切分之后的部分再尝试水平切分
-    4. 剩下的不能被切分的各个部分当成一个layout
-    """
-    # 对每部分进行垂直切分
-    for bboxes_in_block in unsplited_bboxes:
-        # 首先对这个block的bbox进行垂直方向上的扩展
-        boundry_x0, boundry_y0, boundry_x1, boundry_y1 = find_boundry_bboxes(bboxes_in_block) 
-        # 进行垂直方向上的扩展
-        extended_vertical_bboxes = extend_bbox_vertical(bboxes_in_block, boundry_x0, boundry_y0, boundry_x1, boundry_y1)
-        # 然后对这个block进行垂直方向上的切分
-        extend_bbox_vertical.sort(key=lambda x: x[X0_IDX]) # x方向上从小到大，代表了从左到右读取
-        v_boxes_group = []
-        for bbox in extended_vertical_bboxes:
-            if bbox[Y0_IDX]==boundry_y0 and bbox[Y1_IDX]==boundry_y1:
-                v_boxes_group.append(bbox)
-            else:
-                if len(v_boxes_group)>0:
-                    v_boxes.append(v_boxes_group)
-                    v_boxes_group = []
-                    
-        if len(v_boxes_group)>0:
-            
-            v_boxes.append(v_boxes_group)
-            
-        # 把连续的垂直部分加入到sorted_layouts里。注意这个时候已经是连续的垂直部分了，因为上面已经做了
-        for gp in v_boxes:
-            gp.sort(key=lambda x: x[X0_IDX])
-            block_info = {"layout_label":"GOOD_LAYOUT", "content_bboxes": gp}
-            # 然后计算这个group的layout_bbox，也就是最小的x0,y0, 最大的x1,y1
-            x0, y0, x1, y1 = gp[0][X0_EXT_IDX], gp[0][Y0_EXT_IDX], gp[-1][X1_EXT_IDX], gp[-1][Y1_EXT_IDX]
-            block_info["layout_bbox"] = [x0, y0, x1, y1]
-            sorted_layouts.append(block_info)
-            
-        # 在垂直方向上，划分子块，也就是用贯通的垂直线进行切分。这些被切分出来的块，极大可能是可被垂直切分的，如果不能完全的垂直切分，那么尝试水平切分。都不能的则当成一个layout
-        v_split_lines = [boundry_x0]
-        for gp in v_boxes:
-            layout_bbox = gp['layout_bbox']
-            x0, x1 = layout_bbox[0], layout_bbox[2]
-            v_split_lines.append(x0)
-            v_split_lines.append(x1)
-        v_split_lines.append(boundry_x1)
-        
-    reset_idx_x_y(all_bboxes)
-    all_boxes = _paper_bbox_sort_ext(all_bboxes)
-    return all_boxes
-            
-    
-    
-    
-    
-
-
-
--- a/magic_pdf/layout/layout_det_utils.py
+++ b/magic_pdf/layout/layout_det_utils.py
-from magic_pdf.layout.bbox_sort import X0_EXT_IDX, X0_IDX, X1_EXT_IDX, X1_IDX, Y0_IDX, Y1_EXT_IDX, Y1_IDX
-from magic_pdf.libs.boxbase import _is_bottom_full_overlap, _left_intersect, _right_intersect
-
-
-def find_all_left_bbox_direct(this_bbox, all_bboxes) -> list:
-    """
-    在all_bboxes里找到所有右侧垂直方向上和this_bbox有重叠的bbox， 不用延长线
-    并且要考虑两个box左右相交的情况，如果相交了，那么右侧的box就不算最左侧。
-    """
-    left_boxes = [box for box in all_bboxes if box[X1_IDX] <= this_bbox[X0_IDX] 
-         and any([
-         box[Y0_IDX] < this_bbox[Y0_IDX] < box[Y1_IDX], box[Y0_IDX] < this_bbox[Y1_IDX] < box[Y1_IDX],
-         this_bbox[Y0_IDX] < box[Y0_IDX] < this_bbox[Y1_IDX], this_bbox[Y0_IDX] < box[Y1_IDX] < this_bbox[Y1_IDX],
-         box[Y0_IDX]==this_bbox[Y0_IDX] and box[Y1_IDX]==this_bbox[Y1_IDX]]) or _left_intersect(box[:4], this_bbox[:4])]
-        
-    # 然后再过滤一下，找到水平上距离this_bbox最近的那个——x1最大的那个
-    if len(left_boxes) > 0:
-        left_boxes.sort(key=lambda x: x[X1_EXT_IDX] if x[X1_EXT_IDX] else x[X1_IDX], reverse=True)
-        left_boxes = left_boxes[0]
-    else:
-        left_boxes = None
-    return left_boxes
-
-def find_all_right_bbox_direct(this_bbox, all_bboxes) -> list:
-    """
-    找到在this_bbox右侧且距离this_bbox距离最近的bbox.必须是直接遮挡的那种
-    """
-    right_bboxes = [box for box in all_bboxes if box[X0_IDX] >= this_bbox[X1_IDX] 
-        and any([
-        this_bbox[Y0_IDX] < box[Y0_IDX] < this_bbox[Y1_IDX], this_bbox[Y0_IDX] < box[Y1_IDX] < this_bbox[Y1_IDX],
-        box[Y0_IDX] < this_bbox[Y0_IDX] < box[Y1_IDX], box[Y0_IDX] < this_bbox[Y1_IDX] < box[Y1_IDX],
-        box[Y0_IDX]==this_bbox[Y0_IDX] and box[Y1_IDX]==this_bbox[Y1_IDX]]) or _right_intersect(this_bbox[:4], box[:4])]
-    
-    if len(right_bboxes)>0:
-        right_bboxes.sort(key=lambda x: x[X0_EXT_IDX] if x[X0_EXT_IDX] else x[X0_IDX])
-        right_bboxes = right_bboxes[0]
-    else:
-        right_bboxes = None
-    return right_bboxes
-
-def find_all_top_bbox_direct(this_bbox, all_bboxes) -> list:
-    """
-    找到在this_bbox上侧且距离this_bbox距离最近的bbox.必须是直接遮挡的那种
-    """
-    top_bboxes = [box for box in all_bboxes if box[Y1_IDX] <= this_bbox[Y0_IDX] and any([
-        box[X0_IDX] < this_bbox[X0_IDX] < box[X1_IDX], box[X0_IDX] < this_bbox[X1_IDX] < box[X1_IDX],
-        this_bbox[X0_IDX] < box[X0_IDX] < this_bbox[X1_IDX], this_bbox[X0_IDX] < box[X1_IDX] < this_bbox[X1_IDX],
-        box[X0_IDX]==this_bbox[X0_IDX] and box[X1_IDX]==this_bbox[X1_IDX]])]
-    
-    if len(top_bboxes)>0:
-        top_bboxes.sort(key=lambda x: x[Y1_EXT_IDX] if x[Y1_EXT_IDX] else x[Y1_IDX], reverse=True)
-        top_bboxes = top_bboxes[0]
-    else:
-        top_bboxes = None
-    return top_bboxes
-
-def find_all_bottom_bbox_direct(this_bbox, all_bboxes) -> list:
-    """
-    找到在this_bbox下侧且距离this_bbox距离最近的bbox.必须是直接遮挡的那种
-    """
-    bottom_bboxes = [box for box in all_bboxes if box[Y0_IDX] >= this_bbox[Y1_IDX] and any([
-        this_bbox[X0_IDX] < box[X0_IDX] < this_bbox[X1_IDX], this_bbox[X0_IDX] < box[X1_IDX] < this_bbox[X1_IDX],
-        box[X0_IDX] < this_bbox[X0_IDX] < box[X1_IDX], box[X0_IDX] < this_bbox[X1_IDX] < box[X1_IDX],
-        box[X0_IDX]==this_bbox[X0_IDX] and box[X1_IDX]==this_bbox[X1_IDX]])]
-    
-    if len(bottom_bboxes)>0:
-        bottom_bboxes.sort(key=lambda x:  x[Y0_IDX])
-        bottom_bboxes = bottom_bboxes[0]
-    else:
-        bottom_bboxes = None
-    return bottom_bboxes
-
-# ===================================================================================================================
-def find_bottom_bbox_direct_from_right_edge(this_bbox, all_bboxes) -> list:
-    """
-    找到在this_bbox下侧且距离this_bbox距离最近的bbox.必须是直接遮挡的那种
-    """
-    bottom_bboxes = [box for box in all_bboxes if box[Y0_IDX] >= this_bbox[Y1_IDX] and any([
-        this_bbox[X0_IDX] < box[X0_IDX] < this_bbox[X1_IDX], this_bbox[X0_IDX] < box[X1_IDX] < this_bbox[X1_IDX],
-        box[X0_IDX] < this_bbox[X0_IDX] < box[X1_IDX], box[X0_IDX] < this_bbox[X1_IDX] < box[X1_IDX],
-        box[X0_IDX]==this_bbox[X0_IDX] and box[X1_IDX]==this_bbox[X1_IDX]])]
-    
-    if len(bottom_bboxes)>0:
-        # y0最小， X1最大的那个,也就是box上边缘最靠近this_bbox的那个,并且还最靠右
-        bottom_bboxes.sort(key=lambda x: x[Y0_IDX])
-        bottom_bboxes = [box for box in bottom_bboxes if box[Y0_IDX]==bottom_bboxes[0][Y0_IDX]]
-        # 然后再y1相同的情况下，找到x1最大的那个
-        bottom_bboxes.sort(key=lambda x: x[X1_IDX], reverse=True)
-        bottom_bboxes = bottom_bboxes[0]
-    else:
-        bottom_bboxes = None
-    return bottom_bboxes
-
-def find_bottom_bbox_direct_from_left_edge(this_bbox, all_bboxes) -> list:
-    """
-    找到在this_bbox下侧且距离this_bbox距离最近的bbox.必须是直接遮挡的那种
-    """
-    bottom_bboxes = [box for box in all_bboxes if box[Y0_IDX] >= this_bbox[Y1_IDX] and any([
-        this_bbox[X0_IDX] < box[X0_IDX] < this_bbox[X1_IDX], this_bbox[X0_IDX] < box[X1_IDX] < this_bbox[X1_IDX],
-        box[X0_IDX] < this_bbox[X0_IDX] < box[X1_IDX], box[X0_IDX] < this_bbox[X1_IDX] < box[X1_IDX],
-        box[X0_IDX]==this_bbox[X0_IDX] and box[X1_IDX]==this_bbox[X1_IDX]])]
-    
-    if len(bottom_bboxes)>0:
-        # y0最小， X0最小的那个
-        bottom_bboxes.sort(key=lambda x: x[Y0_IDX])
-        bottom_bboxes = [box for box in bottom_bboxes if box[Y0_IDX]==bottom_bboxes[0][Y0_IDX]]
-        # 然后再y0相同的情况下，找到x0最小的那个
-        bottom_bboxes.sort(key=lambda x: x[X0_IDX])
-        bottom_bboxes = bottom_bboxes[0]
-    else:
-        bottom_bboxes = None
-    return bottom_bboxes
-
-def find_top_bbox_direct_from_left_edge(this_bbox, all_bboxes) -> list:
-    """
-    找到在this_bbox上侧且距离this_bbox距离最近的bbox.必须是直接遮挡的那种
-    """
-    top_bboxes = [box for box in all_bboxes if box[Y1_IDX] <= this_bbox[Y0_IDX] and any([
-        box[X0_IDX] < this_bbox[X0_IDX] < box[X1_IDX], box[X0_IDX] < this_bbox[X1_IDX] < box[X1_IDX],
-        this_bbox[X0_IDX] < box[X0_IDX] < this_bbox[X1_IDX], this_bbox[X0_IDX] < box[X1_IDX] < this_bbox[X1_IDX],
-        box[X0_IDX]==this_bbox[X0_IDX] and box[X1_IDX]==this_bbox[X1_IDX]])]
-    
-    if len(top_bboxes)>0:
-        # y1最大， X0最小的那个
-        top_bboxes.sort(key=lambda x: x[Y1_IDX], reverse=True)
-        top_bboxes = [box for box in top_bboxes if box[Y1_IDX]==top_bboxes[0][Y1_IDX]]
-        # 然后再y1相同的情况下，找到x0最小的那个
-        top_bboxes.sort(key=lambda x: x[X0_IDX])
-        top_bboxes = top_bboxes[0]
-    else:
-        top_bboxes = None
-    return top_bboxes
-
-def find_top_bbox_direct_from_right_edge(this_bbox, all_bboxes) -> list:
-    """
-    找到在this_bbox上侧且距离this_bbox距离最近的bbox.必须是直接遮挡的那种
-    """
-    top_bboxes = [box for box in all_bboxes if box[Y1_IDX] <= this_bbox[Y0_IDX] and any([
-        box[X0_IDX] < this_bbox[X0_IDX] < box[X1_IDX], box[X0_IDX] < this_bbox[X1_IDX] < box[X1_IDX],
-        this_bbox[X0_IDX] < box[X0_IDX] < this_bbox[X1_IDX], this_bbox[X0_IDX] < box[X1_IDX] < this_bbox[X1_IDX],
-        box[X0_IDX]==this_bbox[X0_IDX] and box[X1_IDX]==this_bbox[X1_IDX]])]
-    
-    if len(top_bboxes)>0:
-        # y1最大， X1最大的那个
-        top_bboxes.sort(key=lambda x: x[Y1_IDX], reverse=True)
-        top_bboxes = [box for box in top_bboxes if box[Y1_IDX]==top_bboxes[0][Y1_IDX]]
-        # 然后再y1相同的情况下，找到x1最大的那个
-        top_bboxes.sort(key=lambda x: x[X1_IDX], reverse=True)
-        top_bboxes = top_bboxes[0]
-    else:
-        top_bboxes = None
-    return top_bboxes
-    
-# ===================================================================================================================
-
-def get_left_edge_bboxes(all_bboxes) -> list:
-    """
-    返回最左边的bbox
-    """
-    left_bboxes = [box for box in all_bboxes if find_all_left_bbox_direct(box, all_bboxes) is None]
-    return left_bboxes
-    
-def get_right_edge_bboxes(all_bboxes) -> list:
-    """
-    返回最右边的bbox
-    """
-    right_bboxes = [box for box in all_bboxes if find_all_right_bbox_direct(box, all_bboxes) is None]
-    return right_bboxes
-
-def fix_vertical_bbox_pos(bboxes:list):
-    """
-    检查这批bbox在垂直方向是否有轻微的重叠，如果重叠了，就把重叠的bbox往下移动一点
-    在x方向上必须一个包含或者被包含，或者完全重叠，不能只有部分重叠
-    """
-    bboxes.sort(key=lambda x: x[Y0_IDX]) # 从上向下排列
-    for i in range(0, len(bboxes)):
-        for j in range(i+1, len(bboxes)):
-            if _is_bottom_full_overlap(bboxes[i][:4], bboxes[j][:4]):
-                # 如果两个bbox有部分重叠，那么就把下面的bbox往下移动一点
-                bboxes[j][Y0_IDX] = bboxes[i][Y1_IDX] + 2 # 2是个经验值
-                break
-    return bboxes
--- a/magic_pdf/layout/layout_sort.py
+++ b/magic_pdf/layout/layout_sort.py
-"""对pdf上的box进行layout识别，并对内部组成的box进行排序."""
-
-from loguru import logger
-
-from magic_pdf.layout.bbox_sort import (CONTENT_IDX, CONTENT_TYPE_IDX,
-                                        X0_EXT_IDX, X0_IDX, X1_EXT_IDX, X1_IDX,
-                                        Y0_EXT_IDX, Y0_IDX, Y1_EXT_IDX, Y1_IDX,
-                                        paper_bbox_sort)
-from magic_pdf.layout.layout_det_utils import (
-    find_all_bottom_bbox_direct, find_all_left_bbox_direct,
-    find_all_right_bbox_direct, find_all_top_bbox_direct,
-    find_bottom_bbox_direct_from_left_edge,
-    find_bottom_bbox_direct_from_right_edge,
-    find_top_bbox_direct_from_left_edge, find_top_bbox_direct_from_right_edge,
-    get_left_edge_bboxes, get_right_edge_bboxes)
-from magic_pdf.libs.boxbase import get_bbox_in_boundary
-
-LAYOUT_V = 'V'
-LAYOUT_H = 'H'
-LAYOUT_UNPROC = 'U'
-LAYOUT_BAD = 'B'
-
-
-def _is_single_line_text(bbox):
-    """检查bbox里面的文字是否只有一行."""
-    return True  # TODO
-    box_type = bbox[CONTENT_TYPE_IDX]
-    if box_type != 'text':
-        return False
-    paras = bbox[CONTENT_IDX]['paras']
-    text_content = ''
-    for para_id, para in paras.items():  # 拼装内部的段落文本
-        is_title = para['is_title']
-        if is_title != 0:
-            text_content += f"## {para['text']}"
-        else:
-            text_content += para['text']
-        text_content += '\n\n'
-
-    return bbox[CONTENT_TYPE_IDX] == 'text' and len(text_content.split('\n\n')) <= 1
-
-
-def _horizontal_split(bboxes: list, boundary: tuple, avg_font_size=20) -> list:
-    """
-    对bboxes进行水平切割
-    方法是：找到左侧和右侧都没有被直接遮挡的box，然后进行扩展，之后进行切割
-    return:
-        返回几个大的Layout区域 [[x0, y0, x1, y1, "h|u|v"], ], h代表水平，u代表未探测的，v代表垂直布局
-    """
-    sorted_layout_blocks = []  # 这是要最终返回的值
-
-    bound_x0, bound_y0, bound_x1, bound_y1 = boundary
-    all_bboxes = get_bbox_in_boundary(bboxes, boundary)
-    # all_bboxes = paper_bbox_sort(all_bboxes, abs(bound_x1-bound_x0), abs(bound_y1-bound_x0)) # 大致拍下序, 这个是基于直接遮挡的。
-    """
-    首先在水平方向上扩展独占一行的bbox
-
-    """
-    last_h_split_line_y1 = bound_y0  # 记录下上次的水平分割线
-    for i, bbox in enumerate(all_bboxes):
-        left_nearest_bbox = find_all_left_bbox_direct(bbox, all_bboxes)  # 非扩展线
-        right_nearest_bbox = find_all_right_bbox_direct(bbox, all_bboxes)
-        if left_nearest_bbox is None and right_nearest_bbox is None:  # 独占一行
-            """
-            然而，如果只是孤立的一行文字，那么就还要满足以下几个条件才可以：
-            1. bbox和中心线相交。或者
-            2. 上方或者下方也存在同类水平的独占一行的bbox。 或者
-            3. TODO 加强条件：这个bbox上方和下方是同一列column，那么就不能算作独占一行
-            """
-            # 先检查这个bbox里是否只包含一行文字
-            # is_single_line = _is_single_line_text(bbox)
-            """
-            这里有个点需要注意，当页面内容不是居中的时候，第一次调用传递的是page的boundary，这个时候mid_x就不是中心线了.
-            所以这里计算出最紧致的boundary，然后再计算mid_x
-            """
-            boundary_real_x0, boundary_real_x1 = min(
-                [bbox[X0_IDX] for bbox in all_bboxes]
-            ), max([bbox[X1_IDX] for bbox in all_bboxes])
-            mid_x = (boundary_real_x0 + boundary_real_x1) / 2
-            # 检查这个box是否内容在中心线有交
-            # 必须跨过去2个字符的宽度
-            is_cross_boundary_mid_line = (
-                min(mid_x - bbox[X0_IDX], bbox[X1_IDX] - mid_x) > avg_font_size * 2
-            )
-            """
-            检查条件2
-            """
-            is_belong_to_col = False
-            """
-            检查是否能被上方col吸收，方法是：
-            1. 上方非空且不是独占一行的，并且
-            2. 从上个水平分割的最大y=y1开始到当前bbox,最左侧的bbox的[min_x0, max_x1],能够覆盖当前box的[x0, x1]
-            """
-            """
-            以迭代的方式向上找，查找范围是[bound_x0, last_h_sp, bound_x1, bbox[Y0_IDX]]
-            """
-            # 先确定上方的y0, y0
-            b_y0, b_y1 = last_h_split_line_y1, bbox[Y0_IDX]
-            # 然后从box开始逐个向上找到所有与box在x上有交集的box
-            box_to_check = [bound_x0, b_y0, bound_x1, b_y1]
-            bbox_in_bound_check = get_bbox_in_boundary(all_bboxes, box_to_check)
-
-            bboxes_on_top = []
-            virtual_box = bbox
-            while True:
-                b_on_top = find_all_top_bbox_direct(virtual_box, bbox_in_bound_check)
-                if b_on_top is not None:
-                    bboxes_on_top.append(b_on_top)
-                    virtual_box = [
-                        min([virtual_box[X0_IDX], b_on_top[X0_IDX]]),
-                        min(virtual_box[Y0_IDX], b_on_top[Y0_IDX]),
-                        max([virtual_box[X1_IDX], b_on_top[X1_IDX]]),
-                        b_y1,
-                    ]
-                else:
-                    break
-
-            # 随后确定这些box的最小x0, 最大x1
-            if len(bboxes_on_top) > 0 and len(bboxes_on_top) != len(
-                bbox_in_bound_check
-            ):  # virtual_box可能会膨胀到占满整个区域，这实际上就不能属于一个col了。
-                min_x0, max_x1 = virtual_box[X0_IDX], virtual_box[X1_IDX]
-                # 然后采用一种比较粗糙的方法，看min_x0，max_x1是否与位于[bound_x0, last_h_sp, bound_x1, bbox[Y0_IDX]]之间的box有相交
-
-                if not any(
-                    [
-                        b[X0_IDX] <= min_x0 - 1 <= b[X1_IDX]
-                        or b[X0_IDX] <= max_x1 + 1 <= b[X1_IDX]
-                        for b in bbox_in_bound_check
-                    ]
-                ):
-                    # 其上，下都不能被扩展成行，暂时只检查一下上方 TODO
-                    top_nearest_bbox = find_all_top_bbox_direct(bbox, bboxes)
-                    bottom_nearest_bbox = find_all_bottom_bbox_direct(bbox, bboxes)
-                    if not any(
-                        [
-                            top_nearest_bbox is not None
-                            and (
-                                find_all_left_bbox_direct(top_nearest_bbox, bboxes)
-                                is None
-                                and find_all_right_bbox_direct(top_nearest_bbox, bboxes)
-                                is None
-                            ),
-                            bottom_nearest_bbox is not None
-                            and (
-                                find_all_left_bbox_direct(bottom_nearest_bbox, bboxes)
-                                is None
-                                and find_all_right_bbox_direct(
-                                    bottom_nearest_bbox, bboxes
-                                )
-                                is None
-                            ),
-                            top_nearest_bbox is None or bottom_nearest_bbox is None,
-                        ]
-                    ):
-                        is_belong_to_col = True
-
-            # 检查是否能被下方col吸收 TODO
-            """
-            这里为什么没有is_cross_boundary_mid_line的条件呢？
-            确实有些杂志左右两栏宽度不是对称的。
-            """
-            if not is_belong_to_col or is_cross_boundary_mid_line:
-                bbox[X0_EXT_IDX] = bound_x0
-                bbox[Y0_EXT_IDX] = bbox[Y0_IDX]
-                bbox[X1_EXT_IDX] = bound_x1
-                bbox[Y1_EXT_IDX] = bbox[Y1_IDX]
-                last_h_split_line_y1 = bbox[Y1_IDX]  # 更新这条线
-            else:
-                continue
-    """
-    此时独占一行的被成功扩展到指定的边界上，这个时候利用边界条件合并连续的bbox，成为一个group
-    然后合并所有连续水平方向的bbox.
-    """
-    all_bboxes.sort(key=lambda x: x[Y0_IDX])
-    h_bboxes = []
-    h_bbox_group = []
-
-    for bbox in all_bboxes:
-        if bbox[X0_EXT_IDX] == bound_x0 and bbox[X1_EXT_IDX] == bound_x1:
-            h_bbox_group.append(bbox)
-        else:
-            if len(h_bbox_group) > 0:
-                h_bboxes.append(h_bbox_group)
-                h_bbox_group = []
-    # 最后一个group
-    if len(h_bbox_group) > 0:
-        h_bboxes.append(h_bbox_group)
-    """
-    现在h_bboxes里面是所有的group了，每个group都是一个list
-    对h_bboxes里的每个group进行计算放回到sorted_layouts里
-    """
-    h_layouts = []
-    for gp in h_bboxes:
-        gp.sort(key=lambda x: x[Y0_IDX])
-        # 然后计算这个group的layout_bbox，也就是最小的x0,y0, 最大的x1,y1
-        x0, y0, x1, y1 = (
-            gp[0][X0_EXT_IDX],
-            gp[0][Y0_EXT_IDX],
-            gp[-1][X1_EXT_IDX],
-            gp[-1][Y1_EXT_IDX],
-        )
-        h_layouts.append([x0, y0, x1, y1, LAYOUT_H])  # 水平的布局
-    """
-    接下来利用这些连续的水平bbox的layout_bbox的y0, y1，从水平上切分开其余的为几个部分
-    """
-    h_split_lines = [bound_y0]
-    for gp in h_bboxes:  # gp是一个list[bbox_list]
-        y0, y1 = gp[0][1], gp[-1][3]
-        h_split_lines.append(y0)
-        h_split_lines.append(y1)
-    h_split_lines.append(bound_y1)
-
-    unsplited_bboxes = []
-    for i in range(0, len(h_split_lines), 2):
-        start_y0, start_y1 = h_split_lines[i : i + 2]
-        # 然后找出[start_y0, start_y1]之间的其他bbox，这些组成一个未分割板块
-        bboxes_in_block = [
-            bbox
-            for bbox in all_bboxes
-            if bbox[Y0_IDX] >= start_y0 and bbox[Y1_IDX] <= start_y1
-        ]
-        unsplited_bboxes.append(bboxes_in_block)
-    # 接着把未处理的加入到h_layouts里
-    for bboxes_in_block in unsplited_bboxes:
-        if len(bboxes_in_block) == 0:
-            continue
-        x0, y0, x1, y1 = (
-            bound_x0,
-            min([bbox[Y0_IDX] for bbox in bboxes_in_block]),
-            bound_x1,
-            max([bbox[Y1_IDX] for bbox in bboxes_in_block]),
-        )
-        h_layouts.append([x0, y0, x1, y1, LAYOUT_UNPROC])
-
-    h_layouts.sort(key=lambda x: x[1])  # 按照y0排序, 也就是从上到下的顺序
-    """
-    转换成如下格式返回
-    """
-    for layout in h_layouts:
-        sorted_layout_blocks.append(
-            {
-                'layout_bbox': layout[:4],
-                'layout_label': layout[4],
-                'sub_layout': [],
-            }
-        )
-    return sorted_layout_blocks
-
-
-###############################################################################################
-#
-#  垂直方向的处理
-#
-#
-###############################################################################################
-def _vertical_align_split_v1(bboxes: list, boundary: tuple) -> list:
-    """
-    计算垂直方向上的对齐， 并分割bboxes成layout。负责对一列多行的进行列维度分割。
-    如果不能完全分割，剩余部分作为layout_lable为u的layout返回
-    -----------------------
-    |     |           |
-    |     |           |
-    |     |           |
-    |     |           |
-    -------------------------
-    此函数会将：以上布局将会切分出来2列
-    """
-    sorted_layout_blocks = []  # 这是要最终返回的值
-    new_boundary = [boundary[0], boundary[1], boundary[2], boundary[3]]
-
-    v_blocks = []
-    """
-    先从左到右切分
-    """
-    while True:
-        all_bboxes = get_bbox_in_boundary(bboxes, new_boundary)
-        left_edge_bboxes = get_left_edge_bboxes(all_bboxes)
-        if len(left_edge_bboxes) == 0:
-            break
-        right_split_line_x1 = max([bbox[X1_IDX] for bbox in left_edge_bboxes]) + 1
-        # 然后检查这条线能不与其他bbox的左边界相交或者重合
-        if any(
-            [bbox[X0_IDX] <= right_split_line_x1 <= bbox[X1_IDX] for bbox in all_bboxes]
-        ):
-            # 垂直切分线与某些box发生相交，说明无法完全垂直方向切分。
-            break
-        else:  # 说明成功分割出一列
-            # 找到左侧边界最靠左的bbox作为layout的x0
-            layout_x0 = min(
-                [bbox[X0_IDX] for bbox in left_edge_bboxes]
-            )  # 这里主要是为了画出来有一定间距
-            v_blocks.append(
-                [
-                    layout_x0,
-                    new_boundary[1],
-                    right_split_line_x1,
-                    new_boundary[3],
-                    LAYOUT_V,
-                ]
-            )
-            new_boundary[0] = right_split_line_x1  # 更新边界
-    """
-    再从右到左切， 此时如果还是无法完全切分，那么剩余部分作为layout_lable为u的layout返回
-    """
-    unsplited_block = []
-    while True:
-        all_bboxes = get_bbox_in_boundary(bboxes, new_boundary)
-        right_edge_bboxes = get_right_edge_bboxes(all_bboxes)
-        if len(right_edge_bboxes) == 0:
-            break
-        left_split_line_x0 = min([bbox[X0_IDX] for bbox in right_edge_bboxes]) - 1
-        # 然后检查这条线能不与其他bbox的左边界相交或者重合
-        if any(
-            [bbox[X0_IDX] <= left_split_line_x0 <= bbox[X1_IDX] for bbox in all_bboxes]
-        ):
-            # 这里是余下的
-            unsplited_block.append(
-                [
-                    new_boundary[0],
-                    new_boundary[1],
-                    new_boundary[2],
-                    new_boundary[3],
-                    LAYOUT_UNPROC,
-                ]
-            )
-            break
-        else:
-            # 找到右侧边界最靠右的bbox作为layout的x1
-            layout_x1 = max([bbox[X1_IDX] for bbox in right_edge_bboxes])
-            v_blocks.append(
-                [
-                    left_split_line_x0,
-                    new_boundary[1],
-                    layout_x1,
-                    new_boundary[3],
-                    LAYOUT_V,
-                ]
-            )
-            new_boundary[2] = left_split_line_x0  # 更新右边界
-    """
-    最后拼装成layout格式返回
-    """
-    for block in v_blocks:
-        sorted_layout_blocks.append(
-            {
-                'layout_bbox': block[:4],
-                'layout_label': block[4],
-                'sub_layout': [],
-            }
-        )
-    for block in unsplited_block:
-        sorted_layout_blocks.append(
-            {
-                'layout_bbox': block[:4],
-                'layout_label': block[4],
-                'sub_layout': [],
-            }
-        )
-
-    # 按照x0排序
-    sorted_layout_blocks.sort(key=lambda x: x['layout_bbox'][0])
-    return sorted_layout_blocks
-
-
-def _vertical_align_split_v2(bboxes: list, boundary: tuple) -> list:
-    """改进的
-    _vertical_align_split算法，原算法会因为第二列的box由于左侧没有遮挡被认为是左侧的一部分，导致整个layout多列被识别为一列。
-    利用从左上角的box开始向下看的方法，不断扩展w_x0, w_x1，直到不能继续向下扩展，或者到达边界下边界。"""
-    sorted_layout_blocks = []  # 这是要最终返回的值
-    new_boundary = [boundary[0], boundary[1], boundary[2], boundary[3]]
-    bad_boxes = []  # 被割中的box
-    v_blocks = []
-    while True:
-        all_bboxes = get_bbox_in_boundary(bboxes, new_boundary)
-        if len(all_bboxes) == 0:
-            break
-        left_top_box = min(
-            all_bboxes, key=lambda x: (x[X0_IDX], x[Y0_IDX])
-        )  # 这里应该加强，检查一下必须是在第一列的 TODO
-        start_box = [
-            left_top_box[X0_IDX],
-            left_top_box[Y0_IDX],
-            left_top_box[X1_IDX],
-            left_top_box[Y1_IDX],
-        ]
-        w_x0, w_x1 = left_top_box[X0_IDX], left_top_box[X1_IDX]
-        """
-        然后沿着这个box线向下找最近的那个box, 然后扩展w_x0, w_x1
-        扩展之后，宽度会增加，随后用x=w_x1来检测在边界内是否有box与相交，如果相交，那么就说明不能再扩展了。
-        当不能扩展的时候就要看是否到达下边界：
-        1. 达到，那么更新左边界继续分下一个列
-        2. 没有达到，那么此时开始从右侧切分进入下面的循环里
-        """
-        while left_top_box is not None:  # 向下去找
-            virtual_box = [w_x0, left_top_box[Y0_IDX], w_x1, left_top_box[Y1_IDX]]
-            left_top_box = find_bottom_bbox_direct_from_left_edge(
-                virtual_box, all_bboxes
-            )
-            if left_top_box:
-                w_x0, w_x1 = min(virtual_box[X0_IDX], left_top_box[X0_IDX]), max(
-                    [virtual_box[X1_IDX], left_top_box[X1_IDX]]
-                )
-        # 万一这个初始的box在column中间，那么还要向上看
-        start_box = [
-            w_x0,
-            start_box[Y0_IDX],
-            w_x1,
-            start_box[Y1_IDX],
-        ]  # 扩展一下宽度更鲁棒
-        left_top_box = find_top_bbox_direct_from_left_edge(start_box, all_bboxes)
-        while left_top_box is not None:  # 向上去找
-            virtual_box = [w_x0, left_top_box[Y0_IDX], w_x1, left_top_box[Y1_IDX]]
-            left_top_box = find_top_bbox_direct_from_left_edge(virtual_box, all_bboxes)
-            if left_top_box:
-                w_x0, w_x1 = min(virtual_box[X0_IDX], left_top_box[X0_IDX]), max(
-                    [virtual_box[X1_IDX], left_top_box[X1_IDX]]
-                )
-
-        # 检查相交
-        if any([bbox[X0_IDX] <= w_x1 + 1 <= bbox[X1_IDX] for bbox in all_bboxes]):
-            for b in all_bboxes:
-                if b[X0_IDX] <= w_x1 + 1 <= b[X1_IDX]:
-                    bad_boxes.append([b[X0_IDX], b[Y0_IDX], b[X1_IDX], b[Y1_IDX]])
-            break
-        else:  # 说明成功分割出一列
-            v_blocks.append([w_x0, new_boundary[1], w_x1, new_boundary[3], LAYOUT_V])
-            new_boundary[0] = w_x1  # 更新边界
-    """
-    接着开始从右上角的box扫描
-    """
-    w_x0, w_x1 = 0, 0
-    unsplited_block = []
-    while True:
-        all_bboxes = get_bbox_in_boundary(bboxes, new_boundary)
-        if len(all_bboxes) == 0:
-            break
-        # 先找到X1最大的
-        bbox_list_sorted = sorted(
-            all_bboxes, key=lambda bbox: bbox[X1_IDX], reverse=True
-        )
-        # Then, find the boxes with the smallest Y0 value
-        bigest_x1 = bbox_list_sorted[0][X1_IDX]
-        boxes_with_bigest_x1 = [
-            bbox for bbox in bbox_list_sorted if bbox[X1_IDX] == bigest_x1
-        ]  # 也就是最靠右的那些
-        right_top_box = min(
-            boxes_with_bigest_x1, key=lambda bbox: bbox[Y0_IDX]
-        )  # y0最小的那个
-        start_box = [
-            right_top_box[X0_IDX],
-            right_top_box[Y0_IDX],
-            right_top_box[X1_IDX],
-            right_top_box[Y1_IDX],
-        ]
-        w_x0, w_x1 = right_top_box[X0_IDX], right_top_box[X1_IDX]
-
-        while right_top_box is not None:
-            virtual_box = [w_x0, right_top_box[Y0_IDX], w_x1, right_top_box[Y1_IDX]]
-            right_top_box = find_bottom_bbox_direct_from_right_edge(
-                virtual_box, all_bboxes
-            )
-            if right_top_box:
-                w_x0, w_x1 = min([w_x0, right_top_box[X0_IDX]]), max(
-                    [w_x1, right_top_box[X1_IDX]]
-                )
-        # 在向上扫描
-        start_box = [
-            w_x0,
-            start_box[Y0_IDX],
-            w_x1,
-            start_box[Y1_IDX],
-        ]  # 扩展一下宽度更鲁棒
-        right_top_box = find_top_bbox_direct_from_right_edge(start_box, all_bboxes)
-        while right_top_box is not None:
-            virtual_box = [w_x0, right_top_box[Y0_IDX], w_x1, right_top_box[Y1_IDX]]
-            right_top_box = find_top_bbox_direct_from_right_edge(
-                virtual_box, all_bboxes
-            )
-            if right_top_box:
-                w_x0, w_x1 = min([w_x0, right_top_box[X0_IDX]]), max(
-                    [w_x1, right_top_box[X1_IDX]]
-                )
-
-        # 检查是否与其他box相交， 垂直切分线与某些box发生相交，说明无法完全垂直方向切分。
-        if any([bbox[X0_IDX] <= w_x0 - 1 <= bbox[X1_IDX] for bbox in all_bboxes]):
-            unsplited_block.append(
-                [
-                    new_boundary[0],
-                    new_boundary[1],
-                    new_boundary[2],
-                    new_boundary[3],
-                    LAYOUT_UNPROC,
-                ]
-            )
-            for b in all_bboxes:
-                if b[X0_IDX] <= w_x0 - 1 <= b[X1_IDX]:
-                    bad_boxes.append([b[X0_IDX], b[Y0_IDX], b[X1_IDX], b[Y1_IDX]])
-            break
-        else:  # 说明成功分割出一列
-            v_blocks.append([w_x0, new_boundary[1], w_x1, new_boundary[3], LAYOUT_V])
-            new_boundary[2] = w_x0
-    """转换数据结构"""
-    for block in v_blocks:
-        sorted_layout_blocks.append(
-            {
-                'layout_bbox': block[:4],
-                'layout_label': block[4],
-                'sub_layout': [],
-            }
-        )
-
-    for block in unsplited_block:
-        sorted_layout_blocks.append(
-            {
-                'layout_bbox': block[:4],
-                'layout_label': block[4],
-                'sub_layout': [],
-                'bad_boxes': bad_boxes,  # 记录下来，这个box是被割中的
-            }
-        )
-
-    # 按照x0排序
-    sorted_layout_blocks.sort(key=lambda x: x['layout_bbox'][0])
-    return sorted_layout_blocks
-
-
-def _try_horizontal_mult_column_split(bboxes: list, boundary: tuple) -> list:
-    """
-    尝试水平切分，如果切分不动，那就当一个BAD_LAYOUT返回
-    ------------------
-    |        |       |
-    ------------------
-    |    |       |   |   <-  这里是此函数要切分的场景
-    ------------------
-    |        |       |
-    |        |       |
-    """
-    pass
-
-
-def _vertical_split(bboxes: list, boundary: tuple) -> list:
-    """
-    从垂直方向进行切割，分block
-    这个版本里，如果垂直切分不动，那就当一个BAD_LAYOUT返回
-
-                                --------------------------
-                                    |        |       |
-                                    |        |       |
-                                | |
-    这种列是此函数要切分的  ->    | |
-                                | |
-                                    |        |       |
-                                    |        |       |
-                                -------------------------
-    """
-    sorted_layout_blocks = []  # 这是要最终返回的值
-
-    bound_x0, bound_y0, bound_x1, bound_y1 = boundary
-    all_bboxes = get_bbox_in_boundary(bboxes, boundary)
-    """
-    all_bboxes = fix_vertical_bbox_pos(all_bboxes) # 垂直方向解覆盖
-    all_bboxes = fix_hor_bbox_pos(all_bboxes)  # 水平解覆盖
-
-    这两行代码目前先不执行，因为公式检测，表格检测还不是很成熟，导致非常多的textblock参与了运算，时间消耗太大。
-    这两行代码的作用是：
-    如果遇到互相重叠的bbox, 那么会把面积较小的box进行压缩，从而避免重叠。对布局切分来说带来正反馈。
-    """
-
-    # all_bboxes = paper_bbox_sort(all_bboxes, abs(bound_x1-bound_x0), abs(bound_y1-bound_x0)) # 大致拍下序, 这个是基于直接遮挡的。
-    """
-    首先在垂直方向上扩展独占一行的bbox
-
-    """
-    for bbox in all_bboxes:
-        top_nearest_bbox = find_all_top_bbox_direct(bbox, all_bboxes)  # 非扩展线
-        bottom_nearest_bbox = find_all_bottom_bbox_direct(bbox, all_bboxes)
-        if (
-            top_nearest_bbox is None
-            and bottom_nearest_bbox is None
-            and not any(
-                [
-                    b[X0_IDX] < bbox[X1_IDX] < b[X1_IDX]
-                    or b[X0_IDX] < bbox[X0_IDX] < b[X1_IDX]
-                    for b in all_bboxes
-                ]
-            )
-        ):  # 独占一列, 且不和其他重叠
-            bbox[X0_EXT_IDX] = bbox[X0_IDX]
-            bbox[Y0_EXT_IDX] = bound_y0
-            bbox[X1_EXT_IDX] = bbox[X1_IDX]
-            bbox[Y1_EXT_IDX] = bound_y1
-        """
-    此时独占一列的被成功扩展到指定的边界上，这个时候利用边界条件合并连续的bbox，成为一个group
-    然后合并所有连续垂直方向的bbox.
-    """
-    all_bboxes.sort(key=lambda x: x[X0_IDX])
-    # fix: 这里水平方向的列不要合并成一个行，因为需要保证返回给下游的最小block，总是可以无脑从上到下阅读文字。
-    v_bboxes = []
-    for box in all_bboxes:
-        if box[Y0_EXT_IDX] == bound_y0 and box[Y1_EXT_IDX] == bound_y1:
-            v_bboxes.append(box)
-    """
-    现在v_bboxes里面是所有的group了，每个group都是一个list
-    对v_bboxes里的每个group进行计算放回到sorted_layouts里
-    """
-    v_layouts = []
-    for vbox in v_bboxes:
-        # gp.sort(key=lambda x: x[X0_IDX])
-        # 然后计算这个group的layout_bbox，也就是最小的x0,y0, 最大的x1,y1
-        x0, y0, x1, y1 = (
-            vbox[X0_EXT_IDX],
-            vbox[Y0_EXT_IDX],
-            vbox[X1_EXT_IDX],
-            vbox[Y1_EXT_IDX],
-        )
-        v_layouts.append([x0, y0, x1, y1, LAYOUT_V])  # 垂直的布局
-    """
-    接下来利用这些连续的垂直bbox的layout_bbox的x0, x1，从垂直上切分开其余的为几个部分
-    """
-    v_split_lines = [bound_x0]
-    for gp in v_bboxes:
-        x0, x1 = gp[X0_IDX], gp[X1_IDX]
-        v_split_lines.append(x0)
-        v_split_lines.append(x1)
-    v_split_lines.append(bound_x1)
-
-    unsplited_bboxes = []
-    for i in range(0, len(v_split_lines), 2):
-        start_x0, start_x1 = v_split_lines[i : i + 2]
-        # 然后找出[start_x0, start_x1]之间的其他bbox，这些组成一个未分割板块
-        bboxes_in_block = [
-            bbox
-            for bbox in all_bboxes
-            if bbox[X0_IDX] >= start_x0 and bbox[X1_IDX] <= start_x1
-        ]
-        unsplited_bboxes.append(bboxes_in_block)
-    # 接着把未处理的加入到v_layouts里
-    for bboxes_in_block in unsplited_bboxes:
-        if len(bboxes_in_block) == 0:
-            continue
-        x0, y0, x1, y1 = (
-            min([bbox[X0_IDX] for bbox in bboxes_in_block]),
-            bound_y0,
-            max([bbox[X1_IDX] for bbox in bboxes_in_block]),
-            bound_y1,
-        )
-        v_layouts.append(
-            [x0, y0, x1, y1, LAYOUT_UNPROC]
-        )  # 说明这篇区域未能够分析出可靠的版面
-
-    v_layouts.sort(key=lambda x: x[0])  # 按照x0排序, 也就是从左到右的顺序
-
-    for layout in v_layouts:
-        sorted_layout_blocks.append(
-            {
-                'layout_bbox': layout[:4],
-                'layout_label': layout[4],
-                'sub_layout': [],
-            }
-        )
-    """
-    至此，垂直方向切成了2种类型，其一是独占一列的，其二是未处理的。
-    下面对这些未处理的进行垂直方向切分，这个切分要切出来类似“吕”这种类型的垂直方向的布局
-    """
-    for i, layout in enumerate(sorted_layout_blocks):
-        if layout['layout_label'] == LAYOUT_UNPROC:
-            x0, y0, x1, y1 = layout['layout_bbox']
-            v_split_layouts = _vertical_align_split_v2(bboxes, [x0, y0, x1, y1])
-            sorted_layout_blocks[i] = {
-                'layout_bbox': [x0, y0, x1, y1],
-                'layout_label': LAYOUT_H,
-                'sub_layout': v_split_layouts,
-            }
-            layout['layout_label'] = LAYOUT_H  # 被垂线切分成了水平布局
-
-    return sorted_layout_blocks
-
-
-def split_layout(bboxes: list, boundary: tuple, page_num: int) -> list:
-    """
-    把bboxes切割成layout
-    return:
-    [
-        {
-            "layout_bbox": [x0,y0,x1,y1],
-            "layout_label":"u|v|h|b", 未处理|垂直|水平|BAD_LAYOUT
-            "sub_layout":[] #每个元素都是[
-                                            x0,y0,
-                                            x1,y1,
-                                            block_content,
-                                            idx_x,idx_y,
-                                            content_type,
-                                            ext_x0,ext_y0,
-                                            ext_x1,ext_y1
-                                        ], 并且顺序就是阅读顺序
-        }
-    ]
-    example:
-    [
-        {
-            "layout_bbox": [0, 0, 100, 100],
-            "layout_label":"u|v|h|b",
-            "sub_layout":[
-
-            ]
-        },
-        {
-            "layout_bbox": [0, 0, 100, 100],
-            "layout_label":"u|v|h|b",
-            "sub_layout":[
-                {
-                    "layout_bbox": [0, 0, 100, 100],
-                    "layout_label":"u|v|h|b",
-                    "content_bboxes":[
-                        [],
-                        [],
-                        []
-                    ]
-                },
-                {
-                    "layout_bbox": [0, 0, 100, 100],
-                    "layout_label":"u|v|h|b",
-                    "sub_layout":[
-
-                    ]
-                }
-        }
-    ]
-    """
-    sorted_layouts = []  # 最终返回的结果
-
-    boundary_x0, boundary_y0, boundary_x1, boundary_y1 = boundary
-    if len(bboxes) <= 1:
-        return [
-            {
-                'layout_bbox': [boundary_x0, boundary_y0, boundary_x1, boundary_y1],
-                'layout_label': LAYOUT_V,
-                'sub_layout': [],
-            }
-        ]
-    """
-    接下来按照先水平后垂直的顺序进行切分
-    """
-    bboxes = paper_bbox_sort(
-        bboxes, boundary_x1 - boundary_x0, boundary_y1 - boundary_y0
-    )
-    sorted_layouts = _horizontal_split(bboxes, boundary)  # 通过水平分割出来的layout
-    for i, layout in enumerate(sorted_layouts):
-        x0, y0, x1, y1 = layout['layout_bbox']
-        layout_type = layout['layout_label']
-        if layout_type == LAYOUT_UNPROC:  # 说明是非独占单行的，这些需要垂直切分
-            v_split_layouts = _vertical_split(bboxes, [x0, y0, x1, y1])
-            """
-            最后这里有个逻辑问题：如果这个函数只分离出来了一个column layout，那么这个layout分割肯定超出了算法能力范围。因为我们假定的是传进来的
-            box已经把行全部剥离了，所以这里必须十多个列才可以。如果只剥离出来一个layout，并且是多个box，那么就说明这个layout是无法分割的，标记为LAYOUT_UNPROC
-            """
-            layout_label = LAYOUT_V
-            if len(v_split_layouts) == 1:
-                if len(v_split_layouts[0]['sub_layout']) == 0:
-                    layout_label = LAYOUT_UNPROC
-                    # logger.warning(f"WARNING: pageno={page_num}, 无法分割的layout: ", v_split_layouts)
-            """
-            组合起来最终的layout
-            """
-            sorted_layouts[i] = {
-                'layout_bbox': [x0, y0, x1, y1],
-                'layout_label': layout_label,
-                'sub_layout': v_split_layouts,
-            }
-            layout['layout_label'] = LAYOUT_H
-    """
-    水平和垂直方向都切分完毕了。此时还有一些未处理的，这些未处理的可能是因为水平和垂直方向都无法切分。
-    这些最后调用_try_horizontal_mult_block_split做一次水平多个block的联合切分，如果也不能切分最终就当做BAD_LAYOUT返回
-    """
-    # TODO
-
-    return sorted_layouts
-
-
-def get_bboxes_layout(all_boxes: list, boundary: tuple, page_id: int):
-    """
-    对利用layout排序之后的box，进行排序
-    return:
-    [
-        {
-            "layout_bbox": [x0, y0, x1, y1],
-            "layout_label":"u|v|h|b", 未处理|垂直|水平|BAD_LAYOUT
-        }，
-    ]
-    """
-
-    def _preorder_traversal(layout):
-        """对sorted_layouts的叶子节点，也就是len(sub_layout)==0的节点进行排序。排序按照前序遍历的顺序，也就是从上到
-        下，从左到右的顺序."""
-        sorted_layout_blocks = []
-        for layout in layout:
-            sub_layout = layout['sub_layout']
-            if len(sub_layout) == 0:
-                sorted_layout_blocks.append(layout)
-            else:
-                s = _preorder_traversal(sub_layout)
-                sorted_layout_blocks.extend(s)
-        return sorted_layout_blocks
-
-    # -------------------------------------------------------------------------------------------------------------------------
-    sorted_layouts = split_layout(
-        all_boxes, boundary, page_id
-    )  # 先切分成layout，得到一个Tree
-    total_sorted_layout_blocks = _preorder_traversal(sorted_layouts)
-    return total_sorted_layout_blocks, sorted_layouts
-
-
-def get_columns_cnt_of_layout(layout_tree):
-    """获取一个layout的宽度."""
-    max_width_list = [0]  # 初始化一个元素，防止max,min函数报错
-
-    for items in layout_tree:  # 针对每一层（横切）计算列数，横着的算一列
-        layout_type = items['layout_label']
-        sub_layouts = items['sub_layout']
-        if len(sub_layouts) == 0:
-            max_width_list.append(1)
-        else:
-            if layout_type == LAYOUT_H:
-                max_width_list.append(1)
-            else:
-                width = 0
-                for sub_layout in sub_layouts:
-                    if len(sub_layout['sub_layout']) == 0:
-                        width += 1
-                    else:
-                        for lay in sub_layout['sub_layout']:
-                            width += get_columns_cnt_of_layout([lay])
-                max_width_list.append(width)
-
-    return max(max_width_list)
-
-
-def sort_with_layout(bboxes: list, page_width, page_height) -> (list, list):
-    """输入是一个bbox的list.
-
-    获取到输入之后，先进行layout切分，然后对这些bbox进行排序。返回排序后的bboxes
-    """
-
-    new_bboxes = []
-    for box in bboxes:
-        # new_bboxes.append([box[0], box[1], box[2], box[3], None, None, None, 'text', None, None, None, None])
-        new_bboxes.append(
-            [
-                box[0],
-                box[1],
-                box[2],
-                box[3],
-                None,
-                None,
-                None,
-                'text',
-                None,
-                None,
-                None,
-                None,
-                box[4],
-            ]
-        )
-
-    layout_bboxes, _ = get_bboxes_layout(
-        new_bboxes, tuple([0, 0, page_width, page_height]), 0
-    )
-    if any([lay['layout_label'] == LAYOUT_UNPROC for lay in layout_bboxes]):
-        logger.warning('drop this pdf, reason: 复杂版面')
-        return None, None
-
-    sorted_bboxes = []
-    # 利用layout bbox每次框定一些box，然后排序
-    for layout in layout_bboxes:
-        lbox = layout['layout_bbox']
-        bbox_in_layout = get_bbox_in_boundary(new_bboxes, lbox)
-        sorted_bbox = paper_bbox_sort(
-            bbox_in_layout, lbox[2] - lbox[0], lbox[3] - lbox[1]
-        )
-        sorted_bboxes.extend(sorted_bbox)
-
-    return sorted_bboxes, layout_bboxes
-
-
-def sort_text_block(text_block, layout_bboxes):
-    """对一页的text_block进行排序."""
-    sorted_text_bbox = []
-    all_text_bbox = []
-    # 做一个box=>text的映射
-    box_to_text = {}
-    for blk in text_block:
-        box = blk['bbox']
-        box_to_text[(box[0], box[1], box[2], box[3])] = blk
-        all_text_bbox.append(box)
-
-    # text_blocks_to_sort = []
-    # for box in box_to_text.keys():
-    #     text_blocks_to_sort.append([box[0], box[1], box[2], box[3], None, None, None, 'text', None, None, None, None])
-
-    # 按照layout_bboxes的顺序，对text_block进行排序
-    for layout in layout_bboxes:
-        layout_box = layout['layout_bbox']
-        text_bbox_in_layout = get_bbox_in_boundary(
-            all_text_bbox,
-            [
-                layout_box[0] - 1,
-                layout_box[1] - 1,
-                layout_box[2] + 1,
-                layout_box[3] + 1,
-            ],
-        )
-        # sorted_bbox = paper_bbox_sort(text_bbox_in_layout, layout_box[2]-layout_box[0], layout_box[3]-layout_box[1])
-        text_bbox_in_layout.sort(
-            key=lambda x: x[1]
-        )  # 一个layout内部的box，按照y0自上而下排序
-        # sorted_bbox = [[b] for b in text_blocks_to_sort]
-        for sb in text_bbox_in_layout:
-            sorted_text_bbox.append(box_to_text[(sb[0], sb[1], sb[2], sb[3])])
-
-    return sorted_text_bbox
--- a/magic_pdf/layout/layout_spiler_recog.py
+++ b/magic_pdf/layout/layout_spiler_recog.py
-"""
-找到能分割布局的水平的横线、色块
-"""
-
-import os
-from magic_pdf.libs.commons import fitz
-from magic_pdf.libs.boxbase import _is_in_or_part_overlap
-
-
-def __rect_filter_by_width(rect, page_w, page_h):
-    mid_x = page_w/2
-    if rect[0]< mid_x < rect[2]:
-        return True
-    return False
-
-
-def __rect_filter_by_pos(rect, image_bboxes, table_bboxes):
-    """
-    不能出现在table和image的位置
-    """
-    for box in image_bboxes:
-        if _is_in_or_part_overlap(rect, box):
-            return False
-    
-    for box in table_bboxes:
-        if _is_in_or_part_overlap(rect, box):
-            return False
-    
-    return True
-
-
-def __debug_show_page(page, bboxes1: list,bboxes2: list,bboxes3: list,):
-    save_path = "./tmp/debug.pdf"
-    if os.path.exists(save_path):
-        # 删除已经存在的文件
-        os.remove(save_path)
-    # 创建一个新的空白 PDF 文件
-    doc = fitz.open('')
-
-    width = page.rect.width
-    height = page.rect.height
-    new_page = doc.new_page(width=width, height=height)
-    
-    shape = new_page.new_shape()
-    for bbox in bboxes1:
-        # 原始box画上去
-        rect = fitz.Rect(*bbox[0:4])
-        shape = new_page.new_shape()
-        shape.draw_rect(rect)
-        shape.finish(color=fitz.pdfcolor['red'], fill=fitz.pdfcolor['blue'], fill_opacity=0.2)
-        shape.finish()
-        shape.commit()
-        
-    for bbox in bboxes2:
-        # 原始box画上去
-        rect = fitz.Rect(*bbox[0:4])
-        shape = new_page.new_shape()
-        shape.draw_rect(rect)
-        shape.finish(color=None, fill=fitz.pdfcolor['yellow'], fill_opacity=0.2)
-        shape.finish()
-        shape.commit()
-        
-    for bbox in bboxes3:
-        # 原始box画上去
-        rect = fitz.Rect(*bbox[0:4])
-        shape = new_page.new_shape()
-        shape.draw_rect(rect)
-        shape.finish(color=fitz.pdfcolor['red'], fill=None)
-        shape.finish()
-        shape.commit()
-        
-    parent_dir = os.path.dirname(save_path)
-    if not os.path.exists(parent_dir):
-        os.makedirs(parent_dir)
-
-    doc.save(save_path)
-    doc.close() 
-    
-def get_spilter_of_page(page, image_bboxes, table_bboxes):
-    """
-    获取到色块和横线
-    """
-    cdrawings = page.get_cdrawings()
-    
-    spilter_bbox = []
-    for block in cdrawings:
-        if 'fill' in block:
-            fill = block['fill']
-        if 'fill' in block and block['fill'] and block['fill']!=(1.0,1.0,1.0):
-            rect = block['rect']
-            if __rect_filter_by_width(rect, page.rect.width, page.rect.height) and __rect_filter_by_pos(rect, image_bboxes, table_bboxes):
-                spilter_bbox.append(list(rect))
-    
-    """过滤、修正一下这些box。因为有时候会有一些矩形，高度为0或者为负数，造成layout计算无限循环。如果是负高度或者0高度，统一修正为高度为1"""
-    for box in spilter_bbox:
-        if box[3]-box[1] <= 0:
-            box[3] = box[1] + 1
-            
-    #__debug_show_page(page, spilter_bbox, [], [])
-    
-    return spilter_bbox
--- a/magic_pdf/layout/mcol_sort.py
+++ b/magic_pdf/layout/mcol_sort.py
-"""
-This is an advanced PyMuPDF utility for detecting multi-column pages.
-It can be used in a shell script, or its main function can be imported and
-invoked as descript below.
-
-Features
---------
- Identify text belonging to (a variable number of) columns on the page.
- Text with different background color is handled separately, allowing for
-  easier treatment of side remarks, comment boxes, etc.
- Uses text block detection capability to identify text blocks and
-  uses the block bboxes as primary structuring principle.
- Supports ignoring footers via a footer margin parameter.
- Returns re-created text boundary boxes (integer coordinates), sorted ascending
-  by the top, then by the left coordinates.
-
-Restrictions
-------------
- Only supporting horizontal, left-to-right text
- Returns a list of text boundary boxes - not the text itself. The caller is
-  expected to extract text from within the returned boxes.
- Text written above images is ignored altogether (option).
- This utility works as expected in most cases. The following situation cannot
-  be handled correctly:
-    * overlapping (non-disjoint) text blocks
-    * image captions are not recognized and are handled like normal text
-
-Usage
------
- As a CLI shell command use
-
-  python multi_column.py input.pdf footer_margin
-
-  Where footer margin is the height of the bottom stripe to ignore on each page.
-  This code is intended to be modified according to your need.
-
- Use in a Python script as follows:
-
-  ----------------------------------------------------------------------------------
-  from multi_column import column_boxes
-
-  # for each page execute
-  bboxes = column_boxes(page, footer_margin=50, no_image_text=True)
-
-  # bboxes is a list of fitz.IRect objects, that are sort ascending by their y0,
-  # then x0 coordinates. Their text content can be extracted by all PyMuPDF
-  # get_text() variants, like for instance the following:
-  for rect in bboxes:
-      print(page.get_text(clip=rect, sort=True))
-  ----------------------------------------------------------------------------------
-"""
-import sys
-from magic_pdf.libs.commons import fitz
-
-
-def column_boxes(page, footer_margin=50, header_margin=50, no_image_text=True):
-    """Determine bboxes which wrap a column."""
-    paths = page.get_drawings()
-    bboxes = []
-
-    # path rectangles
-    path_rects = []
-
-    # image bboxes
-    img_bboxes = []
-
-    # bboxes of non-horizontal text
-    # avoid when expanding horizontal text boxes
-    vert_bboxes = []
-
-    # compute relevant page area
-    clip = +page.rect
-    clip.y1 -= footer_margin  # Remove footer area
-    clip.y0 += header_margin  # Remove header area
-
-    def can_extend(temp, bb, bboxlist):
-        """Determines whether rectangle 'temp' can be extended by 'bb'
-        without intersecting any of the rectangles contained in 'bboxlist'.
-
-        Items of bboxlist may be None if they have been removed.
-
-        Returns:
-            True if 'temp' has no intersections with items of 'bboxlist'.
-        """
-        for b in bboxlist:
-            if not intersects_bboxes(temp, vert_bboxes) and (
-                b == None or b == bb or (temp & b).is_empty
-            ):
-                continue
-            return False
-
-        return True
-
-    def in_bbox(bb, bboxes):
-        """Return 1-based number if a bbox contains bb, else return 0."""
-        for i, bbox in enumerate(bboxes):
-            if bb in bbox:
-                return i + 1
-        return 0
-
-    def intersects_bboxes(bb, bboxes):
-        """Return True if a bbox intersects bb, else return False."""
-        for bbox in bboxes:
-            if not (bb & bbox).is_empty:
-                return True
-        return False
-
-    def extend_right(bboxes, width, path_bboxes, vert_bboxes, img_bboxes):
-        """Extend a bbox to the right page border.
-
-        Whenever there is no text to the right of a bbox, enlarge it up
-        to the right page border.
-
-        Args:
-            bboxes: (list[IRect]) bboxes to check
-            width: (int) page width
-            path_bboxes: (list[IRect]) bboxes with a background color
-            vert_bboxes: (list[IRect]) bboxes with vertical text
-            img_bboxes: (list[IRect]) bboxes of images
-        Returns:
-            Potentially modified bboxes.
-        """
-        for i, bb in enumerate(bboxes):
-            # do not extend text with background color
-            if in_bbox(bb, path_bboxes):
-                continue
-
-            # do not extend text in images
-            if in_bbox(bb, img_bboxes):
-                continue
-
-            # temp extends bb to the right page border
-            temp = +bb
-            temp.x1 = width
-
-            # do not cut through colored background or images
-            if intersects_bboxes(temp, path_bboxes + vert_bboxes + img_bboxes):
-                continue
-
-            # also, do not intersect other text bboxes
-            check = can_extend(temp, bb, bboxes)
-            if check:
-                bboxes[i] = temp  # replace with enlarged bbox
-
-        return [b for b in bboxes if b != None]
-
-    def clean_nblocks(nblocks):
-        """Do some elementary cleaning."""
-
-        # 1. remove any duplicate blocks.
-        blen = len(nblocks)
-        if blen < 2:
-            return nblocks
-        start = blen - 1
-        for i in range(start, -1, -1):
-            bb1 = nblocks[i]
-            bb0 = nblocks[i - 1]
-            if bb0 == bb1:
-                del nblocks[i]
-
-        # 2. repair sequence in special cases:
-        # consecutive bboxes with almost same bottom value are sorted ascending
-        # by x-coordinate.
-        y1 = nblocks[0].y1  # first bottom coordinate
-        i0 = 0  # its index
-        i1 = -1  # index of last bbox with same bottom
-
-        # Iterate over bboxes, identifying segments with approx. same bottom value.
-        # Replace every segment by its sorted version.
-        for i in range(1, len(nblocks)):
-            b1 = nblocks[i]
-            if abs(b1.y1 - y1) > 10:  # different bottom
-                if i1 > i0:  # segment length > 1? Sort it!
-                    nblocks[i0 : i1 + 1] = sorted(
-                        nblocks[i0 : i1 + 1], key=lambda b: b.x0
-                    )
-                y1 = b1.y1  # store new bottom value
-                i0 = i  # store its start index
-            i1 = i  # store current index
-        if i1 > i0:  # segment waiting to be sorted
-            nblocks[i0 : i1 + 1] = sorted(nblocks[i0 : i1 + 1], key=lambda b: b.x0)
-        return nblocks
-
-    # extract vector graphics
-    for p in paths:
-        path_rects.append(p["rect"].irect)
-    path_bboxes = path_rects
-
-    # sort path bboxes by ascending top, then left coordinates
-    path_bboxes.sort(key=lambda b: (b.y0, b.x0))
-
-    # bboxes of images on page, no need to sort them
-    for item in page.get_images():
-        img_bboxes.extend(page.get_image_rects(item[0]))
-
-    # blocks of text on page
-    blocks = page.get_text(
-        "dict",
-        flags=fitz.TEXTFLAGS_TEXT,
-        clip=clip,
-    )["blocks"]
-
-    # Make block rectangles, ignoring non-horizontal text
-    for b in blocks:
-        bbox = fitz.IRect(b["bbox"])  # bbox of the block
-
-        # ignore text written upon images
-        if no_image_text and in_bbox(bbox, img_bboxes):
-            continue
-
-        # confirm first line to be horizontal
-        line0 = b["lines"][0]  # get first line
-        if line0["dir"] != (1, 0):  # only accept horizontal text
-            vert_bboxes.append(bbox)
-            continue
-
-        srect = fitz.EMPTY_IRECT()
-        for line in b["lines"]:
-            lbbox = fitz.IRect(line["bbox"])
-            text = "".join([s["text"].strip() for s in line["spans"]])
-            if len(text) > 1:
-                srect |= lbbox
-        bbox = +srect
-
-        if not bbox.is_empty:
-            bboxes.append(bbox)
-
-    # Sort text bboxes by ascending background, top, then left coordinates
-    bboxes.sort(key=lambda k: (in_bbox(k, path_bboxes), k.y0, k.x0))
-
-    # Extend bboxes to the right where possible
-    bboxes = extend_right(
-        bboxes, int(page.rect.width), path_bboxes, vert_bboxes, img_bboxes
-    )
-
-    # immediately return of no text found
-    if bboxes == []:
-        return []
-
-    # --------------------------------------------------------------------
-    # Join bboxes to establish some column structure
-    # --------------------------------------------------------------------
-    # the final block bboxes on page
-    nblocks = [bboxes[0]]  # pre-fill with first bbox
-    bboxes = bboxes[1:]  # remaining old bboxes
-
-    for i, bb in enumerate(bboxes):  # iterate old bboxes
-        check = False  # indicates unwanted joins
-
-        # check if bb can extend one of the new blocks
-        for j in range(len(nblocks)):
-            nbb = nblocks[j]  # a new block
-
-            # never join across columns
-            if bb == None or nbb.x1 < bb.x0 or bb.x1 < nbb.x0:
-                continue
-
-            # never join across different background colors
-            if in_bbox(nbb, path_bboxes) != in_bbox(bb, path_bboxes):
-                continue
-
-            temp = bb | nbb  # temporary extension of new block
-            check = can_extend(temp, nbb, nblocks)
-            if check == True:
-                break
-
-        if not check:  # bb cannot be used to extend any of the new bboxes
-            nblocks.append(bb)  # so add it to the list
-            j = len(nblocks) - 1  # index of it
-            temp = nblocks[j]  # new bbox added
-
-        # check if some remaining bbox is contained in temp
-        check = can_extend(temp, bb, bboxes)
-        if check == False:
-            nblocks.append(bb)
-        else:
-            nblocks[j] = temp
-        bboxes[i] = None
-
-    # do some elementary cleaning
-    nblocks = clean_nblocks(nblocks)
-
-    # return identified text bboxes
-    return nblocks
-
-
-if __name__ == "__main__":
-    """Only for debugging purposes, currently.
-
-    Draw red borders around the returned text bboxes and insert
-    the bbox number.
-    Then save the file under the name "input-blocks.pdf".
-    """
-
-    # get the file name
-    filename = sys.argv[1]
-
-    # check if footer margin is given
-    if len(sys.argv) > 2:
-        footer_margin = int(sys.argv[2])
-    else:  # use default vaue
-        footer_margin = 50
-
-    # check if header margin is given
-    if len(sys.argv) > 3:
-        header_margin = int(sys.argv[3])
-    else:  # use default vaue
-        header_margin = 50
-
-    # open document
-    doc = fitz.open(filename)
-
-    # iterate over the pages
-    for page in doc:
-        # remove any geometry issues
-        page.wrap_contents()
-
-        # get the text bboxes
-        bboxes = column_boxes(page, footer_margin=footer_margin, header_margin=header_margin)
-
-        # prepare a canvas to draw rectangles and text
-        shape = page.new_shape()
-
-        # iterate over the bboxes
-        for i, rect in enumerate(bboxes):
-            shape.draw_rect(rect)  # draw a border
-
-            # write sequence number
-            shape.insert_text(rect.tl + (5, 15), str(i), color=fitz.pdfcolor["red"])
-
-        # finish drawing / text with color red
-        shape.finish(color=fitz.pdfcolor["red"])
-        shape.commit()  # store to the page
-
-    # save document with text bboxes
-    doc.ez_save(filename.replace(".pdf", "-blocks.pdf"))
\ No newline at end of file
--- a/magic_pdf/libs/Constants.py
+++ b/magic_pdf/libs/Constants.py
-"""
-span维度自定义字段
-"""
-# span是否是跨页合并的
-CROSS_PAGE = "cross_page"
-
-"""
-block维度自定义字段
-"""
-# block中lines是否被删除
-LINES_DELETED = "lines_deleted"
-
-# struct eqtable
-STRUCT_EQTABLE = "struct_eqtable"
-
-# table recognition max time default value
-TABLE_MAX_TIME_VALUE = 400
-
-# pp_table_result_max_length
-TABLE_MAX_LEN = 480
-
-# pp table structure algorithm
-TABLE_MASTER = "TableMaster"
-
-# table master structure dict
-TABLE_MASTER_DICT = "table_master_structure_dict.txt"
-
-# table master dir
-TABLE_MASTER_DIR = "table_structure_tablemaster_infer/"
-
-# pp detect model dir
-DETECT_MODEL_DIR = "ch_PP-OCRv3_det_infer"
-
-# pp rec model dir
-REC_MODEL_DIR = "ch_PP-OCRv3_rec_infer"
-
-# pp rec char dict path
-REC_CHAR_DICT = "ppocr_keys_v1.txt"
-
-
--- a/magic_pdf/libs/MakeContentConfig.py
+++ b/magic_pdf/libs/MakeContentConfig.py
-class MakeMode:
-    MM_MD = "mm_markdown"
-    NLP_MD = "nlp_markdown"
-    STANDARD_FORMAT = "standard_format"
-
-
-class DropMode:
-    WHOLE_PDF = "whole_pdf"
-    SINGLE_PAGE = "single_page"
-    NONE = "none"
--- a/magic_pdf/libs/ModelBlockTypeEnum.py
+++ b/magic_pdf/libs/ModelBlockTypeEnum.py
-from enum import Enum
-
-class ModelBlockTypeEnum(Enum):
-    TITLE = 0
-    PLAIN_TEXT = 1
-    ABANDON = 2
-    ISOLATE_FORMULA = 8
-    EMBEDDING = 13
-    ISOLATED = 14
\ No newline at end of file
--- a/magic_pdf/libs/__init__.py
+++ b/magic_pdf/libs/__init__.py
--- a/magic_pdf/libs/boxbase.py
+++ b/magic_pdf/libs/boxbase.py
-import math
-
-
-def _is_in_or_part_overlap(box1, box2) -> bool:
-    """两个bbox是否有部分重叠或者包含."""
-    if box1 is None or box2 is None:
-        return False
-
-    x0_1, y0_1, x1_1, y1_1 = box1
-    x0_2, y0_2, x1_2, y1_2 = box2
-
-    return not (x1_1 < x0_2 or  # box1在box2的左边
-                x0_1 > x1_2 or  # box1在box2的右边
-                y1_1 < y0_2 or  # box1在box2的上边
-                y0_1 > y1_2)  # box1在box2的下边
-
-
-def _is_in_or_part_overlap_with_area_ratio(box1,
-                                           box2,
-                                           area_ratio_threshold=0.6):
-    """判断box1是否在box2里面，或者box1和box2有部分重叠，且重叠面积占box1的比例超过area_ratio_threshold."""
-    if box1 is None or box2 is None:
-        return False
-
-    x0_1, y0_1, x1_1, y1_1 = box1
-    x0_2, y0_2, x1_2, y1_2 = box2
-
-    if not _is_in_or_part_overlap(box1, box2):
-        return False
-
-    # 计算重叠面积
-    x_left = max(x0_1, x0_2)
-    y_top = max(y0_1, y0_2)
-    x_right = min(x1_1, x1_2)
-    y_bottom = min(y1_1, y1_2)
-    overlap_area = (x_right - x_left) * (y_bottom - y_top)
-
-    # 计算box1的面积
-    box1_area = (x1_1 - x0_1) * (y1_1 - y0_1)
-
-    return overlap_area / box1_area > area_ratio_threshold
-
-
-def _is_in(box1, box2) -> bool:
-    """box1是否完全在box2里面."""
-    x0_1, y0_1, x1_1, y1_1 = box1
-    x0_2, y0_2, x1_2, y1_2 = box2
-
-    return (x0_1 >= x0_2 and  # box1的左边界不在box2的左边外
-            y0_1 >= y0_2 and  # box1的上边界不在box2的上边外
-            x1_1 <= x1_2 and  # box1的右边界不在box2的右边外
-            y1_1 <= y1_2)  # box1的下边界不在box2的下边外
-
-
-def _is_part_overlap(box1, box2) -> bool:
-    """两个bbox是否有部分重叠，但不完全包含."""
-    if box1 is None or box2 is None:
-        return False
-
-    return _is_in_or_part_overlap(box1, box2) and not _is_in(box1, box2)
-
-
-def _left_intersect(left_box, right_box):
-    """检查两个box的左边界是否有交集，也就是left_box的右边界是否在right_box的左边界内."""
-    if left_box is None or right_box is None:
-        return False
-
-    x0_1, y0_1, x1_1, y1_1 = left_box
-    x0_2, y0_2, x1_2, y1_2 = right_box
-
-    return x1_1 > x0_2 and x0_1 < x0_2 and (y0_1 <= y0_2 <= y1_1
-                                            or y0_1 <= y1_2 <= y1_1)
-
-
-def _right_intersect(left_box, right_box):
-    """检查box是否在右侧边界有交集，也就是left_box的左边界是否在right_box的右边界内."""
-    if left_box is None or right_box is None:
-        return False
-
-    x0_1, y0_1, x1_1, y1_1 = left_box
-    x0_2, y0_2, x1_2, y1_2 = right_box
-
-    return x0_1 < x1_2 and x1_1 > x1_2 and (y0_1 <= y0_2 <= y1_1
-                                            or y0_1 <= y1_2 <= y1_1)
-
-
-def _is_vertical_full_overlap(box1, box2, x_torlence=2):
-    """x方向上：要么box1包含box2, 要么box2包含box1。不能部分包含 y方向上：box1和box2有重叠."""
-    # 解析box的坐标
-    x11, y11, x12, y12 = box1  # 左上角和右下角的坐标 (x1, y1, x2, y2)
-    x21, y21, x22, y22 = box2
-
-    # 在x轴方向上，box1是否包含box2 或 box2包含box1
-    contains_in_x = (x11 - x_torlence <= x21 and x12 + x_torlence >= x22) or (
-        x21 - x_torlence <= x11 and x22 + x_torlence >= x12)
-
-    # 在y轴方向上，box1和box2是否有重叠
-    overlap_in_y = not (y12 < y21 or y11 > y22)
-
-    return contains_in_x and overlap_in_y
-
-
-def _is_bottom_full_overlap(box1, box2, y_tolerance=2):
-    """检查box1下方和box2的上方有轻微的重叠，轻微程度收到y_tolerance的限制 这个函数和_is_vertical-
-    full_overlap的区别是，这个函数允许box1和box2在x方向上有轻微的重叠,允许一定的模糊度."""
-    if box1 is None or box2 is None:
-        return False
-
-    x0_1, y0_1, x1_1, y1_1 = box1
-    x0_2, y0_2, x1_2, y1_2 = box2
-    tolerance_margin = 2
-    is_xdir_full_overlap = (
-        (x0_1 - tolerance_margin <= x0_2 <= x1_1 + tolerance_margin
-         and x0_1 - tolerance_margin <= x1_2 <= x1_1 + tolerance_margin)
-        or (x0_2 - tolerance_margin <= x0_1 <= x1_2 + tolerance_margin
-            and x0_2 - tolerance_margin <= x1_1 <= x1_2 + tolerance_margin))
-
-    return y0_2 < y1_1 and 0 < (y1_1 -
-                                y0_2) < y_tolerance and is_xdir_full_overlap
-
-
-def _is_left_overlap(
-    box1,
-    box2,
-):
-    """检查box1的左侧是否和box2有重叠 在Y方向上可以是部分重叠或者是完全重叠。不分box1和box2的上下关系，也就是无论box1在box2下
-    方还是box2在box1下方，都可以检测到重叠。 X方向上."""
-
-    def __overlap_y(Ay1, Ay2, By1, By2):
-        return max(0, min(Ay2, By2) - max(Ay1, By1))
-
-    if box1 is None or box2 is None:
-        return False
-
-    x0_1, y0_1, x1_1, y1_1 = box1
-    x0_2, y0_2, x1_2, y1_2 = box2
-
-    y_overlap_len = __overlap_y(y0_1, y1_1, y0_2, y1_2)
-    ratio_1 = 1.0 * y_overlap_len / (y1_1 - y0_1) if y1_1 - y0_1 != 0 else 0
-    ratio_2 = 1.0 * y_overlap_len / (y1_2 - y0_2) if y1_2 - y0_2 != 0 else 0
-    vertical_overlap_cond = ratio_1 >= 0.5 or ratio_2 >= 0.5
-
-    # vertical_overlap_cond = y0_1<=y0_2<=y1_1 or y0_1<=y1_2<=y1_1 or y0_2<=y0_1<=y1_2 or y0_2<=y1_1<=y1_2
-    return x0_1 <= x0_2 <= x1_1 and vertical_overlap_cond
-
-
-def __is_overlaps_y_exceeds_threshold(bbox1,
-                                      bbox2,
-                                      overlap_ratio_threshold=0.8):
-    """检查两个bbox在y轴上是否有重叠，并且该重叠区域的高度占两个bbox高度更低的那个超过80%"""
-    _, y0_1, _, y1_1 = bbox1
-    _, y0_2, _, y1_2 = bbox2
-
-    overlap = max(0, min(y1_1, y1_2) - max(y0_1, y0_2))
-    height1, height2 = y1_1 - y0_1, y1_2 - y0_2
-    # max_height = max(height1, height2)
-    min_height = min(height1, height2)
-
-    return (overlap / min_height) > overlap_ratio_threshold
-
-
-def calculate_iou(bbox1, bbox2):
-    """计算两个边界框的交并比(IOU)。
-
-    Args:
-        bbox1 (list[float]): 第一个边界框的坐标，格式为 [x1, y1, x2, y2]，其中 (x1, y1) 为左上角坐标，(x2, y2) 为右下角坐标。
-        bbox2 (list[float]): 第二个边界框的坐标，格式与 `bbox1` 相同。
-
-    Returns:
-        float: 两个边界框的交并比(IOU)，取值范围为 [0, 1]。
-    """
-    # Determine the coordinates of the intersection rectangle
-    x_left = max(bbox1[0], bbox2[0])
-    y_top = max(bbox1[1], bbox2[1])
-    x_right = min(bbox1[2], bbox2[2])
-    y_bottom = min(bbox1[3], bbox2[3])
-
-    if x_right < x_left or y_bottom < y_top:
-        return 0.0
-
-    # The area of overlap area
-    intersection_area = (x_right - x_left) * (y_bottom - y_top)
-
-    # The area of both rectangles
-    bbox1_area = (bbox1[2] - bbox1[0]) * (bbox1[3] - bbox1[1])
-    bbox2_area = (bbox2[2] - bbox2[0]) * (bbox2[3] - bbox2[1])
-
-    # Compute the intersection over union by taking the intersection area
-    # and dividing it by the sum of both areas minus the intersection area
-    iou = intersection_area / float(bbox1_area + bbox2_area -
-                                    intersection_area)
-    return iou
-
-
-def calculate_overlap_area_2_minbox_area_ratio(bbox1, bbox2):
-    """计算box1和box2的重叠面积占最小面积的box的比例."""
-    # Determine the coordinates of the intersection rectangle
-    x_left = max(bbox1[0], bbox2[0])
-    y_top = max(bbox1[1], bbox2[1])
-    x_right = min(bbox1[2], bbox2[2])
-    y_bottom = min(bbox1[3], bbox2[3])
-
-    if x_right < x_left or y_bottom < y_top:
-        return 0.0
-
-    # The area of overlap area
-    intersection_area = (x_right - x_left) * (y_bottom - y_top)
-    min_box_area = min([(bbox1[2] - bbox1[0]) * (bbox1[3] - bbox1[1]),
-                        (bbox2[3] - bbox2[1]) * (bbox2[2] - bbox2[0])])
-    if min_box_area == 0:
-        return 0
-    else:
-        return intersection_area / min_box_area
-
-
-def calculate_overlap_area_in_bbox1_area_ratio(bbox1, bbox2):
-    """计算box1和box2的重叠面积占bbox1的比例."""
-    # Determine the coordinates of the intersection rectangle
-    x_left = max(bbox1[0], bbox2[0])
-    y_top = max(bbox1[1], bbox2[1])
-    x_right = min(bbox1[2], bbox2[2])
-    y_bottom = min(bbox1[3], bbox2[3])
-
-    if x_right < x_left or y_bottom < y_top:
-        return 0.0
-
-    # The area of overlap area
-    intersection_area = (x_right - x_left) * (y_bottom - y_top)
-    bbox1_area = (bbox1[2] - bbox1[0]) * (bbox1[3] - bbox1[1])
-    if bbox1_area == 0:
-        return 0
-    else:
-        return intersection_area / bbox1_area
-
-
-def get_minbox_if_overlap_by_ratio(bbox1, bbox2, ratio):
-    """通过calculate_overlap_area_2_minbox_area_ratio计算两个bbox重叠的面积占最小面积的box的比例
-    如果比例大于ratio，则返回小的那个bbox, 否则返回None."""
-    x1_min, y1_min, x1_max, y1_max = bbox1
-    x2_min, y2_min, x2_max, y2_max = bbox2
-    area1 = (x1_max - x1_min) * (y1_max - y1_min)
-    area2 = (x2_max - x2_min) * (y2_max - y2_min)
-    overlap_ratio = calculate_overlap_area_2_minbox_area_ratio(bbox1, bbox2)
-    if overlap_ratio > ratio:
-        if area1 <= area2:
-            return bbox1
-        else:
-            return bbox2
-    else:
-        return None
-
-
-def get_bbox_in_boundary(bboxes: list, boundary: tuple) -> list:
-    x0, y0, x1, y1 = boundary
-    new_boxes = [
-        box for box in bboxes
-        if box[0] >= x0 and box[1] >= y0 and box[2] <= x1 and box[3] <= y1
-    ]
-    return new_boxes
-
-
-def is_vbox_on_side(bbox, width, height, side_threshold=0.2):
-    """判断一个bbox是否在pdf页面的边缘."""
-    x0, x1 = bbox[0], bbox[2]
-    if x1 <= width * side_threshold or x0 >= width * (1 - side_threshold):
-        return True
-    return False
-
-
-def find_top_nearest_text_bbox(pymu_blocks, obj_bbox):
-    tolerance_margin = 4
-    top_boxes = [
-        box for box in pymu_blocks
-        if obj_bbox[1] - box['bbox'][3] >= -tolerance_margin
-        and not _is_in(box['bbox'], obj_bbox)
-    ]
-    # 然后找到X方向上有互相重叠的
-    top_boxes = [
-        box for box in top_boxes if any([
-            obj_bbox[0] - tolerance_margin <= box['bbox'][0] <= obj_bbox[2] +
-            tolerance_margin, obj_bbox[0] -
-            tolerance_margin <= box['bbox'][2] <= obj_bbox[2] +
-            tolerance_margin, box['bbox'][0] -
-            tolerance_margin <= obj_bbox[0] <= box['bbox'][2] +
-            tolerance_margin, box['bbox'][0] -
-            tolerance_margin <= obj_bbox[2] <= box['bbox'][2] +
-            tolerance_margin
-        ])
-    ]
-
-    # 然后找到y1最大的那个
-    if len(top_boxes) > 0:
-        top_boxes.sort(key=lambda x: x['bbox'][3], reverse=True)
-        return top_boxes[0]
-    else:
-        return None
-
-
-def find_bottom_nearest_text_bbox(pymu_blocks, obj_bbox):
-    bottom_boxes = [
-        box for box in pymu_blocks if box['bbox'][1] -
-        obj_bbox[3] >= -2 and not _is_in(box['bbox'], obj_bbox)
-    ]
-    # 然后找到X方向上有互相重叠的
-    bottom_boxes = [
-        box for box in bottom_boxes if any([
-            obj_bbox[0] - 2 <= box['bbox'][0] <= obj_bbox[2] + 2, obj_bbox[0] -
-            2 <= box['bbox'][2] <= obj_bbox[2] + 2, box['bbox'][0] -
-            2 <= obj_bbox[0] <= box['bbox'][2] + 2, box['bbox'][0] -
-            2 <= obj_bbox[2] <= box['bbox'][2] + 2
-        ])
-    ]
-
-    # 然后找到y0最小的那个
-    if len(bottom_boxes) > 0:
-        bottom_boxes.sort(key=lambda x: x['bbox'][1], reverse=False)
-        return bottom_boxes[0]
-    else:
-        return None
-
-
-def find_left_nearest_text_bbox(pymu_blocks, obj_bbox):
-    """寻找左侧最近的文本block."""
-    left_boxes = [
-        box for box in pymu_blocks if obj_bbox[0] -
-        box['bbox'][2] >= -2 and not _is_in(box['bbox'], obj_bbox)
-    ]
-    # 然后找到X方向上有互相重叠的
-    left_boxes = [
-        box for box in left_boxes if any([
-            obj_bbox[1] - 2 <= box['bbox'][1] <= obj_bbox[3] + 2, obj_bbox[1] -
-            2 <= box['bbox'][3] <= obj_bbox[3] + 2, box['bbox'][1] -
-            2 <= obj_bbox[1] <= box['bbox'][3] + 2, box['bbox'][1] -
-            2 <= obj_bbox[3] <= box['bbox'][3] + 2
-        ])
-    ]
-
-    # 然后找到x1最大的那个
-    if len(left_boxes) > 0:
-        left_boxes.sort(key=lambda x: x['bbox'][2], reverse=True)
-        return left_boxes[0]
-    else:
-        return None
-
-
-def find_right_nearest_text_bbox(pymu_blocks, obj_bbox):
-    """寻找右侧最近的文本block."""
-    right_boxes = [
-        box for box in pymu_blocks if box['bbox'][0] -
-        obj_bbox[2] >= -2 and not _is_in(box['bbox'], obj_bbox)
-    ]
-    # 然后找到X方向上有互相重叠的
-    right_boxes = [
-        box for box in right_boxes if any([
-            obj_bbox[1] - 2 <= box['bbox'][1] <= obj_bbox[3] + 2, obj_bbox[1] -
-            2 <= box['bbox'][3] <= obj_bbox[3] + 2, box['bbox'][1] -
-            2 <= obj_bbox[1] <= box['bbox'][3] + 2, box['bbox'][1] -
-            2 <= obj_bbox[3] <= box['bbox'][3] + 2
-        ])
-    ]
-
-    # 然后找到x0最小的那个
-    if len(right_boxes) > 0:
-        right_boxes.sort(key=lambda x: x['bbox'][0], reverse=False)
-        return right_boxes[0]
-    else:
-        return None
-
-
-def bbox_relative_pos(bbox1, bbox2):
-    """判断两个矩形框的相对位置关系.
-
-    Args:
-        bbox1: 一个四元组，表示第一个矩形框的左上角和右下角的坐标，格式为(x1, y1, x1b, y1b)
-        bbox2: 一个四元组，表示第二个矩形框的左上角和右下角的坐标，格式为(x2, y2, x2b, y2b)
-
-    Returns:
-        一个四元组，表示矩形框1相对于矩形框2的位置关系，格式为(left, right, bottom, top)
-        其中，left表示矩形框1是否在矩形框2的左侧，right表示矩形框1是否在矩形框2的右侧，
-        bottom表示矩形框1是否在矩形框2的下方，top表示矩形框1是否在矩形框2的上方
-    """
-    x1, y1, x1b, y1b = bbox1
-    x2, y2, x2b, y2b = bbox2
-
-    left = x2b < x1
-    right = x1b < x2
-    bottom = y2b < y1
-    top = y1b < y2
-    return left, right, bottom, top
-
-
-def bbox_distance(bbox1, bbox2):
-    """计算两个矩形框的距离。
-
-    Args:
-        bbox1 (tuple): 第一个矩形框的坐标，格式为 (x1, y1, x2, y2)，其中 (x1, y1) 为左上角坐标，(x2, y2) 为右下角坐标。
-        bbox2 (tuple): 第二个矩形框的坐标，格式为 (x1, y1, x2, y2)，其中 (x1, y1) 为左上角坐标，(x2, y2) 为右下角坐标。
-
-    Returns:
-        float: 矩形框之间的距离。
-    """
-
-    def dist(point1, point2):
-        return math.sqrt((point1[0] - point2[0])**2 +
-                         (point1[1] - point2[1])**2)
-
-    x1, y1, x1b, y1b = bbox1
-    x2, y2, x2b, y2b = bbox2
-
-    left, right, bottom, top = bbox_relative_pos(bbox1, bbox2)
-
-    if top and left:
-        return dist((x1, y1b), (x2b, y2))
-    elif left and bottom:
-        return dist((x1, y1), (x2b, y2b))
-    elif bottom and right:
-        return dist((x1b, y1), (x2, y2b))
-    elif right and top:
-        return dist((x1b, y1b), (x2, y2))
-    elif left:
-        return x1 - x2b
-    elif right:
-        return x2 - x1b
-    elif bottom:
-        return y1 - y2b
-    elif top:
-        return y2 - y1b
-    return 0.0
-
-
-def box_area(bbox):
-    return (bbox[2] - bbox[0]) * (bbox[3] - bbox[1])
-
-
-def get_overlap_area(bbox1, bbox2):
-    """计算box1和box2的重叠面积占bbox1的比例."""
-    # Determine the coordinates of the intersection rectangle
-    x_left = max(bbox1[0], bbox2[0])
-    y_top = max(bbox1[1], bbox2[1])
-    x_right = min(bbox1[2], bbox2[2])
-    y_bottom = min(bbox1[3], bbox2[3])
-
-    if x_right < x_left or y_bottom < y_top:
-        return 0.0
-
-    # The area of overlap area
-    return (x_right - x_left) * (y_bottom - y_top)
--- a/magic_pdf/libs/calc_span_stats.py
+++ b/magic_pdf/libs/calc_span_stats.py
-import os
-import csv
-import json
-import pandas as pd
-from pandas import DataFrame as df
-from matplotlib import pyplot as plt
-from termcolor import cprint
-
-"""
-Execute this script in the following way:
-
-1. Make sure there are pdf_dic.json files under the directory code-clean/tmp/unittest/md/, such as the following:
-
-    code-clean/tmp/unittest/md/scihub/scihub_00500000/libgen.scimag00527000-00527999.zip_10.1002/app.25178/pdf_dic.json
-    
-2. Under the directory code-clean, execute the following command:
-
-    $ python -m libs.calc_span_stats
-    
-"""
-
-
-def print_green_on_red(text):
-    cprint(text, "green", "on_red", attrs=["bold"], end="\n\n")
-
-
-def print_green(text):
-    print()
-    cprint(text, "green", attrs=["bold"], end="\n\n")
-
-
-def print_red(text):
-    print()
-    cprint(text, "red", attrs=["bold"], end="\n\n")
-
-
-def safe_get(dict_obj, key, default):
-    val = dict_obj.get(key)
-    if val is None:
-        return default
-    else:
-        return val
-
-
-class SpanStatsCalc:
-    """Calculate statistics of span."""
-
-    def draw_charts(self, span_stats: pd.DataFrame, fig_num: int, save_path: str):
-        """Draw multiple figures in one figure."""
-        # make a canvas
-        fig = plt.figure(fig_num, figsize=(20, 20))
-
-        pass
-
-    def calc_stats_per_dict(self, pdf_dict) -> pd.DataFrame:
-        """Calculate statistics per pdf_dict."""
-        span_stats = pd.DataFrame()
-
-        span_stats = []
-        span_id = 0
-        for page_id, blocks in pdf_dict.items():
-            if page_id.startswith("page_"):
-                if "para_blocks" in blocks.keys():
-                    for para_block in blocks["para_blocks"]:
-                        for line in para_block["lines"]:
-                            for span in line["spans"]:
-                                span_text = safe_get(span, "text", "")
-                                span_font_name = safe_get(span, "font", "")
-                                span_font_size = safe_get(span, "size", 0)
-                                span_font_color = safe_get(span, "color", "")
-                                span_font_flags = safe_get(span, "flags", 0)
-
-                                span_font_flags_decoded = safe_get(span, "decomposed_flags", {})
-                                span_is_super_script = safe_get(span_font_flags_decoded, "is_superscript", False)
-                                span_is_italic = safe_get(span_font_flags_decoded, "is_italic", False)
-                                span_is_serifed = safe_get(span_font_flags_decoded, "is_serifed", False)
-                                span_is_sans_serifed = safe_get(span_font_flags_decoded, "is_sans_serifed", False)
-                                span_is_monospaced = safe_get(span_font_flags_decoded, "is_monospaced", False)
-                                span_is_proportional = safe_get(span_font_flags_decoded, "is_proportional", False)
-                                span_is_bold = safe_get(span_font_flags_decoded, "is_bold", False)
-
-                                span_stats.append(
-                                    {
-                                        "span_id": span_id,  # id of span
-                                        "page_id": page_id,  # page number of pdf
-                                        "span_text": span_text,  # text of span
-                                        "span_font_name": span_font_name,  # font name of span
-                                        "span_font_size": span_font_size,  # font size of span
-                                        "span_font_color": span_font_color,  # font color of span
-                                        "span_font_flags": span_font_flags,  # font flags of span
-                                        "span_is_superscript": int(
-                                            span_is_super_script
-                                        ),  # indicate whether the span is super script or not
-                                        "span_is_italic": int(span_is_italic),  # indicate whether the span is italic or not
-                                        "span_is_serifed": int(span_is_serifed),  # indicate whether the span is serifed or not
-                                        "span_is_sans_serifed": int(
-                                            span_is_sans_serifed
-                                        ),  # indicate whether the span is sans serifed or not
-                                        "span_is_monospaced": int(
-                                            span_is_monospaced
-                                        ),  # indicate whether the span is monospaced or not
-                                        "span_is_proportional": int(
-                                            span_is_proportional
-                                        ),  # indicate whether the span is proportional or not
-                                        "span_is_bold": int(span_is_bold),  # indicate whether the span is bold or not
-                                    }
-                                )
-
-                                span_id += 1
-
-        span_stats = pd.DataFrame(span_stats)
-        # print(span_stats)
-
-        return span_stats
-
-
-def __find_pdf_dic_files(
-    jf_name="pdf_dic.json",
-    base_code_name="code-clean",
-    tgt_base_dir_name="tmp",
-    unittest_dir_name="unittest",
-    md_dir_name="md",
-    book_names=[
-        "scihub",
-    ],  # other possible values: "zlib", "arxiv" and so on
-):
-    pdf_dict_files = []
-
-    curr_dir = os.path.dirname(__file__)
-
-    for i in range(len(curr_dir)):
-        if curr_dir[i : i + len(base_code_name)] == base_code_name:
-            base_code_dir_name = curr_dir[: i + len(base_code_name)]
-            for book_name in book_names:
-                search_dir_relative_name = os.path.join(tgt_base_dir_name, unittest_dir_name, md_dir_name, book_name)
-                if os.path.exists(base_code_dir_name):
-                    search_dir_name = os.path.join(base_code_dir_name, search_dir_relative_name)
-                    for root, dirs, files in os.walk(search_dir_name):
-                        for file in files:
-                            if file == jf_name:
-                                pdf_dict_files.append(os.path.join(root, file))
-                break
-
-    return pdf_dict_files
-
-
-def combine_span_texts(group_df, span_stats):
-    combined_span_texts = []
-    for _, row in group_df.iterrows():
-        curr_span_id = row.name
-        curr_span_text = row["span_text"]
-
-        pre_span_id = curr_span_id - 1
-        pre_span_text = span_stats.at[pre_span_id, "span_text"] if pre_span_id in span_stats.index else ""
-
-        next_span_id = curr_span_id + 1
-        next_span_text = span_stats.at[next_span_id, "span_text"] if next_span_id in span_stats.index else ""
-
-        # pointer_sign is a right arrow if the span is superscript, otherwise it is a down arrow
-        pointer_sign = "→ → → "
-        combined_text = "\n".join([pointer_sign + pre_span_text, pointer_sign + curr_span_text, pointer_sign + next_span_text])
-        combined_span_texts.append(combined_text)
-
-    return "\n\n".join(combined_span_texts)
-
-
-# pd.set_option("display.max_colwidth", None)  # 设置为 None 来显示完整的文本
-pd.set_option("display.max_rows", None)  # 设置为 None 来显示更多的行
-
-
-def main():
-    pdf_dict_files = __find_pdf_dic_files()
-    # print(pdf_dict_files)
-
-    span_stats_calc = SpanStatsCalc()
-
-    for pdf_dict_file in pdf_dict_files:
-        print("-" * 100)
-        print_green_on_red(f"Processing {pdf_dict_file}")
-
-        with open(pdf_dict_file, "r", encoding="utf-8") as f:
-            pdf_dict = json.load(f)
-
-            raw_df = span_stats_calc.calc_stats_per_dict(pdf_dict)
-            save_path = pdf_dict_file.replace("pdf_dic.json", "span_stats_raw.csv")
-            raw_df.to_csv(save_path, index=False)
-
-            filtered_df = raw_df[raw_df["span_is_superscript"] == 1]
-            if filtered_df.empty:
-                print("No superscript span found!")
-                continue
-
-            filtered_grouped_df = filtered_df.groupby(["span_font_name", "span_font_size", "span_font_color"])
-
-            combined_span_texts = filtered_grouped_df.apply(combine_span_texts, span_stats=raw_df)  # type: ignore
-
-            final_df = filtered_grouped_df.size().reset_index(name="count")
-            final_df["span_texts"] = combined_span_texts.reset_index(level=[0, 1, 2], drop=True)
-
-            print(final_df)
-
-            final_df["span_texts"] = final_df["span_texts"].apply(lambda x: x.replace("\n", "\r\n"))
-
-            save_path = pdf_dict_file.replace("pdf_dic.json", "span_stats_final.csv")
-            # 使用 UTF-8 编码并添加 BOM，确保所有字段被双引号包围
-            final_df.to_csv(save_path, index=False, encoding="utf-8-sig", quoting=csv.QUOTE_ALL)
-
-            # 创建一个 2x2 的图表布局
-            fig, axs = plt.subplots(2, 2, figsize=(15, 10))
-
-            # 按照 span_font_name 分类作图
-            final_df.groupby("span_font_name")["count"].sum().plot(kind="bar", ax=axs[0, 0], title="By Font Name")
-
-            # 按照 span_font_size 分类作图
-            final_df.groupby("span_font_size")["count"].sum().plot(kind="bar", ax=axs[0, 1], title="By Font Size")
-
-            # 按照 span_font_color 分类作图
-            final_df.groupby("span_font_color")["count"].sum().plot(kind="bar", ax=axs[1, 0], title="By Font Color")
-
-            # 按照 span_font_name、span_font_size 和 span_font_color 共同分类作图
-            grouped = final_df.groupby(["span_font_name", "span_font_size", "span_font_color"])
-            grouped["count"].sum().unstack().plot(kind="bar", ax=axs[1, 1], title="Combined Grouping")
-
-            # 调整布局
-            plt.tight_layout()
-
-            # 显示图表
-            # plt.show()
-
-            # 保存图表到 PNG 文件
-            save_path = pdf_dict_file.replace("pdf_dic.json", "span_stats_combined.png")
-            plt.savefig(save_path)
-
-            # 清除画布
-            plt.clf()
-
-
-if __name__ == "__main__":
-    main()
--- a/magic_pdf/libs/commons.py
+++ b/magic_pdf/libs/commons.py
-import datetime
-import json
-import os, re, configparser
-import subprocess
-import time
-
-import boto3
-from loguru import logger
-from boto3.s3.transfer import TransferConfig
-from botocore.config import Config
-
-import fitz # 1.23.9中已经切换到rebase
-# import fitz_old as fitz  # 使用1.23.9之前的pymupdf库
-
-
-def get_delta_time(input_time):
-    return round(time.time() - input_time, 2)
-
-
-def join_path(*args):
-    return '/'.join(str(s).rstrip('/') for s in args)
-
-
-#配置全局的errlog_path，方便demo同步引用
-error_log_path = "s3://llm-pdf-text/err_logs/"
-# json_dump_path = "s3://pdf_books_temp/json_dump/" # 这条路径仅用于临时本地测试,不能提交到main
-json_dump_path = "s3://llm-pdf-text/json_dump/"
-
-# s3_image_save_path = "s3://mllm-raw-media/pdf2md_img/" # 基础库不应该有这些存在的路径，应该在业务代码中定义
-
-
-def get_top_percent_list(num_list, percent):
-    """
-    获取列表中前百分之多少的元素
-    :param num_list:
-    :param percent:
-    :return:
-    """
-    if len(num_list) == 0:
-        top_percent_list = []
-    else:
-        # 对imgs_len_list排序
-        sorted_imgs_len_list = sorted(num_list, reverse=True)
-        # 计算 percent 的索引
-        top_percent_index = int(len(sorted_imgs_len_list) * percent)
-        # 取前80%的元素
-        top_percent_list = sorted_imgs_len_list[:top_percent_index]
-    return top_percent_list
-
-
-def formatted_time(time_stamp):
-    dt_object = datetime.datetime.fromtimestamp(time_stamp)
-    output_time = dt_object.strftime("%Y-%m-%d-%H:%M:%S")
-    return output_time
-
-
-def mymax(alist: list):
-    if len(alist) == 0:
-        return 0  # 空是0， 0*0也是0大小q
-    else:
-        return max(alist)
-
-def parse_aws_param(profile):
-    if isinstance(profile, str):
-        # 解析配置文件
-        config_file = join_path(os.path.expanduser("~"), ".aws", "config")
-        credentials_file = join_path(os.path.expanduser("~"), ".aws", "credentials")
-        config = configparser.ConfigParser()
-        config.read(credentials_file)
-        config.read(config_file)
-        # 获取 AWS 账户相关信息
-        ak = config.get(profile, "aws_access_key_id")
-        sk = config.get(profile, "aws_secret_access_key")
-        if profile == "default":
-            s3_str = config.get(f"{profile}", "s3")
-        else:
-            s3_str = config.get(f"profile {profile}", "s3")
-        end_match = re.search("endpoint_url[\s]*=[\s]*([^\s\n]+)[\s\n]*$", s3_str, re.MULTILINE)
-        if end_match:
-            endpoint = end_match.group(1)
-        else:
-            raise ValueError(f"aws 配置文件中没有找到 endpoint_url")
-        style_match = re.search("addressing_style[\s]*=[\s]*([^\s\n]+)[\s\n]*$", s3_str, re.MULTILINE)
-        if style_match:
-            addressing_style = style_match.group(1)
-        else:
-            addressing_style = "path"
-    elif isinstance(profile, dict):
-        ak = profile["ak"]
-        sk = profile["sk"]
-        endpoint = profile["endpoint"]
-        addressing_style = "auto"
-
-    return ak, sk, endpoint, addressing_style
-
-
-def parse_bucket_key(s3_full_path: str):
-    """
-    输入 s3://bucket/path/to/my/file.txt
-    输出 bucket, path/to/my/file.txt
-    """
-    s3_full_path = s3_full_path.strip()
-    if s3_full_path.startswith("s3://"):
-        s3_full_path = s3_full_path[5:]
-    if s3_full_path.startswith("/"):
-        s3_full_path = s3_full_path[1:]
-    bucket, key = s3_full_path.split("/", 1)
-    return bucket, key
-
-
-def read_file(pdf_path: str, s3_profile):
-    if pdf_path.startswith("s3://"):
-        ak, sk, end_point, addressing_style = parse_aws_param(s3_profile)
-        cli = boto3.client(service_name="s3", aws_access_key_id=ak, aws_secret_access_key=sk, endpoint_url=end_point,
-                           config=Config(s3={'addressing_style': addressing_style}, retries={'max_attempts': 10, 'mode': 'standard'}))
-        bucket_name, bucket_key = parse_bucket_key(pdf_path)
-        res = cli.get_object(Bucket=bucket_name, Key=bucket_key)
-        file_content = res["Body"].read()
-        return file_content
-    else:
-        with open(pdf_path, "rb") as f:
-            return f.read()
-
-
-def get_docx_model_output(pdf_model_output, page_id):
-
-    model_output_json = pdf_model_output[page_id]
-
-    return model_output_json
-
-
-def list_dir(dir_path:str, s3_profile:str):
-    """
-    列出dir_path下的所有文件
-    """
-    ret = []
-    
-    if dir_path.startswith("s3"):
-        ak, sk, end_point, addressing_style = parse_aws_param(s3_profile)
-        s3info = re.findall(r"s3:\/\/([^\/]+)\/(.*)", dir_path)
-        bucket, path = s3info[0][0], s3info[0][1]
-        try:
-            cli = boto3.client(service_name="s3", aws_access_key_id=ak, aws_secret_access_key=sk, endpoint_url=end_point,
-                                            config=Config(s3={'addressing_style': addressing_style}))
-            def list_obj_scluster():
-                marker = None
-                while True:
-                    list_kwargs = dict(MaxKeys=1000, Bucket=bucket, Prefix=path)
-                    if marker:
-                        list_kwargs['Marker'] = marker
-                    response = cli.list_objects(**list_kwargs)
-                    contents = response.get("Contents", [])
-                    yield from contents
-                    if not response.get("IsTruncated") or len(contents)==0:
-                        break
-                    marker = contents[-1]['Key']
-
-
-            for info in list_obj_scluster():
-                file_path = info['Key']
-                #size = info['Size']
-
-                if path!="":
-                    afile = file_path[len(path):]
-                    if afile.endswith(".json"):
-                        ret.append(f"s3://{bucket}/{file_path}")
-                        
-            return ret
-
-        except Exception as e:
-            logger.exception(e)
-            exit(-1)
-    else: #本地的目录，那么扫描本地目录并返会这个目录里的所有jsonl文件
-        
-        for root, dirs, files in os.walk(dir_path):
-            for file in files:
-                if file.endswith(".json"):
-                    ret.append(join_path(root, file))
-        ret.sort()
-        return ret
-
-def get_img_s3_client(save_path:str, image_s3_config:str):
-    """
-    """
-    if save_path.startswith("s3://"):  # 放这里是为了最少创建一个s3 client
-        ak, sk, end_point, addressing_style = parse_aws_param(image_s3_config)
-        img_s3_client = boto3.client(
-            service_name="s3",
-            aws_access_key_id=ak,
-            aws_secret_access_key=sk,
-            endpoint_url=end_point,
-            config=Config(s3={"addressing_style": addressing_style}, retries={'max_attempts': 5, 'mode': 'standard'}),
-        )
-    else:
-        img_s3_client = None
-        
-    return img_s3_client
-
-if __name__=="__main__":
-    s3_path = "s3://llm-pdf-text/layout_det/scihub/scimag07865000-07865999/10.1007/s10729-011-9175-6.pdf/"
-    s3_profile = "langchao"
-    ret = list_dir(s3_path, s3_profile)
-    print(ret)
-    
\ No newline at end of file
--- a/magic_pdf/libs/config_reader.py
+++ b/magic_pdf/libs/config_reader.py
-"""
-根据bucket的名字返回对应的s3 AK， SK，endpoint三元组
-
-"""
-
-import json
-import os
-
-from loguru import logger
-
-from magic_pdf.libs.commons import parse_bucket_key
-
-# 定义配置文件名常量
-CONFIG_FILE_NAME = "magic-pdf.json"
-
-
-def read_config():
-    home_dir = os.path.expanduser("~")
-
-    config_file = os.path.join(home_dir, CONFIG_FILE_NAME)
-
-    if not os.path.exists(config_file):
-        raise FileNotFoundError(f"{config_file} not found")
-
-    with open(config_file, "r", encoding="utf-8") as f:
-        config = json.load(f)
-    return config
-
-
-def get_s3_config(bucket_name: str):
-    """
-    ~/magic-pdf.json 读出来
-    """
-    config = read_config()
-
-    bucket_info = config.get("bucket_info")
-    if bucket_name not in bucket_info:
-        access_key, secret_key, storage_endpoint = bucket_info["[default]"]
-    else:
-        access_key, secret_key, storage_endpoint = bucket_info[bucket_name]
-
-    if access_key is None or secret_key is None or storage_endpoint is None:
-        raise Exception(f"ak, sk or endpoint not found in {CONFIG_FILE_NAME}")
-
-    # logger.info(f"get_s3_config: ak={access_key}, sk={secret_key}, endpoint={storage_endpoint}")
-
-    return access_key, secret_key, storage_endpoint
-
-
-def get_s3_config_dict(path: str):
-    access_key, secret_key, storage_endpoint = get_s3_config(get_bucket_name(path))
-    return {"ak": access_key, "sk": secret_key, "endpoint": storage_endpoint}
-
-
-def get_bucket_name(path):
-    bucket, key = parse_bucket_key(path)
-    return bucket
-
-
-def get_local_models_dir():
-    config = read_config()
-    models_dir = config.get("models-dir")
-    if models_dir is None:
-        logger.warning(f"'models-dir' not found in {CONFIG_FILE_NAME}, use '/tmp/models' as default")
-        return "/tmp/models"
-    else:
-        return models_dir
-
-
-def get_device():
-    config = read_config()
-    device = config.get("device-mode")
-    if device is None:
-        logger.warning(f"'device-mode' not found in {CONFIG_FILE_NAME}, use 'cpu' as default")
-        return "cpu"
-    else:
-        return device
-
-
-def get_table_recog_config():
-    config = read_config()
-    table_config = config.get("table-config")
-    if table_config is None:
-        logger.warning(f"'table-config' not found in {CONFIG_FILE_NAME}, use 'False' as default")
-        return json.loads('{"is_table_recog_enable": false, "max_time": 400}')
-    else:
-        return table_config
-
-
-if __name__ == "__main__":
-    ak, sk, endpoint = get_s3_config("llm-raw")
--- a/magic_pdf/libs/convert_utils.py
+++ b/magic_pdf/libs/convert_utils.py
-def dict_to_list(input_dict):
-    items_list = []
-    for _, item in input_dict.items():
-        items_list.append(item)
-    return items_list
--- a/magic_pdf/libs/coordinate_transform.py
+++ b/magic_pdf/libs/coordinate_transform.py
-def get_scale_ratio(model_page_info, page):
-    pix = page.get_pixmap(dpi=72)
-    pymu_width = int(pix.w)
-    pymu_height = int(pix.h)
-    width_from_json = model_page_info['page_info']['width']
-    height_from_json = model_page_info['page_info']['height']
-    horizontal_scale_ratio = width_from_json / pymu_width
-    vertical_scale_ratio = height_from_json / pymu_height
-    return horizontal_scale_ratio, vertical_scale_ratio
--- a/magic_pdf/libs/detect_language_from_model.py
+++ b/magic_pdf/libs/detect_language_from_model.py
-from collections import Counter
-
-from magic_pdf.libs.language import detect_lang
-
-def get_language_from_model(model_list: list):
-    language_lst = []
-    for ocr_page_info in model_list:
-        page_text = ""
-        layout_dets = ocr_page_info["layout_dets"]
-        for layout_det in layout_dets:
-            category_id = layout_det["category_id"]
-            allow_category_id_list = [15]
-            if category_id in allow_category_id_list:
-                page_text += layout_det["text"]
-        page_language = detect_lang(page_text)
-        language_lst.append(page_language)
-    # 统计text_language_list中每种语言的个数
-    count_dict = Counter(language_lst)
-    # 输出text_language_list中出现的次数最多的语言
-    language = max(count_dict, key=count_dict.get)
-    return language