Merge pull request #1120 from opendatalab/release-0.10.2

Release 0.10.2

Merge pull request #1120 from opendatalab/release-0.10.2
Release 0.10.2
8afff9ae · Xiaomeng Zhao · GitHub · 4df1eb74 · 7fdbb6e5 · 8afff9ae
Unverified Commit 8afff9ae authored Nov 27, 2024 by Xiaomeng Zhao Committed by GitHub Nov 27, 2024
20 changed files
--- a/README.md
+++ b/README.md
@@ -277,88 +277,14 @@ If your device supports CUDA and meets the GPU requirements of the mainline envi
 ### Command Line
-```bash
+[Using MinerU via Command Line](https://mineru.readthedocs.io/en/latest/user_guide/quick_start/command_line.html)
-magic-pdf --help
-Usage: magic-pdf [OPTIONS]
-Options:
-  -v, --version                display the version and exit
-  -p, --path PATH              local pdf filepath or directory  [required]
-  -o, --output-dir PATH        output local directory  [required]
-  -m, --method [ocr|txt|auto]  the method for parsing pdf. ocr: using ocr
-                               technique to extract information from pdf. txt:
-                               suitable for the text-based pdf only and
-                               outperform ocr. auto: automatically choose the
-                               best method for parsing pdf from ocr and txt.
-                               without method specified, auto will be used by
-                               default.
-  -l, --lang TEXT              Input the languages in the pdf (if known) to
-                               improve OCR accuracy.  Optional. You should
-                               input "Abbreviation" with language form url: ht
-                               tps://paddlepaddle.github.io/PaddleOCR/latest/en
-                               /ppocr/blog/multi_languages.html#5-support-languages-
-                               and-abbreviations
-  -d, --debug BOOLEAN          Enables detailed debugging information during
-                               the execution of the CLI commands.
-  -s, --start INTEGER          The starting page for PDF parsing, beginning
-                               from 0.
-  -e, --end INTEGER            The ending page for PDF parsing, beginning from
-                               0.
-  --help                       Show this message and exit.
-## show version
-magic-pdf -v
-## command line example
-magic-pdf -p {some_pdf} -o {some_output_dir} -m auto
-```
-`{some_pdf}` can be a single PDF file or a directory containing multiple PDFs.
-The results will be saved in the `{some_output_dir}` directory. The output file list is as follows:
-```text
-├── some_pdf.md                          # markdown file
-├── images                               # directory for storing images
-├── some_pdf_layout.pdf                  # layout diagram (Include layout reading order)
-├── some_pdf_middle.json                 # MinerU intermediate processing result
-├── some_pdf_model.json                  # model inference result
-├── some_pdf_origin.pdf                  # original PDF file
-├── some_pdf_spans.pdf                   # smallest granularity bbox position information diagram
-└── some_pdf_content_list.json           # Rich text JSON arranged in reading order
-```
 > [!TIP]
 > For more information about the output files, please refer to the [Output File Description](docs/output_file_en_us.md).
 ### API
-Processing files from local disk
+[Using MinerU via Python API](https://mineru.readthedocs.io/en/latest/user_guide/quick_start/to_markdown.html)
-```python
-image_writer = DiskReaderWriter(local_image_dir)
-image_dir = str(os.path.basename(local_image_dir))
-jso_useful_key = {"_pdf_type": "", "model_list": []}
-pipe = UNIPipe(pdf_bytes, jso_useful_key, image_writer)
-pipe.pipe_classify()
-pipe.pipe_analyze()
-pipe.pipe_parse()
-md_content = pipe.pipe_mk_markdown(image_dir, drop_mode="none")
-```
-Processing files from object storage
-```python
-s3pdf_cli = S3ReaderWriter(pdf_ak, pdf_sk, pdf_endpoint)
-image_dir = "s3://img_bucket/"
-s3image_cli = S3ReaderWriter(img_ak, img_sk, img_endpoint, parent_path=image_dir)
-pdf_bytes = s3pdf_cli.read(s3_pdf_path, mode=s3pdf_cli.MODE_BIN)
-jso_useful_key = {"_pdf_type": "", "model_list": []}
-pipe = UNIPipe(pdf_bytes, jso_useful_key, s3image_cli)
-pipe.pipe_classify()
-pipe.pipe_analyze()
-pipe.pipe_parse()
-md_content = pipe.pipe_mk_markdown(image_dir, drop_mode="none")
-```
 For detailed implementation, refer to:

--- a/README_zh-CN.md
+++ b/README_zh-CN.md
@@ -284,89 +284,14 @@ pip install -U magic-pdf[full] --extra-index-url https://wheels.myhloli.com -i h
 ### 命令行
-```bash
+[通过命令行使用MinerU](https://mineru.readthedocs.io/zh-cn/latest/user_guide/quick_start/command_line.html)
-magic-pdf --help
-Usage: magic-pdf [OPTIONS]
-Options:
-  -v, --version                display the version and exit
-  -p, --path PATH              local pdf filepath or directory  [required]
-  -o, --output-dir PATH        output local directory  [required]
-  -m, --method [ocr|txt|auto]  the method for parsing pdf. ocr: using ocr
-                               technique to extract information from pdf. txt:
-                               suitable for the text-based pdf only and
-                               outperform ocr. auto: automatically choose the
-                               best method for parsing pdf from ocr and txt.
-                               without method specified, auto will be used by
-                               default.
-  -l, --lang TEXT              Input the languages in the pdf (if known) to
-                               improve OCR accuracy.  Optional. You should
-                               input "Abbreviation" with language form url: ht
-                               tps://paddlepaddle.github.io/PaddleOCR/latest/en
-                               /ppocr/blog/multi_languages.html#5-support-languages-
-                               and-abbreviations
-  -d, --debug BOOLEAN          Enables detailed debugging information during
-                               the execution of the CLI commands.
-  -s, --start INTEGER          The starting page for PDF parsing, beginning
-                               from 0.
-  -e, --end INTEGER            The ending page for PDF parsing, beginning from
-                               0.
-  --help                       Show this message and exit.
-## show version
-magic-pdf -v
-## command line example
-magic-pdf -p {some_pdf} -o {some_output_dir} -m auto
-```
-其中 `{some_pdf}` 可以是单个pdf文件，也可以是一个包含多个pdf文件的目录。
-运行完命令后输出的结果会保存在`{some_output_dir}`目录下, 输出的文件列表如下
-```text
-├── some_pdf.md                          # markdown 文件
-├── images                               # 存放图片目录
-├── some_pdf_layout.pdf                  # layout 绘图 （包含layout阅读顺序）
-├── some_pdf_middle.json                 # minerU 中间处理结果
-├── some_pdf_model.json                  # 模型推理结果
-├── some_pdf_origin.pdf                  # 原 pdf 文件
-├── some_pdf_spans.pdf                   # 最小粒度的bbox位置信息绘图
-└── some_pdf_content_list.json           # 按阅读顺序排列的富文本json
-```
 > [!TIP]
 > 更多有关输出文件的信息，请参考[输出文件说明](docs/output_file_zh_cn.md)
 ### API
-处理本地磁盘上的文件
+[通过Python代码调用MinerU](https://mineru.readthedocs.io/zh-cn/latest/user_guide/quick_start/to_markdown.html)
-```python
-image_writer = DiskReaderWriter(local_image_dir)
-image_dir = str(os.path.basename(local_image_dir))
-jso_useful_key = {"_pdf_type": "", "model_list": []}
-pipe = UNIPipe(pdf_bytes, jso_useful_key, image_writer)
-pipe.pipe_classify()
-pipe.pipe_analyze()
-pipe.pipe_parse()
-md_content = pipe.pipe_mk_markdown(image_dir, drop_mode="none")
-```
-处理对象存储上的文件
-```python
-s3pdf_cli = S3ReaderWriter(pdf_ak, pdf_sk, pdf_endpoint)
-image_dir = "s3://img_bucket/"
-s3image_cli = S3ReaderWriter(img_ak, img_sk, img_endpoint, parent_path=image_dir)
-pdf_bytes = s3pdf_cli.read(s3_pdf_path, mode=s3pdf_cli.MODE_BIN)
-jso_useful_key = {"_pdf_type": "", "model_list": []}
-pipe = UNIPipe(pdf_bytes, jso_useful_key, s3image_cli)
-pipe.pipe_classify()
-pipe.pipe_analyze()
-pipe.pipe_parse()
-md_content = pipe.pipe_mk_markdown(image_dir, drop_mode="none")
-```
 详细实现可参考

--- a/magic_pdf/dict2md/mkcontent.py
+++ b/magic_pdf/dict2md/mkcontent.py
-import math
-from loguru import logger
-from magic_pdf.config.ocr_content_type import ContentType
-from magic_pdf.libs.boxbase import (find_bottom_nearest_text_bbox,
-                                    find_top_nearest_text_bbox)
-from magic_pdf.libs.commons import join_path
-TYPE_INLINE_EQUATION = ContentType.InlineEquation
-TYPE_INTERLINE_EQUATION = ContentType.InterlineEquation
-UNI_FORMAT_TEXT_TYPE = ['text', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6']
-@DeprecationWarning
-def mk_nlp_markdown_1(para_dict: dict):
-    """对排序后的bboxes拼接内容."""
-    content_lst = []
-    for _, page_info in para_dict.items():
-        para_blocks = page_info.get('para_blocks')
-        if not para_blocks:
-            continue
-        for block in para_blocks:
-            item = block['paras']
-            for _, p in item.items():
-                para_text = p['para_text']
-                is_title = p['is_para_title']
-                title_level = p['para_title_level']
-                md_title_prefix = '#' * title_level
-                if is_title:
-                    content_lst.append(f'{md_title_prefix} {para_text}')
-                else:
-                    content_lst.append(para_text)
-    content_text = '\n\n'.join(content_lst)
-    return content_text
-# 找到目标字符串在段落中的索引
-def __find_index(paragraph, target):
-    index = paragraph.find(target)
-    if index != -1:
-        return index
-    else:
-        return None
-def __insert_string(paragraph, target, position):
-    new_paragraph = paragraph[:position] + target + paragraph[position:]
-    return new_paragraph
-def __insert_after(content, image_content, target):
-    """在content中找到target，将image_content插入到target后面."""
-    index = content.find(target)
-    if index != -1:
-        content = (
-            content[: index + len(target)]
-            + '\n\n'
-            + image_content
-            + '\n\n'
-            + content[index + len(target) :]
-        )
-    else:
-        logger.error(
-            f"Can't find the location of image {image_content} in the markdown file, search target is {target}"
-        )
-    return content
-def __insert_before(content, image_content, target):
-    """在content中找到target，将image_content插入到target前面."""
-    index = content.find(target)
-    if index != -1:
-        content = content[:index] + '\n\n' + image_content + '\n\n' + content[index:]
-    else:
-        logger.error(
-            f"Can't find the location of image {image_content} in the markdown file, search target is {target}"
-        )
-    return content
-@DeprecationWarning
-def mk_mm_markdown_1(para_dict: dict):
-    """拼装多模态markdown."""
-    content_lst = []
-    for _, page_info in para_dict.items():
-        page_lst = []  # 一个page内的段落列表
-        para_blocks = page_info.get('para_blocks')
-        pymu_raw_blocks = page_info.get('preproc_blocks')
-        all_page_images = []
-        all_page_images.extend(page_info.get('images', []))
-        all_page_images.extend(page_info.get('image_backup', []))
-        all_page_images.extend(page_info.get('tables', []))
-        all_page_images.extend(page_info.get('table_backup', []))
-        if not para_blocks or not pymu_raw_blocks:  # 只有图片的拼接的场景
-            for img in all_page_images:
-                page_lst.append(f"![]({img['image_path']})")  # TODO 图片顺序
-            page_md = '\n\n'.join(page_lst)
-        else:
-            for block in para_blocks:
-                item = block['paras']
-                for _, p in item.items():
-                    para_text = p['para_text']
-                    is_title = p['is_para_title']
-                    title_level = p['para_title_level']
-                    md_title_prefix = '#' * title_level
-                    if is_title:
-                        page_lst.append(f'{md_title_prefix} {para_text}')
-                    else:
-                        page_lst.append(para_text)
-            """拼装成一个页面的文本"""
-            page_md = '\n\n'.join(page_lst)
-            """插入图片"""
-            for img in all_page_images:
-                imgbox = img['bbox']
-                img_content = f"![]({img['image_path']})"
-                # 先看在哪个block内
-                for block in pymu_raw_blocks:
-                    bbox = block['bbox']
-                    if (
-                        bbox[0] - 1 <= imgbox[0] < bbox[2] + 1
-                        and bbox[1] - 1 <= imgbox[1] < bbox[3] + 1
-                    ):  # 确定在block内
-                        for l in block['lines']:  # noqa: E741
-                            line_box = l['bbox']
-                            if (
-                                line_box[0] - 1 <= imgbox[0] < line_box[2] + 1
-                                and line_box[1] - 1 <= imgbox[1] < line_box[3] + 1
-                            ):  # 在line内的，插入line前面
-                                line_txt = ''.join([s['text'] for s in l['spans']])
-                                page_md = __insert_before(
-                                    page_md, img_content, line_txt
-                                )
-                                break
-                            break
-                        else:  # 在行与行之间
-                            # 找到图片x0,y0与line的x0,y0最近的line
-                            min_distance = 100000
-                            min_line = None
-                            for l in block['lines']:  # noqa: E741
-                                line_box = l['bbox']
-                                distance = math.sqrt(
-                                    (line_box[0] - imgbox[0]) ** 2
-                                    + (line_box[1] - imgbox[1]) ** 2
-                                )
-                                if distance < min_distance:
-                                    min_distance = distance
-                                    min_line = l
-                            if min_line:
-                                line_txt = ''.join(
-                                    [s['text'] for s in min_line['spans']]
-                                )
-                                img_h = imgbox[3] - imgbox[1]
-                                if min_distance < img_h:  # 文字在图片前面
-                                    page_md = __insert_after(
-                                        page_md, img_content, line_txt
-                                    )
-                                else:
-                                    page_md = __insert_before(
-                                        page_md, img_content, line_txt
-                                    )
-                            else:
-                                logger.error(
-                                    f"Can't find the location of image {img['image_path']} in the markdown file  #1"
-                                )
-                else:  # 应当在两个block之间
-                    # 找到上方最近的block，如果上方没有就找大下方最近的block
-                    top_txt_block = find_top_nearest_text_bbox(pymu_raw_blocks, imgbox)
-                    if top_txt_block:
-                        line_txt = ''.join(
-                            [s['text'] for s in top_txt_block['lines'][-1]['spans']]
-                        )
-                        page_md = __insert_after(page_md, img_content, line_txt)
-                    else:
-                        bottom_txt_block = find_bottom_nearest_text_bbox(
-                            pymu_raw_blocks, imgbox
-                        )
-                        if bottom_txt_block:
-                            line_txt = ''.join(
-                                [
-                                    s['text']
-                                    for s in bottom_txt_block['lines'][0]['spans']
-                                ]
-                            )
-                            page_md = __insert_before(page_md, img_content, line_txt)
-                        else:
-                            logger.error(
-                                f"Can't find the location of image {img['image_path']} in the markdown file  #2"
-                            )
-        content_lst.append(page_md)
-    """拼装成全部页面的文本"""
-    content_text = '\n\n'.join(content_lst)
-    return content_text
-def __insert_after_para(text, type, element, content_list):
-    """在content_list中找到text，将image_path作为一个新的node插入到text后面."""
-    for i, c in enumerate(content_list):
-        content_type = c.get('type')
-        if content_type in UNI_FORMAT_TEXT_TYPE and text in c.get('text', ''):
-            if type == 'image':
-                content_node = {
-                    'type': 'image',
-                    'img_path': element.get('image_path'),
-                    'img_alt': '',
-                    'img_title': '',
-                    'img_caption': '',
-                }
-            elif type == 'table':
-                content_node = {
-                    'type': 'table',
-                    'img_path': element.get('image_path'),
-                    'table_latex': element.get('text'),
-                    'table_title': '',
-                    'table_caption': '',
-                    'table_quality': element.get('quality'),
-                }
-            content_list.insert(i + 1, content_node)
-            break
-    else:
-        logger.error(
-            f"Can't find the location of image {element.get('image_path')} in the markdown file, search target is {text}"
-        )
-def __insert_before_para(text, type, element, content_list):
-    """在content_list中找到text，将image_path作为一个新的node插入到text前面."""
-    for i, c in enumerate(content_list):
-        content_type = c.get('type')
-        if content_type in UNI_FORMAT_TEXT_TYPE and text in c.get('text', ''):
-            if type == 'image':
-                content_node = {
-                    'type': 'image',
-                    'img_path': element.get('image_path'),
-                    'img_alt': '',
-                    'img_title': '',
-                    'img_caption': '',
-                }
-            elif type == 'table':
-                content_node = {
-                    'type': 'table',
-                    'img_path': element.get('image_path'),
-                    'table_latex': element.get('text'),
-                    'table_title': '',
-                    'table_caption': '',
-                    'table_quality': element.get('quality'),
-                }
-            content_list.insert(i, content_node)
-            break
-    else:
-        logger.error(
-            f"Can't find the location of image {element.get('image_path')} in the markdown file, search target is {text}"
-        )
-def mk_universal_format(pdf_info_list: list, img_buket_path):
-    """构造统一格式 https://aicarrier.feishu.cn/wiki/FqmMwcH69iIdCWkkyjvcDwNUnTY."""
-    content_lst = []
-    for page_info in pdf_info_list:
-        page_lst = []  # 一个page内的段落列表
-        para_blocks = page_info.get('para_blocks')
-        pymu_raw_blocks = page_info.get('preproc_blocks')
-        all_page_images = []
-        all_page_images.extend(page_info.get('images', []))
-        all_page_images.extend(page_info.get('image_backup', []))
-        # all_page_images.extend(page_info.get("tables",[]))
-        # all_page_images.extend(page_info.get("table_backup",[]) )
-        all_page_tables = []
-        all_page_tables.extend(page_info.get('tables', []))
-        if not para_blocks or not pymu_raw_blocks:  # 只有图片的拼接的场景
-            for img in all_page_images:
-                content_node = {
-                    'type': 'image',
-                    'img_path': join_path(img_buket_path, img['image_path']),
-                    'img_alt': '',
-                    'img_title': '',
-                    'img_caption': '',
-                }
-                page_lst.append(content_node)  # TODO 图片顺序
-            for table in all_page_tables:
-                content_node = {
-                    'type': 'table',
-                    'img_path': join_path(img_buket_path, table['image_path']),
-                    'table_latex': table.get('text'),
-                    'table_title': '',
-                    'table_caption': '',
-                    'table_quality': table.get('quality'),
-                }
-                page_lst.append(content_node)  # TODO 图片顺序
-        else:
-            for block in para_blocks:
-                item = block['paras']
-                for _, p in item.items():
-                    font_type = p[
-                        'para_font_type'
-                    ]  # 对于文本来说，要么是普通文本，要么是个行间公式
-                    if font_type == TYPE_INTERLINE_EQUATION:
-                        content_node = {'type': 'equation', 'latex': p['para_text']}
-                        page_lst.append(content_node)
-                    else:
-                        para_text = p['para_text']
-                        is_title = p['is_para_title']
-                        title_level = p['para_title_level']
-                        if is_title:
-                            content_node = {
-                                'type': f'h{title_level}',
-                                'text': para_text,
-                            }
-                            page_lst.append(content_node)
-                        else:
-                            content_node = {'type': 'text', 'text': para_text}
-                            page_lst.append(content_node)
-        content_lst.extend(page_lst)
-        """插入图片"""
-        for img in all_page_images:
-            insert_img_or_table('image', img, pymu_raw_blocks, content_lst)
-        """插入表格"""
-        for table in all_page_tables:
-            insert_img_or_table('table', table, pymu_raw_blocks, content_lst)
-    # end for
-    return content_lst
-def insert_img_or_table(type, element, pymu_raw_blocks, content_lst):
-    element_bbox = element['bbox']
-    # 先看在哪个block内
-    for block in pymu_raw_blocks:
-        bbox = block['bbox']
-        if (
-            bbox[0] - 1 <= element_bbox[0] < bbox[2] + 1
-            and bbox[1] - 1 <= element_bbox[1] < bbox[3] + 1
-        ):  # 确定在这个大的block内，然后进入逐行比较距离
-            for l in block['lines']:  # noqa: E741
-                line_box = l['bbox']
-                if (
-                    line_box[0] - 1 <= element_bbox[0] < line_box[2] + 1
-                    and line_box[1] - 1 <= element_bbox[1] < line_box[3] + 1
-                ):  # 在line内的，插入line前面
-                    line_txt = ''.join([s['text'] for s in l['spans']])
-                    __insert_before_para(line_txt, type, element, content_lst)
-                    break
-                break
-            else:  # 在行与行之间
-                # 找到图片x0,y0与line的x0,y0最近的line
-                min_distance = 100000
-                min_line = None
-                for l in block['lines']:  # noqa: E741
-                    line_box = l['bbox']
-                    distance = math.sqrt(
-                        (line_box[0] - element_bbox[0]) ** 2
-                        + (line_box[1] - element_bbox[1]) ** 2
-                    )
-                    if distance < min_distance:
-                        min_distance = distance
-                        min_line = l
-                if min_line:
-                    line_txt = ''.join([s['text'] for s in min_line['spans']])
-                    img_h = element_bbox[3] - element_bbox[1]
-                    if min_distance < img_h:  # 文字在图片前面
-                        __insert_after_para(line_txt, type, element, content_lst)
-                    else:
-                        __insert_before_para(line_txt, type, element, content_lst)
-                    break
-                else:
-                    logger.error(
-                        f"Can't find the location of image {element.get('image_path')} in the markdown file  #1"
-                    )
-    else:  # 应当在两个block之间
-        # 找到上方最近的block，如果上方没有就找大下方最近的block
-        top_txt_block = find_top_nearest_text_bbox(pymu_raw_blocks, element_bbox)
-        if top_txt_block:
-            line_txt = ''.join([s['text'] for s in top_txt_block['lines'][-1]['spans']])
-            __insert_after_para(line_txt, type, element, content_lst)
-        else:
-            bottom_txt_block = find_bottom_nearest_text_bbox(
-                pymu_raw_blocks, element_bbox
-            )
-            if bottom_txt_block:
-                line_txt = ''.join(
-                    [s['text'] for s in bottom_txt_block['lines'][0]['spans']]
-                )
-                __insert_before_para(line_txt, type, element, content_lst)
-            else:  # TODO ，图片可能独占一列，这种情况上下是没有图片的
-                logger.error(
-                    f"Can't find the location of image {element.get('image_path')} in the markdown file  #2"
-                )
-def mk_mm_markdown(content_list):
-    """基于同一格式的内容列表，构造markdown，含图片."""
-    content_md = []
-    for c in content_list:
-        content_type = c.get('type')
-        if content_type == 'text':
-            content_md.append(c.get('text'))
-        elif content_type == 'equation':
-            content = c.get('latex')
-            if content.startswith('$$') and content.endswith('$$'):
-                content_md.append(content)
-            else:
-                content_md.append(f"\n$$\n{c.get('latex')}\n$$\n")
-        elif content_type in UNI_FORMAT_TEXT_TYPE:
-            content_md.append(f"{'#'*int(content_type[1])} {c.get('text')}")
-        elif content_type == 'image':
-            content_md.append(f"![]({c.get('img_path')})")
-    return '\n\n'.join(content_md)
-def mk_nlp_markdown(content_list):
-    """基于同一格式的内容列表，构造markdown，不含图片."""
-    content_md = []
-    for c in content_list:
-        content_type = c.get('type')
-        if content_type == 'text':
-            content_md.append(c.get('text'))
-        elif content_type == 'equation':
-            content_md.append(f"$$\n{c.get('latex')}\n$$")
-        elif content_type == 'table':
-            content_md.append(f"$$$\n{c.get('table_latex')}\n$$$")
-        elif content_type in UNI_FORMAT_TEXT_TYPE:
-            content_md.append(f"{'#'*int(content_type[1])} {c.get('text')}")
-    return '\n\n'.join(content_md)
--- a/magic_pdf/filter/pdf_meta_scan.py
+++ b/magic_pdf/filter/pdf_meta_scan.py
 """输入： s3路径，每行一个 输出： pdf文件元信息，包括每一页上的所有图片的长宽高，bbox位置."""
-import sys
 from collections import Counter
-import click
+import fitz
 from loguru import logger
 from magic_pdf.config.drop_reason import DropReason
-from magic_pdf.libs.commons import fitz, get_top_percent_list, mymax, read_file
+from magic_pdf.libs.commons import get_top_percent_list, mymax
 from magic_pdf.libs.language import detect_lang
 from magic_pdf.libs.pdf_check import detect_invalid_chars
@@ -384,21 +383,8 @@ def pdf_meta_scan(pdf_bytes: bytes):
        return res
-@click.command()
-@click.option('--s3-pdf-path', help='s3上pdf文件的路径')
-@click.option('--s3-profile', help='s3上的profile')
-def main(s3_pdf_path: str, s3_profile: str):
-    """"""
-    try:
-        file_content = read_file(s3_pdf_path, s3_profile)
-        pdf_meta_scan(file_content)
-    except Exception as e:
-        print(f'ERROR: {s3_pdf_path}, {e}', file=sys.stderr)
-        logger.exception(e)
 if __name__ == '__main__':
-    main()
+    pass
    # "D:\project/20231108code-clean\pdf_cost_time\竖排例子\净空法师-大乘无量寿.pdf"
    # "D:\project/20231108code-clean\pdf_cost_time\竖排例子\三国演义_繁体竖排版.pdf"
    # "D:\project/20231108code-clean\pdf_cost_time\scihub\scihub_86800000\libgen.scimag86880000-86880999.zip_10.1021/acsami.1c03109.s002.pdf"

--- a/magic_pdf/layout/__init__.py
+++ b/magic_pdf/layout/__init__.py
--- a/magic_pdf/layout/bbox_sort.py
+++ b/magic_pdf/layout/bbox_sort.py
-# 定义这里的bbox是一个list [x0, y0, x1, y1, block_content, idx_x, idx_y, content_type, ext_x0, ext_y0, ext_x1, ext_y1], 初始时候idx_x, idx_y都是None
-# 其中x0, y0代表左上角坐标，x1, y1代表右下角坐标，坐标原点在左上角。
-from magic_pdf.layout.layout_spiler_recog import get_spilter_of_page
-from magic_pdf.libs.boxbase import _is_in, _is_in_or_part_overlap, _is_vertical_full_overlap
-from magic_pdf.libs.commons import mymax
-X0_IDX = 0
-Y0_IDX = 1
-X1_IDX = 2
-Y1_IDX = 3
-CONTENT_IDX = 4
-IDX_X = 5
-IDX_Y = 6
-CONTENT_TYPE_IDX = 7
-X0_EXT_IDX = 8
-Y0_EXT_IDX = 9
-X1_EXT_IDX = 10
-Y1_EXT_IDX = 11
-def prepare_bboxes_for_layout_split(image_info, image_backup_info, table_info, inline_eq_info, interline_eq_info, text_raw_blocks: dict, page_boundry, page):
-    """
-    text_raw_blocks:结构参考test/assets/papre/pymu_textblocks.json
-    把bbox重新组装成一个list，每个元素[x0, y0, x1, y1, block_content, idx_x, idx_y, content_type, ext_x0, ext_y0, ext_x1, ext_y1], 初始时候idx_x, idx_y都是None. 对于图片、公式来说，block_content是图片的地址， 对于段落来说，block_content是pymupdf里的block结构
-    """
-    all_bboxes = []
-    for image in image_info:
-        box = image['bbox']
-        # 由于没有实现横向的栏切分，因此在这里先过滤掉一些小的图片。这些图片有可能影响layout，造成没有横向栏切分的情况下，layout切分不准确。例如 scihub_76500000/libgen.scimag76570000-76570999.zip_10.1186/s13287-019-1355-1
-        # 把长宽都小于50的去掉
-        if abs(box[0]-box[2]) < 50 and abs(box[1]-box[3]) < 50:
-            continue
-        all_bboxes.append([box[0], box[1], box[2], box[3], None, None, None, 'image', None, None, None, None])
-    for table in table_info:
-        box = table['bbox']
-        all_bboxes.append([box[0], box[1], box[2], box[3], None, None, None, 'table', None, None, None, None])
-    """由于公式与段落混合，因此公式不再参与layout划分，无需加入all_bboxes"""
-    # 加入文本block
-    text_block_temp = []
-    for block in text_raw_blocks:
-        bbox = block['bbox']
-        text_block_temp.append([bbox[0], bbox[1], bbox[2], bbox[3], None, None, None, 'text', None, None, None, None])
-    text_block_new = resolve_bbox_overlap_for_layout_det(text_block_temp)   
-    text_block_new = filter_lines_bbox(text_block_new) # 去掉线条bbox，有可能让layout探测陷入无限循环
-    """找出会影响layout的色块、横向分割线"""
-    spilter_bboxes = get_spilter_of_page(page, [b['bbox'] for b in image_info]+[b['bbox'] for b in image_backup_info], [b['bbox'] for b in table_info], )
-    # 还要去掉存在于spilter_bboxes里的text_block
-    if len(spilter_bboxes) > 0:
-        text_block_new = [box for box in text_block_new if not any([_is_in_or_part_overlap(box[:4], spilter_bbox) for spilter_bbox in spilter_bboxes])]
-    for bbox in text_block_new:
-        all_bboxes.append([bbox[0], bbox[1], bbox[2], bbox[3], None, None, None, 'text', None, None, None, None]) 
-    for bbox in spilter_bboxes:
-        all_bboxes.append([bbox[0], bbox[1], bbox[2], bbox[3], None, None, None, 'spilter', None, None, None, None])
-    return all_bboxes
-def resolve_bbox_overlap_for_layout_det(bboxes:list):
-    """
-    1. 去掉bbox互相包含的，去掉被包含的
-    2. 上下方向上如果有重叠，就扩大大box范围，直到覆盖小box
-    """
-    def _is_in_other_bbox(i:int):
-        """
-        判断i个box是否被其他box有所包含
-        """
-        for j in range(0, len(bboxes)):
-            if j!=i and _is_in(bboxes[i][:4], bboxes[j][:4]):
-                return True
-            # elif j!=i and _is_bottom_full_overlap(bboxes[i][:4], bboxes[j][:4]):
-            #     return True
-        return False
-    # 首先去掉被包含的bbox
-    new_bbox_1 = []
-    for i in range(0, len(bboxes)):
-        if not _is_in_other_bbox(i):
-            new_bbox_1.append(bboxes[i])
-    # 其次扩展大的box
-    new_box = []
-    new_bbox_2 = []
-    len_1 = len(new_bbox_2)
-    while True:
-        merged_idx = []
-        for i in range(0, len(new_bbox_1)):
-            if i in merged_idx:
-                continue
-            for j in range(i+1, len(new_bbox_1)):
-                if j in merged_idx:
-                    continue
-                bx1 = new_bbox_1[i]
-                bx2 = new_bbox_1[j]
-                if i!=j and _is_vertical_full_overlap(bx1[:4], bx2[:4]):
-                    merged_box = min([bx1[0], bx2[0]]), min([bx1[1], bx2[1]]), max([bx1[2], bx2[2]]), max([bx1[3], bx2[3]])
-                    new_bbox_2.append(merged_box)
-                    merged_idx.append(i)
-                    merged_idx.append(j)
-        for i in range(0, len(new_bbox_1)): # 没有合并的加入进来
-            if i not in merged_idx:
-                new_bbox_2.append(new_bbox_1[i])        
-        if len(new_bbox_2)==0 or len_1==len(new_bbox_2):
-            break
-        else:
-            len_1 = len(new_bbox_2)
-            new_box = new_bbox_2
-            new_bbox_1, new_bbox_2 = new_bbox_2, []
-    return new_box
-def filter_lines_bbox(bboxes: list):
-    """
-    过滤掉bbox为空的行
-    """
-    new_box = []
-    for box in bboxes:
-        x0, y0, x1, y1 = box[0], box[1], box[2], box[3]
-        if abs(x0-x1)<=1 or abs(y0-y1)<=1:
-            continue
-        else:
-            new_box.append(box)
-    return new_box
-################################################################################
-# 第一种排序算法
-# 以下是基于延长线遮挡做的一个算法
-#
-################################################################################
-def find_all_left_bbox(this_bbox, all_bboxes) -> list:
-    """
-    寻找this_bbox左边的所有bbox
-    """
-    left_boxes = [box for box in all_bboxes if box[X1_IDX] <= this_bbox[X0_IDX]]
-    return left_boxes
-def find_all_top_bbox(this_bbox, all_bboxes) -> list:
-    """
-    寻找this_bbox上面的所有bbox
-    """
-    top_boxes = [box for box in all_bboxes if box[Y1_IDX] <= this_bbox[Y0_IDX]]
-    return top_boxes
-def get_and_set_idx_x(this_bbox, all_bboxes) -> int:
-    """
-    寻找this_bbox在all_bboxes中的遮挡深度 idx_x
-    """
-    if this_bbox[IDX_X] is not None:
-        return this_bbox[IDX_X]
-    else:
-        all_left_bboxes = find_all_left_bbox(this_bbox, all_bboxes)
-        if len(all_left_bboxes) == 0:
-            this_bbox[IDX_X] = 0
-        else:
-            all_left_bboxes_idx = [get_and_set_idx_x(bbox, all_bboxes) for bbox in all_left_bboxes]
-            max_idx_x = mymax(all_left_bboxes_idx)
-            this_bbox[IDX_X] = max_idx_x + 1
-        return this_bbox[IDX_X]
-def get_and_set_idx_y(this_bbox, all_bboxes) -> int:
-    """
-    寻找this_bbox在all_bboxes中y方向的遮挡深度 idx_y
-    """
-    if this_bbox[IDX_Y] is not None:
-        return this_bbox[IDX_Y]
-    else:
-        all_top_bboxes = find_all_top_bbox(this_bbox, all_bboxes)
-        if len(all_top_bboxes) == 0:
-            this_bbox[IDX_Y] = 0
-        else:
-            all_top_bboxes_idx = [get_and_set_idx_y(bbox, all_bboxes) for bbox in all_top_bboxes]
-            max_idx_y = mymax(all_top_bboxes_idx)
-            this_bbox[IDX_Y] = max_idx_y + 1
-        return this_bbox[IDX_Y]
-def bbox_sort(all_bboxes: list):
-    """
-    排序
-    """
-    all_bboxes_idx_x = [get_and_set_idx_x(bbox, all_bboxes) for bbox in all_bboxes]
-    all_bboxes_idx_y = [get_and_set_idx_y(bbox, all_bboxes) for bbox in all_bboxes]
-    all_bboxes_idx = [(idx_x, idx_y) for idx_x, idx_y in zip(all_bboxes_idx_x, all_bboxes_idx_y)]
-    all_bboxes_idx = [idx_x_y[0] * 100000 + idx_x_y[1] for idx_x_y in all_bboxes_idx]  # 变换成一个点，保证能够先X，X相同时按Y排序
-    all_bboxes_idx = list(zip(all_bboxes_idx, all_bboxes))
-    all_bboxes_idx.sort(key=lambda x: x[0])
-    sorted_bboxes = [bbox for idx, bbox in all_bboxes_idx]
-    return sorted_bboxes
-################################################################################
-# 第二种排序算法
-# 下面的算法在计算idx_x和idx_y的时候不考虑延长线，而只考虑实际的长或者宽被遮挡的情况
-#
-################################################################################
-def find_left_nearest_bbox(this_bbox, all_bboxes) -> list:
-    """
-    在all_bboxes里找到所有右侧高度和this_bbox有重叠的bbox
-    """
-    left_boxes = [box for box in all_bboxes if box[X1_IDX] <= this_bbox[X0_IDX] and any([
-         box[Y0_IDX] < this_bbox[Y0_IDX] < box[Y1_IDX], box[Y0_IDX] < this_bbox[Y1_IDX] < box[Y1_IDX],
-         this_bbox[Y0_IDX] < box[Y0_IDX] < this_bbox[Y1_IDX], this_bbox[Y0_IDX] < box[Y1_IDX] < this_bbox[Y1_IDX],
-         box[Y0_IDX]==this_bbox[Y0_IDX] and box[Y1_IDX]==this_bbox[Y1_IDX]])]
-    # 然后再过滤一下，找到水平上距离this_bbox最近的那个
-    if len(left_boxes) > 0:
-        left_boxes.sort(key=lambda x: x[X1_IDX], reverse=True)
-        left_boxes = [left_boxes[0]]
-    else:
-        left_boxes = []
-    return left_boxes
-def get_and_set_idx_x_2(this_bbox, all_bboxes):
-    """
-    寻找this_bbox在all_bboxes中的被直接遮挡的深度 idx_x
-    这个遮挡深度不考虑延长线，而是被实际的长或者宽遮挡的情况
-    """
-    if this_bbox[IDX_X] is not None:
-        return this_bbox[IDX_X]
-    else:
-        left_nearest_bbox = find_left_nearest_bbox(this_bbox, all_bboxes)
-        if len(left_nearest_bbox) == 0:
-            this_bbox[IDX_X] = 0
-        else:
-            left_idx_x = get_and_set_idx_x_2(left_nearest_bbox[0], all_bboxes)
-            this_bbox[IDX_X] = left_idx_x + 1
-        return this_bbox[IDX_X]
-def find_top_nearest_bbox(this_bbox, all_bboxes) -> list:
-    """
-    在all_bboxes里找到所有下侧宽度和this_bbox有重叠的bbox
-    """
-    top_boxes = [box for box in all_bboxes if box[Y1_IDX] <= this_bbox[Y0_IDX] and any([
-        box[X0_IDX] < this_bbox[X0_IDX] < box[X1_IDX], box[X0_IDX] < this_bbox[X1_IDX] < box[X1_IDX],
-         this_bbox[X0_IDX] < box[X0_IDX] < this_bbox[X1_IDX], this_bbox[X0_IDX] < box[X1_IDX] < this_bbox[X1_IDX],
-        box[X0_IDX]==this_bbox[X0_IDX] and box[X1_IDX]==this_bbox[X1_IDX]])]
-    # 然后再过滤一下，找到水平上距离this_bbox最近的那个
-    if len(top_boxes) > 0:
-        top_boxes.sort(key=lambda x: x[Y1_IDX], reverse=True)
-        top_boxes = [top_boxes[0]]
-    else:
-        top_boxes = []
-    return top_boxes
-def get_and_set_idx_y_2(this_bbox, all_bboxes):
-    """
-    寻找this_bbox在all_bboxes中的被直接遮挡的深度 idx_y
-    这个遮挡深度不考虑延长线，而是被实际的长或者宽遮挡的情况
-    """
-    if this_bbox[IDX_Y] is not None:
-        return this_bbox[IDX_Y]
-    else:
-        top_nearest_bbox = find_top_nearest_bbox(this_bbox, all_bboxes)
-        if len(top_nearest_bbox) == 0:
-            this_bbox[IDX_Y] = 0
-        else:
-            top_idx_y = get_and_set_idx_y_2(top_nearest_bbox[0], all_bboxes)
-            this_bbox[IDX_Y] = top_idx_y + 1
-        return this_bbox[IDX_Y]
-def paper_bbox_sort(all_bboxes: list, page_width, page_height):
-    all_bboxes_idx_x = [get_and_set_idx_x_2(bbox, all_bboxes) for bbox in all_bboxes]
-    all_bboxes_idx_y = [get_and_set_idx_y_2(bbox, all_bboxes) for bbox in all_bboxes]
-    all_bboxes_idx = [(idx_x, idx_y) for idx_x, idx_y in zip(all_bboxes_idx_x, all_bboxes_idx_y)]
-    all_bboxes_idx = [idx_x_y[0] * 100000 + idx_x_y[1] for idx_x_y in all_bboxes_idx]  # 变换成一个点，保证能够先X，X相同时按Y排序
-    all_bboxes_idx = list(zip(all_bboxes_idx, all_bboxes))
-    all_bboxes_idx.sort(key=lambda x: x[0])
-    sorted_bboxes = [bbox for idx, bbox in all_bboxes_idx]
-    return sorted_bboxes
-################################################################################
-"""
-第三种排序算法, 假设page的最左侧为X0，最右侧为X1，最上侧为Y0，最下侧为Y1
-这个排序算法在第二种算法基础上增加对bbox的预处理步骤。预处理思路如下：
-1. 首先在水平方向上对bbox进行扩展。扩展方法是：
-    - 对每个bbox，找到其左边最近的bbox（也就是y方向有重叠），然后将其左边界扩展到左边最近bbox的右边界(x1+1),这里加1是为了避免重叠。如果没有左边的bbox，那么就将其左边界扩展到page的最左侧X0。
-    - 对每个bbox，找到其右边最近的bbox（也就是y方向有重叠），然后将其右边界扩展到右边最近bbox的左边界(x0-1),这里减1是为了避免重叠。如果没有右边的bbox，那么就将其右边界扩展到page的最右侧X1。
-    - 经过上面2个步骤，bbox扩展到了水平方向的最大范围。[左最近bbox.x1+1, 右最近bbox.x0-1]
-2. 合并所有的连续水平方向的bbox, 合并方法是：
-    - 对bbox进行y方向排序，然后从上到下遍历所有bbox，如果当前bbox和下一个bbox的x0, x1等于X0, X1，那么就合并这两个bbox。
-3. 然后在垂直方向上对bbox进行扩展。扩展方法是：
-    - 首先从page上切割掉合并后的水平bbox, 得到几个新的block
-    针对每个block
-    - x0: 扎到位于左侧x=x0延长线的左侧所有的bboxes, 找到最大的x1,让x0=x1+1。如果没有，则x0=X0
-    - x1: 找到位于右侧x=x1延长线右侧所有的bboxes， 找到最小的x0, 让x1=x0-1。如果没有，则x1=X1
-    随后在垂直方向上合并所有的连续的block，方法如下：
-    - 对block进行x方向排序，然后从左到右遍历所有block，如果当前block和下一个block的x0, x1相等，那么就合并这两个block。
-    如果垂直切分后所有小bbox都被分配到了一个block, 那么分割就完成了。这些合并后的block打上标签'GOOD_LAYOUT’
-    如果在某个垂直方向上无法被完全分割到一个block，那么就将这个block打上标签'BAD_LAYOUT'。
-    至此完成，一个页面的预处理，天然的block要么属于'GOOD_LAYOUT'，要么属于'BAD_LAYOUT'。针对含有'BAD_LAYOUT'的页面，可以先按照自上而下，自左到右进行天然排序，也可以先过滤掉这种书籍。
-    (完成条件下次加强：进行水平方向切分，把混乱的layout部分尽可能切割出去)
-"""
-################################################################################
-def find_left_neighbor_bboxes(this_bbox, all_bboxes) -> list:
-    """
-    在all_bboxes里找到所有右侧高度和this_bbox有重叠的bbox
-    这里使用扩展之后的bbox
-    """
-    left_boxes = [box for box in all_bboxes if box[X1_EXT_IDX] <= this_bbox[X0_EXT_IDX] and any([
-         box[Y0_EXT_IDX] < this_bbox[Y0_EXT_IDX] < box[Y1_EXT_IDX], box[Y0_EXT_IDX] < this_bbox[Y1_EXT_IDX] < box[Y1_EXT_IDX],
-         this_bbox[Y0_EXT_IDX] < box[Y0_EXT_IDX] < this_bbox[Y1_EXT_IDX], this_bbox[Y0_EXT_IDX] < box[Y1_EXT_IDX] < this_bbox[Y1_EXT_IDX],
-         box[Y0_EXT_IDX]==this_bbox[Y0_EXT_IDX] and box[Y1_EXT_IDX]==this_bbox[Y1_EXT_IDX]])]
-    # 然后再过滤一下，找到水平上距离this_bbox最近的那个
-    if len(left_boxes) > 0:
-        left_boxes.sort(key=lambda x: x[X1_EXT_IDX], reverse=True)
-        left_boxes = left_boxes
-    else:
-        left_boxes = []
-    return left_boxes
-def find_top_neighbor_bboxes(this_bbox, all_bboxes) -> list:
-    """
-    在all_bboxes里找到所有下侧宽度和this_bbox有重叠的bbox
-    这里使用扩展之后的bbox
-    """
-    top_boxes = [box for box in all_bboxes if box[Y1_EXT_IDX] <= this_bbox[Y0_EXT_IDX] and any([
-        box[X0_EXT_IDX] < this_bbox[X0_EXT_IDX] < box[X1_EXT_IDX], box[X0_EXT_IDX] < this_bbox[X1_EXT_IDX] < box[X1_EXT_IDX],
-         this_bbox[X0_EXT_IDX] < box[X0_EXT_IDX] < this_bbox[X1_EXT_IDX], this_bbox[X0_EXT_IDX] < box[X1_EXT_IDX] < this_bbox[X1_EXT_IDX],
-        box[X0_EXT_IDX]==this_bbox[X0_EXT_IDX] and box[X1_EXT_IDX]==this_bbox[X1_EXT_IDX]])]
-    # 然后再过滤一下，找到水平上距离this_bbox最近的那个
-    if len(top_boxes) > 0:
-        top_boxes.sort(key=lambda x: x[Y1_EXT_IDX], reverse=True)
-        top_boxes = top_boxes
-    else:
-        top_boxes = []
-    return top_boxes
-def get_and_set_idx_x_2_ext(this_bbox, all_bboxes):
-    """
-    寻找this_bbox在all_bboxes中的被直接遮挡的深度 idx_x
-    这个遮挡深度不考虑延长线，而是被实际的长或者宽遮挡的情况
-    """
-    if this_bbox[IDX_X] is not None:
-        return this_bbox[IDX_X]
-    else:
-        left_nearest_bbox = find_left_neighbor_bboxes(this_bbox, all_bboxes)
-        if len(left_nearest_bbox) == 0:
-            this_bbox[IDX_X] = 0
-        else:
-            left_idx_x = [get_and_set_idx_x_2(b, all_bboxes) for b in left_nearest_bbox]
-            this_bbox[IDX_X] = mymax(left_idx_x) + 1
-        return this_bbox[IDX_X]
-def get_and_set_idx_y_2_ext(this_bbox, all_bboxes):
-    """
-    寻找this_bbox在all_bboxes中的被直接遮挡的深度 idx_y
-    这个遮挡深度不考虑延长线，而是被实际的长或者宽遮挡的情况
-    """
-    if this_bbox[IDX_Y] is not None:
-        return this_bbox[IDX_Y]
-    else:
-        top_nearest_bbox = find_top_neighbor_bboxes(this_bbox, all_bboxes)
-        if len(top_nearest_bbox) == 0:
-            this_bbox[IDX_Y] = 0
-        else:
-            top_idx_y = [get_and_set_idx_y_2_ext(b, all_bboxes) for b in top_nearest_bbox]
-            this_bbox[IDX_Y] = mymax(top_idx_y) + 1
-        return this_bbox[IDX_Y]
-def _paper_bbox_sort_ext(all_bboxes: list):
-    all_bboxes_idx_x = [get_and_set_idx_x_2_ext(bbox, all_bboxes) for bbox in all_bboxes]
-    all_bboxes_idx_y = [get_and_set_idx_y_2_ext(bbox, all_bboxes) for bbox in all_bboxes]
-    all_bboxes_idx = [(idx_x, idx_y) for idx_x, idx_y in zip(all_bboxes_idx_x, all_bboxes_idx_y)]
-    all_bboxes_idx = [idx_x_y[0] * 100000 + idx_x_y[1] for idx_x_y in all_bboxes_idx]  # 变换成一个点，保证能够先X，X相同时按Y排序
-    all_bboxes_idx = list(zip(all_bboxes_idx, all_bboxes))
-    all_bboxes_idx.sort(key=lambda x: x[0])
-    sorted_bboxes = [bbox for idx, bbox in all_bboxes_idx]
-    return sorted_bboxes
-# ===============================================================================================
-def find_left_bbox_ext_line(this_bbox, all_bboxes) -> list:
-    """
-    寻找this_bbox左边的所有bbox, 使用延长线
-    """
-    left_boxes = [box for box in all_bboxes if box[X1_IDX] <= this_bbox[X0_IDX]]
-    if len(left_boxes):
-        left_boxes.sort(key=lambda x: x[X1_IDX], reverse=True)
-        left_boxes = left_boxes[0]
-    else:
-        left_boxes = None
-    return left_boxes
-def find_right_bbox_ext_line(this_bbox, all_bboxes) -> list:
-    """
-    寻找this_bbox右边的所有bbox, 使用延长线
-    """
-    right_boxes = [box for box in all_bboxes if box[X0_IDX] >= this_bbox[X1_IDX]]
-    if len(right_boxes):
-        right_boxes.sort(key=lambda x: x[X0_IDX])
-        right_boxes = right_boxes[0]
-    else:
-        right_boxes = None
-    return right_boxes
-# =============================================================================================
-def find_left_nearest_bbox_direct(this_bbox, all_bboxes) -> list:
-    """
-    在all_bboxes里找到所有右侧高度和this_bbox有重叠的bbox， 不用延长线并且不能像
-    """
-    left_boxes = [box for box in all_bboxes if box[X1_IDX] <= this_bbox[X0_IDX] and any([
-         box[Y0_IDX] < this_bbox[Y0_IDX] < box[Y1_IDX], box[Y0_IDX] < this_bbox[Y1_IDX] < box[Y1_IDX],
-         this_bbox[Y0_IDX] < box[Y0_IDX] < this_bbox[Y1_IDX], this_bbox[Y0_IDX] < box[Y1_IDX] < this_bbox[Y1_IDX],
-         box[Y0_IDX]==this_bbox[Y0_IDX] and box[Y1_IDX]==this_bbox[Y1_IDX]])]
-    # 然后再过滤一下，找到水平上距离this_bbox最近的那个——x1最大的那个
-    if len(left_boxes) > 0:
-        left_boxes.sort(key=lambda x: x[X1_EXT_IDX] if x[X1_EXT_IDX] else x[X1_IDX], reverse=True)
-        left_boxes = left_boxes[0]
-    else:
-        left_boxes = None
-    return left_boxes
-def find_right_nearst_bbox_direct(this_bbox, all_bboxes) -> list:
-    """
-    找到在this_bbox右侧且距离this_bbox距离最近的bbox.必须是直接遮挡的那种
-    """
-    right_bboxes = [box for box in all_bboxes if box[X0_IDX] >= this_bbox[X1_IDX] and any([
-        this_bbox[Y0_IDX] < box[Y0_IDX] < this_bbox[Y1_IDX], this_bbox[Y0_IDX] < box[Y1_IDX] < this_bbox[Y1_IDX],
-        box[Y0_IDX] < this_bbox[Y0_IDX] < box[Y1_IDX], box[Y0_IDX] < this_bbox[Y1_IDX] < box[Y1_IDX],
-        box[Y0_IDX]==this_bbox[Y0_IDX] and box[Y1_IDX]==this_bbox[Y1_IDX]])]
-    if len(right_bboxes)>0:
-        right_bboxes.sort(key=lambda x: x[X0_EXT_IDX] if x[X0_EXT_IDX] else x[X0_IDX])
-        right_bboxes = right_bboxes[0]
-    else:
-        right_bboxes = None
-    return right_bboxes
-def reset_idx_x_y(all_boxes:list)->list:
-    for box in all_boxes:
-        box[IDX_X] = None
-        box[IDX_Y] = None
-    return all_boxes
-# ===================================================================================================
-def find_top_nearest_bbox_direct(this_bbox, bboxes_collection) -> list:
-    """
-    找到在this_bbox上方且距离this_bbox距离最近的bbox.必须是直接遮挡的那种
-    """
-    top_bboxes = [box for box in bboxes_collection if box[Y1_IDX] <= this_bbox[Y0_IDX] and any([
-        box[X0_IDX] < this_bbox[X0_IDX] < box[X1_IDX], box[X0_IDX] < this_bbox[X1_IDX] < box[X1_IDX],
-         this_bbox[X0_IDX] < box[X0_IDX] < this_bbox[X1_IDX], this_bbox[X0_IDX] < box[X1_IDX] < this_bbox[X1_IDX],
-        box[X0_IDX]==this_bbox[X0_IDX] and box[X1_IDX]==this_bbox[X1_IDX]])]
-    # 然后再过滤一下，找到上方距离this_bbox最近的那个
-    if len(top_bboxes) > 0:
-        top_bboxes.sort(key=lambda x: x[Y1_IDX], reverse=True)
-        top_bboxes = top_bboxes[0]
-    else:
-        top_bboxes = None
-    return top_bboxes
-def find_bottom_nearest_bbox_direct(this_bbox, bboxes_collection) -> list:
-    """
-    找到在this_bbox下方且距离this_bbox距离最近的bbox.必须是直接遮挡的那种
-    """
-    bottom_bboxes = [box for box in bboxes_collection if box[Y0_IDX] >= this_bbox[Y1_IDX] and any([
-        box[X0_IDX] < this_bbox[X0_IDX] < box[X1_IDX], box[X0_IDX] < this_bbox[X1_IDX] < box[X1_IDX],
-         this_bbox[X0_IDX] < box[X0_IDX] < this_bbox[X1_IDX], this_bbox[X0_IDX] < box[X1_IDX] < this_bbox[X1_IDX],
-        box[X0_IDX]==this_bbox[X0_IDX] and box[X1_IDX]==this_bbox[X1_IDX]])]
-    # 然后再过滤一下，找到水平上距离this_bbox最近的那个
-    if len(bottom_bboxes) > 0:
-        bottom_bboxes.sort(key=lambda x: x[Y0_IDX])
-        bottom_bboxes = bottom_bboxes[0]
-    else:
-        bottom_bboxes = None
-    return bottom_bboxes
-def find_boundry_bboxes(bboxes:list) -> tuple:
-    """
-    找到bboxes的边界——找到所有bbox里最小的(x0, y0), 最大的(x1, y1)
-    """
-    x0, y0, x1, y1 = bboxes[0][X0_IDX], bboxes[0][Y0_IDX], bboxes[0][X1_IDX], bboxes[0][Y1_IDX]
-    for box in bboxes:
-        x0 = min(box[X0_IDX], x0)
-        y0 = min(box[Y0_IDX], y0)
-        x1 = max(box[X1_IDX], x1)
-        y1 = max(box[Y1_IDX], y1)
-    return x0, y0, x1, y1
-def extend_bbox_vertical(bboxes:list, boundry_x0, boundry_y0, boundry_x1, boundry_y1) -> list:
-    """
-    在垂直方向上扩展能够直接垂直打通的bbox,也就是那些上下都没有其他box的bbox
-    """
-    for box in bboxes:
-        top_nearest_bbox = find_top_nearest_bbox_direct(box, bboxes)
-        bottom_nearest_bbox = find_bottom_nearest_bbox_direct(box, bboxes)
-        if top_nearest_bbox is None and bottom_nearest_bbox is None: # 独占一列
-            box[X0_EXT_IDX] = box[X0_IDX]
-            box[Y0_EXT_IDX] = boundry_y0
-            box[X1_EXT_IDX] = box[X1_IDX]
-            box[Y1_EXT_IDX] = boundry_y1
-        # else:
-        #     if top_nearest_bbox is None:
-        #         box[Y0_EXT_IDX] = boundry_y0
-        #     else:
-        #         box[Y0_EXT_IDX] = top_nearest_bbox[Y1_IDX] + 1
-        #     if bottom_nearest_bbox is None:
-        #         box[Y1_EXT_IDX] = boundry_y1
-        #     else:
-        #         box[Y1_EXT_IDX] = bottom_nearest_bbox[Y0_IDX] - 1
-        #     box[X0_EXT_IDX] = box[X0_IDX]
-        #     box[X1_EXT_IDX] = box[X1_IDX]
-    return bboxes
-# ===================================================================================================
-def paper_bbox_sort_v2(all_bboxes: list, page_width:int, page_height:int):
-    """
-    增加预处理行为的排序:
-    return:
-    [
-        {
-            "layout_bbox": [x0, y0, x1, y1],
-            "layout_label":"GOOD_LAYOUT/BAD_LAYOUT",
-            "content_bboxes": [] #每个元素都是[x0, y0, x1, y1, block_content, idx_x, idx_y, content_type, ext_x0, ext_y0, ext_x1, ext_y1], 并且顺序就是阅读顺序
-        }
-    ]
-    """
-    sorted_layouts = [] # 最后的返回结果
-    page_x0, page_y0, page_x1, page_y1 = 1, 1, page_width-1, page_height-1
-    all_bboxes = paper_bbox_sort(all_bboxes) # 大致拍下序
-    # 首先在水平方向上扩展独占一行的bbox
-    for bbox in all_bboxes:
-        left_nearest_bbox = find_left_nearest_bbox_direct(bbox, all_bboxes) # 非扩展线
-        right_nearest_bbox = find_right_nearst_bbox_direct(bbox, all_bboxes)
-        if left_nearest_bbox is None and right_nearest_bbox is None: # 独占一行
-            bbox[X0_EXT_IDX] = page_x0
-            bbox[Y0_EXT_IDX] = bbox[Y0_IDX]
-            bbox[X1_EXT_IDX] = page_x1
-            bbox[Y1_EXT_IDX] = bbox[Y1_IDX]
-    # 此时独占一行的被成功扩展到指定的边界上，这个时候利用边界条件合并连续的bbox，成为一个group
-    if len(all_bboxes)==1:
-        return [{"layout_bbox": [page_x0, page_y0, page_x1, page_y1], "layout_label":"GOOD_LAYOUT", "content_bboxes": all_bboxes}]
-    if len(all_bboxes)==0:
-        return []
-    """
-    然后合并所有连续水平方向的bbox.
-    """
-    all_bboxes.sort(key=lambda x: x[Y0_IDX])
-    h_bboxes = []
-    h_bbox_group = []
-    v_boxes = []
-    for bbox in all_bboxes:
-        if bbox[X0_IDX] == page_x0 and bbox[X1_IDX] == page_x1:
-            h_bbox_group.append(bbox)
-        else:
-            if len(h_bbox_group)>0:
-                h_bboxes.append(h_bbox_group) 
-                h_bbox_group = []
-    # 最后一个group
-    if len(h_bbox_group)>0:
-        h_bboxes.append(h_bbox_group)
-    """
-    现在h_bboxes里面是所有的group了，每个group都是一个list
-    对h_bboxes里的每个group进行计算放回到sorted_layouts里
-    """
-    for gp in h_bboxes:
-        gp.sort(key=lambda x: x[Y0_IDX])
-        block_info = {"layout_label":"GOOD_LAYOUT", "content_bboxes": gp}
-        # 然后计算这个group的layout_bbox，也就是最小的x0,y0, 最大的x1,y1
-        x0, y0, x1, y1 = gp[0][X0_EXT_IDX], gp[0][Y0_EXT_IDX], gp[-1][X1_EXT_IDX], gp[-1][Y1_EXT_IDX]
-        block_info["layout_bbox"] = [x0, y0, x1, y1]
-        sorted_layouts.append(block_info)
-    # 接下来利用这些连续的水平bbox的layout_bbox的y0, y1，从水平上切分开其余的为几个部分
-    h_split_lines = [page_y0]
-    for gp in h_bboxes:
-        layout_bbox = gp['layout_bbox']
-        y0, y1 = layout_bbox[1], layout_bbox[3]
-        h_split_lines.append(y0)
-        h_split_lines.append(y1)
-    h_split_lines.append(page_y1)
-    unsplited_bboxes = []
-    for i in range(0, len(h_split_lines), 2):
-        start_y0, start_y1 = h_split_lines[i:i+2]
-        # 然后找出[start_y0, start_y1]之间的其他bbox，这些组成一个未分割板块
-        bboxes_in_block = [bbox for bbox in all_bboxes if bbox[Y0_IDX]>=start_y0 and bbox[Y1_IDX]<=start_y1]
-        unsplited_bboxes.append(bboxes_in_block)
-    # ================== 至此，水平方向的 已经切分排序完毕====================================
-    """
-    接下来针对每个非水平的部分切分垂直方向的
-    此时，只剩下了无法被完全水平打通的bbox了。对这些box，优先进行垂直扩展，然后进行垂直切分.
-    分3步：
-    1. 先把能完全垂直打通的隔离出去当做一个layout
-    2. 其余的先垂直切分
-    3. 垂直切分之后的部分再尝试水平切分
-    4. 剩下的不能被切分的各个部分当成一个layout
-    """
-    # 对每部分进行垂直切分
-    for bboxes_in_block in unsplited_bboxes:
-        # 首先对这个block的bbox进行垂直方向上的扩展
-        boundry_x0, boundry_y0, boundry_x1, boundry_y1 = find_boundry_bboxes(bboxes_in_block) 
-        # 进行垂直方向上的扩展
-        extended_vertical_bboxes = extend_bbox_vertical(bboxes_in_block, boundry_x0, boundry_y0, boundry_x1, boundry_y1)
-        # 然后对这个block进行垂直方向上的切分
-        extend_bbox_vertical.sort(key=lambda x: x[X0_IDX]) # x方向上从小到大，代表了从左到右读取
-        v_boxes_group = []
-        for bbox in extended_vertical_bboxes:
-            if bbox[Y0_IDX]==boundry_y0 and bbox[Y1_IDX]==boundry_y1:
-                v_boxes_group.append(bbox)
-            else:
-                if len(v_boxes_group)>0:
-                    v_boxes.append(v_boxes_group)
-                    v_boxes_group = []
-        if len(v_boxes_group)>0:
-            v_boxes.append(v_boxes_group)
-        # 把连续的垂直部分加入到sorted_layouts里。注意这个时候已经是连续的垂直部分了，因为上面已经做了
-        for gp in v_boxes:
-            gp.sort(key=lambda x: x[X0_IDX])
-            block_info = {"layout_label":"GOOD_LAYOUT", "content_bboxes": gp}
-            # 然后计算这个group的layout_bbox，也就是最小的x0,y0, 最大的x1,y1
-            x0, y0, x1, y1 = gp[0][X0_EXT_IDX], gp[0][Y0_EXT_IDX], gp[-1][X1_EXT_IDX], gp[-1][Y1_EXT_IDX]
-            block_info["layout_bbox"] = [x0, y0, x1, y1]
-            sorted_layouts.append(block_info)
-        # 在垂直方向上，划分子块，也就是用贯通的垂直线进行切分。这些被切分出来的块，极大可能是可被垂直切分的，如果不能完全的垂直切分，那么尝试水平切分。都不能的则当成一个layout
-        v_split_lines = [boundry_x0]
-        for gp in v_boxes:
-            layout_bbox = gp['layout_bbox']
-            x0, x1 = layout_bbox[0], layout_bbox[2]
-            v_split_lines.append(x0)
-            v_split_lines.append(x1)
-        v_split_lines.append(boundry_x1)
-    reset_idx_x_y(all_bboxes)
-    all_boxes = _paper_bbox_sort_ext(all_bboxes)
-    return all_boxes
--- a/magic_pdf/layout/layout_det_utils.py
+++ b/magic_pdf/layout/layout_det_utils.py
-from magic_pdf.layout.bbox_sort import X0_EXT_IDX, X0_IDX, X1_EXT_IDX, X1_IDX, Y0_IDX, Y1_EXT_IDX, Y1_IDX
-from magic_pdf.libs.boxbase import _is_bottom_full_overlap, _left_intersect, _right_intersect
-def find_all_left_bbox_direct(this_bbox, all_bboxes) -> list:
-    """
-    在all_bboxes里找到所有右侧垂直方向上和this_bbox有重叠的bbox， 不用延长线
-    并且要考虑两个box左右相交的情况，如果相交了，那么右侧的box就不算最左侧。
-    """
-    left_boxes = [box for box in all_bboxes if box[X1_IDX] <= this_bbox[X0_IDX] 
-         and any([
-         box[Y0_IDX] < this_bbox[Y0_IDX] < box[Y1_IDX], box[Y0_IDX] < this_bbox[Y1_IDX] < box[Y1_IDX],
-         this_bbox[Y0_IDX] < box[Y0_IDX] < this_bbox[Y1_IDX], this_bbox[Y0_IDX] < box[Y1_IDX] < this_bbox[Y1_IDX],
-         box[Y0_IDX]==this_bbox[Y0_IDX] and box[Y1_IDX]==this_bbox[Y1_IDX]]) or _left_intersect(box[:4], this_bbox[:4])]
-    # 然后再过滤一下，找到水平上距离this_bbox最近的那个——x1最大的那个
-    if len(left_boxes) > 0:
-        left_boxes.sort(key=lambda x: x[X1_EXT_IDX] if x[X1_EXT_IDX] else x[X1_IDX], reverse=True)
-        left_boxes = left_boxes[0]
-    else:
-        left_boxes = None
-    return left_boxes
-def find_all_right_bbox_direct(this_bbox, all_bboxes) -> list:
-    """
-    找到在this_bbox右侧且距离this_bbox距离最近的bbox.必须是直接遮挡的那种
-    """
-    right_bboxes = [box for box in all_bboxes if box[X0_IDX] >= this_bbox[X1_IDX] 
-        and any([
-        this_bbox[Y0_IDX] < box[Y0_IDX] < this_bbox[Y1_IDX], this_bbox[Y0_IDX] < box[Y1_IDX] < this_bbox[Y1_IDX],
-        box[Y0_IDX] < this_bbox[Y0_IDX] < box[Y1_IDX], box[Y0_IDX] < this_bbox[Y1_IDX] < box[Y1_IDX],
-        box[Y0_IDX]==this_bbox[Y0_IDX] and box[Y1_IDX]==this_bbox[Y1_IDX]]) or _right_intersect(this_bbox[:4], box[:4])]
-    if len(right_bboxes)>0:
-        right_bboxes.sort(key=lambda x: x[X0_EXT_IDX] if x[X0_EXT_IDX] else x[X0_IDX])
-        right_bboxes = right_bboxes[0]
-    else:
-        right_bboxes = None
-    return right_bboxes
-def find_all_top_bbox_direct(this_bbox, all_bboxes) -> list:
-    """
-    找到在this_bbox上侧且距离this_bbox距离最近的bbox.必须是直接遮挡的那种
-    """
-    top_bboxes = [box for box in all_bboxes if box[Y1_IDX] <= this_bbox[Y0_IDX] and any([
-        box[X0_IDX] < this_bbox[X0_IDX] < box[X1_IDX], box[X0_IDX] < this_bbox[X1_IDX] < box[X1_IDX],
-        this_bbox[X0_IDX] < box[X0_IDX] < this_bbox[X1_IDX], this_bbox[X0_IDX] < box[X1_IDX] < this_bbox[X1_IDX],
-        box[X0_IDX]==this_bbox[X0_IDX] and box[X1_IDX]==this_bbox[X1_IDX]])]
-    if len(top_bboxes)>0:
-        top_bboxes.sort(key=lambda x: x[Y1_EXT_IDX] if x[Y1_EXT_IDX] else x[Y1_IDX], reverse=True)
-        top_bboxes = top_bboxes[0]
-    else:
-        top_bboxes = None
-    return top_bboxes
-def find_all_bottom_bbox_direct(this_bbox, all_bboxes) -> list:
-    """
-    找到在this_bbox下侧且距离this_bbox距离最近的bbox.必须是直接遮挡的那种
-    """
-    bottom_bboxes = [box for box in all_bboxes if box[Y0_IDX] >= this_bbox[Y1_IDX] and any([
-        this_bbox[X0_IDX] < box[X0_IDX] < this_bbox[X1_IDX], this_bbox[X0_IDX] < box[X1_IDX] < this_bbox[X1_IDX],
-        box[X0_IDX] < this_bbox[X0_IDX] < box[X1_IDX], box[X0_IDX] < this_bbox[X1_IDX] < box[X1_IDX],
-        box[X0_IDX]==this_bbox[X0_IDX] and box[X1_IDX]==this_bbox[X1_IDX]])]
-    if len(bottom_bboxes)>0:
-        bottom_bboxes.sort(key=lambda x:  x[Y0_IDX])
-        bottom_bboxes = bottom_bboxes[0]
-    else:
-        bottom_bboxes = None
-    return bottom_bboxes
-# ===================================================================================================================
-def find_bottom_bbox_direct_from_right_edge(this_bbox, all_bboxes) -> list:
-    """
-    找到在this_bbox下侧且距离this_bbox距离最近的bbox.必须是直接遮挡的那种
-    """
-    bottom_bboxes = [box for box in all_bboxes if box[Y0_IDX] >= this_bbox[Y1_IDX] and any([
-        this_bbox[X0_IDX] < box[X0_IDX] < this_bbox[X1_IDX], this_bbox[X0_IDX] < box[X1_IDX] < this_bbox[X1_IDX],
-        box[X0_IDX] < this_bbox[X0_IDX] < box[X1_IDX], box[X0_IDX] < this_bbox[X1_IDX] < box[X1_IDX],
-        box[X0_IDX]==this_bbox[X0_IDX] and box[X1_IDX]==this_bbox[X1_IDX]])]
-    if len(bottom_bboxes)>0:
-        # y0最小， X1最大的那个,也就是box上边缘最靠近this_bbox的那个,并且还最靠右
-        bottom_bboxes.sort(key=lambda x: x[Y0_IDX])
-        bottom_bboxes = [box for box in bottom_bboxes if box[Y0_IDX]==bottom_bboxes[0][Y0_IDX]]
-        # 然后再y1相同的情况下，找到x1最大的那个
-        bottom_bboxes.sort(key=lambda x: x[X1_IDX], reverse=True)
-        bottom_bboxes = bottom_bboxes[0]
-    else:
-        bottom_bboxes = None
-    return bottom_bboxes
-def find_bottom_bbox_direct_from_left_edge(this_bbox, all_bboxes) -> list:
-    """
-    找到在this_bbox下侧且距离this_bbox距离最近的bbox.必须是直接遮挡的那种
-    """
-    bottom_bboxes = [box for box in all_bboxes if box[Y0_IDX] >= this_bbox[Y1_IDX] and any([
-        this_bbox[X0_IDX] < box[X0_IDX] < this_bbox[X1_IDX], this_bbox[X0_IDX] < box[X1_IDX] < this_bbox[X1_IDX],
-        box[X0_IDX] < this_bbox[X0_IDX] < box[X1_IDX], box[X0_IDX] < this_bbox[X1_IDX] < box[X1_IDX],
-        box[X0_IDX]==this_bbox[X0_IDX] and box[X1_IDX]==this_bbox[X1_IDX]])]
-    if len(bottom_bboxes)>0:
-        # y0最小， X0最小的那个
-        bottom_bboxes.sort(key=lambda x: x[Y0_IDX])
-        bottom_bboxes = [box for box in bottom_bboxes if box[Y0_IDX]==bottom_bboxes[0][Y0_IDX]]
-        # 然后再y0相同的情况下，找到x0最小的那个
-        bottom_bboxes.sort(key=lambda x: x[X0_IDX])
-        bottom_bboxes = bottom_bboxes[0]
-    else:
-        bottom_bboxes = None
-    return bottom_bboxes
-def find_top_bbox_direct_from_left_edge(this_bbox, all_bboxes) -> list:
-    """
-    找到在this_bbox上侧且距离this_bbox距离最近的bbox.必须是直接遮挡的那种
-    """
-    top_bboxes = [box for box in all_bboxes if box[Y1_IDX] <= this_bbox[Y0_IDX] and any([
-        box[X0_IDX] < this_bbox[X0_IDX] < box[X1_IDX], box[X0_IDX] < this_bbox[X1_IDX] < box[X1_IDX],
-        this_bbox[X0_IDX] < box[X0_IDX] < this_bbox[X1_IDX], this_bbox[X0_IDX] < box[X1_IDX] < this_bbox[X1_IDX],
-        box[X0_IDX]==this_bbox[X0_IDX] and box[X1_IDX]==this_bbox[X1_IDX]])]
-    if len(top_bboxes)>0:
-        # y1最大， X0最小的那个
-        top_bboxes.sort(key=lambda x: x[Y1_IDX], reverse=True)
-        top_bboxes = [box for box in top_bboxes if box[Y1_IDX]==top_bboxes[0][Y1_IDX]]
-        # 然后再y1相同的情况下，找到x0最小的那个
-        top_bboxes.sort(key=lambda x: x[X0_IDX])
-        top_bboxes = top_bboxes[0]
-    else:
-        top_bboxes = None
-    return top_bboxes
-def find_top_bbox_direct_from_right_edge(this_bbox, all_bboxes) -> list:
-    """
-    找到在this_bbox上侧且距离this_bbox距离最近的bbox.必须是直接遮挡的那种
-    """
-    top_bboxes = [box for box in all_bboxes if box[Y1_IDX] <= this_bbox[Y0_IDX] and any([
-        box[X0_IDX] < this_bbox[X0_IDX] < box[X1_IDX], box[X0_IDX] < this_bbox[X1_IDX] < box[X1_IDX],
-        this_bbox[X0_IDX] < box[X0_IDX] < this_bbox[X1_IDX], this_bbox[X0_IDX] < box[X1_IDX] < this_bbox[X1_IDX],
-        box[X0_IDX]==this_bbox[X0_IDX] and box[X1_IDX]==this_bbox[X1_IDX]])]
-    if len(top_bboxes)>0:
-        # y1最大， X1最大的那个
-        top_bboxes.sort(key=lambda x: x[Y1_IDX], reverse=True)
-        top_bboxes = [box for box in top_bboxes if box[Y1_IDX]==top_bboxes[0][Y1_IDX]]
-        # 然后再y1相同的情况下，找到x1最大的那个
-        top_bboxes.sort(key=lambda x: x[X1_IDX], reverse=True)
-        top_bboxes = top_bboxes[0]
-    else:
-        top_bboxes = None
-    return top_bboxes
-# ===================================================================================================================
-def get_left_edge_bboxes(all_bboxes) -> list:
-    """
-    返回最左边的bbox
-    """
-    left_bboxes = [box for box in all_bboxes if find_all_left_bbox_direct(box, all_bboxes) is None]
-    return left_bboxes
-def get_right_edge_bboxes(all_bboxes) -> list:
-    """
-    返回最右边的bbox
-    """
-    right_bboxes = [box for box in all_bboxes if find_all_right_bbox_direct(box, all_bboxes) is None]
-    return right_bboxes
-def fix_vertical_bbox_pos(bboxes:list):
-    """
-    检查这批bbox在垂直方向是否有轻微的重叠，如果重叠了，就把重叠的bbox往下移动一点
-    在x方向上必须一个包含或者被包含，或者完全重叠，不能只有部分重叠
-    """
-    bboxes.sort(key=lambda x: x[Y0_IDX]) # 从上向下排列
-    for i in range(0, len(bboxes)):
-        for j in range(i+1, len(bboxes)):
-            if _is_bottom_full_overlap(bboxes[i][:4], bboxes[j][:4]):
-                # 如果两个bbox有部分重叠，那么就把下面的bbox往下移动一点
-                bboxes[j][Y0_IDX] = bboxes[i][Y1_IDX] + 2 # 2是个经验值
-                break
-    return bboxes
--- a/magic_pdf/layout/layout_sort.py
+++ b/magic_pdf/layout/layout_sort.py
-"""对pdf上的box进行layout识别，并对内部组成的box进行排序."""
-from loguru import logger
-from magic_pdf.layout.bbox_sort import (CONTENT_IDX, CONTENT_TYPE_IDX,
-                                        X0_EXT_IDX, X0_IDX, X1_EXT_IDX, X1_IDX,
-                                        Y0_EXT_IDX, Y0_IDX, Y1_EXT_IDX, Y1_IDX,
-                                        paper_bbox_sort)
-from magic_pdf.layout.layout_det_utils import (
-    find_all_bottom_bbox_direct, find_all_left_bbox_direct,
-    find_all_right_bbox_direct, find_all_top_bbox_direct,
-    find_bottom_bbox_direct_from_left_edge,
-    find_bottom_bbox_direct_from_right_edge,
-    find_top_bbox_direct_from_left_edge, find_top_bbox_direct_from_right_edge,
-    get_left_edge_bboxes, get_right_edge_bboxes)
-from magic_pdf.libs.boxbase import get_bbox_in_boundary
-LAYOUT_V = 'V'
-LAYOUT_H = 'H'
-LAYOUT_UNPROC = 'U'
-LAYOUT_BAD = 'B'
-def _is_single_line_text(bbox):
-    """检查bbox里面的文字是否只有一行."""
-    return True  # TODO
-    box_type = bbox[CONTENT_TYPE_IDX]
-    if box_type != 'text':
-        return False
-    paras = bbox[CONTENT_IDX]['paras']
-    text_content = ''
-    for para_id, para in paras.items():  # 拼装内部的段落文本
-        is_title = para['is_title']
-        if is_title != 0:
-            text_content += f"## {para['text']}"
-        else:
-            text_content += para['text']
-        text_content += '\n\n'
-    return bbox[CONTENT_TYPE_IDX] == 'text' and len(text_content.split('\n\n')) <= 1
-def _horizontal_split(bboxes: list, boundary: tuple, avg_font_size=20) -> list:
-    """
-    对bboxes进行水平切割
-    方法是：找到左侧和右侧都没有被直接遮挡的box，然后进行扩展，之后进行切割
-    return:
-        返回几个大的Layout区域 [[x0, y0, x1, y1, "h|u|v"], ], h代表水平，u代表未探测的，v代表垂直布局
-    """
-    sorted_layout_blocks = []  # 这是要最终返回的值
-    bound_x0, bound_y0, bound_x1, bound_y1 = boundary
-    all_bboxes = get_bbox_in_boundary(bboxes, boundary)
-    # all_bboxes = paper_bbox_sort(all_bboxes, abs(bound_x1-bound_x0), abs(bound_y1-bound_x0)) # 大致拍下序, 这个是基于直接遮挡的。
-    """
-    首先在水平方向上扩展独占一行的bbox
-    """
-    last_h_split_line_y1 = bound_y0  # 记录下上次的水平分割线
-    for i, bbox in enumerate(all_bboxes):
-        left_nearest_bbox = find_all_left_bbox_direct(bbox, all_bboxes)  # 非扩展线
-        right_nearest_bbox = find_all_right_bbox_direct(bbox, all_bboxes)
-        if left_nearest_bbox is None and right_nearest_bbox is None:  # 独占一行
-            """
-            然而，如果只是孤立的一行文字，那么就还要满足以下几个条件才可以：
-            1. bbox和中心线相交。或者
-            2. 上方或者下方也存在同类水平的独占一行的bbox。 或者
-            3. TODO 加强条件：这个bbox上方和下方是同一列column，那么就不能算作独占一行
-            """
-            # 先检查这个bbox里是否只包含一行文字
-            # is_single_line = _is_single_line_text(bbox)
-            """
-            这里有个点需要注意，当页面内容不是居中的时候，第一次调用传递的是page的boundary，这个时候mid_x就不是中心线了.
-            所以这里计算出最紧致的boundary，然后再计算mid_x
-            """
-            boundary_real_x0, boundary_real_x1 = min(
-                [bbox[X0_IDX] for bbox in all_bboxes]
-            ), max([bbox[X1_IDX] for bbox in all_bboxes])
-            mid_x = (boundary_real_x0 + boundary_real_x1) / 2
-            # 检查这个box是否内容在中心线有交
-            # 必须跨过去2个字符的宽度
-            is_cross_boundary_mid_line = (
-                min(mid_x - bbox[X0_IDX], bbox[X1_IDX] - mid_x) > avg_font_size * 2
-            )
-            """
-            检查条件2
-            """
-            is_belong_to_col = False
-            """
-            检查是否能被上方col吸收，方法是：
-            1. 上方非空且不是独占一行的，并且
-            2. 从上个水平分割的最大y=y1开始到当前bbox,最左侧的bbox的[min_x0, max_x1],能够覆盖当前box的[x0, x1]
-            """
-            """
-            以迭代的方式向上找，查找范围是[bound_x0, last_h_sp, bound_x1, bbox[Y0_IDX]]
-            """
-            # 先确定上方的y0, y0
-            b_y0, b_y1 = last_h_split_line_y1, bbox[Y0_IDX]
-            # 然后从box开始逐个向上找到所有与box在x上有交集的box
-            box_to_check = [bound_x0, b_y0, bound_x1, b_y1]
-            bbox_in_bound_check = get_bbox_in_boundary(all_bboxes, box_to_check)
-            bboxes_on_top = []
-            virtual_box = bbox
-            while True:
-                b_on_top = find_all_top_bbox_direct(virtual_box, bbox_in_bound_check)
-                if b_on_top is not None:
-                    bboxes_on_top.append(b_on_top)
-                    virtual_box = [
-                        min([virtual_box[X0_IDX], b_on_top[X0_IDX]]),
-                        min(virtual_box[Y0_IDX], b_on_top[Y0_IDX]),
-                        max([virtual_box[X1_IDX], b_on_top[X1_IDX]]),
-                        b_y1,
-                    ]
-                else:
-                    break
-            # 随后确定这些box的最小x0, 最大x1
-            if len(bboxes_on_top) > 0 and len(bboxes_on_top) != len(
-                bbox_in_bound_check
-            ):  # virtual_box可能会膨胀到占满整个区域，这实际上就不能属于一个col了。
-                min_x0, max_x1 = virtual_box[X0_IDX], virtual_box[X1_IDX]
-                # 然后采用一种比较粗糙的方法，看min_x0，max_x1是否与位于[bound_x0, last_h_sp, bound_x1, bbox[Y0_IDX]]之间的box有相交
-                if not any(
-                    [
-                        b[X0_IDX] <= min_x0 - 1 <= b[X1_IDX]
-                        or b[X0_IDX] <= max_x1 + 1 <= b[X1_IDX]
-                        for b in bbox_in_bound_check
-                    ]
-                ):
-                    # 其上，下都不能被扩展成行，暂时只检查一下上方 TODO
-                    top_nearest_bbox = find_all_top_bbox_direct(bbox, bboxes)
-                    bottom_nearest_bbox = find_all_bottom_bbox_direct(bbox, bboxes)
-                    if not any(
-                        [
-                            top_nearest_bbox is not None
-                            and (
-                                find_all_left_bbox_direct(top_nearest_bbox, bboxes)
-                                is None
-                                and find_all_right_bbox_direct(top_nearest_bbox, bboxes)
-                                is None
-                            ),
-                            bottom_nearest_bbox is not None
-                            and (
-                                find_all_left_bbox_direct(bottom_nearest_bbox, bboxes)
-                                is None
-                                and find_all_right_bbox_direct(
-                                    bottom_nearest_bbox, bboxes
-                                )
-                                is None
-                            ),
-                            top_nearest_bbox is None or bottom_nearest_bbox is None,
-                        ]
-                    ):
-                        is_belong_to_col = True
-            # 检查是否能被下方col吸收 TODO
-            """
-            这里为什么没有is_cross_boundary_mid_line的条件呢？
-            确实有些杂志左右两栏宽度不是对称的。
-            """
-            if not is_belong_to_col or is_cross_boundary_mid_line:
-                bbox[X0_EXT_IDX] = bound_x0
-                bbox[Y0_EXT_IDX] = bbox[Y0_IDX]
-                bbox[X1_EXT_IDX] = bound_x1
-                bbox[Y1_EXT_IDX] = bbox[Y1_IDX]
-                last_h_split_line_y1 = bbox[Y1_IDX]  # 更新这条线
-            else:
-                continue
-    """
-    此时独占一行的被成功扩展到指定的边界上，这个时候利用边界条件合并连续的bbox，成为一个group
-    然后合并所有连续水平方向的bbox.
-    """
-    all_bboxes.sort(key=lambda x: x[Y0_IDX])
-    h_bboxes = []
-    h_bbox_group = []
-    for bbox in all_bboxes:
-        if bbox[X0_EXT_IDX] == bound_x0 and bbox[X1_EXT_IDX] == bound_x1:
-            h_bbox_group.append(bbox)
-        else:
-            if len(h_bbox_group) > 0:
-                h_bboxes.append(h_bbox_group)
-                h_bbox_group = []
-    # 最后一个group
-    if len(h_bbox_group) > 0:
-        h_bboxes.append(h_bbox_group)
-    """
-    现在h_bboxes里面是所有的group了，每个group都是一个list
-    对h_bboxes里的每个group进行计算放回到sorted_layouts里
-    """
-    h_layouts = []
-    for gp in h_bboxes:
-        gp.sort(key=lambda x: x[Y0_IDX])
-        # 然后计算这个group的layout_bbox，也就是最小的x0,y0, 最大的x1,y1
-        x0, y0, x1, y1 = (
-            gp[0][X0_EXT_IDX],
-            gp[0][Y0_EXT_IDX],
-            gp[-1][X1_EXT_IDX],
-            gp[-1][Y1_EXT_IDX],
-        )
-        h_layouts.append([x0, y0, x1, y1, LAYOUT_H])  # 水平的布局
-    """
-    接下来利用这些连续的水平bbox的layout_bbox的y0, y1，从水平上切分开其余的为几个部分
-    """
-    h_split_lines = [bound_y0]
-    for gp in h_bboxes:  # gp是一个list[bbox_list]
-        y0, y1 = gp[0][1], gp[-1][3]
-        h_split_lines.append(y0)
-        h_split_lines.append(y1)
-    h_split_lines.append(bound_y1)
-    unsplited_bboxes = []
-    for i in range(0, len(h_split_lines), 2):
-        start_y0, start_y1 = h_split_lines[i : i + 2]
-        # 然后找出[start_y0, start_y1]之间的其他bbox，这些组成一个未分割板块
-        bboxes_in_block = [
-            bbox
-            for bbox in all_bboxes
-            if bbox[Y0_IDX] >= start_y0 and bbox[Y1_IDX] <= start_y1
-        ]
-        unsplited_bboxes.append(bboxes_in_block)
-    # 接着把未处理的加入到h_layouts里
-    for bboxes_in_block in unsplited_bboxes:
-        if len(bboxes_in_block) == 0:
-            continue
-        x0, y0, x1, y1 = (
-            bound_x0,
-            min([bbox[Y0_IDX] for bbox in bboxes_in_block]),
-            bound_x1,
-            max([bbox[Y1_IDX] for bbox in bboxes_in_block]),
-        )
-        h_layouts.append([x0, y0, x1, y1, LAYOUT_UNPROC])
-    h_layouts.sort(key=lambda x: x[1])  # 按照y0排序, 也就是从上到下的顺序
-    """
-    转换成如下格式返回
-    """
-    for layout in h_layouts:
-        sorted_layout_blocks.append(
-            {
-                'layout_bbox': layout[:4],
-                'layout_label': layout[4],
-                'sub_layout': [],
-            }
-        )
-    return sorted_layout_blocks
-###############################################################################################
-#
-#  垂直方向的处理
-#
-#
-###############################################################################################
-def _vertical_align_split_v1(bboxes: list, boundary: tuple) -> list:
-    """
-    计算垂直方向上的对齐， 并分割bboxes成layout。负责对一列多行的进行列维度分割。
-    如果不能完全分割，剩余部分作为layout_lable为u的layout返回
-    -----------------------
-    |     |           |
-    |     |           |
-    |     |           |
-    |     |           |
-    -------------------------
-    此函数会将：以上布局将会切分出来2列
-    """
-    sorted_layout_blocks = []  # 这是要最终返回的值
-    new_boundary = [boundary[0], boundary[1], boundary[2], boundary[3]]
-    v_blocks = []
-    """
-    先从左到右切分
-    """
-    while True:
-        all_bboxes = get_bbox_in_boundary(bboxes, new_boundary)
-        left_edge_bboxes = get_left_edge_bboxes(all_bboxes)
-        if len(left_edge_bboxes) == 0:
-            break
-        right_split_line_x1 = max([bbox[X1_IDX] for bbox in left_edge_bboxes]) + 1
-        # 然后检查这条线能不与其他bbox的左边界相交或者重合
-        if any(
-            [bbox[X0_IDX] <= right_split_line_x1 <= bbox[X1_IDX] for bbox in all_bboxes]
-        ):
-            # 垂直切分线与某些box发生相交，说明无法完全垂直方向切分。
-            break
-        else:  # 说明成功分割出一列
-            # 找到左侧边界最靠左的bbox作为layout的x0
-            layout_x0 = min(
-                [bbox[X0_IDX] for bbox in left_edge_bboxes]
-            )  # 这里主要是为了画出来有一定间距
-            v_blocks.append(
-                [
-                    layout_x0,
-                    new_boundary[1],
-                    right_split_line_x1,
-                    new_boundary[3],
-                    LAYOUT_V,
-                ]
-            )
-            new_boundary[0] = right_split_line_x1  # 更新边界
-    """
-    再从右到左切， 此时如果还是无法完全切分，那么剩余部分作为layout_lable为u的layout返回
-    """
-    unsplited_block = []
-    while True:
-        all_bboxes = get_bbox_in_boundary(bboxes, new_boundary)
-        right_edge_bboxes = get_right_edge_bboxes(all_bboxes)
-        if len(right_edge_bboxes) == 0:
-            break
-        left_split_line_x0 = min([bbox[X0_IDX] for bbox in right_edge_bboxes]) - 1
-        # 然后检查这条线能不与其他bbox的左边界相交或者重合
-        if any(
-            [bbox[X0_IDX] <= left_split_line_x0 <= bbox[X1_IDX] for bbox in all_bboxes]
-        ):
-            # 这里是余下的
-            unsplited_block.append(
-                [
-                    new_boundary[0],
-                    new_boundary[1],
-                    new_boundary[2],
-                    new_boundary[3],
-                    LAYOUT_UNPROC,
-                ]
-            )
-            break
-        else:
-            # 找到右侧边界最靠右的bbox作为layout的x1
-            layout_x1 = max([bbox[X1_IDX] for bbox in right_edge_bboxes])
-            v_blocks.append(
-                [
-                    left_split_line_x0,
-                    new_boundary[1],
-                    layout_x1,
-                    new_boundary[3],
-                    LAYOUT_V,
-                ]
-            )
-            new_boundary[2] = left_split_line_x0  # 更新右边界
-    """
-    最后拼装成layout格式返回
-    """
-    for block in v_blocks:
-        sorted_layout_blocks.append(
-            {
-                'layout_bbox': block[:4],
-                'layout_label': block[4],
-                'sub_layout': [],
-            }
-        )
-    for block in unsplited_block:
-        sorted_layout_blocks.append(
-            {
-                'layout_bbox': block[:4],
-                'layout_label': block[4],
-                'sub_layout': [],
-            }
-        )
-    # 按照x0排序
-    sorted_layout_blocks.sort(key=lambda x: x['layout_bbox'][0])
-    return sorted_layout_blocks
-def _vertical_align_split_v2(bboxes: list, boundary: tuple) -> list:
-    """改进的
-    _vertical_align_split算法，原算法会因为第二列的box由于左侧没有遮挡被认为是左侧的一部分，导致整个layout多列被识别为一列。
-    利用从左上角的box开始向下看的方法，不断扩展w_x0, w_x1，直到不能继续向下扩展，或者到达边界下边界。"""
-    sorted_layout_blocks = []  # 这是要最终返回的值
-    new_boundary = [boundary[0], boundary[1], boundary[2], boundary[3]]
-    bad_boxes = []  # 被割中的box
-    v_blocks = []
-    while True:
-        all_bboxes = get_bbox_in_boundary(bboxes, new_boundary)
-        if len(all_bboxes) == 0:
-            break
-        left_top_box = min(
-            all_bboxes, key=lambda x: (x[X0_IDX], x[Y0_IDX])
-        )  # 这里应该加强，检查一下必须是在第一列的 TODO
-        start_box = [
-            left_top_box[X0_IDX],
-            left_top_box[Y0_IDX],
-            left_top_box[X1_IDX],
-            left_top_box[Y1_IDX],
-        ]
-        w_x0, w_x1 = left_top_box[X0_IDX], left_top_box[X1_IDX]
-        """
-        然后沿着这个box线向下找最近的那个box, 然后扩展w_x0, w_x1
-        扩展之后，宽度会增加，随后用x=w_x1来检测在边界内是否有box与相交，如果相交，那么就说明不能再扩展了。
-        当不能扩展的时候就要看是否到达下边界：
-        1. 达到，那么更新左边界继续分下一个列
-        2. 没有达到，那么此时开始从右侧切分进入下面的循环里
-        """
-        while left_top_box is not None:  # 向下去找
-            virtual_box = [w_x0, left_top_box[Y0_IDX], w_x1, left_top_box[Y1_IDX]]
-            left_top_box = find_bottom_bbox_direct_from_left_edge(
-                virtual_box, all_bboxes
-            )
-            if left_top_box:
-                w_x0, w_x1 = min(virtual_box[X0_IDX], left_top_box[X0_IDX]), max(
-                    [virtual_box[X1_IDX], left_top_box[X1_IDX]]
-                )
-        # 万一这个初始的box在column中间，那么还要向上看
-        start_box = [
-            w_x0,
-            start_box[Y0_IDX],
-            w_x1,
-            start_box[Y1_IDX],
-        ]  # 扩展一下宽度更鲁棒
-        left_top_box = find_top_bbox_direct_from_left_edge(start_box, all_bboxes)
-        while left_top_box is not None:  # 向上去找
-            virtual_box = [w_x0, left_top_box[Y0_IDX], w_x1, left_top_box[Y1_IDX]]
-            left_top_box = find_top_bbox_direct_from_left_edge(virtual_box, all_bboxes)
-            if left_top_box:
-                w_x0, w_x1 = min(virtual_box[X0_IDX], left_top_box[X0_IDX]), max(
-                    [virtual_box[X1_IDX], left_top_box[X1_IDX]]
-                )
-        # 检查相交
-        if any([bbox[X0_IDX] <= w_x1 + 1 <= bbox[X1_IDX] for bbox in all_bboxes]):
-            for b in all_bboxes:
-                if b[X0_IDX] <= w_x1 + 1 <= b[X1_IDX]:
-                    bad_boxes.append([b[X0_IDX], b[Y0_IDX], b[X1_IDX], b[Y1_IDX]])
-            break
-        else:  # 说明成功分割出一列
-            v_blocks.append([w_x0, new_boundary[1], w_x1, new_boundary[3], LAYOUT_V])
-            new_boundary[0] = w_x1  # 更新边界
-    """
-    接着开始从右上角的box扫描
-    """
-    w_x0, w_x1 = 0, 0
-    unsplited_block = []
-    while True:
-        all_bboxes = get_bbox_in_boundary(bboxes, new_boundary)
-        if len(all_bboxes) == 0:
-            break
-        # 先找到X1最大的
-        bbox_list_sorted = sorted(
-            all_bboxes, key=lambda bbox: bbox[X1_IDX], reverse=True
-        )
-        # Then, find the boxes with the smallest Y0 value
-        bigest_x1 = bbox_list_sorted[0][X1_IDX]
-        boxes_with_bigest_x1 = [
-            bbox for bbox in bbox_list_sorted if bbox[X1_IDX] == bigest_x1
-        ]  # 也就是最靠右的那些
-        right_top_box = min(
-            boxes_with_bigest_x1, key=lambda bbox: bbox[Y0_IDX]
-        )  # y0最小的那个
-        start_box = [
-            right_top_box[X0_IDX],
-            right_top_box[Y0_IDX],
-            right_top_box[X1_IDX],
-            right_top_box[Y1_IDX],
-        ]
-        w_x0, w_x1 = right_top_box[X0_IDX], right_top_box[X1_IDX]
-        while right_top_box is not None:
-            virtual_box = [w_x0, right_top_box[Y0_IDX], w_x1, right_top_box[Y1_IDX]]
-            right_top_box = find_bottom_bbox_direct_from_right_edge(
-                virtual_box, all_bboxes
-            )
-            if right_top_box:
-                w_x0, w_x1 = min([w_x0, right_top_box[X0_IDX]]), max(
-                    [w_x1, right_top_box[X1_IDX]]
-                )
-        # 在向上扫描
-        start_box = [
-            w_x0,
-            start_box[Y0_IDX],
-            w_x1,
-            start_box[Y1_IDX],
-        ]  # 扩展一下宽度更鲁棒
-        right_top_box = find_top_bbox_direct_from_right_edge(start_box, all_bboxes)
-        while right_top_box is not None:
-            virtual_box = [w_x0, right_top_box[Y0_IDX], w_x1, right_top_box[Y1_IDX]]
-            right_top_box = find_top_bbox_direct_from_right_edge(
-                virtual_box, all_bboxes
-            )
-            if right_top_box:
-                w_x0, w_x1 = min([w_x0, right_top_box[X0_IDX]]), max(
-                    [w_x1, right_top_box[X1_IDX]]
-                )
-        # 检查是否与其他box相交， 垂直切分线与某些box发生相交，说明无法完全垂直方向切分。
-        if any([bbox[X0_IDX] <= w_x0 - 1 <= bbox[X1_IDX] for bbox in all_bboxes]):
-            unsplited_block.append(
-                [
-                    new_boundary[0],
-                    new_boundary[1],
-                    new_boundary[2],
-                    new_boundary[3],
-                    LAYOUT_UNPROC,
-                ]
-            )
-            for b in all_bboxes:
-                if b[X0_IDX] <= w_x0 - 1 <= b[X1_IDX]:
-                    bad_boxes.append([b[X0_IDX], b[Y0_IDX], b[X1_IDX], b[Y1_IDX]])
-            break
-        else:  # 说明成功分割出一列
-            v_blocks.append([w_x0, new_boundary[1], w_x1, new_boundary[3], LAYOUT_V])
-            new_boundary[2] = w_x0
-    """转换数据结构"""
-    for block in v_blocks:
-        sorted_layout_blocks.append(
-            {
-                'layout_bbox': block[:4],
-                'layout_label': block[4],
-                'sub_layout': [],
-            }
-        )
-    for block in unsplited_block:
-        sorted_layout_blocks.append(
-            {
-                'layout_bbox': block[:4],
-                'layout_label': block[4],
-                'sub_layout': [],
-                'bad_boxes': bad_boxes,  # 记录下来，这个box是被割中的
-            }
-        )
-    # 按照x0排序
-    sorted_layout_blocks.sort(key=lambda x: x['layout_bbox'][0])
-    return sorted_layout_blocks
-def _try_horizontal_mult_column_split(bboxes: list, boundary: tuple) -> list:
-    """
-    尝试水平切分，如果切分不动，那就当一个BAD_LAYOUT返回
-    ------------------
-    |        |       |
-    ------------------
-    |    |       |   |   <-  这里是此函数要切分的场景
-    ------------------
-    |        |       |
-    |        |       |
-    """
-    pass
-def _vertical_split(bboxes: list, boundary: tuple) -> list:
-    """
-    从垂直方向进行切割，分block
-    这个版本里，如果垂直切分不动，那就当一个BAD_LAYOUT返回
-                                --------------------------
-                                    |        |       |
-                                    |        |       |
-                                | |
-    这种列是此函数要切分的  ->    | |
-                                | |
-                                    |        |       |
-                                    |        |       |
-                                -------------------------
-    """
-    sorted_layout_blocks = []  # 这是要最终返回的值
-    bound_x0, bound_y0, bound_x1, bound_y1 = boundary
-    all_bboxes = get_bbox_in_boundary(bboxes, boundary)
-    """
-    all_bboxes = fix_vertical_bbox_pos(all_bboxes) # 垂直方向解覆盖
-    all_bboxes = fix_hor_bbox_pos(all_bboxes)  # 水平解覆盖
-    这两行代码目前先不执行，因为公式检测，表格检测还不是很成熟，导致非常多的textblock参与了运算，时间消耗太大。
-    这两行代码的作用是：
-    如果遇到互相重叠的bbox, 那么会把面积较小的box进行压缩，从而避免重叠。对布局切分来说带来正反馈。
-    """
-    # all_bboxes = paper_bbox_sort(all_bboxes, abs(bound_x1-bound_x0), abs(bound_y1-bound_x0)) # 大致拍下序, 这个是基于直接遮挡的。
-    """
-    首先在垂直方向上扩展独占一行的bbox
-    """
-    for bbox in all_bboxes:
-        top_nearest_bbox = find_all_top_bbox_direct(bbox, all_bboxes)  # 非扩展线
-        bottom_nearest_bbox = find_all_bottom_bbox_direct(bbox, all_bboxes)
-        if (
-            top_nearest_bbox is None
-            and bottom_nearest_bbox is None
-            and not any(
-                [
-                    b[X0_IDX] < bbox[X1_IDX] < b[X1_IDX]
-                    or b[X0_IDX] < bbox[X0_IDX] < b[X1_IDX]
-                    for b in all_bboxes
-                ]
-            )
-        ):  # 独占一列, 且不和其他重叠
-            bbox[X0_EXT_IDX] = bbox[X0_IDX]
-            bbox[Y0_EXT_IDX] = bound_y0
-            bbox[X1_EXT_IDX] = bbox[X1_IDX]
-            bbox[Y1_EXT_IDX] = bound_y1
-        """
-    此时独占一列的被成功扩展到指定的边界上，这个时候利用边界条件合并连续的bbox，成为一个group
-    然后合并所有连续垂直方向的bbox.
-    """
-    all_bboxes.sort(key=lambda x: x[X0_IDX])
-    # fix: 这里水平方向的列不要合并成一个行，因为需要保证返回给下游的最小block，总是可以无脑从上到下阅读文字。
-    v_bboxes = []
-    for box in all_bboxes:
-        if box[Y0_EXT_IDX] == bound_y0 and box[Y1_EXT_IDX] == bound_y1:
-            v_bboxes.append(box)
-    """
-    现在v_bboxes里面是所有的group了，每个group都是一个list
-    对v_bboxes里的每个group进行计算放回到sorted_layouts里
-    """
-    v_layouts = []
-    for vbox in v_bboxes:
-        # gp.sort(key=lambda x: x[X0_IDX])
-        # 然后计算这个group的layout_bbox，也就是最小的x0,y0, 最大的x1,y1
-        x0, y0, x1, y1 = (
-            vbox[X0_EXT_IDX],
-            vbox[Y0_EXT_IDX],
-            vbox[X1_EXT_IDX],
-            vbox[Y1_EXT_IDX],
-        )
-        v_layouts.append([x0, y0, x1, y1, LAYOUT_V])  # 垂直的布局
-    """
-    接下来利用这些连续的垂直bbox的layout_bbox的x0, x1，从垂直上切分开其余的为几个部分
-    """
-    v_split_lines = [bound_x0]
-    for gp in v_bboxes:
-        x0, x1 = gp[X0_IDX], gp[X1_IDX]
-        v_split_lines.append(x0)
-        v_split_lines.append(x1)
-    v_split_lines.append(bound_x1)
-    unsplited_bboxes = []
-    for i in range(0, len(v_split_lines), 2):
-        start_x0, start_x1 = v_split_lines[i : i + 2]
-        # 然后找出[start_x0, start_x1]之间的其他bbox，这些组成一个未分割板块
-        bboxes_in_block = [
-            bbox
-            for bbox in all_bboxes
-            if bbox[X0_IDX] >= start_x0 and bbox[X1_IDX] <= start_x1
-        ]
-        unsplited_bboxes.append(bboxes_in_block)
-    # 接着把未处理的加入到v_layouts里
-    for bboxes_in_block in unsplited_bboxes:
-        if len(bboxes_in_block) == 0:
-            continue
-        x0, y0, x1, y1 = (
-            min([bbox[X0_IDX] for bbox in bboxes_in_block]),
-            bound_y0,
-            max([bbox[X1_IDX] for bbox in bboxes_in_block]),
-            bound_y1,
-        )
-        v_layouts.append(
-            [x0, y0, x1, y1, LAYOUT_UNPROC]
-        )  # 说明这篇区域未能够分析出可靠的版面
-    v_layouts.sort(key=lambda x: x[0])  # 按照x0排序, 也就是从左到右的顺序
-    for layout in v_layouts:
-        sorted_layout_blocks.append(
-            {
-                'layout_bbox': layout[:4],
-                'layout_label': layout[4],
-                'sub_layout': [],
-            }
-        )
-    """
-    至此，垂直方向切成了2种类型，其一是独占一列的，其二是未处理的。
-    下面对这些未处理的进行垂直方向切分，这个切分要切出来类似“吕”这种类型的垂直方向的布局
-    """
-    for i, layout in enumerate(sorted_layout_blocks):
-        if layout['layout_label'] == LAYOUT_UNPROC:
-            x0, y0, x1, y1 = layout['layout_bbox']
-            v_split_layouts = _vertical_align_split_v2(bboxes, [x0, y0, x1, y1])
-            sorted_layout_blocks[i] = {
-                'layout_bbox': [x0, y0, x1, y1],
-                'layout_label': LAYOUT_H,
-                'sub_layout': v_split_layouts,
-            }
-            layout['layout_label'] = LAYOUT_H  # 被垂线切分成了水平布局
-    return sorted_layout_blocks
-def split_layout(bboxes: list, boundary: tuple, page_num: int) -> list:
-    """
-    把bboxes切割成layout
-    return:
-    [
-        {
-            "layout_bbox": [x0,y0,x1,y1],
-            "layout_label":"u|v|h|b", 未处理|垂直|水平|BAD_LAYOUT
-            "sub_layout":[] #每个元素都是[
-                                            x0,y0,
-                                            x1,y1,
-                                            block_content,
-                                            idx_x,idx_y,
-                                            content_type,
-                                            ext_x0,ext_y0,
-                                            ext_x1,ext_y1
-                                        ], 并且顺序就是阅读顺序
-        }
-    ]
-    example:
-    [
-        {
-            "layout_bbox": [0, 0, 100, 100],
-            "layout_label":"u|v|h|b",
-            "sub_layout":[
-            ]
-        },
-        {
-            "layout_bbox": [0, 0, 100, 100],
-            "layout_label":"u|v|h|b",
-            "sub_layout":[
-                {
-                    "layout_bbox": [0, 0, 100, 100],
-                    "layout_label":"u|v|h|b",
-                    "content_bboxes":[
-                        [],
-                        [],
-                        []
-                    ]
-                },
-                {
-                    "layout_bbox": [0, 0, 100, 100],
-                    "layout_label":"u|v|h|b",
-                    "sub_layout":[
-                    ]
-                }
-        }
-    ]
-    """
-    sorted_layouts = []  # 最终返回的结果
-    boundary_x0, boundary_y0, boundary_x1, boundary_y1 = boundary
-    if len(bboxes) <= 1:
-        return [
-            {
-                'layout_bbox': [boundary_x0, boundary_y0, boundary_x1, boundary_y1],
-                'layout_label': LAYOUT_V,
-                'sub_layout': [],
-            }
-        ]
-    """
-    接下来按照先水平后垂直的顺序进行切分
-    """
-    bboxes = paper_bbox_sort(
-        bboxes, boundary_x1 - boundary_x0, boundary_y1 - boundary_y0
-    )
-    sorted_layouts = _horizontal_split(bboxes, boundary)  # 通过水平分割出来的layout
-    for i, layout in enumerate(sorted_layouts):
-        x0, y0, x1, y1 = layout['layout_bbox']
-        layout_type = layout['layout_label']
-        if layout_type == LAYOUT_UNPROC:  # 说明是非独占单行的，这些需要垂直切分
-            v_split_layouts = _vertical_split(bboxes, [x0, y0, x1, y1])
-            """
-            最后这里有个逻辑问题：如果这个函数只分离出来了一个column layout，那么这个layout分割肯定超出了算法能力范围。因为我们假定的是传进来的
-            box已经把行全部剥离了，所以这里必须十多个列才可以。如果只剥离出来一个layout，并且是多个box，那么就说明这个layout是无法分割的，标记为LAYOUT_UNPROC
-            """
-            layout_label = LAYOUT_V
-            if len(v_split_layouts) == 1:
-                if len(v_split_layouts[0]['sub_layout']) == 0:
-                    layout_label = LAYOUT_UNPROC
-                    # logger.warning(f"WARNING: pageno={page_num}, 无法分割的layout: ", v_split_layouts)
-            """
-            组合起来最终的layout
-            """
-            sorted_layouts[i] = {
-                'layout_bbox': [x0, y0, x1, y1],
-                'layout_label': layout_label,
-                'sub_layout': v_split_layouts,
-            }
-            layout['layout_label'] = LAYOUT_H
-    """
-    水平和垂直方向都切分完毕了。此时还有一些未处理的，这些未处理的可能是因为水平和垂直方向都无法切分。
-    这些最后调用_try_horizontal_mult_block_split做一次水平多个block的联合切分，如果也不能切分最终就当做BAD_LAYOUT返回
-    """
-    # TODO
-    return sorted_layouts
-def get_bboxes_layout(all_boxes: list, boundary: tuple, page_id: int):
-    """
-    对利用layout排序之后的box，进行排序
-    return:
-    [
-        {
-            "layout_bbox": [x0, y0, x1, y1],
-            "layout_label":"u|v|h|b", 未处理|垂直|水平|BAD_LAYOUT
-        }，
-    ]
-    """
-    def _preorder_traversal(layout):
-        """对sorted_layouts的叶子节点，也就是len(sub_layout)==0的节点进行排序。排序按照前序遍历的顺序，也就是从上到
-        下，从左到右的顺序."""
-        sorted_layout_blocks = []
-        for layout in layout:
-            sub_layout = layout['sub_layout']
-            if len(sub_layout) == 0:
-                sorted_layout_blocks.append(layout)
-            else:
-                s = _preorder_traversal(sub_layout)
-                sorted_layout_blocks.extend(s)
-        return sorted_layout_blocks
-    # -------------------------------------------------------------------------------------------------------------------------
-    sorted_layouts = split_layout(
-        all_boxes, boundary, page_id
-    )  # 先切分成layout，得到一个Tree
-    total_sorted_layout_blocks = _preorder_traversal(sorted_layouts)
-    return total_sorted_layout_blocks, sorted_layouts
-def get_columns_cnt_of_layout(layout_tree):
-    """获取一个layout的宽度."""
-    max_width_list = [0]  # 初始化一个元素，防止max,min函数报错
-    for items in layout_tree:  # 针对每一层（横切）计算列数，横着的算一列
-        layout_type = items['layout_label']
-        sub_layouts = items['sub_layout']
-        if len(sub_layouts) == 0:
-            max_width_list.append(1)
-        else:
-            if layout_type == LAYOUT_H:
-                max_width_list.append(1)
-            else:
-                width = 0
-                for sub_layout in sub_layouts:
-                    if len(sub_layout['sub_layout']) == 0:
-                        width += 1
-                    else:
-                        for lay in sub_layout['sub_layout']:
-                            width += get_columns_cnt_of_layout([lay])
-                max_width_list.append(width)
-    return max(max_width_list)
-def sort_with_layout(bboxes: list, page_width, page_height) -> (list, list):
-    """输入是一个bbox的list.
-    获取到输入之后，先进行layout切分，然后对这些bbox进行排序。返回排序后的bboxes
-    """
-    new_bboxes = []
-    for box in bboxes:
-        # new_bboxes.append([box[0], box[1], box[2], box[3], None, None, None, 'text', None, None, None, None])
-        new_bboxes.append(
-            [
-                box[0],
-                box[1],
-                box[2],
-                box[3],
-                None,
-                None,
-                None,
-                'text',
-                None,
-                None,
-                None,
-                None,
-                box[4],
-            ]
-        )
-    layout_bboxes, _ = get_bboxes_layout(
-        new_bboxes, tuple([0, 0, page_width, page_height]), 0
-    )
-    if any([lay['layout_label'] == LAYOUT_UNPROC for lay in layout_bboxes]):
-        logger.warning('drop this pdf, reason: 复杂版面')
-        return None, None
-    sorted_bboxes = []
-    # 利用layout bbox每次框定一些box，然后排序
-    for layout in layout_bboxes:
-        lbox = layout['layout_bbox']
-        bbox_in_layout = get_bbox_in_boundary(new_bboxes, lbox)
-        sorted_bbox = paper_bbox_sort(
-            bbox_in_layout, lbox[2] - lbox[0], lbox[3] - lbox[1]
-        )
-        sorted_bboxes.extend(sorted_bbox)
-    return sorted_bboxes, layout_bboxes
-def sort_text_block(text_block, layout_bboxes):
-    """对一页的text_block进行排序."""
-    sorted_text_bbox = []
-    all_text_bbox = []
-    # 做一个box=>text的映射
-    box_to_text = {}
-    for blk in text_block:
-        box = blk['bbox']
-        box_to_text[(box[0], box[1], box[2], box[3])] = blk
-        all_text_bbox.append(box)
-    # text_blocks_to_sort = []
-    # for box in box_to_text.keys():
-    #     text_blocks_to_sort.append([box[0], box[1], box[2], box[3], None, None, None, 'text', None, None, None, None])
-    # 按照layout_bboxes的顺序，对text_block进行排序
-    for layout in layout_bboxes:
-        layout_box = layout['layout_bbox']
-        text_bbox_in_layout = get_bbox_in_boundary(
-            all_text_bbox,
-            [
-                layout_box[0] - 1,
-                layout_box[1] - 1,
-                layout_box[2] + 1,
-                layout_box[3] + 1,
-            ],
-        )
-        # sorted_bbox = paper_bbox_sort(text_bbox_in_layout, layout_box[2]-layout_box[0], layout_box[3]-layout_box[1])
-        text_bbox_in_layout.sort(
-            key=lambda x: x[1]
-        )  # 一个layout内部的box，按照y0自上而下排序
-        # sorted_bbox = [[b] for b in text_blocks_to_sort]
-        for sb in text_bbox_in_layout:
-            sorted_text_bbox.append(box_to_text[(sb[0], sb[1], sb[2], sb[3])])
-    return sorted_text_bbox
--- a/magic_pdf/layout/layout_spiler_recog.py
+++ b/magic_pdf/layout/layout_spiler_recog.py
-"""
-找到能分割布局的水平的横线、色块
-"""
-import os
-from magic_pdf.libs.commons import fitz
-from magic_pdf.libs.boxbase import _is_in_or_part_overlap
-def __rect_filter_by_width(rect, page_w, page_h):
-    mid_x = page_w/2
-    if rect[0]< mid_x < rect[2]:
-        return True
-    return False
-def __rect_filter_by_pos(rect, image_bboxes, table_bboxes):
-    """
-    不能出现在table和image的位置
-    """
-    for box in image_bboxes:
-        if _is_in_or_part_overlap(rect, box):
-            return False
-    for box in table_bboxes:
-        if _is_in_or_part_overlap(rect, box):
-            return False
-    return True
-def __debug_show_page(page, bboxes1: list,bboxes2: list,bboxes3: list,):
-    save_path = "./tmp/debug.pdf"
-    if os.path.exists(save_path):
-        # 删除已经存在的文件
-        os.remove(save_path)
-    # 创建一个新的空白 PDF 文件
-    doc = fitz.open('')
-    width = page.rect.width
-    height = page.rect.height
-    new_page = doc.new_page(width=width, height=height)
-    shape = new_page.new_shape()
-    for bbox in bboxes1:
-        # 原始box画上去
-        rect = fitz.Rect(*bbox[0:4])
-        shape = new_page.new_shape()
-        shape.draw_rect(rect)
-        shape.finish(color=fitz.pdfcolor['red'], fill=fitz.pdfcolor['blue'], fill_opacity=0.2)
-        shape.finish()
-        shape.commit()
-    for bbox in bboxes2:
-        # 原始box画上去
-        rect = fitz.Rect(*bbox[0:4])
-        shape = new_page.new_shape()
-        shape.draw_rect(rect)
-        shape.finish(color=None, fill=fitz.pdfcolor['yellow'], fill_opacity=0.2)
-        shape.finish()
-        shape.commit()
-    for bbox in bboxes3:
-        # 原始box画上去
-        rect = fitz.Rect(*bbox[0:4])
-        shape = new_page.new_shape()
-        shape.draw_rect(rect)
-        shape.finish(color=fitz.pdfcolor['red'], fill=None)
-        shape.finish()
-        shape.commit()
-    parent_dir = os.path.dirname(save_path)
-    if not os.path.exists(parent_dir):
-        os.makedirs(parent_dir)
-    doc.save(save_path)
-    doc.close() 
-def get_spilter_of_page(page, image_bboxes, table_bboxes):
-    """
-    获取到色块和横线
-    """
-    cdrawings = page.get_cdrawings()
-    spilter_bbox = []
-    for block in cdrawings:
-        if 'fill' in block:
-            fill = block['fill']
-        if 'fill' in block and block['fill'] and block['fill']!=(1.0,1.0,1.0):
-            rect = block['rect']
-            if __rect_filter_by_width(rect, page.rect.width, page.rect.height) and __rect_filter_by_pos(rect, image_bboxes, table_bboxes):
-                spilter_bbox.append(list(rect))
-    """过滤、修正一下这些box。因为有时候会有一些矩形，高度为0或者为负数，造成layout计算无限循环。如果是负高度或者0高度，统一修正为高度为1"""
-    for box in spilter_bbox:
-        if box[3]-box[1] <= 0:
-            box[3] = box[1] + 1
-    #__debug_show_page(page, spilter_bbox, [], [])
-    return spilter_bbox
--- a/magic_pdf/layout/mcol_sort.py
+++ b/magic_pdf/layout/mcol_sort.py
-"""
-This is an advanced PyMuPDF utility for detecting multi-column pages.
-It can be used in a shell script, or its main function can be imported and
-invoked as descript below.
-Features
---------
- Identify text belonging to (a variable number of) columns on the page.
- Text with different background color is handled separately, allowing for
-  easier treatment of side remarks, comment boxes, etc.
- Uses text block detection capability to identify text blocks and
-  uses the block bboxes as primary structuring principle.
- Supports ignoring footers via a footer margin parameter.
- Returns re-created text boundary boxes (integer coordinates), sorted ascending
-  by the top, then by the left coordinates.
-Restrictions
-------------
- Only supporting horizontal, left-to-right text
- Returns a list of text boundary boxes - not the text itself. The caller is
-  expected to extract text from within the returned boxes.
- Text written above images is ignored altogether (option).
- This utility works as expected in most cases. The following situation cannot
-  be handled correctly:
-    * overlapping (non-disjoint) text blocks
-    * image captions are not recognized and are handled like normal text
-Usage
------
- As a CLI shell command use
-  python multi_column.py input.pdf footer_margin
-  Where footer margin is the height of the bottom stripe to ignore on each page.
-  This code is intended to be modified according to your need.
- Use in a Python script as follows:
-  ----------------------------------------------------------------------------------
-  from multi_column import column_boxes
-  # for each page execute
-  bboxes = column_boxes(page, footer_margin=50, no_image_text=True)
-  # bboxes is a list of fitz.IRect objects, that are sort ascending by their y0,
-  # then x0 coordinates. Their text content can be extracted by all PyMuPDF
-  # get_text() variants, like for instance the following:
-  for rect in bboxes:
-      print(page.get_text(clip=rect, sort=True))
-  ----------------------------------------------------------------------------------
-"""
-import sys
-from magic_pdf.libs.commons import fitz
-def column_boxes(page, footer_margin=50, header_margin=50, no_image_text=True):
-    """Determine bboxes which wrap a column."""
-    paths = page.get_drawings()
-    bboxes = []
-    # path rectangles
-    path_rects = []
-    # image bboxes
-    img_bboxes = []
-    # bboxes of non-horizontal text
-    # avoid when expanding horizontal text boxes
-    vert_bboxes = []
-    # compute relevant page area
-    clip = +page.rect
-    clip.y1 -= footer_margin  # Remove footer area
-    clip.y0 += header_margin  # Remove header area
-    def can_extend(temp, bb, bboxlist):
-        """Determines whether rectangle 'temp' can be extended by 'bb'
-        without intersecting any of the rectangles contained in 'bboxlist'.
-        Items of bboxlist may be None if they have been removed.
-        Returns:
-            True if 'temp' has no intersections with items of 'bboxlist'.
-        """
-        for b in bboxlist:
-            if not intersects_bboxes(temp, vert_bboxes) and (
-                b == None or b == bb or (temp & b).is_empty
-            ):
-                continue
-            return False
-        return True
-    def in_bbox(bb, bboxes):
-        """Return 1-based number if a bbox contains bb, else return 0."""
-        for i, bbox in enumerate(bboxes):
-            if bb in bbox:
-                return i + 1
-        return 0
-    def intersects_bboxes(bb, bboxes):
-        """Return True if a bbox intersects bb, else return False."""
-        for bbox in bboxes:
-            if not (bb & bbox).is_empty:
-                return True
-        return False
-    def extend_right(bboxes, width, path_bboxes, vert_bboxes, img_bboxes):
-        """Extend a bbox to the right page border.
-        Whenever there is no text to the right of a bbox, enlarge it up
-        to the right page border.
-        Args:
-            bboxes: (list[IRect]) bboxes to check
-            width: (int) page width
-            path_bboxes: (list[IRect]) bboxes with a background color
-            vert_bboxes: (list[IRect]) bboxes with vertical text
-            img_bboxes: (list[IRect]) bboxes of images
-        Returns:
-            Potentially modified bboxes.
-        """
-        for i, bb in enumerate(bboxes):
-            # do not extend text with background color
-            if in_bbox(bb, path_bboxes):
-                continue
-            # do not extend text in images
-            if in_bbox(bb, img_bboxes):
-                continue
-            # temp extends bb to the right page border
-            temp = +bb
-            temp.x1 = width
-            # do not cut through colored background or images
-            if intersects_bboxes(temp, path_bboxes + vert_bboxes + img_bboxes):
-                continue
-            # also, do not intersect other text bboxes
-            check = can_extend(temp, bb, bboxes)
-            if check:
-                bboxes[i] = temp  # replace with enlarged bbox
-        return [b for b in bboxes if b != None]
-    def clean_nblocks(nblocks):
-        """Do some elementary cleaning."""
-        # 1. remove any duplicate blocks.
-        blen = len(nblocks)
-        if blen < 2:
-            return nblocks
-        start = blen - 1
-        for i in range(start, -1, -1):
-            bb1 = nblocks[i]
-            bb0 = nblocks[i - 1]
-            if bb0 == bb1:
-                del nblocks[i]
-        # 2. repair sequence in special cases:
-        # consecutive bboxes with almost same bottom value are sorted ascending
-        # by x-coordinate.
-        y1 = nblocks[0].y1  # first bottom coordinate
-        i0 = 0  # its index
-        i1 = -1  # index of last bbox with same bottom
-        # Iterate over bboxes, identifying segments with approx. same bottom value.
-        # Replace every segment by its sorted version.
-        for i in range(1, len(nblocks)):
-            b1 = nblocks[i]
-            if abs(b1.y1 - y1) > 10:  # different bottom
-                if i1 > i0:  # segment length > 1? Sort it!
-                    nblocks[i0 : i1 + 1] = sorted(
-                        nblocks[i0 : i1 + 1], key=lambda b: b.x0
-                    )
-                y1 = b1.y1  # store new bottom value
-                i0 = i  # store its start index
-            i1 = i  # store current index
-        if i1 > i0:  # segment waiting to be sorted
-            nblocks[i0 : i1 + 1] = sorted(nblocks[i0 : i1 + 1], key=lambda b: b.x0)
-        return nblocks
-    # extract vector graphics
-    for p in paths:
-        path_rects.append(p["rect"].irect)
-    path_bboxes = path_rects
-    # sort path bboxes by ascending top, then left coordinates
-    path_bboxes.sort(key=lambda b: (b.y0, b.x0))
-    # bboxes of images on page, no need to sort them
-    for item in page.get_images():
-        img_bboxes.extend(page.get_image_rects(item[0]))
-    # blocks of text on page
-    blocks = page.get_text(
-        "dict",
-        flags=fitz.TEXTFLAGS_TEXT,
-        clip=clip,
-    )["blocks"]
-    # Make block rectangles, ignoring non-horizontal text
-    for b in blocks:
-        bbox = fitz.IRect(b["bbox"])  # bbox of the block
-        # ignore text written upon images
-        if no_image_text and in_bbox(bbox, img_bboxes):
-            continue
-        # confirm first line to be horizontal
-        line0 = b["lines"][0]  # get first line
-        if line0["dir"] != (1, 0):  # only accept horizontal text
-            vert_bboxes.append(bbox)
-            continue
-        srect = fitz.EMPTY_IRECT()
-        for line in b["lines"]:
-            lbbox = fitz.IRect(line["bbox"])
-            text = "".join([s["text"].strip() for s in line["spans"]])
-            if len(text) > 1:
-                srect |= lbbox
-        bbox = +srect
-        if not bbox.is_empty:
-            bboxes.append(bbox)
-    # Sort text bboxes by ascending background, top, then left coordinates
-    bboxes.sort(key=lambda k: (in_bbox(k, path_bboxes), k.y0, k.x0))
-    # Extend bboxes to the right where possible
-    bboxes = extend_right(
-        bboxes, int(page.rect.width), path_bboxes, vert_bboxes, img_bboxes
-    )
-    # immediately return of no text found
-    if bboxes == []:
-        return []
-    # --------------------------------------------------------------------
-    # Join bboxes to establish some column structure
-    # --------------------------------------------------------------------
-    # the final block bboxes on page
-    nblocks = [bboxes[0]]  # pre-fill with first bbox
-    bboxes = bboxes[1:]  # remaining old bboxes
-    for i, bb in enumerate(bboxes):  # iterate old bboxes
-        check = False  # indicates unwanted joins
-        # check if bb can extend one of the new blocks
-        for j in range(len(nblocks)):
-            nbb = nblocks[j]  # a new block
-            # never join across columns
-            if bb == None or nbb.x1 < bb.x0 or bb.x1 < nbb.x0:
-                continue
-            # never join across different background colors
-            if in_bbox(nbb, path_bboxes) != in_bbox(bb, path_bboxes):
-                continue
-            temp = bb | nbb  # temporary extension of new block
-            check = can_extend(temp, nbb, nblocks)
-            if check == True:
-                break
-        if not check:  # bb cannot be used to extend any of the new bboxes
-            nblocks.append(bb)  # so add it to the list
-            j = len(nblocks) - 1  # index of it
-            temp = nblocks[j]  # new bbox added
-        # check if some remaining bbox is contained in temp
-        check = can_extend(temp, bb, bboxes)
-        if check == False:
-            nblocks.append(bb)
-        else:
-            nblocks[j] = temp
-        bboxes[i] = None
-    # do some elementary cleaning
-    nblocks = clean_nblocks(nblocks)
-    # return identified text bboxes
-    return nblocks
-if __name__ == "__main__":
-    """Only for debugging purposes, currently.
-    Draw red borders around the returned text bboxes and insert
-    the bbox number.
-    Then save the file under the name "input-blocks.pdf".
-    """
-    # get the file name
-    filename = sys.argv[1]
-    # check if footer margin is given
-    if len(sys.argv) > 2:
-        footer_margin = int(sys.argv[2])
-    else:  # use default vaue
-        footer_margin = 50
-    # check if header margin is given
-    if len(sys.argv) > 3:
-        header_margin = int(sys.argv[3])
-    else:  # use default vaue
-        header_margin = 50
-    # open document
-    doc = fitz.open(filename)
-    # iterate over the pages
-    for page in doc:
-        # remove any geometry issues
-        page.wrap_contents()
-        # get the text bboxes
-        bboxes = column_boxes(page, footer_margin=footer_margin, header_margin=header_margin)
-        # prepare a canvas to draw rectangles and text
-        shape = page.new_shape()
-        # iterate over the bboxes
-        for i, rect in enumerate(bboxes):
-            shape.draw_rect(rect)  # draw a border
-            # write sequence number
-            shape.insert_text(rect.tl + (5, 15), str(i), color=fitz.pdfcolor["red"])
-        # finish drawing / text with color red
-        shape.finish(color=fitz.pdfcolor["red"])
-        shape.commit()  # store to the page
-    # save document with text bboxes
-    doc.ez_save(filename.replace(".pdf", "-blocks.pdf"))
\ No newline at end of file
--- a/magic_pdf/libs/calc_span_stats.py
+++ b/magic_pdf/libs/calc_span_stats.py
-import os
-import csv
-import json
-import pandas as pd
-from pandas import DataFrame as df
-from matplotlib import pyplot as plt
-from termcolor import cprint
-"""
-Execute this script in the following way:
-1. Make sure there are pdf_dic.json files under the directory code-clean/tmp/unittest/md/, such as the following:
-    code-clean/tmp/unittest/md/scihub/scihub_00500000/libgen.scimag00527000-00527999.zip_10.1002/app.25178/pdf_dic.json
-2. Under the directory code-clean, execute the following command:
-    $ python -m libs.calc_span_stats
-"""
-def print_green_on_red(text):
-    cprint(text, "green", "on_red", attrs=["bold"], end="\n\n")
-def print_green(text):
-    print()
-    cprint(text, "green", attrs=["bold"], end="\n\n")
-def print_red(text):
-    print()
-    cprint(text, "red", attrs=["bold"], end="\n\n")
-def safe_get(dict_obj, key, default):
-    val = dict_obj.get(key)
-    if val is None:
-        return default
-    else:
-        return val
-class SpanStatsCalc:
-    """Calculate statistics of span."""
-    def draw_charts(self, span_stats: pd.DataFrame, fig_num: int, save_path: str):
-        """Draw multiple figures in one figure."""
-        # make a canvas
-        fig = plt.figure(fig_num, figsize=(20, 20))
-        pass
-    def calc_stats_per_dict(self, pdf_dict) -> pd.DataFrame:
-        """Calculate statistics per pdf_dict."""
-        span_stats = pd.DataFrame()
-        span_stats = []
-        span_id = 0
-        for page_id, blocks in pdf_dict.items():
-            if page_id.startswith("page_"):
-                if "para_blocks" in blocks.keys():
-                    for para_block in blocks["para_blocks"]:
-                        for line in para_block["lines"]:
-                            for span in line["spans"]:
-                                span_text = safe_get(span, "text", "")
-                                span_font_name = safe_get(span, "font", "")
-                                span_font_size = safe_get(span, "size", 0)
-                                span_font_color = safe_get(span, "color", "")
-                                span_font_flags = safe_get(span, "flags", 0)
-                                span_font_flags_decoded = safe_get(span, "decomposed_flags", {})
-                                span_is_super_script = safe_get(span_font_flags_decoded, "is_superscript", False)
-                                span_is_italic = safe_get(span_font_flags_decoded, "is_italic", False)
-                                span_is_serifed = safe_get(span_font_flags_decoded, "is_serifed", False)
-                                span_is_sans_serifed = safe_get(span_font_flags_decoded, "is_sans_serifed", False)
-                                span_is_monospaced = safe_get(span_font_flags_decoded, "is_monospaced", False)
-                                span_is_proportional = safe_get(span_font_flags_decoded, "is_proportional", False)
-                                span_is_bold = safe_get(span_font_flags_decoded, "is_bold", False)
-                                span_stats.append(
-                                    {
-                                        "span_id": span_id,  # id of span
-                                        "page_id": page_id,  # page number of pdf
-                                        "span_text": span_text,  # text of span
-                                        "span_font_name": span_font_name,  # font name of span
-                                        "span_font_size": span_font_size,  # font size of span
-                                        "span_font_color": span_font_color,  # font color of span
-                                        "span_font_flags": span_font_flags,  # font flags of span
-                                        "span_is_superscript": int(
-                                            span_is_super_script
-                                        ),  # indicate whether the span is super script or not
-                                        "span_is_italic": int(span_is_italic),  # indicate whether the span is italic or not
-                                        "span_is_serifed": int(span_is_serifed),  # indicate whether the span is serifed or not
-                                        "span_is_sans_serifed": int(
-                                            span_is_sans_serifed
-                                        ),  # indicate whether the span is sans serifed or not
-                                        "span_is_monospaced": int(
-                                            span_is_monospaced
-                                        ),  # indicate whether the span is monospaced or not
-                                        "span_is_proportional": int(
-                                            span_is_proportional
-                                        ),  # indicate whether the span is proportional or not
-                                        "span_is_bold": int(span_is_bold),  # indicate whether the span is bold or not
-                                    }
-                                )
-                                span_id += 1
-        span_stats = pd.DataFrame(span_stats)
-        # print(span_stats)
-        return span_stats
-def __find_pdf_dic_files(
-    jf_name="pdf_dic.json",
-    base_code_name="code-clean",
-    tgt_base_dir_name="tmp",
-    unittest_dir_name="unittest",
-    md_dir_name="md",
-    book_names=[
-        "scihub",
-    ],  # other possible values: "zlib", "arxiv" and so on
-):
-    pdf_dict_files = []
-    curr_dir = os.path.dirname(__file__)
-    for i in range(len(curr_dir)):
-        if curr_dir[i : i + len(base_code_name)] == base_code_name:
-            base_code_dir_name = curr_dir[: i + len(base_code_name)]
-            for book_name in book_names:
-                search_dir_relative_name = os.path.join(tgt_base_dir_name, unittest_dir_name, md_dir_name, book_name)
-                if os.path.exists(base_code_dir_name):
-                    search_dir_name = os.path.join(base_code_dir_name, search_dir_relative_name)
-                    for root, dirs, files in os.walk(search_dir_name):
-                        for file in files:
-                            if file == jf_name:
-                                pdf_dict_files.append(os.path.join(root, file))
-                break
-    return pdf_dict_files
-def combine_span_texts(group_df, span_stats):
-    combined_span_texts = []
-    for _, row in group_df.iterrows():
-        curr_span_id = row.name
-        curr_span_text = row["span_text"]
-        pre_span_id = curr_span_id - 1
-        pre_span_text = span_stats.at[pre_span_id, "span_text"] if pre_span_id in span_stats.index else ""
-        next_span_id = curr_span_id + 1
-        next_span_text = span_stats.at[next_span_id, "span_text"] if next_span_id in span_stats.index else ""
-        # pointer_sign is a right arrow if the span is superscript, otherwise it is a down arrow
-        pointer_sign = "→ → → "
-        combined_text = "\n".join([pointer_sign + pre_span_text, pointer_sign + curr_span_text, pointer_sign + next_span_text])
-        combined_span_texts.append(combined_text)
-    return "\n\n".join(combined_span_texts)
-# pd.set_option("display.max_colwidth", None)  # 设置为 None 来显示完整的文本
-pd.set_option("display.max_rows", None)  # 设置为 None 来显示更多的行
-def main():
-    pdf_dict_files = __find_pdf_dic_files()
-    # print(pdf_dict_files)
-    span_stats_calc = SpanStatsCalc()
-    for pdf_dict_file in pdf_dict_files:
-        print("-" * 100)
-        print_green_on_red(f"Processing {pdf_dict_file}")
-        with open(pdf_dict_file, "r", encoding="utf-8") as f:
-            pdf_dict = json.load(f)
-            raw_df = span_stats_calc.calc_stats_per_dict(pdf_dict)
-            save_path = pdf_dict_file.replace("pdf_dic.json", "span_stats_raw.csv")
-            raw_df.to_csv(save_path, index=False)
-            filtered_df = raw_df[raw_df["span_is_superscript"] == 1]
-            if filtered_df.empty:
-                print("No superscript span found!")
-                continue
-            filtered_grouped_df = filtered_df.groupby(["span_font_name", "span_font_size", "span_font_color"])
-            combined_span_texts = filtered_grouped_df.apply(combine_span_texts, span_stats=raw_df)  # type: ignore
-            final_df = filtered_grouped_df.size().reset_index(name="count")
-            final_df["span_texts"] = combined_span_texts.reset_index(level=[0, 1, 2], drop=True)
-            print(final_df)
-            final_df["span_texts"] = final_df["span_texts"].apply(lambda x: x.replace("\n", "\r\n"))
-            save_path = pdf_dict_file.replace("pdf_dic.json", "span_stats_final.csv")
-            # 使用 UTF-8 编码并添加 BOM，确保所有字段被双引号包围
-            final_df.to_csv(save_path, index=False, encoding="utf-8-sig", quoting=csv.QUOTE_ALL)
-            # 创建一个 2x2 的图表布局
-            fig, axs = plt.subplots(2, 2, figsize=(15, 10))
-            # 按照 span_font_name 分类作图
-            final_df.groupby("span_font_name")["count"].sum().plot(kind="bar", ax=axs[0, 0], title="By Font Name")
-            # 按照 span_font_size 分类作图
-            final_df.groupby("span_font_size")["count"].sum().plot(kind="bar", ax=axs[0, 1], title="By Font Size")
-            # 按照 span_font_color 分类作图
-            final_df.groupby("span_font_color")["count"].sum().plot(kind="bar", ax=axs[1, 0], title="By Font Color")
-            # 按照 span_font_name、span_font_size 和 span_font_color 共同分类作图
-            grouped = final_df.groupby(["span_font_name", "span_font_size", "span_font_color"])
-            grouped["count"].sum().unstack().plot(kind="bar", ax=axs[1, 1], title="Combined Grouping")
-            # 调整布局
-            plt.tight_layout()
-            # 显示图表
-            # plt.show()
-            # 保存图表到 PNG 文件
-            save_path = pdf_dict_file.replace("pdf_dic.json", "span_stats_combined.png")
-            plt.savefig(save_path)
-            # 清除画布
-            plt.clf()
-if __name__ == "__main__":
-    main()
--- a/magic_pdf/libs/commons.py
+++ b/magic_pdf/libs/commons.py
-import datetime
-import json
-import os, re, configparser
-import subprocess
-import time
-import boto3
-from loguru import logger
-from boto3.s3.transfer import TransferConfig
-from botocore.config import Config
-import fitz # 1.23.9中已经切换到rebase
-# import fitz_old as fitz  # 使用1.23.9之前的pymupdf库
-def get_delta_time(input_time):
-    return round(time.time() - input_time, 2)
 def join_path(*args):
    return '/'.join(str(s).rstrip('/') for s in args)
-#配置全局的errlog_path，方便demo同步引用
-error_log_path = "s3://llm-pdf-text/err_logs/"
-# json_dump_path = "s3://pdf_books_temp/json_dump/" # 这条路径仅用于临时本地测试,不能提交到main
-json_dump_path = "s3://llm-pdf-text/json_dump/"
-# s3_image_save_path = "s3://mllm-raw-media/pdf2md_img/" # 基础库不应该有这些存在的路径，应该在业务代码中定义
 def get_top_percent_list(num_list, percent):
    """
    获取列表中前百分之多少的元素
@@ -48,51 +22,12 @@ def get_top_percent_list(num_list, percent):
    return top_percent_list
-def formatted_time(time_stamp):
-    dt_object = datetime.datetime.fromtimestamp(time_stamp)
-    output_time = dt_object.strftime("%Y-%m-%d-%H:%M:%S")
-    return output_time
 def mymax(alist: list):
    if len(alist) == 0:
        return 0  # 空是0， 0*0也是0大小q
    else:
        return max(alist)
-def parse_aws_param(profile):
-    if isinstance(profile, str):
-        # 解析配置文件
-        config_file = join_path(os.path.expanduser("~"), ".aws", "config")
-        credentials_file = join_path(os.path.expanduser("~"), ".aws", "credentials")
-        config = configparser.ConfigParser()
-        config.read(credentials_file)
-        config.read(config_file)
-        # 获取 AWS 账户相关信息
-        ak = config.get(profile, "aws_access_key_id")
-        sk = config.get(profile, "aws_secret_access_key")
-        if profile == "default":
-            s3_str = config.get(f"{profile}", "s3")
-        else:
-            s3_str = config.get(f"profile {profile}", "s3")
-        end_match = re.search("endpoint_url[\s]*=[\s]*([^\s\n]+)[\s\n]*$", s3_str, re.MULTILINE)
-        if end_match:
-            endpoint = end_match.group(1)
-        else:
-            raise ValueError(f"aws 配置文件中没有找到 endpoint_url")
-        style_match = re.search("addressing_style[\s]*=[\s]*([^\s\n]+)[\s\n]*$", s3_str, re.MULTILINE)
-        if style_match:
-            addressing_style = style_match.group(1)
-        else:
-            addressing_style = "path"
-    elif isinstance(profile, dict):
-        ak = profile["ak"]
-        sk = profile["sk"]
-        endpoint = profile["endpoint"]
-        addressing_style = "auto"
-    return ak, sk, endpoint, addressing_style
 def parse_bucket_key(s3_full_path: str):
    """
@@ -106,99 +41,3 @@ def parse_bucket_key(s3_full_path: str):
        s3_full_path = s3_full_path[1:]
    bucket, key = s3_full_path.split("/", 1)
    return bucket, key
-def read_file(pdf_path: str, s3_profile):
-    if pdf_path.startswith("s3://"):
-        ak, sk, end_point, addressing_style = parse_aws_param(s3_profile)
-        cli = boto3.client(service_name="s3", aws_access_key_id=ak, aws_secret_access_key=sk, endpoint_url=end_point,
-                           config=Config(s3={'addressing_style': addressing_style}, retries={'max_attempts': 10, 'mode': 'standard'}))
-        bucket_name, bucket_key = parse_bucket_key(pdf_path)
-        res = cli.get_object(Bucket=bucket_name, Key=bucket_key)
-        file_content = res["Body"].read()
-        return file_content
-    else:
-        with open(pdf_path, "rb") as f:
-            return f.read()
-def get_docx_model_output(pdf_model_output, page_id):
-    model_output_json = pdf_model_output[page_id]
-    return model_output_json
-def list_dir(dir_path:str, s3_profile:str):
-    """
-    列出dir_path下的所有文件
-    """
-    ret = []
-    if dir_path.startswith("s3"):
-        ak, sk, end_point, addressing_style = parse_aws_param(s3_profile)
-        s3info = re.findall(r"s3:\/\/([^\/]+)\/(.*)", dir_path)
-        bucket, path = s3info[0][0], s3info[0][1]
-        try:
-            cli = boto3.client(service_name="s3", aws_access_key_id=ak, aws_secret_access_key=sk, endpoint_url=end_point,
-                                            config=Config(s3={'addressing_style': addressing_style}))
-            def list_obj_scluster():
-                marker = None
-                while True:
-                    list_kwargs = dict(MaxKeys=1000, Bucket=bucket, Prefix=path)
-                    if marker:
-                        list_kwargs['Marker'] = marker
-                    response = cli.list_objects(**list_kwargs)
-                    contents = response.get("Contents", [])
-                    yield from contents
-                    if not response.get("IsTruncated") or len(contents)==0:
-                        break
-                    marker = contents[-1]['Key']
-            for info in list_obj_scluster():
-                file_path = info['Key']
-                #size = info['Size']
-                if path!="":
-                    afile = file_path[len(path):]
-                    if afile.endswith(".json"):
-                        ret.append(f"s3://{bucket}/{file_path}")
-            return ret
-        except Exception as e:
-            logger.exception(e)
-            exit(-1)
-    else: #本地的目录，那么扫描本地目录并返会这个目录里的所有jsonl文件
-        for root, dirs, files in os.walk(dir_path):
-            for file in files:
-                if file.endswith(".json"):
-                    ret.append(join_path(root, file))
-        ret.sort()
-        return ret
-def get_img_s3_client(save_path:str, image_s3_config:str):
-    """
-    """
-    if save_path.startswith("s3://"):  # 放这里是为了最少创建一个s3 client
-        ak, sk, end_point, addressing_style = parse_aws_param(image_s3_config)
-        img_s3_client = boto3.client(
-            service_name="s3",
-            aws_access_key_id=ak,
-            aws_secret_access_key=sk,
-            endpoint_url=end_point,
-            config=Config(s3={"addressing_style": addressing_style}, retries={'max_attempts': 5, 'mode': 'standard'}),
-        )
-    else:
-        img_s3_client = None
-    return img_s3_client
-if __name__=="__main__":
-    s3_path = "s3://llm-pdf-text/layout_det/scihub/scimag07865000-07865999/10.1007/s10729-011-9175-6.pdf/"
-    s3_profile = "langchao"
-    ret = list_dir(s3_path, s3_profile)
-    print(ret)
\ No newline at end of file
--- a/magic_pdf/libs/detect_language_from_model.py
+++ b/magic_pdf/libs/detect_language_from_model.py
-from collections import Counter
-from magic_pdf.libs.language import detect_lang
-def get_language_from_model(model_list: list):
-    language_lst = []
-    for ocr_page_info in model_list:
-        page_text = ""
-        layout_dets = ocr_page_info["layout_dets"]
-        for layout_det in layout_dets:
-            category_id = layout_det["category_id"]
-            allow_category_id_list = [15]
-            if category_id in allow_category_id_list:
-                page_text += layout_det["text"]
-        page_language = detect_lang(page_text)
-        language_lst.append(page_language)
-    # 统计text_language_list中每种语言的个数
-    count_dict = Counter(language_lst)
-    # 输出text_language_list中出现的次数最多的语言
-    language = max(count_dict, key=count_dict.get)
-    return language
--- a/magic_pdf/libs/draw_bbox.py
+++ b/magic_pdf/libs/draw_bbox.py
+import fitz
 from magic_pdf.config.constants import CROSS_PAGE
-from magic_pdf.config.ocr_content_type import (BlockType, CategoryId,
+from magic_pdf.config.ocr_content_type import BlockType, CategoryId, ContentType
-                                               ContentType)
 from magic_pdf.data.dataset import PymuDocDataset
-from magic_pdf.libs.commons import fitz  # PyMuPDF
 from magic_pdf.model.magic_model import MagicModel

--- a/magic_pdf/libs/markdown_utils.py
+++ b/magic_pdf/libs/markdown_utils.py
-import re
-def escape_special_markdown_char(pymu_blocks):
-    """
-    转义正文里对markdown语法有特殊意义的字符
-    """
-    special_chars = ["*", "`", "~", "$"]
-    for blk in pymu_blocks:
-        for line in blk['lines']:
-            for span in line['spans']:
-                for char in special_chars:
-                    span_text = span['text']
-                    span_type = span.get("_type", None)
-                    if span_type in ['inline-equation', 'interline-equation']:
-                        continue
-                    elif span_text:
-                        span['text'] = span['text'].replace(char, "\\" + char)
-    return pymu_blocks
 def ocr_escape_special_markdown_char(content):
    """

--- a/magic_pdf/libs/nlp_utils.py
+++ b/magic_pdf/libs/nlp_utils.py
-import re
-from os import path
-from collections import Counter
-from loguru import logger
-# from langdetect import detect
-import spacy
-import en_core_web_sm
-import zh_core_web_sm
-from magic_pdf.libs.language import detect_lang
-class NLPModels:
-    """
-    How to upload local models to s3:
-        - config aws cli:
-            doc\SETUP-CLI.md
-            doc\setup_cli.sh
-            app\config\__init__.py
-        - $ cd {local_dir_storing_models}
-        - $ ls models
-            en_core_web_sm-3.7.1/
-            zh_core_web_sm-3.7.0/
-        - $ aws s3 sync models/ s3://llm-infra/models --profile=p_project_norm
-        - $ aws s3 --profile=p_project_norm ls  s3://llm-infra/models/
-            PRE en_core_web_sm-3.7.1/
-            PRE zh_core_web_sm-3.7.0/
-    """
-    def __init__(self):
-        # if OS is windows, set "TMP_DIR" to "D:/tmp"
-        home_dir = path.expanduser("~")
-        self.default_local_path = path.join(home_dir, ".nlp_models")
-        self.default_shared_path = "/share/pdf_processor/nlp_models"
-        self.default_hdfs_path = "hdfs://pdf_processor/nlp_models"
-        self.default_s3_path = "s3://llm-infra/models"
-        self.nlp_models = self.nlp_models = {
-            "en_core_web_sm": {
-                "type": "spacy",
-                "version": "3.7.1",
-            },
-            "en_core_web_md": {
-                "type": "spacy",
-                "version": "3.7.1",
-            },
-            "en_core_web_lg": {
-                "type": "spacy",
-                "version": "3.7.1",
-            },
-            "zh_core_web_sm": {
-                "type": "spacy",
-                "version": "3.7.0",
-            },
-            "zh_core_web_md": {
-                "type": "spacy",
-                "version": "3.7.0",
-            },
-            "zh_core_web_lg": {
-                "type": "spacy",
-                "version": "3.7.0",
-            },
-        }
-        self.en_core_web_sm_model = en_core_web_sm.load()
-        self.zh_core_web_sm_model = zh_core_web_sm.load()
-    def load_model(self, model_name, model_type, model_version):
-        if (
-            model_name in self.nlp_models
-            and self.nlp_models[model_name]["type"] == model_type
-            and self.nlp_models[model_name]["version"] == model_version
-        ):
-            return spacy.load(model_name) if spacy.util.is_package(model_name) else None
-        else:
-            logger.error(f"Unsupported model name or version: {model_name} {model_version}")
-            return None
-    def detect_language(self, text, use_langdetect=False):
-        if len(text) == 0:
-            return None
-        if use_langdetect:
-            # print("use_langdetect")
-            # print(detect_lang(text))
-            # return detect_lang(text)
-            if detect_lang(text) == "zh":
-                return "zh"
-            else:
-                return "en"
-        if not use_langdetect:
-            en_count = len(re.findall(r"[a-zA-Z]", text))
-            cn_count = len(re.findall(r"[\u4e00-\u9fff]", text))
-            if en_count > cn_count:
-                return "en"
-            if cn_count > en_count:
-                return "zh"
-    def detect_entity_catgr_using_nlp(self, text, threshold=0.5):
-        """
-        Detect entity categories using NLP models and return the most frequent entity types.
-        Parameters
-        ----------
-        text : str
-            Text to be processed.
-        Returns
-        -------
-        str
-            The most frequent entity type.
-        """
-        lang = self.detect_language(text, use_langdetect=True)
-        if lang == "en":
-            nlp_model = self.en_core_web_sm_model
-        elif lang == "zh":
-            nlp_model = self.zh_core_web_sm_model
-        else:
-            # logger.error(f"Unsupported language: {lang}")
-            return {}
-        # Splitting text into smaller parts
-        text_parts = re.split(r"[,;，；、\s & |]+", text)
-        text_parts = [part for part in text_parts if not re.match(r"[\d\W]+", part)]  # Remove non-words
-        text_combined = " ".join(text_parts)
-        try:
-            doc = nlp_model(text_combined)
-            entity_counts = Counter([ent.label_ for ent in doc.ents])
-            word_counts_in_entities = Counter()
-            for ent in doc.ents:
-                word_counts_in_entities[ent.label_] += len(ent.text.split())
-            total_words_in_entities = sum(word_counts_in_entities.values())
-            total_words = len([token for token in doc if not token.is_punct])
-            if total_words_in_entities == 0 or total_words == 0:
-                return None
-            entity_percentage = total_words_in_entities / total_words
-            if entity_percentage < 0.5:
-                return None
-            most_common_entity, word_count = word_counts_in_entities.most_common(1)[0]
-            entity_percentage = word_count / total_words_in_entities
-            if entity_percentage >= threshold:
-                return most_common_entity
-            else:
-                return None
-        except Exception as e:
-            logger.error(f"Error in entity detection: {e}")
-            return None
-def __main__():
-    nlpModel = NLPModels()
-    test_strings = [
-        "张三",
-        "张三, 李四，王五; 赵六",
-        "John Doe",
-        "Jane Smith",
-        "Lee, John",
-        "John Doe, Jane Smith; Alice Johnson，Bob Lee",
-        "孙七, Michael Jordan；赵八",
-        "David Smith  Michael O'Connor; Kevin ßáçøñ",
-        "李雷·韩梅梅, 张三·李四",
-        "Charles Robert Darwin, Isaac Newton",
-        "莱昂纳多·迪卡普里奥, 杰克·吉伦哈尔",
-        "John Doe, Jane Smith; Alice Johnson",
-        "张三, 李四，王五; 赵六",
-        "Lei Wang, Jia Li, and Xiaojun Chen, LINKE YANG OU, and YUAN ZHANG",
-        "Rachel Mills  &  William Barry  &  Susanne B. Haga",
-        "Claire Chabut* and Jean-François Bussières",
-        "1 Department of Chemistry, Northeastern University, Shenyang 110004, China 2 State Key Laboratory of Polymer Physics and Chemistry, Changchun Institute of Applied Chemistry, Chinese Academy of Sciences, Changchun 130022, China",
-        "Changchun",
-        "china",
-        "Rongjun Song, 1,2 Baoyan Zhang, 1 Baotong Huang, 2 Tao Tang 2",
-        "Synergistic Effect of Supported Nickel Catalyst with Intumescent Flame-Retardants on Flame Retardancy and Thermal Stability of Polypropylene",
-        "Synergistic Effect of Supported Nickel Catalyst with",
-        "Intumescent Flame-Retardants on Flame Retardancy",
-        "and Thermal Stability of Polypropylene",
-    ]
-    for test in test_strings:
-        print()
-        print(f"Original String: {test}")
-        result = nlpModel.detect_entity_catgr_using_nlp(test)
-        print(f"Detected entities: {result}")
-if __name__ == "__main__":
-    __main__()
--- a/magic_pdf/libs/pdf_image_tools.py
+++ b/magic_pdf/libs/pdf_image_tools.py
 from io import BytesIO
 import cv2
+import fitz
 import numpy as np
 from PIL import Image
 from magic_pdf.data.data_reader_writer import DataWriter
-from magic_pdf.libs.commons import fitz, join_path
+from magic_pdf.libs.commons import join_path
 from magic_pdf.libs.hash_utils import compute_sha256

--- a/magic_pdf/libs/textbase.py
+++ b/magic_pdf/libs/textbase.py
-import math
-def __inc_dict_val(mp, key, val_inc:int):
-    if mp.get(key):
-        mp[key] = mp[key] + val_inc
-    else:
-        mp[key] = val_inc
-def get_text_block_base_info(block):
-    """
-    获取这个文本块里的字体的颜色、字号、字体
-    按照正文字数最多的返回
-    """
-    counter = {}
-    for line in block['lines']:
-        for span in line['spans']:
-            color = span['color']
-            size = round(span['size'], 2)
-            font = span['font']
-            txt_len = len(span['text'])
-            __inc_dict_val(counter, (color, size, font), txt_len)
-    c, s, ft = max(counter, key=counter.get)
-    return c, s, ft
\ No newline at end of file
--- a/magic_pdf/libs/vis_utils.py
+++ b/magic_pdf/libs/vis_utils.py
-from magic_pdf.libs.commons import fitz
-import os
-def draw_bbox_on_page(raw_pdf_doc: fitz.Document, paras_dict:dict, save_path: str):
-    """
-    在page上画出bbox，保存到save_path
-    """
-    # 检查文件是否存在
-    is_new_pdf = False
-    if os.path.exists(save_path):
-        # 打开现有的 PDF 文件
-        doc = fitz.open(save_path)
-    else:
-        # 创建一个新的空白 PDF 文件
-        is_new_pdf = True
-        doc = fitz.open('')
-    color_map = {
-        'image': fitz.pdfcolor["yellow"],
-        'text': fitz.pdfcolor['blue'],
-        "table": fitz.pdfcolor['green']
-    }
-    for k, v in paras_dict.items():
-        page_idx = v['page_idx']
-        width = raw_pdf_doc[page_idx].rect.width
-        height = raw_pdf_doc[page_idx].rect.height
-        new_page = doc.new_page(width=width, height=height)
-        shape = new_page.new_shape()
-        for order, block in enumerate(v['preproc_blocks']):
-            rect = fitz.Rect(block['bbox'])
-            shape = new_page.new_shape()
-            shape.draw_rect(rect)
-            shape.finish(color=None, fill=color_map['text'], fill_opacity=0.2)
-            shape.finish()
-            shape.commit()
-        for img in v['images']:
-            # 原始box画上去
-            rect = fitz.Rect(img['bbox'])
-            shape = new_page.new_shape()
-            shape.draw_rect(rect)
-            shape.finish(color=None, fill=fitz.pdfcolor['yellow'])
-            shape.finish()
-            shape.commit()
-        for img in v['image_backup']:
-            # 原始box画上去
-            rect = fitz.Rect(img['bbox'])
-            shape = new_page.new_shape()
-            shape.draw_rect(rect)
-            shape.finish(color=fitz.pdfcolor['yellow'],  fill=None)
-            shape.finish()
-            shape.commit()
-        for tb in v['droped_text_block']:
-            # 原始box画上去
-            rect = fitz.Rect(tb['bbox'])
-            shape = new_page.new_shape()
-            shape.draw_rect(rect)
-            shape.finish(color=None, fill=fitz.pdfcolor['black'], fill_opacity=0.4)
-            shape.finish()
-            shape.commit()
-        # TODO table
-        for tb in v['tables']:
-            rect = fitz.Rect(tb['bbox'])
-            shape = new_page.new_shape()
-            shape.draw_rect(rect)
-            shape.finish(color=None, fill=fitz.pdfcolor['green'], fill_opacity=0.2)
-            shape.finish()
-            shape.commit()
-    parent_dir = os.path.dirname(save_path)
-    if not os.path.exists(parent_dir):
-        os.makedirs(parent_dir)
-    if is_new_pdf:
-        doc.save(save_path)
-    else:
-        doc.saveIncr()
-    doc.close()
-def debug_show_bbox(raw_pdf_doc: fitz.Document, page_idx: int, bboxes: list, droped_bboxes:list,  expect_drop_bboxes:list, save_path: str, expected_page_id:int):
-    """
-    以覆盖的方式写个临时的pdf，用于debug
-    """
-    if page_idx!=expected_page_id:
-        return
-    if os.path.exists(save_path):
-        # 删除已经存在的文件
-        os.remove(save_path)
-    # 创建一个新的空白 PDF 文件
-    doc = fitz.open('')
-    width = raw_pdf_doc[page_idx].rect.width
-    height = raw_pdf_doc[page_idx].rect.height
-    new_page = doc.new_page(width=width, height=height)
-    shape = new_page.new_shape()
-    for bbox in bboxes:
-        # 原始box画上去
-        rect = fitz.Rect(*bbox[0:4])
-        shape = new_page.new_shape()
-        shape.draw_rect(rect)
-        shape.finish(color=fitz.pdfcolor['red'], fill=fitz.pdfcolor['blue'], fill_opacity=0.2)
-        shape.finish()
-        shape.commit()
-    for bbox in droped_bboxes:
-        # 原始box画上去
-        rect = fitz.Rect(*bbox[0:4])
-        shape = new_page.new_shape()
-        shape.draw_rect(rect)
-        shape.finish(color=None, fill=fitz.pdfcolor['yellow'], fill_opacity=0.2)
-        shape.finish()
-        shape.commit()
-    for bbox in expect_drop_bboxes:
-        # 原始box画上去
-        rect = fitz.Rect(*bbox[0:4])
-        shape = new_page.new_shape()
-        shape.draw_rect(rect)
-        shape.finish(color=fitz.pdfcolor['red'], fill=None)
-        shape.finish()
-        shape.commit()
-    # shape.insert_textbox(fitz.Rect(200, 0, 600, 20), f"total bboxes: {len(bboxes)}", fontname="helv", fontsize=12,
-    #                      color=(0, 0, 0))
-    # shape.finish(color=fitz.pdfcolor['black'])
-    # shape.commit()
-    parent_dir = os.path.dirname(save_path)
-    if not os.path.exists(parent_dir):
-        os.makedirs(parent_dir)
-    doc.save(save_path)
-    doc.close()
-def debug_show_page(page, bboxes1: list,bboxes2: list,bboxes3: list,):
-    save_path = "./tmp/debug.pdf"
-    if os.path.exists(save_path):
-        # 删除已经存在的文件
-        os.remove(save_path)
-    # 创建一个新的空白 PDF 文件
-    doc = fitz.open('')
-    width = page.rect.width
-    height = page.rect.height
-    new_page = doc.new_page(width=width, height=height)
-    shape = new_page.new_shape()
-    for bbox in bboxes1:
-        # 原始box画上去
-        rect = fitz.Rect(*bbox[0:4])
-        shape = new_page.new_shape()
-        shape.draw_rect(rect)
-        shape.finish(color=fitz.pdfcolor['red'], fill=fitz.pdfcolor['blue'], fill_opacity=0.2)
-        shape.finish()
-        shape.commit()
-    for bbox in bboxes2:
-        # 原始box画上去
-        rect = fitz.Rect(*bbox[0:4])
-        shape = new_page.new_shape()
-        shape.draw_rect(rect)
-        shape.finish(color=None, fill=fitz.pdfcolor['yellow'], fill_opacity=0.2)
-        shape.finish()
-        shape.commit()
-    for bbox in bboxes3:
-        # 原始box画上去
-        rect = fitz.Rect(*bbox[0:4])
-        shape = new_page.new_shape()
-        shape.draw_rect(rect)
-        shape.finish(color=fitz.pdfcolor['red'], fill=None)
-        shape.finish()
-        shape.commit()
-    parent_dir = os.path.dirname(save_path)
-    if not os.path.exists(parent_dir):
-        os.makedirs(parent_dir)
-    doc.save(save_path)
-    doc.close() 
-def draw_layout_bbox_on_page(raw_pdf_doc: fitz.Document, paras_dict:dict, header, footer, pdf_path: str):
-    """
-    在page上画出bbox，保存到save_path
-    """
-    # 检查文件是否存在
-    is_new_pdf = False
-    if os.path.exists(pdf_path):
-        # 打开现有的 PDF 文件
-        doc = fitz.open(pdf_path)
-    else:
-        # 创建一个新的空白 PDF 文件
-        is_new_pdf = True
-        doc = fitz.open('')
-    for k, v in paras_dict.items():
-        page_idx = v['page_idx']
-        layouts = v['layout_bboxes']
-        page = doc[page_idx]
-        shape = page.new_shape()
-        for order, layout in enumerate(layouts):
-            border_offset = 1
-            rect_box = layout['layout_bbox']
-            layout_label = layout['layout_label']
-            fill_color = fitz.pdfcolor['pink'] if layout_label=='U' else None
-            rect_box = [rect_box[0]+1, rect_box[1]-border_offset, rect_box[2]-1, rect_box[3]+border_offset]
-            rect = fitz.Rect(*rect_box)
-            shape.draw_rect(rect)
-            shape.finish(color=fitz.pdfcolor['red'], fill=fill_color, fill_opacity=0.4)
-            """
-            draw order text on layout box
-            """
-            font_size = 10
-            shape.insert_text((rect_box[0] + 1, rect_box[1] + font_size), f"{order}", fontsize=font_size, color=(0, 0, 0))
-        """画上footer header"""
-        if header:
-            shape.draw_rect(fitz.Rect(header))
-            shape.finish(color=None, fill=fitz.pdfcolor['black'], fill_opacity=0.2)
-        if footer:
-            shape.draw_rect(fitz.Rect(footer))
-            shape.finish(color=None, fill=fitz.pdfcolor['black'], fill_opacity=0.2)
-        shape.commit()
-    if is_new_pdf:
-        doc.save(pdf_path)
-    else:
-        doc.saveIncr()
-    doc.close()
-@DeprecationWarning
-def draw_layout_on_page(raw_pdf_doc: fitz.Document,  page_idx: int, page_layout: list, pdf_path: str):
-    """
-    把layout的box用红色边框花在pdf_path的page_idx上
-    """
-    def draw(shape, layout, fill_color=fitz.pdfcolor['pink']):
-        border_offset = 1
-        rect_box = layout['layout_bbox']
-        layout_label = layout['layout_label']
-        sub_layout = layout['sub_layout']
-        if len(sub_layout)==0:
-            fill_color = fill_color if layout_label=='U' else None
-            rect_box = [rect_box[0]+1, rect_box[1]-border_offset, rect_box[2]-1, rect_box[3]+border_offset]
-            rect = fitz.Rect(*rect_box)
-            shape.draw_rect(rect)
-            shape.finish(color=fitz.pdfcolor['red'], fill=fill_color, fill_opacity=0.2)
-            # if layout_label=='U':
-            #     bad_boxes = layout.get("bad_boxes", [])
-            #     for bad_box in bad_boxes:
-            #         rect = fitz.Rect(*bad_box)
-            #         shape.draw_rect(rect)
-            #         shape.finish(color=fitz.pdfcolor['red'], fill=fitz.pdfcolor['red'], fill_opacity=0.2)
-        # else:
-        #     rect = fitz.Rect(*rect_box)
-        #     shape.draw_rect(rect)
-        #     shape.finish(color=fitz.pdfcolor['blue'])
-        for sub_layout in sub_layout:
-            draw(shape, sub_layout)
-        shape.commit()
-    # 检查文件是否存在
-    is_new_pdf = False
-    if os.path.exists(pdf_path):
-        # 打开现有的 PDF 文件
-        doc = fitz.open(pdf_path)
-    else:
-        # 创建一个新的空白 PDF 文件
-        is_new_pdf = True
-        doc = fitz.open('')
-    page = doc[page_idx]
-    shape = page.new_shape()
-    for order, layout in enumerate(page_layout):
-        draw(shape, layout, fitz.pdfcolor['yellow'])
-    # shape.insert_textbox(fitz.Rect(200, 0, 600, 20), f"total bboxes: {len(layout)}", fontname="helv", fontsize=12,
-    #                      color=(0, 0, 0))
-    # shape.finish(color=fitz.pdfcolor['black'])
-    # shape.commit()
-    parent_dir = os.path.dirname(pdf_path)
-    if not os.path.exists(parent_dir):
-        os.makedirs(parent_dir)
-    if is_new_pdf:
-        doc.save(pdf_path)
-    else:
-        doc.saveIncr()
-    doc.close()
\ No newline at end of file
--- a/magic_pdf/model/doc_analyze_by_custom_model.py
+++ b/magic_pdf/model/doc_analyze_by_custom_model.py
@@ -46,8 +46,8 @@ def load_images_from_pdf(pdf_bytes: bytes, dpi=200, start_page_id=0, end_page_id
                mat = fitz.Matrix(dpi / 72, dpi / 72)
                pm = page.get_pixmap(matrix=mat, alpha=False)
-                # If the width or height exceeds 9000 after scaling, do not scale further.
+                # If the width or height exceeds 4500 after scaling, do not scale further.
-                if pm.width > 9000 or pm.height > 9000:
+                if pm.width > 4500 or pm.height > 4500:
                    pm = page.get_pixmap(matrix=fitz.Matrix(1, 1), alpha=False)
                img = Image.frombytes("RGB", (pm.width, pm.height), pm.samples)