Merge pull request #1099 from myhloli/dev

refactor(magic_pdf): remove unused functions and simplify code

Merge pull request #1099 from myhloli/dev
refactor(magic_pdf): remove unused functions and simplify code
e6da37dd · Xiaomeng Zhao · GitHub · 79b58a1e · 6a22b5ab · 79b58a1e
Unverified Commit e6da37dd authored Nov 26, 2024 by Xiaomeng Zhao Committed by GitHub Nov 26, 2024
20 changed files
--- a/magic_pdf/dict2md/mkcontent.py.bak
+++ b/magic_pdf/dict2md/mkcontent.py.bak
-import math
-from loguru import logger
-from magic_pdf.config.ocr_content_type import ContentType
-from magic_pdf.libs.boxbase import (find_bottom_nearest_text_bbox,
-                                    find_top_nearest_text_bbox)
-from magic_pdf.libs.commons import join_path
-TYPE_INLINE_EQUATION = ContentType.InlineEquation
-TYPE_INTERLINE_EQUATION = ContentType.InterlineEquation
-UNI_FORMAT_TEXT_TYPE = ['text', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6']
-@DeprecationWarning
-def mk_nlp_markdown_1(para_dict: dict):
-    """对排序后的bboxes拼接内容."""
-    content_lst = []
-    for _, page_info in para_dict.items():
-        para_blocks = page_info.get('para_blocks')
-        if not para_blocks:
-            continue
-        for block in para_blocks:
-            item = block['paras']
-            for _, p in item.items():
-                para_text = p['para_text']
-                is_title = p['is_para_title']
-                title_level = p['para_title_level']
-                md_title_prefix = '#' * title_level
-                if is_title:
-                    content_lst.append(f'{md_title_prefix} {para_text}')
-                else:
-                    content_lst.append(para_text)
-    content_text = '\n\n'.join(content_lst)
-    return content_text
-# 找到目标字符串在段落中的索引
-def __find_index(paragraph, target):
-    index = paragraph.find(target)
-    if index != -1:
-        return index
-    else:
-        return None
-def __insert_string(paragraph, target, position):
-    new_paragraph = paragraph[:position] + target + paragraph[position:]
-    return new_paragraph
-def __insert_after(content, image_content, target):
-    """在content中找到target，将image_content插入到target后面."""
-    index = content.find(target)
-    if index != -1:
-        content = (
-            content[: index + len(target)]
-            + '\n\n'
-            + image_content
-            + '\n\n'
-            + content[index + len(target) :]
-        )
-    else:
-        logger.error(
-            f"Can't find the location of image {image_content} in the markdown file, search target is {target}"
-        )
-    return content
-def __insert_before(content, image_content, target):
-    """在content中找到target，将image_content插入到target前面."""
-    index = content.find(target)
-    if index != -1:
-        content = content[:index] + '\n\n' + image_content + '\n\n' + content[index:]
-    else:
-        logger.error(
-            f"Can't find the location of image {image_content} in the markdown file, search target is {target}"
-        )
-    return content
-@DeprecationWarning
-def mk_mm_markdown_1(para_dict: dict):
-    """拼装多模态markdown."""
-    content_lst = []
-    for _, page_info in para_dict.items():
-        page_lst = []  # 一个page内的段落列表
-        para_blocks = page_info.get('para_blocks')
-        pymu_raw_blocks = page_info.get('preproc_blocks')
-        all_page_images = []
-        all_page_images.extend(page_info.get('images', []))
-        all_page_images.extend(page_info.get('image_backup', []))
-        all_page_images.extend(page_info.get('tables', []))
-        all_page_images.extend(page_info.get('table_backup', []))
-        if not para_blocks or not pymu_raw_blocks:  # 只有图片的拼接的场景
-            for img in all_page_images:
-                page_lst.append(f"![]({img['image_path']})")  # TODO 图片顺序
-            page_md = '\n\n'.join(page_lst)
-        else:
-            for block in para_blocks:
-                item = block['paras']
-                for _, p in item.items():
-                    para_text = p['para_text']
-                    is_title = p['is_para_title']
-                    title_level = p['para_title_level']
-                    md_title_prefix = '#' * title_level
-                    if is_title:
-                        page_lst.append(f'{md_title_prefix} {para_text}')
-                    else:
-                        page_lst.append(para_text)
-            """拼装成一个页面的文本"""
-            page_md = '\n\n'.join(page_lst)
-            """插入图片"""
-            for img in all_page_images:
-                imgbox = img['bbox']
-                img_content = f"![]({img['image_path']})"
-                # 先看在哪个block内
-                for block in pymu_raw_blocks:
-                    bbox = block['bbox']
-                    if (
-                        bbox[0] - 1 <= imgbox[0] < bbox[2] + 1
-                        and bbox[1] - 1 <= imgbox[1] < bbox[3] + 1
-                    ):  # 确定在block内
-                        for l in block['lines']:  # noqa: E741
-                            line_box = l['bbox']
-                            if (
-                                line_box[0] - 1 <= imgbox[0] < line_box[2] + 1
-                                and line_box[1] - 1 <= imgbox[1] < line_box[3] + 1
-                            ):  # 在line内的，插入line前面
-                                line_txt = ''.join([s['text'] for s in l['spans']])
-                                page_md = __insert_before(
-                                    page_md, img_content, line_txt
-                                )
-                                break
-                            break
-                        else:  # 在行与行之间
-                            # 找到图片x0,y0与line的x0,y0最近的line
-                            min_distance = 100000
-                            min_line = None
-                            for l in block['lines']:  # noqa: E741
-                                line_box = l['bbox']
-                                distance = math.sqrt(
-                                    (line_box[0] - imgbox[0]) ** 2
-                                    + (line_box[1] - imgbox[1]) ** 2
-                                )
-                                if distance < min_distance:
-                                    min_distance = distance
-                                    min_line = l
-                            if min_line:
-                                line_txt = ''.join(
-                                    [s['text'] for s in min_line['spans']]
-                                )
-                                img_h = imgbox[3] - imgbox[1]
-                                if min_distance < img_h:  # 文字在图片前面
-                                    page_md = __insert_after(
-                                        page_md, img_content, line_txt
-                                    )
-                                else:
-                                    page_md = __insert_before(
-                                        page_md, img_content, line_txt
-                                    )
-                            else:
-                                logger.error(
-                                    f"Can't find the location of image {img['image_path']} in the markdown file  #1"
-                                )
-                else:  # 应当在两个block之间
-                    # 找到上方最近的block，如果上方没有就找大下方最近的block
-                    top_txt_block = find_top_nearest_text_bbox(pymu_raw_blocks, imgbox)
-                    if top_txt_block:
-                        line_txt = ''.join(
-                            [s['text'] for s in top_txt_block['lines'][-1]['spans']]
-                        )
-                        page_md = __insert_after(page_md, img_content, line_txt)
-                    else:
-                        bottom_txt_block = find_bottom_nearest_text_bbox(
-                            pymu_raw_blocks, imgbox
-                        )
-                        if bottom_txt_block:
-                            line_txt = ''.join(
-                                [
-                                    s['text']
-                                    for s in bottom_txt_block['lines'][0]['spans']
-                                ]
-                            )
-                            page_md = __insert_before(page_md, img_content, line_txt)
-                        else:
-                            logger.error(
-                                f"Can't find the location of image {img['image_path']} in the markdown file  #2"
-                            )
-        content_lst.append(page_md)
-    """拼装成全部页面的文本"""
-    content_text = '\n\n'.join(content_lst)
-    return content_text
-def __insert_after_para(text, type, element, content_list):
-    """在content_list中找到text，将image_path作为一个新的node插入到text后面."""
-    for i, c in enumerate(content_list):
-        content_type = c.get('type')
-        if content_type in UNI_FORMAT_TEXT_TYPE and text in c.get('text', ''):
-            if type == 'image':
-                content_node = {
-                    'type': 'image',
-                    'img_path': element.get('image_path'),
-                    'img_alt': '',
-                    'img_title': '',
-                    'img_caption': '',
-                }
-            elif type == 'table':
-                content_node = {
-                    'type': 'table',
-                    'img_path': element.get('image_path'),
-                    'table_latex': element.get('text'),
-                    'table_title': '',
-                    'table_caption': '',
-                    'table_quality': element.get('quality'),
-                }
-            content_list.insert(i + 1, content_node)
-            break
-    else:
-        logger.error(
-            f"Can't find the location of image {element.get('image_path')} in the markdown file, search target is {text}"
-        )
-def __insert_before_para(text, type, element, content_list):
-    """在content_list中找到text，将image_path作为一个新的node插入到text前面."""
-    for i, c in enumerate(content_list):
-        content_type = c.get('type')
-        if content_type in UNI_FORMAT_TEXT_TYPE and text in c.get('text', ''):
-            if type == 'image':
-                content_node = {
-                    'type': 'image',
-                    'img_path': element.get('image_path'),
-                    'img_alt': '',
-                    'img_title': '',
-                    'img_caption': '',
-                }
-            elif type == 'table':
-                content_node = {
-                    'type': 'table',
-                    'img_path': element.get('image_path'),
-                    'table_latex': element.get('text'),
-                    'table_title': '',
-                    'table_caption': '',
-                    'table_quality': element.get('quality'),
-                }
-            content_list.insert(i, content_node)
-            break
-    else:
-        logger.error(
-            f"Can't find the location of image {element.get('image_path')} in the markdown file, search target is {text}"
-        )
-def mk_universal_format(pdf_info_list: list, img_buket_path):
-    """构造统一格式 https://aicarrier.feishu.cn/wiki/FqmMwcH69iIdCWkkyjvcDwNUnTY."""
-    content_lst = []
-    for page_info in pdf_info_list:
-        page_lst = []  # 一个page内的段落列表
-        para_blocks = page_info.get('para_blocks')
-        pymu_raw_blocks = page_info.get('preproc_blocks')
-        all_page_images = []
-        all_page_images.extend(page_info.get('images', []))
-        all_page_images.extend(page_info.get('image_backup', []))
-        # all_page_images.extend(page_info.get("tables",[]))
-        # all_page_images.extend(page_info.get("table_backup",[]) )
-        all_page_tables = []
-        all_page_tables.extend(page_info.get('tables', []))
-        if not para_blocks or not pymu_raw_blocks:  # 只有图片的拼接的场景
-            for img in all_page_images:
-                content_node = {
-                    'type': 'image',
-                    'img_path': join_path(img_buket_path, img['image_path']),
-                    'img_alt': '',
-                    'img_title': '',
-                    'img_caption': '',
-                }
-                page_lst.append(content_node)  # TODO 图片顺序
-            for table in all_page_tables:
-                content_node = {
-                    'type': 'table',
-                    'img_path': join_path(img_buket_path, table['image_path']),
-                    'table_latex': table.get('text'),
-                    'table_title': '',
-                    'table_caption': '',
-                    'table_quality': table.get('quality'),
-                }
-                page_lst.append(content_node)  # TODO 图片顺序
-        else:
-            for block in para_blocks:
-                item = block['paras']
-                for _, p in item.items():
-                    font_type = p[
-                        'para_font_type'
-                    ]  # 对于文本来说，要么是普通文本，要么是个行间公式
-                    if font_type == TYPE_INTERLINE_EQUATION:
-                        content_node = {'type': 'equation', 'latex': p['para_text']}
-                        page_lst.append(content_node)
-                    else:
-                        para_text = p['para_text']
-                        is_title = p['is_para_title']
-                        title_level = p['para_title_level']
-                        if is_title:
-                            content_node = {
-                                'type': f'h{title_level}',
-                                'text': para_text,
-                            }
-                            page_lst.append(content_node)
-                        else:
-                            content_node = {'type': 'text', 'text': para_text}
-                            page_lst.append(content_node)
-        content_lst.extend(page_lst)
-        """插入图片"""
-        for img in all_page_images:
-            insert_img_or_table('image', img, pymu_raw_blocks, content_lst)
-        """插入表格"""
-        for table in all_page_tables:
-            insert_img_or_table('table', table, pymu_raw_blocks, content_lst)
-    # end for
-    return content_lst
-def insert_img_or_table(type, element, pymu_raw_blocks, content_lst):
-    element_bbox = element['bbox']
-    # 先看在哪个block内
-    for block in pymu_raw_blocks:
-        bbox = block['bbox']
-        if (
-            bbox[0] - 1 <= element_bbox[0] < bbox[2] + 1
-            and bbox[1] - 1 <= element_bbox[1] < bbox[3] + 1
-        ):  # 确定在这个大的block内，然后进入逐行比较距离
-            for l in block['lines']:  # noqa: E741
-                line_box = l['bbox']
-                if (
-                    line_box[0] - 1 <= element_bbox[0] < line_box[2] + 1
-                    and line_box[1] - 1 <= element_bbox[1] < line_box[3] + 1
-                ):  # 在line内的，插入line前面
-                    line_txt = ''.join([s['text'] for s in l['spans']])
-                    __insert_before_para(line_txt, type, element, content_lst)
-                    break
-                break
-            else:  # 在行与行之间
-                # 找到图片x0,y0与line的x0,y0最近的line
-                min_distance = 100000
-                min_line = None
-                for l in block['lines']:  # noqa: E741
-                    line_box = l['bbox']
-                    distance = math.sqrt(
-                        (line_box[0] - element_bbox[0]) ** 2
-                        + (line_box[1] - element_bbox[1]) ** 2
-                    )
-                    if distance < min_distance:
-                        min_distance = distance
-                        min_line = l
-                if min_line:
-                    line_txt = ''.join([s['text'] for s in min_line['spans']])
-                    img_h = element_bbox[3] - element_bbox[1]
-                    if min_distance < img_h:  # 文字在图片前面
-                        __insert_after_para(line_txt, type, element, content_lst)
-                    else:
-                        __insert_before_para(line_txt, type, element, content_lst)
-                    break
-                else:
-                    logger.error(
-                        f"Can't find the location of image {element.get('image_path')} in the markdown file  #1"
-                    )
-    else:  # 应当在两个block之间
-        # 找到上方最近的block，如果上方没有就找大下方最近的block
-        top_txt_block = find_top_nearest_text_bbox(pymu_raw_blocks, element_bbox)
-        if top_txt_block:
-            line_txt = ''.join([s['text'] for s in top_txt_block['lines'][-1]['spans']])
-            __insert_after_para(line_txt, type, element, content_lst)
-        else:
-            bottom_txt_block = find_bottom_nearest_text_bbox(
-                pymu_raw_blocks, element_bbox
-            )
-            if bottom_txt_block:
-                line_txt = ''.join(
-                    [s['text'] for s in bottom_txt_block['lines'][0]['spans']]
-                )
-                __insert_before_para(line_txt, type, element, content_lst)
-            else:  # TODO ，图片可能独占一列，这种情况上下是没有图片的
-                logger.error(
-                    f"Can't find the location of image {element.get('image_path')} in the markdown file  #2"
-                )
-def mk_mm_markdown(content_list):
-    """基于同一格式的内容列表，构造markdown，含图片."""
-    content_md = []
-    for c in content_list:
-        content_type = c.get('type')
-        if content_type == 'text':
-            content_md.append(c.get('text'))
-        elif content_type == 'equation':
-            content = c.get('latex')
-            if content.startswith('$$') and content.endswith('$$'):
-                content_md.append(content)
-            else:
-                content_md.append(f"\n$$\n{c.get('latex')}\n$$\n")
-        elif content_type in UNI_FORMAT_TEXT_TYPE:
-            content_md.append(f"{'#'*int(content_type[1])} {c.get('text')}")
-        elif content_type == 'image':
-            content_md.append(f"![]({c.get('img_path')})")
-    return '\n\n'.join(content_md)
-def mk_nlp_markdown(content_list):
-    """基于同一格式的内容列表，构造markdown，不含图片."""
-    content_md = []
-    for c in content_list:
-        content_type = c.get('type')
-        if content_type == 'text':
-            content_md.append(c.get('text'))
-        elif content_type == 'equation':
-            content_md.append(f"$$\n{c.get('latex')}\n$$")
-        elif content_type == 'table':
-            content_md.append(f"$$$\n{c.get('table_latex')}\n$$$")
-        elif content_type in UNI_FORMAT_TEXT_TYPE:
-            content_md.append(f"{'#'*int(content_type[1])} {c.get('text')}")
-    return '\n\n'.join(content_md)
--- a/magic_pdf/layout.bak/__init__.py
+++ b/magic_pdf/layout.bak/__init__.py
--- a/magic_pdf/layout.bak/bbox_sort.py
+++ b/magic_pdf/layout.bak/bbox_sort.py
-# 定义这里的bbox是一个list [x0, y0, x1, y1, block_content, idx_x, idx_y, content_type, ext_x0, ext_y0, ext_x1, ext_y1], 初始时候idx_x, idx_y都是None
-# 其中x0, y0代表左上角坐标，x1, y1代表右下角坐标，坐标原点在左上角。
-from magic_pdf.layout.layout_spiler_recog import get_spilter_of_page
-from magic_pdf.libs.boxbase import _is_in, _is_in_or_part_overlap, _is_vertical_full_overlap
-from magic_pdf.libs.commons import mymax
-X0_IDX = 0
-Y0_IDX = 1
-X1_IDX = 2
-Y1_IDX = 3
-CONTENT_IDX = 4
-IDX_X = 5
-IDX_Y = 6
-CONTENT_TYPE_IDX = 7
-X0_EXT_IDX = 8
-Y0_EXT_IDX = 9
-X1_EXT_IDX = 10
-Y1_EXT_IDX = 11
-def prepare_bboxes_for_layout_split(image_info, image_backup_info, table_info, inline_eq_info, interline_eq_info, text_raw_blocks: dict, page_boundry, page):
-    """
-    text_raw_blocks:结构参考test/assets/papre/pymu_textblocks.json
-    把bbox重新组装成一个list，每个元素[x0, y0, x1, y1, block_content, idx_x, idx_y, content_type, ext_x0, ext_y0, ext_x1, ext_y1], 初始时候idx_x, idx_y都是None. 对于图片、公式来说，block_content是图片的地址， 对于段落来说，block_content是pymupdf里的block结构
-    """
-    all_bboxes = []
-    for image in image_info:
-        box = image['bbox']
-        # 由于没有实现横向的栏切分，因此在这里先过滤掉一些小的图片。这些图片有可能影响layout，造成没有横向栏切分的情况下，layout切分不准确。例如 scihub_76500000/libgen.scimag76570000-76570999.zip_10.1186/s13287-019-1355-1
-        # 把长宽都小于50的去掉
-        if abs(box[0]-box[2]) < 50 and abs(box[1]-box[3]) < 50:
-            continue
-        all_bboxes.append([box[0], box[1], box[2], box[3], None, None, None, 'image', None, None, None, None])
-    for table in table_info:
-        box = table['bbox']
-        all_bboxes.append([box[0], box[1], box[2], box[3], None, None, None, 'table', None, None, None, None])
-    """由于公式与段落混合，因此公式不再参与layout划分，无需加入all_bboxes"""
-    # 加入文本block
-    text_block_temp = []
-    for block in text_raw_blocks:
-        bbox = block['bbox']
-        text_block_temp.append([bbox[0], bbox[1], bbox[2], bbox[3], None, None, None, 'text', None, None, None, None])
-    text_block_new = resolve_bbox_overlap_for_layout_det(text_block_temp)   
-    text_block_new = filter_lines_bbox(text_block_new) # 去掉线条bbox，有可能让layout探测陷入无限循环
-    """找出会影响layout的色块、横向分割线"""
-    spilter_bboxes = get_spilter_of_page(page, [b['bbox'] for b in image_info]+[b['bbox'] for b in image_backup_info], [b['bbox'] for b in table_info], )
-    # 还要去掉存在于spilter_bboxes里的text_block
-    if len(spilter_bboxes) > 0:
-        text_block_new = [box for box in text_block_new if not any([_is_in_or_part_overlap(box[:4], spilter_bbox) for spilter_bbox in spilter_bboxes])]
-    for bbox in text_block_new:
-        all_bboxes.append([bbox[0], bbox[1], bbox[2], bbox[3], None, None, None, 'text', None, None, None, None]) 
-    for bbox in spilter_bboxes:
-        all_bboxes.append([bbox[0], bbox[1], bbox[2], bbox[3], None, None, None, 'spilter', None, None, None, None])
-    return all_bboxes
-def resolve_bbox_overlap_for_layout_det(bboxes:list):
-    """
-    1. 去掉bbox互相包含的，去掉被包含的
-    2. 上下方向上如果有重叠，就扩大大box范围，直到覆盖小box
-    """
-    def _is_in_other_bbox(i:int):
-        """
-        判断i个box是否被其他box有所包含
-        """
-        for j in range(0, len(bboxes)):
-            if j!=i and _is_in(bboxes[i][:4], bboxes[j][:4]):
-                return True
-            # elif j!=i and _is_bottom_full_overlap(bboxes[i][:4], bboxes[j][:4]):
-            #     return True
-        return False
-    # 首先去掉被包含的bbox
-    new_bbox_1 = []
-    for i in range(0, len(bboxes)):
-        if not _is_in_other_bbox(i):
-            new_bbox_1.append(bboxes[i])
-    # 其次扩展大的box
-    new_box = []
-    new_bbox_2 = []
-    len_1 = len(new_bbox_2)
-    while True:
-        merged_idx = []
-        for i in range(0, len(new_bbox_1)):
-            if i in merged_idx:
-                continue
-            for j in range(i+1, len(new_bbox_1)):
-                if j in merged_idx:
-                    continue
-                bx1 = new_bbox_1[i]
-                bx2 = new_bbox_1[j]
-                if i!=j and _is_vertical_full_overlap(bx1[:4], bx2[:4]):
-                    merged_box = min([bx1[0], bx2[0]]), min([bx1[1], bx2[1]]), max([bx1[2], bx2[2]]), max([bx1[3], bx2[3]])
-                    new_bbox_2.append(merged_box)
-                    merged_idx.append(i)
-                    merged_idx.append(j)
-        for i in range(0, len(new_bbox_1)): # 没有合并的加入进来
-            if i not in merged_idx:
-                new_bbox_2.append(new_bbox_1[i])        
-        if len(new_bbox_2)==0 or len_1==len(new_bbox_2):
-            break
-        else:
-            len_1 = len(new_bbox_2)
-            new_box = new_bbox_2
-            new_bbox_1, new_bbox_2 = new_bbox_2, []
-    return new_box
-def filter_lines_bbox(bboxes: list):
-    """
-    过滤掉bbox为空的行
-    """
-    new_box = []
-    for box in bboxes:
-        x0, y0, x1, y1 = box[0], box[1], box[2], box[3]
-        if abs(x0-x1)<=1 or abs(y0-y1)<=1:
-            continue
-        else:
-            new_box.append(box)
-    return new_box
-################################################################################
-# 第一种排序算法
-# 以下是基于延长线遮挡做的一个算法
-#
-################################################################################
-def find_all_left_bbox(this_bbox, all_bboxes) -> list:
-    """
-    寻找this_bbox左边的所有bbox
-    """
-    left_boxes = [box for box in all_bboxes if box[X1_IDX] <= this_bbox[X0_IDX]]
-    return left_boxes
-def find_all_top_bbox(this_bbox, all_bboxes) -> list:
-    """
-    寻找this_bbox上面的所有bbox
-    """
-    top_boxes = [box for box in all_bboxes if box[Y1_IDX] <= this_bbox[Y0_IDX]]
-    return top_boxes
-def get_and_set_idx_x(this_bbox, all_bboxes) -> int:
-    """
-    寻找this_bbox在all_bboxes中的遮挡深度 idx_x
-    """
-    if this_bbox[IDX_X] is not None:
-        return this_bbox[IDX_X]
-    else:
-        all_left_bboxes = find_all_left_bbox(this_bbox, all_bboxes)
-        if len(all_left_bboxes) == 0:
-            this_bbox[IDX_X] = 0
-        else:
-            all_left_bboxes_idx = [get_and_set_idx_x(bbox, all_bboxes) for bbox in all_left_bboxes]
-            max_idx_x = mymax(all_left_bboxes_idx)
-            this_bbox[IDX_X] = max_idx_x + 1
-        return this_bbox[IDX_X]
-def get_and_set_idx_y(this_bbox, all_bboxes) -> int:
-    """
-    寻找this_bbox在all_bboxes中y方向的遮挡深度 idx_y
-    """
-    if this_bbox[IDX_Y] is not None:
-        return this_bbox[IDX_Y]
-    else:
-        all_top_bboxes = find_all_top_bbox(this_bbox, all_bboxes)
-        if len(all_top_bboxes) == 0:
-            this_bbox[IDX_Y] = 0
-        else:
-            all_top_bboxes_idx = [get_and_set_idx_y(bbox, all_bboxes) for bbox in all_top_bboxes]
-            max_idx_y = mymax(all_top_bboxes_idx)
-            this_bbox[IDX_Y] = max_idx_y + 1
-        return this_bbox[IDX_Y]
-def bbox_sort(all_bboxes: list):
-    """
-    排序
-    """
-    all_bboxes_idx_x = [get_and_set_idx_x(bbox, all_bboxes) for bbox in all_bboxes]
-    all_bboxes_idx_y = [get_and_set_idx_y(bbox, all_bboxes) for bbox in all_bboxes]
-    all_bboxes_idx = [(idx_x, idx_y) for idx_x, idx_y in zip(all_bboxes_idx_x, all_bboxes_idx_y)]
-    all_bboxes_idx = [idx_x_y[0] * 100000 + idx_x_y[1] for idx_x_y in all_bboxes_idx]  # 变换成一个点，保证能够先X，X相同时按Y排序
-    all_bboxes_idx = list(zip(all_bboxes_idx, all_bboxes))
-    all_bboxes_idx.sort(key=lambda x: x[0])
-    sorted_bboxes = [bbox for idx, bbox in all_bboxes_idx]
-    return sorted_bboxes
-################################################################################
-# 第二种排序算法
-# 下面的算法在计算idx_x和idx_y的时候不考虑延长线，而只考虑实际的长或者宽被遮挡的情况
-#
-################################################################################
-def find_left_nearest_bbox(this_bbox, all_bboxes) -> list:
-    """
-    在all_bboxes里找到所有右侧高度和this_bbox有重叠的bbox
-    """
-    left_boxes = [box for box in all_bboxes if box[X1_IDX] <= this_bbox[X0_IDX] and any([
-         box[Y0_IDX] < this_bbox[Y0_IDX] < box[Y1_IDX], box[Y0_IDX] < this_bbox[Y1_IDX] < box[Y1_IDX],
-         this_bbox[Y0_IDX] < box[Y0_IDX] < this_bbox[Y1_IDX], this_bbox[Y0_IDX] < box[Y1_IDX] < this_bbox[Y1_IDX],
-         box[Y0_IDX]==this_bbox[Y0_IDX] and box[Y1_IDX]==this_bbox[Y1_IDX]])]
-    # 然后再过滤一下，找到水平上距离this_bbox最近的那个
-    if len(left_boxes) > 0:
-        left_boxes.sort(key=lambda x: x[X1_IDX], reverse=True)
-        left_boxes = [left_boxes[0]]
-    else:
-        left_boxes = []
-    return left_boxes
-def get_and_set_idx_x_2(this_bbox, all_bboxes):
-    """
-    寻找this_bbox在all_bboxes中的被直接遮挡的深度 idx_x
-    这个遮挡深度不考虑延长线，而是被实际的长或者宽遮挡的情况
-    """
-    if this_bbox[IDX_X] is not None:
-        return this_bbox[IDX_X]
-    else:
-        left_nearest_bbox = find_left_nearest_bbox(this_bbox, all_bboxes)
-        if len(left_nearest_bbox) == 0:
-            this_bbox[IDX_X] = 0
-        else:
-            left_idx_x = get_and_set_idx_x_2(left_nearest_bbox[0], all_bboxes)
-            this_bbox[IDX_X] = left_idx_x + 1
-        return this_bbox[IDX_X]
-def find_top_nearest_bbox(this_bbox, all_bboxes) -> list:
-    """
-    在all_bboxes里找到所有下侧宽度和this_bbox有重叠的bbox
-    """
-    top_boxes = [box for box in all_bboxes if box[Y1_IDX] <= this_bbox[Y0_IDX] and any([
-        box[X0_IDX] < this_bbox[X0_IDX] < box[X1_IDX], box[X0_IDX] < this_bbox[X1_IDX] < box[X1_IDX],
-         this_bbox[X0_IDX] < box[X0_IDX] < this_bbox[X1_IDX], this_bbox[X0_IDX] < box[X1_IDX] < this_bbox[X1_IDX],
-        box[X0_IDX]==this_bbox[X0_IDX] and box[X1_IDX]==this_bbox[X1_IDX]])]
-    # 然后再过滤一下，找到水平上距离this_bbox最近的那个
-    if len(top_boxes) > 0:
-        top_boxes.sort(key=lambda x: x[Y1_IDX], reverse=True)
-        top_boxes = [top_boxes[0]]
-    else:
-        top_boxes = []
-    return top_boxes
-def get_and_set_idx_y_2(this_bbox, all_bboxes):
-    """
-    寻找this_bbox在all_bboxes中的被直接遮挡的深度 idx_y
-    这个遮挡深度不考虑延长线，而是被实际的长或者宽遮挡的情况
-    """
-    if this_bbox[IDX_Y] is not None:
-        return this_bbox[IDX_Y]
-    else:
-        top_nearest_bbox = find_top_nearest_bbox(this_bbox, all_bboxes)
-        if len(top_nearest_bbox) == 0:
-            this_bbox[IDX_Y] = 0
-        else:
-            top_idx_y = get_and_set_idx_y_2(top_nearest_bbox[0], all_bboxes)
-            this_bbox[IDX_Y] = top_idx_y + 1
-        return this_bbox[IDX_Y]
-def paper_bbox_sort(all_bboxes: list, page_width, page_height):
-    all_bboxes_idx_x = [get_and_set_idx_x_2(bbox, all_bboxes) for bbox in all_bboxes]
-    all_bboxes_idx_y = [get_and_set_idx_y_2(bbox, all_bboxes) for bbox in all_bboxes]
-    all_bboxes_idx = [(idx_x, idx_y) for idx_x, idx_y in zip(all_bboxes_idx_x, all_bboxes_idx_y)]
-    all_bboxes_idx = [idx_x_y[0] * 100000 + idx_x_y[1] for idx_x_y in all_bboxes_idx]  # 变换成一个点，保证能够先X，X相同时按Y排序
-    all_bboxes_idx = list(zip(all_bboxes_idx, all_bboxes))
-    all_bboxes_idx.sort(key=lambda x: x[0])
-    sorted_bboxes = [bbox for idx, bbox in all_bboxes_idx]
-    return sorted_bboxes
-################################################################################
-"""
-第三种排序算法, 假设page的最左侧为X0，最右侧为X1，最上侧为Y0，最下侧为Y1
-这个排序算法在第二种算法基础上增加对bbox的预处理步骤。预处理思路如下：
-1. 首先在水平方向上对bbox进行扩展。扩展方法是：
-    - 对每个bbox，找到其左边最近的bbox（也就是y方向有重叠），然后将其左边界扩展到左边最近bbox的右边界(x1+1),这里加1是为了避免重叠。如果没有左边的bbox，那么就将其左边界扩展到page的最左侧X0。
-    - 对每个bbox，找到其右边最近的bbox（也就是y方向有重叠），然后将其右边界扩展到右边最近bbox的左边界(x0-1),这里减1是为了避免重叠。如果没有右边的bbox，那么就将其右边界扩展到page的最右侧X1。
-    - 经过上面2个步骤，bbox扩展到了水平方向的最大范围。[左最近bbox.x1+1, 右最近bbox.x0-1]
-2. 合并所有的连续水平方向的bbox, 合并方法是：
-    - 对bbox进行y方向排序，然后从上到下遍历所有bbox，如果当前bbox和下一个bbox的x0, x1等于X0, X1，那么就合并这两个bbox。
-3. 然后在垂直方向上对bbox进行扩展。扩展方法是：
-    - 首先从page上切割掉合并后的水平bbox, 得到几个新的block
-    针对每个block
-    - x0: 扎到位于左侧x=x0延长线的左侧所有的bboxes, 找到最大的x1,让x0=x1+1。如果没有，则x0=X0
-    - x1: 找到位于右侧x=x1延长线右侧所有的bboxes， 找到最小的x0, 让x1=x0-1。如果没有，则x1=X1
-    随后在垂直方向上合并所有的连续的block，方法如下：
-    - 对block进行x方向排序，然后从左到右遍历所有block，如果当前block和下一个block的x0, x1相等，那么就合并这两个block。
-    如果垂直切分后所有小bbox都被分配到了一个block, 那么分割就完成了。这些合并后的block打上标签'GOOD_LAYOUT’
-    如果在某个垂直方向上无法被完全分割到一个block，那么就将这个block打上标签'BAD_LAYOUT'。
-    至此完成，一个页面的预处理，天然的block要么属于'GOOD_LAYOUT'，要么属于'BAD_LAYOUT'。针对含有'BAD_LAYOUT'的页面，可以先按照自上而下，自左到右进行天然排序，也可以先过滤掉这种书籍。
-    (完成条件下次加强：进行水平方向切分，把混乱的layout部分尽可能切割出去)
-"""
-################################################################################
-def find_left_neighbor_bboxes(this_bbox, all_bboxes) -> list:
-    """
-    在all_bboxes里找到所有右侧高度和this_bbox有重叠的bbox
-    这里使用扩展之后的bbox
-    """
-    left_boxes = [box for box in all_bboxes if box[X1_EXT_IDX] <= this_bbox[X0_EXT_IDX] and any([
-         box[Y0_EXT_IDX] < this_bbox[Y0_EXT_IDX] < box[Y1_EXT_IDX], box[Y0_EXT_IDX] < this_bbox[Y1_EXT_IDX] < box[Y1_EXT_IDX],
-         this_bbox[Y0_EXT_IDX] < box[Y0_EXT_IDX] < this_bbox[Y1_EXT_IDX], this_bbox[Y0_EXT_IDX] < box[Y1_EXT_IDX] < this_bbox[Y1_EXT_IDX],
-         box[Y0_EXT_IDX]==this_bbox[Y0_EXT_IDX] and box[Y1_EXT_IDX]==this_bbox[Y1_EXT_IDX]])]
-    # 然后再过滤一下，找到水平上距离this_bbox最近的那个
-    if len(left_boxes) > 0:
-        left_boxes.sort(key=lambda x: x[X1_EXT_IDX], reverse=True)
-        left_boxes = left_boxes
-    else:
-        left_boxes = []
-    return left_boxes
-def find_top_neighbor_bboxes(this_bbox, all_bboxes) -> list:
-    """
-    在all_bboxes里找到所有下侧宽度和this_bbox有重叠的bbox
-    这里使用扩展之后的bbox
-    """
-    top_boxes = [box for box in all_bboxes if box[Y1_EXT_IDX] <= this_bbox[Y0_EXT_IDX] and any([
-        box[X0_EXT_IDX] < this_bbox[X0_EXT_IDX] < box[X1_EXT_IDX], box[X0_EXT_IDX] < this_bbox[X1_EXT_IDX] < box[X1_EXT_IDX],
-         this_bbox[X0_EXT_IDX] < box[X0_EXT_IDX] < this_bbox[X1_EXT_IDX], this_bbox[X0_EXT_IDX] < box[X1_EXT_IDX] < this_bbox[X1_EXT_IDX],
-        box[X0_EXT_IDX]==this_bbox[X0_EXT_IDX] and box[X1_EXT_IDX]==this_bbox[X1_EXT_IDX]])]
-    # 然后再过滤一下，找到水平上距离this_bbox最近的那个
-    if len(top_boxes) > 0:
-        top_boxes.sort(key=lambda x: x[Y1_EXT_IDX], reverse=True)
-        top_boxes = top_boxes
-    else:
-        top_boxes = []
-    return top_boxes
-def get_and_set_idx_x_2_ext(this_bbox, all_bboxes):
-    """
-    寻找this_bbox在all_bboxes中的被直接遮挡的深度 idx_x
-    这个遮挡深度不考虑延长线，而是被实际的长或者宽遮挡的情况
-    """
-    if this_bbox[IDX_X] is not None:
-        return this_bbox[IDX_X]
-    else:
-        left_nearest_bbox = find_left_neighbor_bboxes(this_bbox, all_bboxes)
-        if len(left_nearest_bbox) == 0:
-            this_bbox[IDX_X] = 0
-        else:
-            left_idx_x = [get_and_set_idx_x_2(b, all_bboxes) for b in left_nearest_bbox]
-            this_bbox[IDX_X] = mymax(left_idx_x) + 1
-        return this_bbox[IDX_X]
-def get_and_set_idx_y_2_ext(this_bbox, all_bboxes):
-    """
-    寻找this_bbox在all_bboxes中的被直接遮挡的深度 idx_y
-    这个遮挡深度不考虑延长线，而是被实际的长或者宽遮挡的情况
-    """
-    if this_bbox[IDX_Y] is not None:
-        return this_bbox[IDX_Y]
-    else:
-        top_nearest_bbox = find_top_neighbor_bboxes(this_bbox, all_bboxes)
-        if len(top_nearest_bbox) == 0:
-            this_bbox[IDX_Y] = 0
-        else:
-            top_idx_y = [get_and_set_idx_y_2_ext(b, all_bboxes) for b in top_nearest_bbox]
-            this_bbox[IDX_Y] = mymax(top_idx_y) + 1
-        return this_bbox[IDX_Y]
-def _paper_bbox_sort_ext(all_bboxes: list):
-    all_bboxes_idx_x = [get_and_set_idx_x_2_ext(bbox, all_bboxes) for bbox in all_bboxes]
-    all_bboxes_idx_y = [get_and_set_idx_y_2_ext(bbox, all_bboxes) for bbox in all_bboxes]
-    all_bboxes_idx = [(idx_x, idx_y) for idx_x, idx_y in zip(all_bboxes_idx_x, all_bboxes_idx_y)]
-    all_bboxes_idx = [idx_x_y[0] * 100000 + idx_x_y[1] for idx_x_y in all_bboxes_idx]  # 变换成一个点，保证能够先X，X相同时按Y排序
-    all_bboxes_idx = list(zip(all_bboxes_idx, all_bboxes))
-    all_bboxes_idx.sort(key=lambda x: x[0])
-    sorted_bboxes = [bbox for idx, bbox in all_bboxes_idx]
-    return sorted_bboxes
-# ===============================================================================================
-def find_left_bbox_ext_line(this_bbox, all_bboxes) -> list:
-    """
-    寻找this_bbox左边的所有bbox, 使用延长线
-    """
-    left_boxes = [box for box in all_bboxes if box[X1_IDX] <= this_bbox[X0_IDX]]
-    if len(left_boxes):
-        left_boxes.sort(key=lambda x: x[X1_IDX], reverse=True)
-        left_boxes = left_boxes[0]
-    else:
-        left_boxes = None
-    return left_boxes
-def find_right_bbox_ext_line(this_bbox, all_bboxes) -> list:
-    """
-    寻找this_bbox右边的所有bbox, 使用延长线
-    """
-    right_boxes = [box for box in all_bboxes if box[X0_IDX] >= this_bbox[X1_IDX]]
-    if len(right_boxes):
-        right_boxes.sort(key=lambda x: x[X0_IDX])
-        right_boxes = right_boxes[0]
-    else:
-        right_boxes = None
-    return right_boxes
-# =============================================================================================
-def find_left_nearest_bbox_direct(this_bbox, all_bboxes) -> list:
-    """
-    在all_bboxes里找到所有右侧高度和this_bbox有重叠的bbox， 不用延长线并且不能像
-    """
-    left_boxes = [box for box in all_bboxes if box[X1_IDX] <= this_bbox[X0_IDX] and any([
-         box[Y0_IDX] < this_bbox[Y0_IDX] < box[Y1_IDX], box[Y0_IDX] < this_bbox[Y1_IDX] < box[Y1_IDX],
-         this_bbox[Y0_IDX] < box[Y0_IDX] < this_bbox[Y1_IDX], this_bbox[Y0_IDX] < box[Y1_IDX] < this_bbox[Y1_IDX],
-         box[Y0_IDX]==this_bbox[Y0_IDX] and box[Y1_IDX]==this_bbox[Y1_IDX]])]
-    # 然后再过滤一下，找到水平上距离this_bbox最近的那个——x1最大的那个
-    if len(left_boxes) > 0:
-        left_boxes.sort(key=lambda x: x[X1_EXT_IDX] if x[X1_EXT_IDX] else x[X1_IDX], reverse=True)
-        left_boxes = left_boxes[0]
-    else:
-        left_boxes = None
-    return left_boxes
-def find_right_nearst_bbox_direct(this_bbox, all_bboxes) -> list:
-    """
-    找到在this_bbox右侧且距离this_bbox距离最近的bbox.必须是直接遮挡的那种
-    """
-    right_bboxes = [box for box in all_bboxes if box[X0_IDX] >= this_bbox[X1_IDX] and any([
-        this_bbox[Y0_IDX] < box[Y0_IDX] < this_bbox[Y1_IDX], this_bbox[Y0_IDX] < box[Y1_IDX] < this_bbox[Y1_IDX],
-        box[Y0_IDX] < this_bbox[Y0_IDX] < box[Y1_IDX], box[Y0_IDX] < this_bbox[Y1_IDX] < box[Y1_IDX],
-        box[Y0_IDX]==this_bbox[Y0_IDX] and box[Y1_IDX]==this_bbox[Y1_IDX]])]
-    if len(right_bboxes)>0:
-        right_bboxes.sort(key=lambda x: x[X0_EXT_IDX] if x[X0_EXT_IDX] else x[X0_IDX])
-        right_bboxes = right_bboxes[0]
-    else:
-        right_bboxes = None
-    return right_bboxes
-def reset_idx_x_y(all_boxes:list)->list:
-    for box in all_boxes:
-        box[IDX_X] = None
-        box[IDX_Y] = None
-    return all_boxes
-# ===================================================================================================
-def find_top_nearest_bbox_direct(this_bbox, bboxes_collection) -> list:
-    """
-    找到在this_bbox上方且距离this_bbox距离最近的bbox.必须是直接遮挡的那种
-    """
-    top_bboxes = [box for box in bboxes_collection if box[Y1_IDX] <= this_bbox[Y0_IDX] and any([
-        box[X0_IDX] < this_bbox[X0_IDX] < box[X1_IDX], box[X0_IDX] < this_bbox[X1_IDX] < box[X1_IDX],
-         this_bbox[X0_IDX] < box[X0_IDX] < this_bbox[X1_IDX], this_bbox[X0_IDX] < box[X1_IDX] < this_bbox[X1_IDX],
-        box[X0_IDX]==this_bbox[X0_IDX] and box[X1_IDX]==this_bbox[X1_IDX]])]
-    # 然后再过滤一下，找到上方距离this_bbox最近的那个
-    if len(top_bboxes) > 0:
-        top_bboxes.sort(key=lambda x: x[Y1_IDX], reverse=True)
-        top_bboxes = top_bboxes[0]
-    else:
-        top_bboxes = None
-    return top_bboxes
-def find_bottom_nearest_bbox_direct(this_bbox, bboxes_collection) -> list:
-    """
-    找到在this_bbox下方且距离this_bbox距离最近的bbox.必须是直接遮挡的那种
-    """
-    bottom_bboxes = [box for box in bboxes_collection if box[Y0_IDX] >= this_bbox[Y1_IDX] and any([
-        box[X0_IDX] < this_bbox[X0_IDX] < box[X1_IDX], box[X0_IDX] < this_bbox[X1_IDX] < box[X1_IDX],
-         this_bbox[X0_IDX] < box[X0_IDX] < this_bbox[X1_IDX], this_bbox[X0_IDX] < box[X1_IDX] < this_bbox[X1_IDX],
-        box[X0_IDX]==this_bbox[X0_IDX] and box[X1_IDX]==this_bbox[X1_IDX]])]
-    # 然后再过滤一下，找到水平上距离this_bbox最近的那个
-    if len(bottom_bboxes) > 0:
-        bottom_bboxes.sort(key=lambda x: x[Y0_IDX])
-        bottom_bboxes = bottom_bboxes[0]
-    else:
-        bottom_bboxes = None
-    return bottom_bboxes
-def find_boundry_bboxes(bboxes:list) -> tuple:
-    """
-    找到bboxes的边界——找到所有bbox里最小的(x0, y0), 最大的(x1, y1)
-    """
-    x0, y0, x1, y1 = bboxes[0][X0_IDX], bboxes[0][Y0_IDX], bboxes[0][X1_IDX], bboxes[0][Y1_IDX]
-    for box in bboxes:
-        x0 = min(box[X0_IDX], x0)
-        y0 = min(box[Y0_IDX], y0)
-        x1 = max(box[X1_IDX], x1)
-        y1 = max(box[Y1_IDX], y1)
-    return x0, y0, x1, y1
-def extend_bbox_vertical(bboxes:list, boundry_x0, boundry_y0, boundry_x1, boundry_y1) -> list:
-    """
-    在垂直方向上扩展能够直接垂直打通的bbox,也就是那些上下都没有其他box的bbox
-    """
-    for box in bboxes:
-        top_nearest_bbox = find_top_nearest_bbox_direct(box, bboxes)
-        bottom_nearest_bbox = find_bottom_nearest_bbox_direct(box, bboxes)
-        if top_nearest_bbox is None and bottom_nearest_bbox is None: # 独占一列
-            box[X0_EXT_IDX] = box[X0_IDX]
-            box[Y0_EXT_IDX] = boundry_y0
-            box[X1_EXT_IDX] = box[X1_IDX]
-            box[Y1_EXT_IDX] = boundry_y1
-        # else:
-        #     if top_nearest_bbox is None:
-        #         box[Y0_EXT_IDX] = boundry_y0
-        #     else:
-        #         box[Y0_EXT_IDX] = top_nearest_bbox[Y1_IDX] + 1
-        #     if bottom_nearest_bbox is None:
-        #         box[Y1_EXT_IDX] = boundry_y1
-        #     else:
-        #         box[Y1_EXT_IDX] = bottom_nearest_bbox[Y0_IDX] - 1
-        #     box[X0_EXT_IDX] = box[X0_IDX]
-        #     box[X1_EXT_IDX] = box[X1_IDX]
-    return bboxes
-# ===================================================================================================
-def paper_bbox_sort_v2(all_bboxes: list, page_width:int, page_height:int):
-    """
-    增加预处理行为的排序:
-    return:
-    [
-        {
-            "layout_bbox": [x0, y0, x1, y1],
-            "layout_label":"GOOD_LAYOUT/BAD_LAYOUT",
-            "content_bboxes": [] #每个元素都是[x0, y0, x1, y1, block_content, idx_x, idx_y, content_type, ext_x0, ext_y0, ext_x1, ext_y1], 并且顺序就是阅读顺序
-        }
-    ]
-    """
-    sorted_layouts = [] # 最后的返回结果
-    page_x0, page_y0, page_x1, page_y1 = 1, 1, page_width-1, page_height-1
-    all_bboxes = paper_bbox_sort(all_bboxes) # 大致拍下序
-    # 首先在水平方向上扩展独占一行的bbox
-    for bbox in all_bboxes:
-        left_nearest_bbox = find_left_nearest_bbox_direct(bbox, all_bboxes) # 非扩展线
-        right_nearest_bbox = find_right_nearst_bbox_direct(bbox, all_bboxes)
-        if left_nearest_bbox is None and right_nearest_bbox is None: # 独占一行
-            bbox[X0_EXT_IDX] = page_x0
-            bbox[Y0_EXT_IDX] = bbox[Y0_IDX]
-            bbox[X1_EXT_IDX] = page_x1
-            bbox[Y1_EXT_IDX] = bbox[Y1_IDX]
-    # 此时独占一行的被成功扩展到指定的边界上，这个时候利用边界条件合并连续的bbox，成为一个group
-    if len(all_bboxes)==1:
-        return [{"layout_bbox": [page_x0, page_y0, page_x1, page_y1], "layout_label":"GOOD_LAYOUT", "content_bboxes": all_bboxes}]
-    if len(all_bboxes)==0:
-        return []
-    """
-    然后合并所有连续水平方向的bbox.
-    """
-    all_bboxes.sort(key=lambda x: x[Y0_IDX])
-    h_bboxes = []
-    h_bbox_group = []
-    v_boxes = []
-    for bbox in all_bboxes:
-        if bbox[X0_IDX] == page_x0 and bbox[X1_IDX] == page_x1:
-            h_bbox_group.append(bbox)
-        else:
-            if len(h_bbox_group)>0:
-                h_bboxes.append(h_bbox_group) 
-                h_bbox_group = []
-    # 最后一个group
-    if len(h_bbox_group)>0:
-        h_bboxes.append(h_bbox_group)
-    """
-    现在h_bboxes里面是所有的group了，每个group都是一个list
-    对h_bboxes里的每个group进行计算放回到sorted_layouts里
-    """
-    for gp in h_bboxes:
-        gp.sort(key=lambda x: x[Y0_IDX])
-        block_info = {"layout_label":"GOOD_LAYOUT", "content_bboxes": gp}
-        # 然后计算这个group的layout_bbox，也就是最小的x0,y0, 最大的x1,y1
-        x0, y0, x1, y1 = gp[0][X0_EXT_IDX], gp[0][Y0_EXT_IDX], gp[-1][X1_EXT_IDX], gp[-1][Y1_EXT_IDX]
-        block_info["layout_bbox"] = [x0, y0, x1, y1]
-        sorted_layouts.append(block_info)
-    # 接下来利用这些连续的水平bbox的layout_bbox的y0, y1，从水平上切分开其余的为几个部分
-    h_split_lines = [page_y0]
-    for gp in h_bboxes:
-        layout_bbox = gp['layout_bbox']
-        y0, y1 = layout_bbox[1], layout_bbox[3]
-        h_split_lines.append(y0)
-        h_split_lines.append(y1)
-    h_split_lines.append(page_y1)
-    unsplited_bboxes = []
-    for i in range(0, len(h_split_lines), 2):
-        start_y0, start_y1 = h_split_lines[i:i+2]
-        # 然后找出[start_y0, start_y1]之间的其他bbox，这些组成一个未分割板块
-        bboxes_in_block = [bbox for bbox in all_bboxes if bbox[Y0_IDX]>=start_y0 and bbox[Y1_IDX]<=start_y1]
-        unsplited_bboxes.append(bboxes_in_block)
-    # ================== 至此，水平方向的 已经切分排序完毕====================================
-    """
-    接下来针对每个非水平的部分切分垂直方向的
-    此时，只剩下了无法被完全水平打通的bbox了。对这些box，优先进行垂直扩展，然后进行垂直切分.
-    分3步：
-    1. 先把能完全垂直打通的隔离出去当做一个layout
-    2. 其余的先垂直切分
-    3. 垂直切分之后的部分再尝试水平切分
-    4. 剩下的不能被切分的各个部分当成一个layout
-    """
-    # 对每部分进行垂直切分
-    for bboxes_in_block in unsplited_bboxes:
-        # 首先对这个block的bbox进行垂直方向上的扩展
-        boundry_x0, boundry_y0, boundry_x1, boundry_y1 = find_boundry_bboxes(bboxes_in_block) 
-        # 进行垂直方向上的扩展
-        extended_vertical_bboxes = extend_bbox_vertical(bboxes_in_block, boundry_x0, boundry_y0, boundry_x1, boundry_y1)
-        # 然后对这个block进行垂直方向上的切分
-        extend_bbox_vertical.sort(key=lambda x: x[X0_IDX]) # x方向上从小到大，代表了从左到右读取
-        v_boxes_group = []
-        for bbox in extended_vertical_bboxes:
-            if bbox[Y0_IDX]==boundry_y0 and bbox[Y1_IDX]==boundry_y1:
-                v_boxes_group.append(bbox)
-            else:
-                if len(v_boxes_group)>0:
-                    v_boxes.append(v_boxes_group)
-                    v_boxes_group = []
-        if len(v_boxes_group)>0:
-            v_boxes.append(v_boxes_group)
-        # 把连续的垂直部分加入到sorted_layouts里。注意这个时候已经是连续的垂直部分了，因为上面已经做了
-        for gp in v_boxes:
-            gp.sort(key=lambda x: x[X0_IDX])
-            block_info = {"layout_label":"GOOD_LAYOUT", "content_bboxes": gp}
-            # 然后计算这个group的layout_bbox，也就是最小的x0,y0, 最大的x1,y1
-            x0, y0, x1, y1 = gp[0][X0_EXT_IDX], gp[0][Y0_EXT_IDX], gp[-1][X1_EXT_IDX], gp[-1][Y1_EXT_IDX]
-            block_info["layout_bbox"] = [x0, y0, x1, y1]
-            sorted_layouts.append(block_info)
-        # 在垂直方向上，划分子块，也就是用贯通的垂直线进行切分。这些被切分出来的块，极大可能是可被垂直切分的，如果不能完全的垂直切分，那么尝试水平切分。都不能的则当成一个layout
-        v_split_lines = [boundry_x0]
-        for gp in v_boxes:
-            layout_bbox = gp['layout_bbox']
-            x0, x1 = layout_bbox[0], layout_bbox[2]
-            v_split_lines.append(x0)
-            v_split_lines.append(x1)
-        v_split_lines.append(boundry_x1)
-    reset_idx_x_y(all_bboxes)
-    all_boxes = _paper_bbox_sort_ext(all_bboxes)
-    return all_boxes
--- a/magic_pdf/layout.bak/layout_det_utils.py
+++ b/magic_pdf/layout.bak/layout_det_utils.py
-from magic_pdf.layout.bbox_sort import X0_EXT_IDX, X0_IDX, X1_EXT_IDX, X1_IDX, Y0_IDX, Y1_EXT_IDX, Y1_IDX
-from magic_pdf.libs.boxbase import _is_bottom_full_overlap, _left_intersect, _right_intersect
-def find_all_left_bbox_direct(this_bbox, all_bboxes) -> list:
-    """
-    在all_bboxes里找到所有右侧垂直方向上和this_bbox有重叠的bbox， 不用延长线
-    并且要考虑两个box左右相交的情况，如果相交了，那么右侧的box就不算最左侧。
-    """
-    left_boxes = [box for box in all_bboxes if box[X1_IDX] <= this_bbox[X0_IDX] 
-         and any([
-         box[Y0_IDX] < this_bbox[Y0_IDX] < box[Y1_IDX], box[Y0_IDX] < this_bbox[Y1_IDX] < box[Y1_IDX],
-         this_bbox[Y0_IDX] < box[Y0_IDX] < this_bbox[Y1_IDX], this_bbox[Y0_IDX] < box[Y1_IDX] < this_bbox[Y1_IDX],
-         box[Y0_IDX]==this_bbox[Y0_IDX] and box[Y1_IDX]==this_bbox[Y1_IDX]]) or _left_intersect(box[:4], this_bbox[:4])]
-    # 然后再过滤一下，找到水平上距离this_bbox最近的那个——x1最大的那个
-    if len(left_boxes) > 0:
-        left_boxes.sort(key=lambda x: x[X1_EXT_IDX] if x[X1_EXT_IDX] else x[X1_IDX], reverse=True)
-        left_boxes = left_boxes[0]
-    else:
-        left_boxes = None
-    return left_boxes
-def find_all_right_bbox_direct(this_bbox, all_bboxes) -> list:
-    """
-    找到在this_bbox右侧且距离this_bbox距离最近的bbox.必须是直接遮挡的那种
-    """
-    right_bboxes = [box for box in all_bboxes if box[X0_IDX] >= this_bbox[X1_IDX] 
-        and any([
-        this_bbox[Y0_IDX] < box[Y0_IDX] < this_bbox[Y1_IDX], this_bbox[Y0_IDX] < box[Y1_IDX] < this_bbox[Y1_IDX],
-        box[Y0_IDX] < this_bbox[Y0_IDX] < box[Y1_IDX], box[Y0_IDX] < this_bbox[Y1_IDX] < box[Y1_IDX],
-        box[Y0_IDX]==this_bbox[Y0_IDX] and box[Y1_IDX]==this_bbox[Y1_IDX]]) or _right_intersect(this_bbox[:4], box[:4])]
-    if len(right_bboxes)>0:
-        right_bboxes.sort(key=lambda x: x[X0_EXT_IDX] if x[X0_EXT_IDX] else x[X0_IDX])
-        right_bboxes = right_bboxes[0]
-    else:
-        right_bboxes = None
-    return right_bboxes
-def find_all_top_bbox_direct(this_bbox, all_bboxes) -> list:
-    """
-    找到在this_bbox上侧且距离this_bbox距离最近的bbox.必须是直接遮挡的那种
-    """
-    top_bboxes = [box for box in all_bboxes if box[Y1_IDX] <= this_bbox[Y0_IDX] and any([
-        box[X0_IDX] < this_bbox[X0_IDX] < box[X1_IDX], box[X0_IDX] < this_bbox[X1_IDX] < box[X1_IDX],
-        this_bbox[X0_IDX] < box[X0_IDX] < this_bbox[X1_IDX], this_bbox[X0_IDX] < box[X1_IDX] < this_bbox[X1_IDX],
-        box[X0_IDX]==this_bbox[X0_IDX] and box[X1_IDX]==this_bbox[X1_IDX]])]
-    if len(top_bboxes)>0:
-        top_bboxes.sort(key=lambda x: x[Y1_EXT_IDX] if x[Y1_EXT_IDX] else x[Y1_IDX], reverse=True)
-        top_bboxes = top_bboxes[0]
-    else:
-        top_bboxes = None
-    return top_bboxes
-def find_all_bottom_bbox_direct(this_bbox, all_bboxes) -> list:
-    """
-    找到在this_bbox下侧且距离this_bbox距离最近的bbox.必须是直接遮挡的那种
-    """
-    bottom_bboxes = [box for box in all_bboxes if box[Y0_IDX] >= this_bbox[Y1_IDX] and any([
-        this_bbox[X0_IDX] < box[X0_IDX] < this_bbox[X1_IDX], this_bbox[X0_IDX] < box[X1_IDX] < this_bbox[X1_IDX],
-        box[X0_IDX] < this_bbox[X0_IDX] < box[X1_IDX], box[X0_IDX] < this_bbox[X1_IDX] < box[X1_IDX],
-        box[X0_IDX]==this_bbox[X0_IDX] and box[X1_IDX]==this_bbox[X1_IDX]])]
-    if len(bottom_bboxes)>0:
-        bottom_bboxes.sort(key=lambda x:  x[Y0_IDX])
-        bottom_bboxes = bottom_bboxes[0]
-    else:
-        bottom_bboxes = None
-    return bottom_bboxes
-# ===================================================================================================================
-def find_bottom_bbox_direct_from_right_edge(this_bbox, all_bboxes) -> list:
-    """
-    找到在this_bbox下侧且距离this_bbox距离最近的bbox.必须是直接遮挡的那种
-    """
-    bottom_bboxes = [box for box in all_bboxes if box[Y0_IDX] >= this_bbox[Y1_IDX] and any([
-        this_bbox[X0_IDX] < box[X0_IDX] < this_bbox[X1_IDX], this_bbox[X0_IDX] < box[X1_IDX] < this_bbox[X1_IDX],
-        box[X0_IDX] < this_bbox[X0_IDX] < box[X1_IDX], box[X0_IDX] < this_bbox[X1_IDX] < box[X1_IDX],
-        box[X0_IDX]==this_bbox[X0_IDX] and box[X1_IDX]==this_bbox[X1_IDX]])]
-    if len(bottom_bboxes)>0:
-        # y0最小， X1最大的那个,也就是box上边缘最靠近this_bbox的那个,并且还最靠右
-        bottom_bboxes.sort(key=lambda x: x[Y0_IDX])
-        bottom_bboxes = [box for box in bottom_bboxes if box[Y0_IDX]==bottom_bboxes[0][Y0_IDX]]
-        # 然后再y1相同的情况下，找到x1最大的那个
-        bottom_bboxes.sort(key=lambda x: x[X1_IDX], reverse=True)
-        bottom_bboxes = bottom_bboxes[0]
-    else:
-        bottom_bboxes = None
-    return bottom_bboxes
-def find_bottom_bbox_direct_from_left_edge(this_bbox, all_bboxes) -> list:
-    """
-    找到在this_bbox下侧且距离this_bbox距离最近的bbox.必须是直接遮挡的那种
-    """
-    bottom_bboxes = [box for box in all_bboxes if box[Y0_IDX] >= this_bbox[Y1_IDX] and any([
-        this_bbox[X0_IDX] < box[X0_IDX] < this_bbox[X1_IDX], this_bbox[X0_IDX] < box[X1_IDX] < this_bbox[X1_IDX],
-        box[X0_IDX] < this_bbox[X0_IDX] < box[X1_IDX], box[X0_IDX] < this_bbox[X1_IDX] < box[X1_IDX],
-        box[X0_IDX]==this_bbox[X0_IDX] and box[X1_IDX]==this_bbox[X1_IDX]])]
-    if len(bottom_bboxes)>0:
-        # y0最小， X0最小的那个
-        bottom_bboxes.sort(key=lambda x: x[Y0_IDX])
-        bottom_bboxes = [box for box in bottom_bboxes if box[Y0_IDX]==bottom_bboxes[0][Y0_IDX]]
-        # 然后再y0相同的情况下，找到x0最小的那个
-        bottom_bboxes.sort(key=lambda x: x[X0_IDX])
-        bottom_bboxes = bottom_bboxes[0]
-    else:
-        bottom_bboxes = None
-    return bottom_bboxes
-def find_top_bbox_direct_from_left_edge(this_bbox, all_bboxes) -> list:
-    """
-    找到在this_bbox上侧且距离this_bbox距离最近的bbox.必须是直接遮挡的那种
-    """
-    top_bboxes = [box for box in all_bboxes if box[Y1_IDX] <= this_bbox[Y0_IDX] and any([
-        box[X0_IDX] < this_bbox[X0_IDX] < box[X1_IDX], box[X0_IDX] < this_bbox[X1_IDX] < box[X1_IDX],
-        this_bbox[X0_IDX] < box[X0_IDX] < this_bbox[X1_IDX], this_bbox[X0_IDX] < box[X1_IDX] < this_bbox[X1_IDX],
-        box[X0_IDX]==this_bbox[X0_IDX] and box[X1_IDX]==this_bbox[X1_IDX]])]
-    if len(top_bboxes)>0:
-        # y1最大， X0最小的那个
-        top_bboxes.sort(key=lambda x: x[Y1_IDX], reverse=True)
-        top_bboxes = [box for box in top_bboxes if box[Y1_IDX]==top_bboxes[0][Y1_IDX]]
-        # 然后再y1相同的情况下，找到x0最小的那个
-        top_bboxes.sort(key=lambda x: x[X0_IDX])
-        top_bboxes = top_bboxes[0]
-    else:
-        top_bboxes = None
-    return top_bboxes
-def find_top_bbox_direct_from_right_edge(this_bbox, all_bboxes) -> list:
-    """
-    找到在this_bbox上侧且距离this_bbox距离最近的bbox.必须是直接遮挡的那种
-    """
-    top_bboxes = [box for box in all_bboxes if box[Y1_IDX] <= this_bbox[Y0_IDX] and any([
-        box[X0_IDX] < this_bbox[X0_IDX] < box[X1_IDX], box[X0_IDX] < this_bbox[X1_IDX] < box[X1_IDX],
-        this_bbox[X0_IDX] < box[X0_IDX] < this_bbox[X1_IDX], this_bbox[X0_IDX] < box[X1_IDX] < this_bbox[X1_IDX],
-        box[X0_IDX]==this_bbox[X0_IDX] and box[X1_IDX]==this_bbox[X1_IDX]])]
-    if len(top_bboxes)>0:
-        # y1最大， X1最大的那个
-        top_bboxes.sort(key=lambda x: x[Y1_IDX], reverse=True)
-        top_bboxes = [box for box in top_bboxes if box[Y1_IDX]==top_bboxes[0][Y1_IDX]]
-        # 然后再y1相同的情况下，找到x1最大的那个
-        top_bboxes.sort(key=lambda x: x[X1_IDX], reverse=True)
-        top_bboxes = top_bboxes[0]
-    else:
-        top_bboxes = None
-    return top_bboxes
-# ===================================================================================================================
-def get_left_edge_bboxes(all_bboxes) -> list:
-    """
-    返回最左边的bbox
-    """
-    left_bboxes = [box for box in all_bboxes if find_all_left_bbox_direct(box, all_bboxes) is None]
-    return left_bboxes
-def get_right_edge_bboxes(all_bboxes) -> list:
-    """
-    返回最右边的bbox
-    """
-    right_bboxes = [box for box in all_bboxes if find_all_right_bbox_direct(box, all_bboxes) is None]
-    return right_bboxes
-def fix_vertical_bbox_pos(bboxes:list):
-    """
-    检查这批bbox在垂直方向是否有轻微的重叠，如果重叠了，就把重叠的bbox往下移动一点
-    在x方向上必须一个包含或者被包含，或者完全重叠，不能只有部分重叠
-    """
-    bboxes.sort(key=lambda x: x[Y0_IDX]) # 从上向下排列
-    for i in range(0, len(bboxes)):
-        for j in range(i+1, len(bboxes)):
-            if _is_bottom_full_overlap(bboxes[i][:4], bboxes[j][:4]):
-                # 如果两个bbox有部分重叠，那么就把下面的bbox往下移动一点
-                bboxes[j][Y0_IDX] = bboxes[i][Y1_IDX] + 2 # 2是个经验值
-                break
-    return bboxes
--- a/magic_pdf/layout.bak/layout_sort.py
+++ b/magic_pdf/layout.bak/layout_sort.py
-"""对pdf上的box进行layout识别，并对内部组成的box进行排序."""
-from loguru import logger
-from magic_pdf.layout.bbox_sort import (CONTENT_IDX, CONTENT_TYPE_IDX,
-                                        X0_EXT_IDX, X0_IDX, X1_EXT_IDX, X1_IDX,
-                                        Y0_EXT_IDX, Y0_IDX, Y1_EXT_IDX, Y1_IDX,
-                                        paper_bbox_sort)
-from magic_pdf.layout.layout_det_utils import (
-    find_all_bottom_bbox_direct, find_all_left_bbox_direct,
-    find_all_right_bbox_direct, find_all_top_bbox_direct,
-    find_bottom_bbox_direct_from_left_edge,
-    find_bottom_bbox_direct_from_right_edge,
-    find_top_bbox_direct_from_left_edge, find_top_bbox_direct_from_right_edge,
-    get_left_edge_bboxes, get_right_edge_bboxes)
-from magic_pdf.libs.boxbase import get_bbox_in_boundary
-LAYOUT_V = 'V'
-LAYOUT_H = 'H'
-LAYOUT_UNPROC = 'U'
-LAYOUT_BAD = 'B'
-def _is_single_line_text(bbox):
-    """检查bbox里面的文字是否只有一行."""
-    return True  # TODO
-    box_type = bbox[CONTENT_TYPE_IDX]
-    if box_type != 'text':
-        return False
-    paras = bbox[CONTENT_IDX]['paras']
-    text_content = ''
-    for para_id, para in paras.items():  # 拼装内部的段落文本
-        is_title = para['is_title']
-        if is_title != 0:
-            text_content += f"## {para['text']}"
-        else:
-            text_content += para['text']
-        text_content += '\n\n'
-    return bbox[CONTENT_TYPE_IDX] == 'text' and len(text_content.split('\n\n')) <= 1
-def _horizontal_split(bboxes: list, boundary: tuple, avg_font_size=20) -> list:
-    """
-    对bboxes进行水平切割
-    方法是：找到左侧和右侧都没有被直接遮挡的box，然后进行扩展，之后进行切割
-    return:
-        返回几个大的Layout区域 [[x0, y0, x1, y1, "h|u|v"], ], h代表水平，u代表未探测的，v代表垂直布局
-    """
-    sorted_layout_blocks = []  # 这是要最终返回的值
-    bound_x0, bound_y0, bound_x1, bound_y1 = boundary
-    all_bboxes = get_bbox_in_boundary(bboxes, boundary)
-    # all_bboxes = paper_bbox_sort(all_bboxes, abs(bound_x1-bound_x0), abs(bound_y1-bound_x0)) # 大致拍下序, 这个是基于直接遮挡的。
-    """
-    首先在水平方向上扩展独占一行的bbox
-    """
-    last_h_split_line_y1 = bound_y0  # 记录下上次的水平分割线
-    for i, bbox in enumerate(all_bboxes):
-        left_nearest_bbox = find_all_left_bbox_direct(bbox, all_bboxes)  # 非扩展线
-        right_nearest_bbox = find_all_right_bbox_direct(bbox, all_bboxes)
-        if left_nearest_bbox is None and right_nearest_bbox is None:  # 独占一行
-            """
-            然而，如果只是孤立的一行文字，那么就还要满足以下几个条件才可以：
-            1. bbox和中心线相交。或者
-            2. 上方或者下方也存在同类水平的独占一行的bbox。 或者
-            3. TODO 加强条件：这个bbox上方和下方是同一列column，那么就不能算作独占一行
-            """
-            # 先检查这个bbox里是否只包含一行文字
-            # is_single_line = _is_single_line_text(bbox)
-            """
-            这里有个点需要注意，当页面内容不是居中的时候，第一次调用传递的是page的boundary，这个时候mid_x就不是中心线了.
-            所以这里计算出最紧致的boundary，然后再计算mid_x
-            """
-            boundary_real_x0, boundary_real_x1 = min(
-                [bbox[X0_IDX] for bbox in all_bboxes]
-            ), max([bbox[X1_IDX] for bbox in all_bboxes])
-            mid_x = (boundary_real_x0 + boundary_real_x1) / 2
-            # 检查这个box是否内容在中心线有交
-            # 必须跨过去2个字符的宽度
-            is_cross_boundary_mid_line = (
-                min(mid_x - bbox[X0_IDX], bbox[X1_IDX] - mid_x) > avg_font_size * 2
-            )
-            """
-            检查条件2
-            """
-            is_belong_to_col = False
-            """
-            检查是否能被上方col吸收，方法是：
-            1. 上方非空且不是独占一行的，并且
-            2. 从上个水平分割的最大y=y1开始到当前bbox,最左侧的bbox的[min_x0, max_x1],能够覆盖当前box的[x0, x1]
-            """
-            """
-            以迭代的方式向上找，查找范围是[bound_x0, last_h_sp, bound_x1, bbox[Y0_IDX]]
-            """
-            # 先确定上方的y0, y0
-            b_y0, b_y1 = last_h_split_line_y1, bbox[Y0_IDX]
-            # 然后从box开始逐个向上找到所有与box在x上有交集的box
-            box_to_check = [bound_x0, b_y0, bound_x1, b_y1]
-            bbox_in_bound_check = get_bbox_in_boundary(all_bboxes, box_to_check)
-            bboxes_on_top = []
-            virtual_box = bbox
-            while True:
-                b_on_top = find_all_top_bbox_direct(virtual_box, bbox_in_bound_check)
-                if b_on_top is not None:
-                    bboxes_on_top.append(b_on_top)
-                    virtual_box = [
-                        min([virtual_box[X0_IDX], b_on_top[X0_IDX]]),
-                        min(virtual_box[Y0_IDX], b_on_top[Y0_IDX]),
-                        max([virtual_box[X1_IDX], b_on_top[X1_IDX]]),
-                        b_y1,
-                    ]
-                else:
-                    break
-            # 随后确定这些box的最小x0, 最大x1
-            if len(bboxes_on_top) > 0 and len(bboxes_on_top) != len(
-                bbox_in_bound_check
-            ):  # virtual_box可能会膨胀到占满整个区域，这实际上就不能属于一个col了。
-                min_x0, max_x1 = virtual_box[X0_IDX], virtual_box[X1_IDX]
-                # 然后采用一种比较粗糙的方法，看min_x0，max_x1是否与位于[bound_x0, last_h_sp, bound_x1, bbox[Y0_IDX]]之间的box有相交
-                if not any(
-                    [
-                        b[X0_IDX] <= min_x0 - 1 <= b[X1_IDX]
-                        or b[X0_IDX] <= max_x1 + 1 <= b[X1_IDX]
-                        for b in bbox_in_bound_check
-                    ]
-                ):
-                    # 其上，下都不能被扩展成行，暂时只检查一下上方 TODO
-                    top_nearest_bbox = find_all_top_bbox_direct(bbox, bboxes)
-                    bottom_nearest_bbox = find_all_bottom_bbox_direct(bbox, bboxes)
-                    if not any(
-                        [
-                            top_nearest_bbox is not None
-                            and (
-                                find_all_left_bbox_direct(top_nearest_bbox, bboxes)
-                                is None
-                                and find_all_right_bbox_direct(top_nearest_bbox, bboxes)
-                                is None
-                            ),
-                            bottom_nearest_bbox is not None
-                            and (
-                                find_all_left_bbox_direct(bottom_nearest_bbox, bboxes)
-                                is None
-                                and find_all_right_bbox_direct(
-                                    bottom_nearest_bbox, bboxes
-                                )
-                                is None
-                            ),
-                            top_nearest_bbox is None or bottom_nearest_bbox is None,
-                        ]
-                    ):
-                        is_belong_to_col = True
-            # 检查是否能被下方col吸收 TODO
-            """
-            这里为什么没有is_cross_boundary_mid_line的条件呢？
-            确实有些杂志左右两栏宽度不是对称的。
-            """
-            if not is_belong_to_col or is_cross_boundary_mid_line:
-                bbox[X0_EXT_IDX] = bound_x0
-                bbox[Y0_EXT_IDX] = bbox[Y0_IDX]
-                bbox[X1_EXT_IDX] = bound_x1
-                bbox[Y1_EXT_IDX] = bbox[Y1_IDX]
-                last_h_split_line_y1 = bbox[Y1_IDX]  # 更新这条线
-            else:
-                continue
-    """
-    此时独占一行的被成功扩展到指定的边界上，这个时候利用边界条件合并连续的bbox，成为一个group
-    然后合并所有连续水平方向的bbox.
-    """
-    all_bboxes.sort(key=lambda x: x[Y0_IDX])
-    h_bboxes = []
-    h_bbox_group = []
-    for bbox in all_bboxes:
-        if bbox[X0_EXT_IDX] == bound_x0 and bbox[X1_EXT_IDX] == bound_x1:
-            h_bbox_group.append(bbox)
-        else:
-            if len(h_bbox_group) > 0:
-                h_bboxes.append(h_bbox_group)
-                h_bbox_group = []
-    # 最后一个group
-    if len(h_bbox_group) > 0:
-        h_bboxes.append(h_bbox_group)
-    """
-    现在h_bboxes里面是所有的group了，每个group都是一个list
-    对h_bboxes里的每个group进行计算放回到sorted_layouts里
-    """
-    h_layouts = []
-    for gp in h_bboxes:
-        gp.sort(key=lambda x: x[Y0_IDX])
-        # 然后计算这个group的layout_bbox，也就是最小的x0,y0, 最大的x1,y1
-        x0, y0, x1, y1 = (
-            gp[0][X0_EXT_IDX],
-            gp[0][Y0_EXT_IDX],
-            gp[-1][X1_EXT_IDX],
-            gp[-1][Y1_EXT_IDX],
-        )
-        h_layouts.append([x0, y0, x1, y1, LAYOUT_H])  # 水平的布局
-    """
-    接下来利用这些连续的水平bbox的layout_bbox的y0, y1，从水平上切分开其余的为几个部分
-    """
-    h_split_lines = [bound_y0]
-    for gp in h_bboxes:  # gp是一个list[bbox_list]
-        y0, y1 = gp[0][1], gp[-1][3]
-        h_split_lines.append(y0)
-        h_split_lines.append(y1)
-    h_split_lines.append(bound_y1)
-    unsplited_bboxes = []
-    for i in range(0, len(h_split_lines), 2):
-        start_y0, start_y1 = h_split_lines[i : i + 2]
-        # 然后找出[start_y0, start_y1]之间的其他bbox，这些组成一个未分割板块
-        bboxes_in_block = [
-            bbox
-            for bbox in all_bboxes
-            if bbox[Y0_IDX] >= start_y0 and bbox[Y1_IDX] <= start_y1
-        ]
-        unsplited_bboxes.append(bboxes_in_block)
-    # 接着把未处理的加入到h_layouts里
-    for bboxes_in_block in unsplited_bboxes:
-        if len(bboxes_in_block) == 0:
-            continue
-        x0, y0, x1, y1 = (
-            bound_x0,
-            min([bbox[Y0_IDX] for bbox in bboxes_in_block]),
-            bound_x1,
-            max([bbox[Y1_IDX] for bbox in bboxes_in_block]),
-        )
-        h_layouts.append([x0, y0, x1, y1, LAYOUT_UNPROC])
-    h_layouts.sort(key=lambda x: x[1])  # 按照y0排序, 也就是从上到下的顺序
-    """
-    转换成如下格式返回
-    """
-    for layout in h_layouts:
-        sorted_layout_blocks.append(
-            {
-                'layout_bbox': layout[:4],
-                'layout_label': layout[4],
-                'sub_layout': [],
-            }
-        )
-    return sorted_layout_blocks
-###############################################################################################
-#
-#  垂直方向的处理
-#
-#
-###############################################################################################
-def _vertical_align_split_v1(bboxes: list, boundary: tuple) -> list:
-    """
-    计算垂直方向上的对齐， 并分割bboxes成layout。负责对一列多行的进行列维度分割。
-    如果不能完全分割，剩余部分作为layout_lable为u的layout返回
-    -----------------------
-    |     |           |
-    |     |           |
-    |     |           |
-    |     |           |
-    -------------------------
-    此函数会将：以上布局将会切分出来2列
-    """
-    sorted_layout_blocks = []  # 这是要最终返回的值
-    new_boundary = [boundary[0], boundary[1], boundary[2], boundary[3]]
-    v_blocks = []
-    """
-    先从左到右切分
-    """
-    while True:
-        all_bboxes = get_bbox_in_boundary(bboxes, new_boundary)
-        left_edge_bboxes = get_left_edge_bboxes(all_bboxes)
-        if len(left_edge_bboxes) == 0:
-            break
-        right_split_line_x1 = max([bbox[X1_IDX] for bbox in left_edge_bboxes]) + 1
-        # 然后检查这条线能不与其他bbox的左边界相交或者重合
-        if any(
-            [bbox[X0_IDX] <= right_split_line_x1 <= bbox[X1_IDX] for bbox in all_bboxes]
-        ):
-            # 垂直切分线与某些box发生相交，说明无法完全垂直方向切分。
-            break
-        else:  # 说明成功分割出一列
-            # 找到左侧边界最靠左的bbox作为layout的x0
-            layout_x0 = min(
-                [bbox[X0_IDX] for bbox in left_edge_bboxes]
-            )  # 这里主要是为了画出来有一定间距
-            v_blocks.append(
-                [
-                    layout_x0,
-                    new_boundary[1],
-                    right_split_line_x1,
-                    new_boundary[3],
-                    LAYOUT_V,
-                ]
-            )
-            new_boundary[0] = right_split_line_x1  # 更新边界
-    """
-    再从右到左切， 此时如果还是无法完全切分，那么剩余部分作为layout_lable为u的layout返回
-    """
-    unsplited_block = []
-    while True:
-        all_bboxes = get_bbox_in_boundary(bboxes, new_boundary)
-        right_edge_bboxes = get_right_edge_bboxes(all_bboxes)
-        if len(right_edge_bboxes) == 0:
-            break
-        left_split_line_x0 = min([bbox[X0_IDX] for bbox in right_edge_bboxes]) - 1
-        # 然后检查这条线能不与其他bbox的左边界相交或者重合
-        if any(
-            [bbox[X0_IDX] <= left_split_line_x0 <= bbox[X1_IDX] for bbox in all_bboxes]
-        ):
-            # 这里是余下的
-            unsplited_block.append(
-                [
-                    new_boundary[0],
-                    new_boundary[1],
-                    new_boundary[2],
-                    new_boundary[3],
-                    LAYOUT_UNPROC,
-                ]
-            )
-            break
-        else:
-            # 找到右侧边界最靠右的bbox作为layout的x1
-            layout_x1 = max([bbox[X1_IDX] for bbox in right_edge_bboxes])
-            v_blocks.append(
-                [
-                    left_split_line_x0,
-                    new_boundary[1],
-                    layout_x1,
-                    new_boundary[3],
-                    LAYOUT_V,
-                ]
-            )
-            new_boundary[2] = left_split_line_x0  # 更新右边界
-    """
-    最后拼装成layout格式返回
-    """
-    for block in v_blocks:
-        sorted_layout_blocks.append(
-            {
-                'layout_bbox': block[:4],
-                'layout_label': block[4],
-                'sub_layout': [],
-            }
-        )
-    for block in unsplited_block:
-        sorted_layout_blocks.append(
-            {
-                'layout_bbox': block[:4],
-                'layout_label': block[4],
-                'sub_layout': [],
-            }
-        )
-    # 按照x0排序
-    sorted_layout_blocks.sort(key=lambda x: x['layout_bbox'][0])
-    return sorted_layout_blocks
-def _vertical_align_split_v2(bboxes: list, boundary: tuple) -> list:
-    """改进的
-    _vertical_align_split算法，原算法会因为第二列的box由于左侧没有遮挡被认为是左侧的一部分，导致整个layout多列被识别为一列。
-    利用从左上角的box开始向下看的方法，不断扩展w_x0, w_x1，直到不能继续向下扩展，或者到达边界下边界。"""
-    sorted_layout_blocks = []  # 这是要最终返回的值
-    new_boundary = [boundary[0], boundary[1], boundary[2], boundary[3]]
-    bad_boxes = []  # 被割中的box
-    v_blocks = []
-    while True:
-        all_bboxes = get_bbox_in_boundary(bboxes, new_boundary)
-        if len(all_bboxes) == 0:
-            break
-        left_top_box = min(
-            all_bboxes, key=lambda x: (x[X0_IDX], x[Y0_IDX])
-        )  # 这里应该加强，检查一下必须是在第一列的 TODO
-        start_box = [
-            left_top_box[X0_IDX],
-            left_top_box[Y0_IDX],
-            left_top_box[X1_IDX],
-            left_top_box[Y1_IDX],
-        ]
-        w_x0, w_x1 = left_top_box[X0_IDX], left_top_box[X1_IDX]
-        """
-        然后沿着这个box线向下找最近的那个box, 然后扩展w_x0, w_x1
-        扩展之后，宽度会增加，随后用x=w_x1来检测在边界内是否有box与相交，如果相交，那么就说明不能再扩展了。
-        当不能扩展的时候就要看是否到达下边界：
-        1. 达到，那么更新左边界继续分下一个列
-        2. 没有达到，那么此时开始从右侧切分进入下面的循环里
-        """
-        while left_top_box is not None:  # 向下去找
-            virtual_box = [w_x0, left_top_box[Y0_IDX], w_x1, left_top_box[Y1_IDX]]
-            left_top_box = find_bottom_bbox_direct_from_left_edge(
-                virtual_box, all_bboxes
-            )
-            if left_top_box:
-                w_x0, w_x1 = min(virtual_box[X0_IDX], left_top_box[X0_IDX]), max(
-                    [virtual_box[X1_IDX], left_top_box[X1_IDX]]
-                )
-        # 万一这个初始的box在column中间，那么还要向上看
-        start_box = [
-            w_x0,
-            start_box[Y0_IDX],
-            w_x1,
-            start_box[Y1_IDX],
-        ]  # 扩展一下宽度更鲁棒
-        left_top_box = find_top_bbox_direct_from_left_edge(start_box, all_bboxes)
-        while left_top_box is not None:  # 向上去找
-            virtual_box = [w_x0, left_top_box[Y0_IDX], w_x1, left_top_box[Y1_IDX]]
-            left_top_box = find_top_bbox_direct_from_left_edge(virtual_box, all_bboxes)
-            if left_top_box:
-                w_x0, w_x1 = min(virtual_box[X0_IDX], left_top_box[X0_IDX]), max(
-                    [virtual_box[X1_IDX], left_top_box[X1_IDX]]
-                )
-        # 检查相交
-        if any([bbox[X0_IDX] <= w_x1 + 1 <= bbox[X1_IDX] for bbox in all_bboxes]):
-            for b in all_bboxes:
-                if b[X0_IDX] <= w_x1 + 1 <= b[X1_IDX]:
-                    bad_boxes.append([b[X0_IDX], b[Y0_IDX], b[X1_IDX], b[Y1_IDX]])
-            break
-        else:  # 说明成功分割出一列
-            v_blocks.append([w_x0, new_boundary[1], w_x1, new_boundary[3], LAYOUT_V])
-            new_boundary[0] = w_x1  # 更新边界
-    """
-    接着开始从右上角的box扫描
-    """
-    w_x0, w_x1 = 0, 0
-    unsplited_block = []
-    while True:
-        all_bboxes = get_bbox_in_boundary(bboxes, new_boundary)
-        if len(all_bboxes) == 0:
-            break
-        # 先找到X1最大的
-        bbox_list_sorted = sorted(
-            all_bboxes, key=lambda bbox: bbox[X1_IDX], reverse=True
-        )
-        # Then, find the boxes with the smallest Y0 value
-        bigest_x1 = bbox_list_sorted[0][X1_IDX]
-        boxes_with_bigest_x1 = [
-            bbox for bbox in bbox_list_sorted if bbox[X1_IDX] == bigest_x1
-        ]  # 也就是最靠右的那些
-        right_top_box = min(
-            boxes_with_bigest_x1, key=lambda bbox: bbox[Y0_IDX]
-        )  # y0最小的那个
-        start_box = [
-            right_top_box[X0_IDX],
-            right_top_box[Y0_IDX],
-            right_top_box[X1_IDX],
-            right_top_box[Y1_IDX],
-        ]
-        w_x0, w_x1 = right_top_box[X0_IDX], right_top_box[X1_IDX]
-        while right_top_box is not None:
-            virtual_box = [w_x0, right_top_box[Y0_IDX], w_x1, right_top_box[Y1_IDX]]
-            right_top_box = find_bottom_bbox_direct_from_right_edge(
-                virtual_box, all_bboxes
-            )
-            if right_top_box:
-                w_x0, w_x1 = min([w_x0, right_top_box[X0_IDX]]), max(
-                    [w_x1, right_top_box[X1_IDX]]
-                )
-        # 在向上扫描
-        start_box = [
-            w_x0,
-            start_box[Y0_IDX],
-            w_x1,
-            start_box[Y1_IDX],
-        ]  # 扩展一下宽度更鲁棒
-        right_top_box = find_top_bbox_direct_from_right_edge(start_box, all_bboxes)
-        while right_top_box is not None:
-            virtual_box = [w_x0, right_top_box[Y0_IDX], w_x1, right_top_box[Y1_IDX]]
-            right_top_box = find_top_bbox_direct_from_right_edge(
-                virtual_box, all_bboxes
-            )
-            if right_top_box:
-                w_x0, w_x1 = min([w_x0, right_top_box[X0_IDX]]), max(
-                    [w_x1, right_top_box[X1_IDX]]
-                )
-        # 检查是否与其他box相交， 垂直切分线与某些box发生相交，说明无法完全垂直方向切分。
-        if any([bbox[X0_IDX] <= w_x0 - 1 <= bbox[X1_IDX] for bbox in all_bboxes]):
-            unsplited_block.append(
-                [
-                    new_boundary[0],
-                    new_boundary[1],
-                    new_boundary[2],
-                    new_boundary[3],
-                    LAYOUT_UNPROC,
-                ]
-            )
-            for b in all_bboxes:
-                if b[X0_IDX] <= w_x0 - 1 <= b[X1_IDX]:
-                    bad_boxes.append([b[X0_IDX], b[Y0_IDX], b[X1_IDX], b[Y1_IDX]])
-            break
-        else:  # 说明成功分割出一列
-            v_blocks.append([w_x0, new_boundary[1], w_x1, new_boundary[3], LAYOUT_V])
-            new_boundary[2] = w_x0
-    """转换数据结构"""
-    for block in v_blocks:
-        sorted_layout_blocks.append(
-            {
-                'layout_bbox': block[:4],
-                'layout_label': block[4],
-                'sub_layout': [],
-            }
-        )
-    for block in unsplited_block:
-        sorted_layout_blocks.append(
-            {
-                'layout_bbox': block[:4],
-                'layout_label': block[4],
-                'sub_layout': [],
-                'bad_boxes': bad_boxes,  # 记录下来，这个box是被割中的
-            }
-        )
-    # 按照x0排序
-    sorted_layout_blocks.sort(key=lambda x: x['layout_bbox'][0])
-    return sorted_layout_blocks
-def _try_horizontal_mult_column_split(bboxes: list, boundary: tuple) -> list:
-    """
-    尝试水平切分，如果切分不动，那就当一个BAD_LAYOUT返回
-    ------------------
-    |        |       |
-    ------------------
-    |    |       |   |   <-  这里是此函数要切分的场景
-    ------------------
-    |        |       |
-    |        |       |
-    """
-    pass
-def _vertical_split(bboxes: list, boundary: tuple) -> list:
-    """
-    从垂直方向进行切割，分block
-    这个版本里，如果垂直切分不动，那就当一个BAD_LAYOUT返回
-                                --------------------------
-                                    |        |       |
-                                    |        |       |
-                                | |
-    这种列是此函数要切分的  ->    | |
-                                | |
-                                    |        |       |
-                                    |        |       |
-                                -------------------------
-    """
-    sorted_layout_blocks = []  # 这是要最终返回的值
-    bound_x0, bound_y0, bound_x1, bound_y1 = boundary
-    all_bboxes = get_bbox_in_boundary(bboxes, boundary)
-    """
-    all_bboxes = fix_vertical_bbox_pos(all_bboxes) # 垂直方向解覆盖
-    all_bboxes = fix_hor_bbox_pos(all_bboxes)  # 水平解覆盖
-    这两行代码目前先不执行，因为公式检测，表格检测还不是很成熟，导致非常多的textblock参与了运算，时间消耗太大。
-    这两行代码的作用是：
-    如果遇到互相重叠的bbox, 那么会把面积较小的box进行压缩，从而避免重叠。对布局切分来说带来正反馈。
-    """
-    # all_bboxes = paper_bbox_sort(all_bboxes, abs(bound_x1-bound_x0), abs(bound_y1-bound_x0)) # 大致拍下序, 这个是基于直接遮挡的。
-    """
-    首先在垂直方向上扩展独占一行的bbox
-    """
-    for bbox in all_bboxes:
-        top_nearest_bbox = find_all_top_bbox_direct(bbox, all_bboxes)  # 非扩展线
-        bottom_nearest_bbox = find_all_bottom_bbox_direct(bbox, all_bboxes)
-        if (
-            top_nearest_bbox is None
-            and bottom_nearest_bbox is None
-            and not any(
-                [
-                    b[X0_IDX] < bbox[X1_IDX] < b[X1_IDX]
-                    or b[X0_IDX] < bbox[X0_IDX] < b[X1_IDX]
-                    for b in all_bboxes
-                ]
-            )
-        ):  # 独占一列, 且不和其他重叠
-            bbox[X0_EXT_IDX] = bbox[X0_IDX]
-            bbox[Y0_EXT_IDX] = bound_y0
-            bbox[X1_EXT_IDX] = bbox[X1_IDX]
-            bbox[Y1_EXT_IDX] = bound_y1
-        """
-    此时独占一列的被成功扩展到指定的边界上，这个时候利用边界条件合并连续的bbox，成为一个group
-    然后合并所有连续垂直方向的bbox.
-    """
-    all_bboxes.sort(key=lambda x: x[X0_IDX])
-    # fix: 这里水平方向的列不要合并成一个行，因为需要保证返回给下游的最小block，总是可以无脑从上到下阅读文字。
-    v_bboxes = []
-    for box in all_bboxes:
-        if box[Y0_EXT_IDX] == bound_y0 and box[Y1_EXT_IDX] == bound_y1:
-            v_bboxes.append(box)
-    """
-    现在v_bboxes里面是所有的group了，每个group都是一个list
-    对v_bboxes里的每个group进行计算放回到sorted_layouts里
-    """
-    v_layouts = []
-    for vbox in v_bboxes:
-        # gp.sort(key=lambda x: x[X0_IDX])
-        # 然后计算这个group的layout_bbox，也就是最小的x0,y0, 最大的x1,y1
-        x0, y0, x1, y1 = (
-            vbox[X0_EXT_IDX],
-            vbox[Y0_EXT_IDX],
-            vbox[X1_EXT_IDX],
-            vbox[Y1_EXT_IDX],
-        )
-        v_layouts.append([x0, y0, x1, y1, LAYOUT_V])  # 垂直的布局
-    """
-    接下来利用这些连续的垂直bbox的layout_bbox的x0, x1，从垂直上切分开其余的为几个部分
-    """
-    v_split_lines = [bound_x0]
-    for gp in v_bboxes:
-        x0, x1 = gp[X0_IDX], gp[X1_IDX]
-        v_split_lines.append(x0)
-        v_split_lines.append(x1)
-    v_split_lines.append(bound_x1)
-    unsplited_bboxes = []
-    for i in range(0, len(v_split_lines), 2):
-        start_x0, start_x1 = v_split_lines[i : i + 2]
-        # 然后找出[start_x0, start_x1]之间的其他bbox，这些组成一个未分割板块
-        bboxes_in_block = [
-            bbox
-            for bbox in all_bboxes
-            if bbox[X0_IDX] >= start_x0 and bbox[X1_IDX] <= start_x1
-        ]
-        unsplited_bboxes.append(bboxes_in_block)
-    # 接着把未处理的加入到v_layouts里
-    for bboxes_in_block in unsplited_bboxes:
-        if len(bboxes_in_block) == 0:
-            continue
-        x0, y0, x1, y1 = (
-            min([bbox[X0_IDX] for bbox in bboxes_in_block]),
-            bound_y0,
-            max([bbox[X1_IDX] for bbox in bboxes_in_block]),
-            bound_y1,
-        )
-        v_layouts.append(
-            [x0, y0, x1, y1, LAYOUT_UNPROC]
-        )  # 说明这篇区域未能够分析出可靠的版面
-    v_layouts.sort(key=lambda x: x[0])  # 按照x0排序, 也就是从左到右的顺序
-    for layout in v_layouts:
-        sorted_layout_blocks.append(
-            {
-                'layout_bbox': layout[:4],
-                'layout_label': layout[4],
-                'sub_layout': [],
-            }
-        )
-    """
-    至此，垂直方向切成了2种类型，其一是独占一列的，其二是未处理的。
-    下面对这些未处理的进行垂直方向切分，这个切分要切出来类似“吕”这种类型的垂直方向的布局
-    """
-    for i, layout in enumerate(sorted_layout_blocks):
-        if layout['layout_label'] == LAYOUT_UNPROC:
-            x0, y0, x1, y1 = layout['layout_bbox']
-            v_split_layouts = _vertical_align_split_v2(bboxes, [x0, y0, x1, y1])
-            sorted_layout_blocks[i] = {
-                'layout_bbox': [x0, y0, x1, y1],
-                'layout_label': LAYOUT_H,
-                'sub_layout': v_split_layouts,
-            }
-            layout['layout_label'] = LAYOUT_H  # 被垂线切分成了水平布局
-    return sorted_layout_blocks
-def split_layout(bboxes: list, boundary: tuple, page_num: int) -> list:
-    """
-    把bboxes切割成layout
-    return:
-    [
-        {
-            "layout_bbox": [x0,y0,x1,y1],
-            "layout_label":"u|v|h|b", 未处理|垂直|水平|BAD_LAYOUT
-            "sub_layout":[] #每个元素都是[
-                                            x0,y0,
-                                            x1,y1,
-                                            block_content,
-                                            idx_x,idx_y,
-                                            content_type,
-                                            ext_x0,ext_y0,
-                                            ext_x1,ext_y1
-                                        ], 并且顺序就是阅读顺序
-        }
-    ]
-    example:
-    [
-        {
-            "layout_bbox": [0, 0, 100, 100],
-            "layout_label":"u|v|h|b",
-            "sub_layout":[
-            ]
-        },
-        {
-            "layout_bbox": [0, 0, 100, 100],
-            "layout_label":"u|v|h|b",
-            "sub_layout":[
-                {
-                    "layout_bbox": [0, 0, 100, 100],
-                    "layout_label":"u|v|h|b",
-                    "content_bboxes":[
-                        [],
-                        [],
-                        []
-                    ]
-                },
-                {
-                    "layout_bbox": [0, 0, 100, 100],
-                    "layout_label":"u|v|h|b",
-                    "sub_layout":[
-                    ]
-                }
-        }
-    ]
-    """
-    sorted_layouts = []  # 最终返回的结果
-    boundary_x0, boundary_y0, boundary_x1, boundary_y1 = boundary
-    if len(bboxes) <= 1:
-        return [
-            {
-                'layout_bbox': [boundary_x0, boundary_y0, boundary_x1, boundary_y1],
-                'layout_label': LAYOUT_V,
-                'sub_layout': [],
-            }
-        ]
-    """
-    接下来按照先水平后垂直的顺序进行切分
-    """
-    bboxes = paper_bbox_sort(
-        bboxes, boundary_x1 - boundary_x0, boundary_y1 - boundary_y0
-    )
-    sorted_layouts = _horizontal_split(bboxes, boundary)  # 通过水平分割出来的layout
-    for i, layout in enumerate(sorted_layouts):
-        x0, y0, x1, y1 = layout['layout_bbox']
-        layout_type = layout['layout_label']
-        if layout_type == LAYOUT_UNPROC:  # 说明是非独占单行的，这些需要垂直切分
-            v_split_layouts = _vertical_split(bboxes, [x0, y0, x1, y1])
-            """
-            最后这里有个逻辑问题：如果这个函数只分离出来了一个column layout，那么这个layout分割肯定超出了算法能力范围。因为我们假定的是传进来的
-            box已经把行全部剥离了，所以这里必须十多个列才可以。如果只剥离出来一个layout，并且是多个box，那么就说明这个layout是无法分割的，标记为LAYOUT_UNPROC
-            """
-            layout_label = LAYOUT_V
-            if len(v_split_layouts) == 1:
-                if len(v_split_layouts[0]['sub_layout']) == 0:
-                    layout_label = LAYOUT_UNPROC
-                    # logger.warning(f"WARNING: pageno={page_num}, 无法分割的layout: ", v_split_layouts)
-            """
-            组合起来最终的layout
-            """
-            sorted_layouts[i] = {
-                'layout_bbox': [x0, y0, x1, y1],
-                'layout_label': layout_label,
-                'sub_layout': v_split_layouts,
-            }
-            layout['layout_label'] = LAYOUT_H
-    """
-    水平和垂直方向都切分完毕了。此时还有一些未处理的，这些未处理的可能是因为水平和垂直方向都无法切分。
-    这些最后调用_try_horizontal_mult_block_split做一次水平多个block的联合切分，如果也不能切分最终就当做BAD_LAYOUT返回
-    """
-    # TODO
-    return sorted_layouts
-def get_bboxes_layout(all_boxes: list, boundary: tuple, page_id: int):
-    """
-    对利用layout排序之后的box，进行排序
-    return:
-    [
-        {
-            "layout_bbox": [x0, y0, x1, y1],
-            "layout_label":"u|v|h|b", 未处理|垂直|水平|BAD_LAYOUT
-        }，
-    ]
-    """
-    def _preorder_traversal(layout):
-        """对sorted_layouts的叶子节点，也就是len(sub_layout)==0的节点进行排序。排序按照前序遍历的顺序，也就是从上到
-        下，从左到右的顺序."""
-        sorted_layout_blocks = []
-        for layout in layout:
-            sub_layout = layout['sub_layout']
-            if len(sub_layout) == 0:
-                sorted_layout_blocks.append(layout)
-            else:
-                s = _preorder_traversal(sub_layout)
-                sorted_layout_blocks.extend(s)
-        return sorted_layout_blocks
-    # -------------------------------------------------------------------------------------------------------------------------
-    sorted_layouts = split_layout(
-        all_boxes, boundary, page_id
-    )  # 先切分成layout，得到一个Tree
-    total_sorted_layout_blocks = _preorder_traversal(sorted_layouts)
-    return total_sorted_layout_blocks, sorted_layouts
-def get_columns_cnt_of_layout(layout_tree):
-    """获取一个layout的宽度."""
-    max_width_list = [0]  # 初始化一个元素，防止max,min函数报错
-    for items in layout_tree:  # 针对每一层（横切）计算列数，横着的算一列
-        layout_type = items['layout_label']
-        sub_layouts = items['sub_layout']
-        if len(sub_layouts) == 0:
-            max_width_list.append(1)
-        else:
-            if layout_type == LAYOUT_H:
-                max_width_list.append(1)
-            else:
-                width = 0
-                for sub_layout in sub_layouts:
-                    if len(sub_layout['sub_layout']) == 0:
-                        width += 1
-                    else:
-                        for lay in sub_layout['sub_layout']:
-                            width += get_columns_cnt_of_layout([lay])
-                max_width_list.append(width)
-    return max(max_width_list)
-def sort_with_layout(bboxes: list, page_width, page_height) -> (list, list):
-    """输入是一个bbox的list.
-    获取到输入之后，先进行layout切分，然后对这些bbox进行排序。返回排序后的bboxes
-    """
-    new_bboxes = []
-    for box in bboxes:
-        # new_bboxes.append([box[0], box[1], box[2], box[3], None, None, None, 'text', None, None, None, None])
-        new_bboxes.append(
-            [
-                box[0],
-                box[1],
-                box[2],
-                box[3],
-                None,
-                None,
-                None,
-                'text',
-                None,
-                None,
-                None,
-                None,
-                box[4],
-            ]
-        )
-    layout_bboxes, _ = get_bboxes_layout(
-        new_bboxes, tuple([0, 0, page_width, page_height]), 0
-    )
-    if any([lay['layout_label'] == LAYOUT_UNPROC for lay in layout_bboxes]):
-        logger.warning('drop this pdf, reason: 复杂版面')
-        return None, None
-    sorted_bboxes = []
-    # 利用layout bbox每次框定一些box，然后排序
-    for layout in layout_bboxes:
-        lbox = layout['layout_bbox']
-        bbox_in_layout = get_bbox_in_boundary(new_bboxes, lbox)
-        sorted_bbox = paper_bbox_sort(
-            bbox_in_layout, lbox[2] - lbox[0], lbox[3] - lbox[1]
-        )
-        sorted_bboxes.extend(sorted_bbox)
-    return sorted_bboxes, layout_bboxes
-def sort_text_block(text_block, layout_bboxes):
-    """对一页的text_block进行排序."""
-    sorted_text_bbox = []
-    all_text_bbox = []
-    # 做一个box=>text的映射
-    box_to_text = {}
-    for blk in text_block:
-        box = blk['bbox']
-        box_to_text[(box[0], box[1], box[2], box[3])] = blk
-        all_text_bbox.append(box)
-    # text_blocks_to_sort = []
-    # for box in box_to_text.keys():
-    #     text_blocks_to_sort.append([box[0], box[1], box[2], box[3], None, None, None, 'text', None, None, None, None])
-    # 按照layout_bboxes的顺序，对text_block进行排序
-    for layout in layout_bboxes:
-        layout_box = layout['layout_bbox']
-        text_bbox_in_layout = get_bbox_in_boundary(
-            all_text_bbox,
-            [
-                layout_box[0] - 1,
-                layout_box[1] - 1,
-                layout_box[2] + 1,
-                layout_box[3] + 1,
-            ],
-        )
-        # sorted_bbox = paper_bbox_sort(text_bbox_in_layout, layout_box[2]-layout_box[0], layout_box[3]-layout_box[1])
-        text_bbox_in_layout.sort(
-            key=lambda x: x[1]
-        )  # 一个layout内部的box，按照y0自上而下排序
-        # sorted_bbox = [[b] for b in text_blocks_to_sort]
-        for sb in text_bbox_in_layout:
-            sorted_text_bbox.append(box_to_text[(sb[0], sb[1], sb[2], sb[3])])
-    return sorted_text_bbox
--- a/magic_pdf/layout.bak/layout_spiler_recog.py
+++ b/magic_pdf/layout.bak/layout_spiler_recog.py
-"""
-找到能分割布局的水平的横线、色块
-"""
-import os
-from magic_pdf.libs.commons import fitz
-from magic_pdf.libs.boxbase import _is_in_or_part_overlap
-def __rect_filter_by_width(rect, page_w, page_h):
-    mid_x = page_w/2
-    if rect[0]< mid_x < rect[2]:
-        return True
-    return False
-def __rect_filter_by_pos(rect, image_bboxes, table_bboxes):
-    """
-    不能出现在table和image的位置
-    """
-    for box in image_bboxes:
-        if _is_in_or_part_overlap(rect, box):
-            return False
-    for box in table_bboxes:
-        if _is_in_or_part_overlap(rect, box):
-            return False
-    return True
-def __debug_show_page(page, bboxes1: list,bboxes2: list,bboxes3: list,):
-    save_path = "./tmp/debug.pdf"
-    if os.path.exists(save_path):
-        # 删除已经存在的文件
-        os.remove(save_path)
-    # 创建一个新的空白 PDF 文件
-    doc = fitz.open('')
-    width = page.rect.width
-    height = page.rect.height
-    new_page = doc.new_page(width=width, height=height)
-    shape = new_page.new_shape()
-    for bbox in bboxes1:
-        # 原始box画上去
-        rect = fitz.Rect(*bbox[0:4])
-        shape = new_page.new_shape()
-        shape.draw_rect(rect)
-        shape.finish(color=fitz.pdfcolor['red'], fill=fitz.pdfcolor['blue'], fill_opacity=0.2)
-        shape.finish()
-        shape.commit()
-    for bbox in bboxes2:
-        # 原始box画上去
-        rect = fitz.Rect(*bbox[0:4])
-        shape = new_page.new_shape()
-        shape.draw_rect(rect)
-        shape.finish(color=None, fill=fitz.pdfcolor['yellow'], fill_opacity=0.2)
-        shape.finish()
-        shape.commit()
-    for bbox in bboxes3:
-        # 原始box画上去
-        rect = fitz.Rect(*bbox[0:4])
-        shape = new_page.new_shape()
-        shape.draw_rect(rect)
-        shape.finish(color=fitz.pdfcolor['red'], fill=None)
-        shape.finish()
-        shape.commit()
-    parent_dir = os.path.dirname(save_path)
-    if not os.path.exists(parent_dir):
-        os.makedirs(parent_dir)
-    doc.save(save_path)
-    doc.close() 
-def get_spilter_of_page(page, image_bboxes, table_bboxes):
-    """
-    获取到色块和横线
-    """
-    cdrawings = page.get_cdrawings()
-    spilter_bbox = []
-    for block in cdrawings:
-        if 'fill' in block:
-            fill = block['fill']
-        if 'fill' in block and block['fill'] and block['fill']!=(1.0,1.0,1.0):
-            rect = block['rect']
-            if __rect_filter_by_width(rect, page.rect.width, page.rect.height) and __rect_filter_by_pos(rect, image_bboxes, table_bboxes):
-                spilter_bbox.append(list(rect))
-    """过滤、修正一下这些box。因为有时候会有一些矩形，高度为0或者为负数，造成layout计算无限循环。如果是负高度或者0高度，统一修正为高度为1"""
-    for box in spilter_bbox:
-        if box[3]-box[1] <= 0:
-            box[3] = box[1] + 1
-    #__debug_show_page(page, spilter_bbox, [], [])
-    return spilter_bbox
--- a/magic_pdf/layout.bak/mcol_sort.py
+++ b/magic_pdf/layout.bak/mcol_sort.py
-"""
-This is an advanced PyMuPDF utility for detecting multi-column pages.
-It can be used in a shell script, or its main function can be imported and
-invoked as descript below.
-Features
---------
- Identify text belonging to (a variable number of) columns on the page.
- Text with different background color is handled separately, allowing for
-  easier treatment of side remarks, comment boxes, etc.
- Uses text block detection capability to identify text blocks and
-  uses the block bboxes as primary structuring principle.
- Supports ignoring footers via a footer margin parameter.
- Returns re-created text boundary boxes (integer coordinates), sorted ascending
-  by the top, then by the left coordinates.
-Restrictions
-------------
- Only supporting horizontal, left-to-right text
- Returns a list of text boundary boxes - not the text itself. The caller is
-  expected to extract text from within the returned boxes.
- Text written above images is ignored altogether (option).
- This utility works as expected in most cases. The following situation cannot
-  be handled correctly:
-    * overlapping (non-disjoint) text blocks
-    * image captions are not recognized and are handled like normal text
-Usage
------
- As a CLI shell command use
-  python multi_column.py input.pdf footer_margin
-  Where footer margin is the height of the bottom stripe to ignore on each page.
-  This code is intended to be modified according to your need.
- Use in a Python script as follows:
-  ----------------------------------------------------------------------------------
-  from multi_column import column_boxes
-  # for each page execute
-  bboxes = column_boxes(page, footer_margin=50, no_image_text=True)
-  # bboxes is a list of fitz.IRect objects, that are sort ascending by their y0,
-  # then x0 coordinates. Their text content can be extracted by all PyMuPDF
-  # get_text() variants, like for instance the following:
-  for rect in bboxes:
-      print(page.get_text(clip=rect, sort=True))
-  ----------------------------------------------------------------------------------
-"""
-import sys
-from magic_pdf.libs.commons import fitz
-def column_boxes(page, footer_margin=50, header_margin=50, no_image_text=True):
-    """Determine bboxes which wrap a column."""
-    paths = page.get_drawings()
-    bboxes = []
-    # path rectangles
-    path_rects = []
-    # image bboxes
-    img_bboxes = []
-    # bboxes of non-horizontal text
-    # avoid when expanding horizontal text boxes
-    vert_bboxes = []
-    # compute relevant page area
-    clip = +page.rect
-    clip.y1 -= footer_margin  # Remove footer area
-    clip.y0 += header_margin  # Remove header area
-    def can_extend(temp, bb, bboxlist):
-        """Determines whether rectangle 'temp' can be extended by 'bb'
-        without intersecting any of the rectangles contained in 'bboxlist'.
-        Items of bboxlist may be None if they have been removed.
-        Returns:
-            True if 'temp' has no intersections with items of 'bboxlist'.
-        """
-        for b in bboxlist:
-            if not intersects_bboxes(temp, vert_bboxes) and (
-                b == None or b == bb or (temp & b).is_empty
-            ):
-                continue
-            return False
-        return True
-    def in_bbox(bb, bboxes):
-        """Return 1-based number if a bbox contains bb, else return 0."""
-        for i, bbox in enumerate(bboxes):
-            if bb in bbox:
-                return i + 1
-        return 0
-    def intersects_bboxes(bb, bboxes):
-        """Return True if a bbox intersects bb, else return False."""
-        for bbox in bboxes:
-            if not (bb & bbox).is_empty:
-                return True
-        return False
-    def extend_right(bboxes, width, path_bboxes, vert_bboxes, img_bboxes):
-        """Extend a bbox to the right page border.
-        Whenever there is no text to the right of a bbox, enlarge it up
-        to the right page border.
-        Args:
-            bboxes: (list[IRect]) bboxes to check
-            width: (int) page width
-            path_bboxes: (list[IRect]) bboxes with a background color
-            vert_bboxes: (list[IRect]) bboxes with vertical text
-            img_bboxes: (list[IRect]) bboxes of images
-        Returns:
-            Potentially modified bboxes.
-        """
-        for i, bb in enumerate(bboxes):
-            # do not extend text with background color
-            if in_bbox(bb, path_bboxes):
-                continue
-            # do not extend text in images
-            if in_bbox(bb, img_bboxes):
-                continue
-            # temp extends bb to the right page border
-            temp = +bb
-            temp.x1 = width
-            # do not cut through colored background or images
-            if intersects_bboxes(temp, path_bboxes + vert_bboxes + img_bboxes):
-                continue
-            # also, do not intersect other text bboxes
-            check = can_extend(temp, bb, bboxes)
-            if check:
-                bboxes[i] = temp  # replace with enlarged bbox
-        return [b for b in bboxes if b != None]
-    def clean_nblocks(nblocks):
-        """Do some elementary cleaning."""
-        # 1. remove any duplicate blocks.
-        blen = len(nblocks)
-        if blen < 2:
-            return nblocks
-        start = blen - 1
-        for i in range(start, -1, -1):
-            bb1 = nblocks[i]
-            bb0 = nblocks[i - 1]
-            if bb0 == bb1:
-                del nblocks[i]
-        # 2. repair sequence in special cases:
-        # consecutive bboxes with almost same bottom value are sorted ascending
-        # by x-coordinate.
-        y1 = nblocks[0].y1  # first bottom coordinate
-        i0 = 0  # its index
-        i1 = -1  # index of last bbox with same bottom
-        # Iterate over bboxes, identifying segments with approx. same bottom value.
-        # Replace every segment by its sorted version.
-        for i in range(1, len(nblocks)):
-            b1 = nblocks[i]
-            if abs(b1.y1 - y1) > 10:  # different bottom
-                if i1 > i0:  # segment length > 1? Sort it!
-                    nblocks[i0 : i1 + 1] = sorted(
-                        nblocks[i0 : i1 + 1], key=lambda b: b.x0
-                    )
-                y1 = b1.y1  # store new bottom value
-                i0 = i  # store its start index
-            i1 = i  # store current index
-        if i1 > i0:  # segment waiting to be sorted
-            nblocks[i0 : i1 + 1] = sorted(nblocks[i0 : i1 + 1], key=lambda b: b.x0)
-        return nblocks
-    # extract vector graphics
-    for p in paths:
-        path_rects.append(p["rect"].irect)
-    path_bboxes = path_rects
-    # sort path bboxes by ascending top, then left coordinates
-    path_bboxes.sort(key=lambda b: (b.y0, b.x0))
-    # bboxes of images on page, no need to sort them
-    for item in page.get_images():
-        img_bboxes.extend(page.get_image_rects(item[0]))
-    # blocks of text on page
-    blocks = page.get_text(
-        "dict",
-        flags=fitz.TEXTFLAGS_TEXT,
-        clip=clip,
-    )["blocks"]
-    # Make block rectangles, ignoring non-horizontal text
-    for b in blocks:
-        bbox = fitz.IRect(b["bbox"])  # bbox of the block
-        # ignore text written upon images
-        if no_image_text and in_bbox(bbox, img_bboxes):
-            continue
-        # confirm first line to be horizontal
-        line0 = b["lines"][0]  # get first line
-        if line0["dir"] != (1, 0):  # only accept horizontal text
-            vert_bboxes.append(bbox)
-            continue
-        srect = fitz.EMPTY_IRECT()
-        for line in b["lines"]:
-            lbbox = fitz.IRect(line["bbox"])
-            text = "".join([s["text"].strip() for s in line["spans"]])
-            if len(text) > 1:
-                srect |= lbbox
-        bbox = +srect
-        if not bbox.is_empty:
-            bboxes.append(bbox)
-    # Sort text bboxes by ascending background, top, then left coordinates
-    bboxes.sort(key=lambda k: (in_bbox(k, path_bboxes), k.y0, k.x0))
-    # Extend bboxes to the right where possible
-    bboxes = extend_right(
-        bboxes, int(page.rect.width), path_bboxes, vert_bboxes, img_bboxes
-    )
-    # immediately return of no text found
-    if bboxes == []:
-        return []
-    # --------------------------------------------------------------------
-    # Join bboxes to establish some column structure
-    # --------------------------------------------------------------------
-    # the final block bboxes on page
-    nblocks = [bboxes[0]]  # pre-fill with first bbox
-    bboxes = bboxes[1:]  # remaining old bboxes
-    for i, bb in enumerate(bboxes):  # iterate old bboxes
-        check = False  # indicates unwanted joins
-        # check if bb can extend one of the new blocks
-        for j in range(len(nblocks)):
-            nbb = nblocks[j]  # a new block
-            # never join across columns
-            if bb == None or nbb.x1 < bb.x0 or bb.x1 < nbb.x0:
-                continue
-            # never join across different background colors
-            if in_bbox(nbb, path_bboxes) != in_bbox(bb, path_bboxes):
-                continue
-            temp = bb | nbb  # temporary extension of new block
-            check = can_extend(temp, nbb, nblocks)
-            if check == True:
-                break
-        if not check:  # bb cannot be used to extend any of the new bboxes
-            nblocks.append(bb)  # so add it to the list
-            j = len(nblocks) - 1  # index of it
-            temp = nblocks[j]  # new bbox added
-        # check if some remaining bbox is contained in temp
-        check = can_extend(temp, bb, bboxes)
-        if check == False:
-            nblocks.append(bb)
-        else:
-            nblocks[j] = temp
-        bboxes[i] = None
-    # do some elementary cleaning
-    nblocks = clean_nblocks(nblocks)
-    # return identified text bboxes
-    return nblocks
-if __name__ == "__main__":
-    """Only for debugging purposes, currently.
-    Draw red borders around the returned text bboxes and insert
-    the bbox number.
-    Then save the file under the name "input-blocks.pdf".
-    """
-    # get the file name
-    filename = sys.argv[1]
-    # check if footer margin is given
-    if len(sys.argv) > 2:
-        footer_margin = int(sys.argv[2])
-    else:  # use default vaue
-        footer_margin = 50
-    # check if header margin is given
-    if len(sys.argv) > 3:
-        header_margin = int(sys.argv[3])
-    else:  # use default vaue
-        header_margin = 50
-    # open document
-    doc = fitz.open(filename)
-    # iterate over the pages
-    for page in doc:
-        # remove any geometry issues
-        page.wrap_contents()
-        # get the text bboxes
-        bboxes = column_boxes(page, footer_margin=footer_margin, header_margin=header_margin)
-        # prepare a canvas to draw rectangles and text
-        shape = page.new_shape()
-        # iterate over the bboxes
-        for i, rect in enumerate(bboxes):
-            shape.draw_rect(rect)  # draw a border
-            # write sequence number
-            shape.insert_text(rect.tl + (5, 15), str(i), color=fitz.pdfcolor["red"])
-        # finish drawing / text with color red
-        shape.finish(color=fitz.pdfcolor["red"])
-        shape.commit()  # store to the page
-    # save document with text bboxes
-    doc.ez_save(filename.replace(".pdf", "-blocks.pdf"))
\ No newline at end of file
--- a/magic_pdf/libs/calc_span_stats.py.bak
+++ b/magic_pdf/libs/calc_span_stats.py.bak
-import os
-import csv
-import json
-import pandas as pd
-from pandas import DataFrame as df
-from matplotlib import pyplot as plt
-from termcolor import cprint
-"""
-Execute this script in the following way:
-1. Make sure there are pdf_dic.json files under the directory code-clean/tmp/unittest/md/, such as the following:
-    code-clean/tmp/unittest/md/scihub/scihub_00500000/libgen.scimag00527000-00527999.zip_10.1002/app.25178/pdf_dic.json
-2. Under the directory code-clean, execute the following command:
-    $ python -m libs.calc_span_stats
-"""
-def print_green_on_red(text):
-    cprint(text, "green", "on_red", attrs=["bold"], end="\n\n")
-def print_green(text):
-    print()
-    cprint(text, "green", attrs=["bold"], end="\n\n")
-def print_red(text):
-    print()
-    cprint(text, "red", attrs=["bold"], end="\n\n")
-def safe_get(dict_obj, key, default):
-    val = dict_obj.get(key)
-    if val is None:
-        return default
-    else:
-        return val
-class SpanStatsCalc:
-    """Calculate statistics of span."""
-    def draw_charts(self, span_stats: pd.DataFrame, fig_num: int, save_path: str):
-        """Draw multiple figures in one figure."""
-        # make a canvas
-        fig = plt.figure(fig_num, figsize=(20, 20))
-        pass
-    def calc_stats_per_dict(self, pdf_dict) -> pd.DataFrame:
-        """Calculate statistics per pdf_dict."""
-        span_stats = pd.DataFrame()
-        span_stats = []
-        span_id = 0
-        for page_id, blocks in pdf_dict.items():
-            if page_id.startswith("page_"):
-                if "para_blocks" in blocks.keys():
-                    for para_block in blocks["para_blocks"]:
-                        for line in para_block["lines"]:
-                            for span in line["spans"]:
-                                span_text = safe_get(span, "text", "")
-                                span_font_name = safe_get(span, "font", "")
-                                span_font_size = safe_get(span, "size", 0)
-                                span_font_color = safe_get(span, "color", "")
-                                span_font_flags = safe_get(span, "flags", 0)
-                                span_font_flags_decoded = safe_get(span, "decomposed_flags", {})
-                                span_is_super_script = safe_get(span_font_flags_decoded, "is_superscript", False)
-                                span_is_italic = safe_get(span_font_flags_decoded, "is_italic", False)
-                                span_is_serifed = safe_get(span_font_flags_decoded, "is_serifed", False)
-                                span_is_sans_serifed = safe_get(span_font_flags_decoded, "is_sans_serifed", False)
-                                span_is_monospaced = safe_get(span_font_flags_decoded, "is_monospaced", False)
-                                span_is_proportional = safe_get(span_font_flags_decoded, "is_proportional", False)
-                                span_is_bold = safe_get(span_font_flags_decoded, "is_bold", False)
-                                span_stats.append(
-                                    {
-                                        "span_id": span_id,  # id of span
-                                        "page_id": page_id,  # page number of pdf
-                                        "span_text": span_text,  # text of span
-                                        "span_font_name": span_font_name,  # font name of span
-                                        "span_font_size": span_font_size,  # font size of span
-                                        "span_font_color": span_font_color,  # font color of span
-                                        "span_font_flags": span_font_flags,  # font flags of span
-                                        "span_is_superscript": int(
-                                            span_is_super_script
-                                        ),  # indicate whether the span is super script or not
-                                        "span_is_italic": int(span_is_italic),  # indicate whether the span is italic or not
-                                        "span_is_serifed": int(span_is_serifed),  # indicate whether the span is serifed or not
-                                        "span_is_sans_serifed": int(
-                                            span_is_sans_serifed
-                                        ),  # indicate whether the span is sans serifed or not
-                                        "span_is_monospaced": int(
-                                            span_is_monospaced
-                                        ),  # indicate whether the span is monospaced or not
-                                        "span_is_proportional": int(
-                                            span_is_proportional
-                                        ),  # indicate whether the span is proportional or not
-                                        "span_is_bold": int(span_is_bold),  # indicate whether the span is bold or not
-                                    }
-                                )
-                                span_id += 1
-        span_stats = pd.DataFrame(span_stats)
-        # print(span_stats)
-        return span_stats
-def __find_pdf_dic_files(
-    jf_name="pdf_dic.json",
-    base_code_name="code-clean",
-    tgt_base_dir_name="tmp",
-    unittest_dir_name="unittest",
-    md_dir_name="md",
-    book_names=[
-        "scihub",
-    ],  # other possible values: "zlib", "arxiv" and so on
-):
-    pdf_dict_files = []
-    curr_dir = os.path.dirname(__file__)
-    for i in range(len(curr_dir)):
-        if curr_dir[i : i + len(base_code_name)] == base_code_name:
-            base_code_dir_name = curr_dir[: i + len(base_code_name)]
-            for book_name in book_names:
-                search_dir_relative_name = os.path.join(tgt_base_dir_name, unittest_dir_name, md_dir_name, book_name)
-                if os.path.exists(base_code_dir_name):
-                    search_dir_name = os.path.join(base_code_dir_name, search_dir_relative_name)
-                    for root, dirs, files in os.walk(search_dir_name):
-                        for file in files:
-                            if file == jf_name:
-                                pdf_dict_files.append(os.path.join(root, file))
-                break
-    return pdf_dict_files
-def combine_span_texts(group_df, span_stats):
-    combined_span_texts = []
-    for _, row in group_df.iterrows():
-        curr_span_id = row.name
-        curr_span_text = row["span_text"]
-        pre_span_id = curr_span_id - 1
-        pre_span_text = span_stats.at[pre_span_id, "span_text"] if pre_span_id in span_stats.index else ""
-        next_span_id = curr_span_id + 1
-        next_span_text = span_stats.at[next_span_id, "span_text"] if next_span_id in span_stats.index else ""
-        # pointer_sign is a right arrow if the span is superscript, otherwise it is a down arrow
-        pointer_sign = "→ → → "
-        combined_text = "\n".join([pointer_sign + pre_span_text, pointer_sign + curr_span_text, pointer_sign + next_span_text])
-        combined_span_texts.append(combined_text)
-    return "\n\n".join(combined_span_texts)
-# pd.set_option("display.max_colwidth", None)  # 设置为 None 来显示完整的文本
-pd.set_option("display.max_rows", None)  # 设置为 None 来显示更多的行
-def main():
-    pdf_dict_files = __find_pdf_dic_files()
-    # print(pdf_dict_files)
-    span_stats_calc = SpanStatsCalc()
-    for pdf_dict_file in pdf_dict_files:
-        print("-" * 100)
-        print_green_on_red(f"Processing {pdf_dict_file}")
-        with open(pdf_dict_file, "r", encoding="utf-8") as f:
-            pdf_dict = json.load(f)
-            raw_df = span_stats_calc.calc_stats_per_dict(pdf_dict)
-            save_path = pdf_dict_file.replace("pdf_dic.json", "span_stats_raw.csv")
-            raw_df.to_csv(save_path, index=False)
-            filtered_df = raw_df[raw_df["span_is_superscript"] == 1]
-            if filtered_df.empty:
-                print("No superscript span found!")
-                continue
-            filtered_grouped_df = filtered_df.groupby(["span_font_name", "span_font_size", "span_font_color"])
-            combined_span_texts = filtered_grouped_df.apply(combine_span_texts, span_stats=raw_df)  # type: ignore
-            final_df = filtered_grouped_df.size().reset_index(name="count")
-            final_df["span_texts"] = combined_span_texts.reset_index(level=[0, 1, 2], drop=True)
-            print(final_df)
-            final_df["span_texts"] = final_df["span_texts"].apply(lambda x: x.replace("\n", "\r\n"))
-            save_path = pdf_dict_file.replace("pdf_dic.json", "span_stats_final.csv")
-            # 使用 UTF-8 编码并添加 BOM，确保所有字段被双引号包围
-            final_df.to_csv(save_path, index=False, encoding="utf-8-sig", quoting=csv.QUOTE_ALL)
-            # 创建一个 2x2 的图表布局
-            fig, axs = plt.subplots(2, 2, figsize=(15, 10))
-            # 按照 span_font_name 分类作图
-            final_df.groupby("span_font_name")["count"].sum().plot(kind="bar", ax=axs[0, 0], title="By Font Name")
-            # 按照 span_font_size 分类作图
-            final_df.groupby("span_font_size")["count"].sum().plot(kind="bar", ax=axs[0, 1], title="By Font Size")
-            # 按照 span_font_color 分类作图
-            final_df.groupby("span_font_color")["count"].sum().plot(kind="bar", ax=axs[1, 0], title="By Font Color")
-            # 按照 span_font_name、span_font_size 和 span_font_color 共同分类作图
-            grouped = final_df.groupby(["span_font_name", "span_font_size", "span_font_color"])
-            grouped["count"].sum().unstack().plot(kind="bar", ax=axs[1, 1], title="Combined Grouping")
-            # 调整布局
-            plt.tight_layout()
-            # 显示图表
-            # plt.show()
-            # 保存图表到 PNG 文件
-            save_path = pdf_dict_file.replace("pdf_dic.json", "span_stats_combined.png")
-            plt.savefig(save_path)
-            # 清除画布
-            plt.clf()
-if __name__ == "__main__":
-    main()
--- a/magic_pdf/libs/detect_language_from_model.py.bak
+++ b/magic_pdf/libs/detect_language_from_model.py.bak
-from collections import Counter
-from magic_pdf.libs.language import detect_lang
-def get_language_from_model(model_list: list):
-    language_lst = []
-    for ocr_page_info in model_list:
-        page_text = ""
-        layout_dets = ocr_page_info["layout_dets"]
-        for layout_det in layout_dets:
-            category_id = layout_det["category_id"]
-            allow_category_id_list = [15]
-            if category_id in allow_category_id_list:
-                page_text += layout_det["text"]
-        page_language = detect_lang(page_text)
-        language_lst.append(page_language)
-    # 统计text_language_list中每种语言的个数
-    count_dict = Counter(language_lst)
-    # 输出text_language_list中出现的次数最多的语言
-    language = max(count_dict, key=count_dict.get)
-    return language
--- a/magic_pdf/libs/nlp_utils.py.bak
+++ b/magic_pdf/libs/nlp_utils.py.bak
-import re
-from os import path
-from collections import Counter
-from loguru import logger
-# from langdetect import detect
-import spacy
-import en_core_web_sm
-import zh_core_web_sm
-from magic_pdf.libs.language import detect_lang
-class NLPModels:
-    """
-    How to upload local models to s3:
-        - config aws cli:
-            doc\SETUP-CLI.md
-            doc\setup_cli.sh
-            app\config\__init__.py
-        - $ cd {local_dir_storing_models}
-        - $ ls models
-            en_core_web_sm-3.7.1/
-            zh_core_web_sm-3.7.0/
-        - $ aws s3 sync models/ s3://llm-infra/models --profile=p_project_norm
-        - $ aws s3 --profile=p_project_norm ls  s3://llm-infra/models/
-            PRE en_core_web_sm-3.7.1/
-            PRE zh_core_web_sm-3.7.0/
-    """
-    def __init__(self):
-        # if OS is windows, set "TMP_DIR" to "D:/tmp"
-        home_dir = path.expanduser("~")
-        self.default_local_path = path.join(home_dir, ".nlp_models")
-        self.default_shared_path = "/share/pdf_processor/nlp_models"
-        self.default_hdfs_path = "hdfs://pdf_processor/nlp_models"
-        self.default_s3_path = "s3://llm-infra/models"
-        self.nlp_models = self.nlp_models = {
-            "en_core_web_sm": {
-                "type": "spacy",
-                "version": "3.7.1",
-            },
-            "en_core_web_md": {
-                "type": "spacy",
-                "version": "3.7.1",
-            },
-            "en_core_web_lg": {
-                "type": "spacy",
-                "version": "3.7.1",
-            },
-            "zh_core_web_sm": {
-                "type": "spacy",
-                "version": "3.7.0",
-            },
-            "zh_core_web_md": {
-                "type": "spacy",
-                "version": "3.7.0",
-            },
-            "zh_core_web_lg": {
-                "type": "spacy",
-                "version": "3.7.0",
-            },
-        }
-        self.en_core_web_sm_model = en_core_web_sm.load()
-        self.zh_core_web_sm_model = zh_core_web_sm.load()
-    def load_model(self, model_name, model_type, model_version):
-        if (
-            model_name in self.nlp_models
-            and self.nlp_models[model_name]["type"] == model_type
-            and self.nlp_models[model_name]["version"] == model_version
-        ):
-            return spacy.load(model_name) if spacy.util.is_package(model_name) else None
-        else:
-            logger.error(f"Unsupported model name or version: {model_name} {model_version}")
-            return None
-    def detect_language(self, text, use_langdetect=False):
-        if len(text) == 0:
-            return None
-        if use_langdetect:
-            # print("use_langdetect")
-            # print(detect_lang(text))
-            # return detect_lang(text)
-            if detect_lang(text) == "zh":
-                return "zh"
-            else:
-                return "en"
-        if not use_langdetect:
-            en_count = len(re.findall(r"[a-zA-Z]", text))
-            cn_count = len(re.findall(r"[\u4e00-\u9fff]", text))
-            if en_count > cn_count:
-                return "en"
-            if cn_count > en_count:
-                return "zh"
-    def detect_entity_catgr_using_nlp(self, text, threshold=0.5):
-        """
-        Detect entity categories using NLP models and return the most frequent entity types.
-        Parameters
-        ----------
-        text : str
-            Text to be processed.
-        Returns
-        -------
-        str
-            The most frequent entity type.
-        """
-        lang = self.detect_language(text, use_langdetect=True)
-        if lang == "en":
-            nlp_model = self.en_core_web_sm_model
-        elif lang == "zh":
-            nlp_model = self.zh_core_web_sm_model
-        else:
-            # logger.error(f"Unsupported language: {lang}")
-            return {}
-        # Splitting text into smaller parts
-        text_parts = re.split(r"[,;，；、\s & |]+", text)
-        text_parts = [part for part in text_parts if not re.match(r"[\d\W]+", part)]  # Remove non-words
-        text_combined = " ".join(text_parts)
-        try:
-            doc = nlp_model(text_combined)
-            entity_counts = Counter([ent.label_ for ent in doc.ents])
-            word_counts_in_entities = Counter()
-            for ent in doc.ents:
-                word_counts_in_entities[ent.label_] += len(ent.text.split())
-            total_words_in_entities = sum(word_counts_in_entities.values())
-            total_words = len([token for token in doc if not token.is_punct])
-            if total_words_in_entities == 0 or total_words == 0:
-                return None
-            entity_percentage = total_words_in_entities / total_words
-            if entity_percentage < 0.5:
-                return None
-            most_common_entity, word_count = word_counts_in_entities.most_common(1)[0]
-            entity_percentage = word_count / total_words_in_entities
-            if entity_percentage >= threshold:
-                return most_common_entity
-            else:
-                return None
-        except Exception as e:
-            logger.error(f"Error in entity detection: {e}")
-            return None
-def __main__():
-    nlpModel = NLPModels()
-    test_strings = [
-        "张三",
-        "张三, 李四，王五; 赵六",
-        "John Doe",
-        "Jane Smith",
-        "Lee, John",
-        "John Doe, Jane Smith; Alice Johnson，Bob Lee",
-        "孙七, Michael Jordan；赵八",
-        "David Smith  Michael O'Connor; Kevin ßáçøñ",
-        "李雷·韩梅梅, 张三·李四",
-        "Charles Robert Darwin, Isaac Newton",
-        "莱昂纳多·迪卡普里奥, 杰克·吉伦哈尔",
-        "John Doe, Jane Smith; Alice Johnson",
-        "张三, 李四，王五; 赵六",
-        "Lei Wang, Jia Li, and Xiaojun Chen, LINKE YANG OU, and YUAN ZHANG",
-        "Rachel Mills  &  William Barry  &  Susanne B. Haga",
-        "Claire Chabut* and Jean-François Bussières",
-        "1 Department of Chemistry, Northeastern University, Shenyang 110004, China 2 State Key Laboratory of Polymer Physics and Chemistry, Changchun Institute of Applied Chemistry, Chinese Academy of Sciences, Changchun 130022, China",
-        "Changchun",
-        "china",
-        "Rongjun Song, 1,2 Baoyan Zhang, 1 Baotong Huang, 2 Tao Tang 2",
-        "Synergistic Effect of Supported Nickel Catalyst with Intumescent Flame-Retardants on Flame Retardancy and Thermal Stability of Polypropylene",
-        "Synergistic Effect of Supported Nickel Catalyst with",
-        "Intumescent Flame-Retardants on Flame Retardancy",
-        "and Thermal Stability of Polypropylene",
-    ]
-    for test in test_strings:
-        print()
-        print(f"Original String: {test}")
-        result = nlpModel.detect_entity_catgr_using_nlp(test)
-        print(f"Detected entities: {result}")
-if __name__ == "__main__":
-    __main__()
--- a/magic_pdf/libs/textbase.py.bak
+++ b/magic_pdf/libs/textbase.py.bak
-import math
-def __inc_dict_val(mp, key, val_inc:int):
-    if mp.get(key):
-        mp[key] = mp[key] + val_inc
-    else:
-        mp[key] = val_inc
-def get_text_block_base_info(block):
-    """
-    获取这个文本块里的字体的颜色、字号、字体
-    按照正文字数最多的返回
-    """
-    counter = {}
-    for line in block['lines']:
-        for span in line['spans']:
-            color = span['color']
-            size = round(span['size'], 2)
-            font = span['font']
-            txt_len = len(span['text'])
-            __inc_dict_val(counter, (color, size, font), txt_len)
-    c, s, ft = max(counter, key=counter.get)
-    return c, s, ft
\ No newline at end of file
--- a/magic_pdf/libs/vis_utils.py.bak
+++ b/magic_pdf/libs/vis_utils.py.bak
-from magic_pdf.libs.commons import fitz
-import os
-def draw_bbox_on_page(raw_pdf_doc: fitz.Document, paras_dict:dict, save_path: str):
-    """
-    在page上画出bbox，保存到save_path
-    """
-    # 检查文件是否存在
-    is_new_pdf = False
-    if os.path.exists(save_path):
-        # 打开现有的 PDF 文件
-        doc = fitz.open(save_path)
-    else:
-        # 创建一个新的空白 PDF 文件
-        is_new_pdf = True
-        doc = fitz.open('')
-    color_map = {
-        'image': fitz.pdfcolor["yellow"],
-        'text': fitz.pdfcolor['blue'],
-        "table": fitz.pdfcolor['green']
-    }
-    for k, v in paras_dict.items():
-        page_idx = v['page_idx']
-        width = raw_pdf_doc[page_idx].rect.width
-        height = raw_pdf_doc[page_idx].rect.height
-        new_page = doc.new_page(width=width, height=height)
-        shape = new_page.new_shape()
-        for order, block in enumerate(v['preproc_blocks']):
-            rect = fitz.Rect(block['bbox'])
-            shape = new_page.new_shape()
-            shape.draw_rect(rect)
-            shape.finish(color=None, fill=color_map['text'], fill_opacity=0.2)
-            shape.finish()
-            shape.commit()
-        for img in v['images']:
-            # 原始box画上去
-            rect = fitz.Rect(img['bbox'])
-            shape = new_page.new_shape()
-            shape.draw_rect(rect)
-            shape.finish(color=None, fill=fitz.pdfcolor['yellow'])
-            shape.finish()
-            shape.commit()
-        for img in v['image_backup']:
-            # 原始box画上去
-            rect = fitz.Rect(img['bbox'])
-            shape = new_page.new_shape()
-            shape.draw_rect(rect)
-            shape.finish(color=fitz.pdfcolor['yellow'],  fill=None)
-            shape.finish()
-            shape.commit()
-        for tb in v['droped_text_block']:
-            # 原始box画上去
-            rect = fitz.Rect(tb['bbox'])
-            shape = new_page.new_shape()
-            shape.draw_rect(rect)
-            shape.finish(color=None, fill=fitz.pdfcolor['black'], fill_opacity=0.4)
-            shape.finish()
-            shape.commit()
-        # TODO table
-        for tb in v['tables']:
-            rect = fitz.Rect(tb['bbox'])
-            shape = new_page.new_shape()
-            shape.draw_rect(rect)
-            shape.finish(color=None, fill=fitz.pdfcolor['green'], fill_opacity=0.2)
-            shape.finish()
-            shape.commit()
-    parent_dir = os.path.dirname(save_path)
-    if not os.path.exists(parent_dir):
-        os.makedirs(parent_dir)
-    if is_new_pdf:
-        doc.save(save_path)
-    else:
-        doc.saveIncr()
-    doc.close()
-def debug_show_bbox(raw_pdf_doc: fitz.Document, page_idx: int, bboxes: list, droped_bboxes:list,  expect_drop_bboxes:list, save_path: str, expected_page_id:int):
-    """
-    以覆盖的方式写个临时的pdf，用于debug
-    """
-    if page_idx!=expected_page_id:
-        return
-    if os.path.exists(save_path):
-        # 删除已经存在的文件
-        os.remove(save_path)
-    # 创建一个新的空白 PDF 文件
-    doc = fitz.open('')
-    width = raw_pdf_doc[page_idx].rect.width
-    height = raw_pdf_doc[page_idx].rect.height
-    new_page = doc.new_page(width=width, height=height)
-    shape = new_page.new_shape()
-    for bbox in bboxes:
-        # 原始box画上去
-        rect = fitz.Rect(*bbox[0:4])
-        shape = new_page.new_shape()
-        shape.draw_rect(rect)
-        shape.finish(color=fitz.pdfcolor['red'], fill=fitz.pdfcolor['blue'], fill_opacity=0.2)
-        shape.finish()
-        shape.commit()
-    for bbox in droped_bboxes:
-        # 原始box画上去
-        rect = fitz.Rect(*bbox[0:4])
-        shape = new_page.new_shape()
-        shape.draw_rect(rect)
-        shape.finish(color=None, fill=fitz.pdfcolor['yellow'], fill_opacity=0.2)
-        shape.finish()
-        shape.commit()
-    for bbox in expect_drop_bboxes:
-        # 原始box画上去
-        rect = fitz.Rect(*bbox[0:4])
-        shape = new_page.new_shape()
-        shape.draw_rect(rect)
-        shape.finish(color=fitz.pdfcolor['red'], fill=None)
-        shape.finish()
-        shape.commit()
-    # shape.insert_textbox(fitz.Rect(200, 0, 600, 20), f"total bboxes: {len(bboxes)}", fontname="helv", fontsize=12,
-    #                      color=(0, 0, 0))
-    # shape.finish(color=fitz.pdfcolor['black'])
-    # shape.commit()
-    parent_dir = os.path.dirname(save_path)
-    if not os.path.exists(parent_dir):
-        os.makedirs(parent_dir)
-    doc.save(save_path)
-    doc.close()
-def debug_show_page(page, bboxes1: list,bboxes2: list,bboxes3: list,):
-    save_path = "./tmp/debug.pdf"
-    if os.path.exists(save_path):
-        # 删除已经存在的文件
-        os.remove(save_path)
-    # 创建一个新的空白 PDF 文件
-    doc = fitz.open('')
-    width = page.rect.width
-    height = page.rect.height
-    new_page = doc.new_page(width=width, height=height)
-    shape = new_page.new_shape()
-    for bbox in bboxes1:
-        # 原始box画上去
-        rect = fitz.Rect(*bbox[0:4])
-        shape = new_page.new_shape()
-        shape.draw_rect(rect)
-        shape.finish(color=fitz.pdfcolor['red'], fill=fitz.pdfcolor['blue'], fill_opacity=0.2)
-        shape.finish()
-        shape.commit()
-    for bbox in bboxes2:
-        # 原始box画上去
-        rect = fitz.Rect(*bbox[0:4])
-        shape = new_page.new_shape()
-        shape.draw_rect(rect)
-        shape.finish(color=None, fill=fitz.pdfcolor['yellow'], fill_opacity=0.2)
-        shape.finish()
-        shape.commit()
-    for bbox in bboxes3:
-        # 原始box画上去
-        rect = fitz.Rect(*bbox[0:4])
-        shape = new_page.new_shape()
-        shape.draw_rect(rect)
-        shape.finish(color=fitz.pdfcolor['red'], fill=None)
-        shape.finish()
-        shape.commit()
-    parent_dir = os.path.dirname(save_path)
-    if not os.path.exists(parent_dir):
-        os.makedirs(parent_dir)
-    doc.save(save_path)
-    doc.close() 
-def draw_layout_bbox_on_page(raw_pdf_doc: fitz.Document, paras_dict:dict, header, footer, pdf_path: str):
-    """
-    在page上画出bbox，保存到save_path
-    """
-    # 检查文件是否存在
-    is_new_pdf = False
-    if os.path.exists(pdf_path):
-        # 打开现有的 PDF 文件
-        doc = fitz.open(pdf_path)
-    else:
-        # 创建一个新的空白 PDF 文件
-        is_new_pdf = True
-        doc = fitz.open('')
-    for k, v in paras_dict.items():
-        page_idx = v['page_idx']
-        layouts = v['layout_bboxes']
-        page = doc[page_idx]
-        shape = page.new_shape()
-        for order, layout in enumerate(layouts):
-            border_offset = 1
-            rect_box = layout['layout_bbox']
-            layout_label = layout['layout_label']
-            fill_color = fitz.pdfcolor['pink'] if layout_label=='U' else None
-            rect_box = [rect_box[0]+1, rect_box[1]-border_offset, rect_box[2]-1, rect_box[3]+border_offset]
-            rect = fitz.Rect(*rect_box)
-            shape.draw_rect(rect)
-            shape.finish(color=fitz.pdfcolor['red'], fill=fill_color, fill_opacity=0.4)
-            """
-            draw order text on layout box
-            """
-            font_size = 10
-            shape.insert_text((rect_box[0] + 1, rect_box[1] + font_size), f"{order}", fontsize=font_size, color=(0, 0, 0))
-        """画上footer header"""
-        if header:
-            shape.draw_rect(fitz.Rect(header))
-            shape.finish(color=None, fill=fitz.pdfcolor['black'], fill_opacity=0.2)
-        if footer:
-            shape.draw_rect(fitz.Rect(footer))
-            shape.finish(color=None, fill=fitz.pdfcolor['black'], fill_opacity=0.2)
-        shape.commit()
-    if is_new_pdf:
-        doc.save(pdf_path)
-    else:
-        doc.saveIncr()
-    doc.close()
-@DeprecationWarning
-def draw_layout_on_page(raw_pdf_doc: fitz.Document,  page_idx: int, page_layout: list, pdf_path: str):
-    """
-    把layout的box用红色边框花在pdf_path的page_idx上
-    """
-    def draw(shape, layout, fill_color=fitz.pdfcolor['pink']):
-        border_offset = 1
-        rect_box = layout['layout_bbox']
-        layout_label = layout['layout_label']
-        sub_layout = layout['sub_layout']
-        if len(sub_layout)==0:
-            fill_color = fill_color if layout_label=='U' else None
-            rect_box = [rect_box[0]+1, rect_box[1]-border_offset, rect_box[2]-1, rect_box[3]+border_offset]
-            rect = fitz.Rect(*rect_box)
-            shape.draw_rect(rect)
-            shape.finish(color=fitz.pdfcolor['red'], fill=fill_color, fill_opacity=0.2)
-            # if layout_label=='U':
-            #     bad_boxes = layout.get("bad_boxes", [])
-            #     for bad_box in bad_boxes:
-            #         rect = fitz.Rect(*bad_box)
-            #         shape.draw_rect(rect)
-            #         shape.finish(color=fitz.pdfcolor['red'], fill=fitz.pdfcolor['red'], fill_opacity=0.2)
-        # else:
-        #     rect = fitz.Rect(*rect_box)
-        #     shape.draw_rect(rect)
-        #     shape.finish(color=fitz.pdfcolor['blue'])
-        for sub_layout in sub_layout:
-            draw(shape, sub_layout)
-        shape.commit()
-    # 检查文件是否存在
-    is_new_pdf = False
-    if os.path.exists(pdf_path):
-        # 打开现有的 PDF 文件
-        doc = fitz.open(pdf_path)
-    else:
-        # 创建一个新的空白 PDF 文件
-        is_new_pdf = True
-        doc = fitz.open('')
-    page = doc[page_idx]
-    shape = page.new_shape()
-    for order, layout in enumerate(page_layout):
-        draw(shape, layout, fitz.pdfcolor['yellow'])
-    # shape.insert_textbox(fitz.Rect(200, 0, 600, 20), f"total bboxes: {len(layout)}", fontname="helv", fontsize=12,
-    #                      color=(0, 0, 0))
-    # shape.finish(color=fitz.pdfcolor['black'])
-    # shape.commit()
-    parent_dir = os.path.dirname(pdf_path)
-    if not os.path.exists(parent_dir):
-        os.makedirs(parent_dir)
-    if is_new_pdf:
-        doc.save(pdf_path)
-    else:
-        doc.saveIncr()
-    doc.close()
\ No newline at end of file
--- a/magic_pdf/para/block_continuation_processor.py.bak
+++ b/magic_pdf/para/block_continuation_processor.py.bak
-import os
-import unicodedata
-from magic_pdf.para.commons import *
-if sys.version_info[0] >= 3:
-    sys.stdout.reconfigure(encoding="utf-8")  # type: ignore
-class BlockContinuationProcessor:
-    """
-    This class is used to process the blocks to detect block continuations.
-    """
-    def __init__(self) -> None:
-        pass
-    def __is_similar_font_type(self, font_type1, font_type2, prefix_length_ratio=0.3):
-        """
-        This function checks if the two font types are similar.
-        Definition of similar font types: the two font types have a common prefix,
-        and the length of the common prefix is at least a certain ratio of the length of the shorter font type.
-        Parameters
-        ----------
-        font_type1 : str
-            font type 1
-        font_type2 : str
-            font type 2
-        prefix_length_ratio : float
-            minimum ratio of the common prefix length to the length of the shorter font type
-        Returns
-        -------
-        bool
-            True if the two font types are similar, False otherwise.
-        """
-        if isinstance(font_type1, list):
-            font_type1 = font_type1[0] if font_type1 else ""
-        if isinstance(font_type2, list):
-            font_type2 = font_type2[0] if font_type2 else ""
-        if font_type1 == font_type2:
-            return True
-        # Find the length of the common prefix
-        common_prefix_length = len(os.path.commonprefix([font_type1, font_type2]))
-        # Calculate the minimum prefix length based on the ratio
-        min_prefix_length = int(min(len(font_type1), len(font_type2)) * prefix_length_ratio)
-        return common_prefix_length >= min_prefix_length
-    def __is_same_block_font(self, block1, block2):
-        """
-        This function compares the font of block1 and block2
-        Parameters
-        ----------
-        block1 : dict
-            block1
-        block2 : dict
-            block2
-        Returns
-        -------
-        is_same : bool
-            True if block1 and block2 have the same font, else False
-        """
-        block_1_font_type = safe_get(block1, "block_font_type", "")
-        block_1_font_size = safe_get(block1, "block_font_size", 0)
-        block_1_avg_char_width = safe_get(block1, "avg_char_width", 0)
-        block_2_font_type = safe_get(block2, "block_font_type", "")
-        block_2_font_size = safe_get(block2, "block_font_size", 0)
-        block_2_avg_char_width = safe_get(block2, "avg_char_width", 0)
-        if isinstance(block_1_font_size, list):
-            block_1_font_size = block_1_font_size[0] if block_1_font_size else 0
-        if isinstance(block_2_font_size, list):
-            block_2_font_size = block_2_font_size[0] if block_2_font_size else 0
-        block_1_text = safe_get(block1, "text", "")
-        block_2_text = safe_get(block2, "text", "")
-        if block_1_avg_char_width == 0 or block_2_avg_char_width == 0:
-            return False
-        if not block_1_text or not block_2_text:
-            return False
-        else:
-            text_len_ratio = len(block_2_text) / len(block_1_text)
-            if text_len_ratio < 0.2:
-                avg_char_width_condition = (
-                    abs(block_1_avg_char_width - block_2_avg_char_width) / min(block_1_avg_char_width, block_2_avg_char_width)
-                    < 0.5
-                )
-            else:
-                avg_char_width_condition = (
-                    abs(block_1_avg_char_width - block_2_avg_char_width) / min(block_1_avg_char_width, block_2_avg_char_width)
-                    < 0.2
-                )
-        block_font_size_condtion = abs(block_1_font_size - block_2_font_size) < 1
-        return (
-            self.__is_similar_font_type(block_1_font_type, block_2_font_type)
-            and avg_char_width_condition
-            and block_font_size_condtion
-        )
-    def _is_alphabet_char(self, char):
-        if (char >= "\u0041" and char <= "\u005a") or (char >= "\u0061" and char <= "\u007a"):
-            return True
-        else:
-            return False
-    def _is_chinese_char(self, char):
-        if char >= "\u4e00" and char <= "\u9fa5":
-            return True
-        else:
-            return False
-    def _is_other_letter_char(self, char):
-        try:
-            cat = unicodedata.category(char)
-            if cat == "Lu" or cat == "Ll":
-                return not self._is_alphabet_char(char) and not self._is_chinese_char(char)
-        except TypeError:
-            print("The input to the function must be a single character.")
-        return False
-    def _is_year(self, s: str):
-        try:
-            number = int(s)
-            return 1900 <= number <= 2099
-        except ValueError:
-            return False
-    def __is_para_font_consistent(self, para_1, para_2):
-        """
-        This function compares the font of para1 and para2
-        Parameters
-        ----------
-        para1 : dict
-            para1
-        para2 : dict
-            para2
-        Returns
-        -------
-        is_same : bool
-            True if para1 and para2 have the same font, else False
-        """
-        if para_1 is None or para_2 is None:
-            return False
-        para_1_font_type = safe_get(para_1, "para_font_type", "")
-        para_1_font_size = safe_get(para_1, "para_font_size", 0)
-        para_1_font_color = safe_get(para_1, "para_font_color", "")
-        para_2_font_type = safe_get(para_2, "para_font_type", "")
-        para_2_font_size = safe_get(para_2, "para_font_size", 0)
-        para_2_font_color = safe_get(para_2, "para_font_color", "")
-        if isinstance(para_1_font_type, list):  # get the most common font type
-            para_1_font_type = max(set(para_1_font_type), key=para_1_font_type.count)
-        if isinstance(para_2_font_type, list):
-            para_2_font_type = max(set(para_2_font_type), key=para_2_font_type.count)
-        if isinstance(para_1_font_size, list):  # compute average font type
-            para_1_font_size = sum(para_1_font_size) / len(para_1_font_size)
-        if isinstance(para_2_font_size, list):  # compute average font type
-            para_2_font_size = sum(para_2_font_size) / len(para_2_font_size)
-        return (
-            self.__is_similar_font_type(para_1_font_type, para_2_font_type)
-            and abs(para_1_font_size - para_2_font_size) < 1.5
-            # and para_font_color1 == para_font_color2
-        )
-    def _is_para_puncs_consistent(self, para_1, para_2):
-        """
-        This function determines whether para1 and para2 are originally from the same paragraph by checking the puncs of para1(former) and para2(latter)
-        Parameters
-        ----------
-        para1 : dict
-            para1
-        para2 : dict
-            para2
-        Returns
-        -------
-        is_same : bool
-            True if para1 and para2 are from the same paragraph by using the puncs, else False
-        """
-        para_1_text = safe_get(para_1, "para_text", "").strip()
-        para_2_text = safe_get(para_2, "para_text", "").strip()
-        para_1_bboxes = safe_get(para_1, "para_bbox", [])
-        para_1_font_sizes = safe_get(para_1, "para_font_size", 0)
-        para_2_bboxes = safe_get(para_2, "para_bbox", [])
-        para_2_font_sizes = safe_get(para_2, "para_font_size", 0)
-        # print_yellow("    Features of determine puncs_consistent:")
-        # print(f"    para_1_text: {para_1_text}")
-        # print(f"    para_2_text: {para_2_text}")
-        # print(f"    para_1_bboxes: {para_1_bboxes}")
-        # print(f"    para_2_bboxes: {para_2_bboxes}")
-        # print(f"    para_1_font_sizes: {para_1_font_sizes}")
-        # print(f"    para_2_font_sizes: {para_2_font_sizes}")
-        if is_nested_list(para_1_bboxes):
-            x0_1, y0_1, x1_1, y1_1 = para_1_bboxes[-1]
-        else:
-            x0_1, y0_1, x1_1, y1_1 = para_1_bboxes
-        if is_nested_list(para_2_bboxes):
-            x0_2, y0_2, x1_2, y1_2 = para_2_bboxes[0]
-            para_2_font_sizes = para_2_font_sizes[0]  # type: ignore
-        else:
-            x0_2, y0_2, x1_2, y1_2 = para_2_bboxes
-        right_align_threshold = 0.5 * (para_1_font_sizes + para_2_font_sizes) * 0.8
-        are_two_paras_right_aligned = abs(x1_1 - x1_2) < right_align_threshold
-        left_indent_threshold = 0.5 * (para_1_font_sizes + para_2_font_sizes) * 0.8
-        is_para1_left_indent_than_papa2 = x0_1 - x0_2 > left_indent_threshold
-        is_para2_left_indent_than_papa1 = x0_2 - x0_1 > left_indent_threshold
-        # Check if either para_text1 or para_text2 is empty
-        if not para_1_text or not para_2_text:
-            return False
-        # Define the end puncs for a sentence to end and hyphen
-        end_puncs = [".", "?", "!", "。", "？", "！", "…"]
-        hyphen = ["-", "—"]
-        # Check if para_text1 ends with either hyphen or non-end punctuation or spaces
-        para_1_end_with_hyphen = para_1_text and para_1_text[-1] in hyphen
-        para_1_end_with_end_punc = para_1_text and para_1_text[-1] in end_puncs
-        para_1_end_with_space = para_1_text and para_1_text[-1] == " "
-        para_1_not_end_with_end_punc = para_1_text and para_1_text[-1] not in end_puncs
-        # print_yellow(f"    para_1_end_with_hyphen: {para_1_end_with_hyphen}")
-        # print_yellow(f"    para_1_end_with_end_punc: {para_1_end_with_end_punc}")
-        # print_yellow(f"    para_1_not_end_with_end_punc: {para_1_not_end_with_end_punc}")
-        # print_yellow(f"    para_1_end_with_space: {para_1_end_with_space}")
-        if para_1_end_with_hyphen:  # If para_text1 ends with hyphen
-            # print_red(f"para_1 is end with hyphen.")
-            para_2_is_consistent = para_2_text and (
-                para_2_text[0] in hyphen
-                or (self._is_alphabet_char(para_2_text[0]) and para_2_text[0].islower())
-                or (self._is_chinese_char(para_2_text[0]))
-                or (self._is_other_letter_char(para_2_text[0]))
-            )
-            if para_2_is_consistent:
-                # print(f"para_2 is consistent.\n")
-                return True
-            else:
-                # print(f"para_2 is not consistent.\n")
-                pass
-        elif para_1_end_with_end_punc:  # If para_text1 ends with ending punctuations
-            # print_red(f"para_1 is end with end_punc.")
-            para_2_is_consistent = (
-                para_2_text
-                and (
-                    para_2_text[0] == " "
-                    or (self._is_alphabet_char(para_2_text[0]) and para_2_text[0].isupper())
-                    or (self._is_chinese_char(para_2_text[0]))
-                    or (self._is_other_letter_char(para_2_text[0]))
-                )
-                and not is_para2_left_indent_than_papa1
-            )
-            if para_2_is_consistent:
-                # print(f"para_2 is consistent.\n")
-                return True
-            else:
-                # print(f"para_2 is not consistent.\n")
-                pass
-        elif para_1_not_end_with_end_punc:  # If para_text1 is not end with ending punctuations
-            # print_red(f"para_1 is NOT end with end_punc.")
-            para_2_is_consistent = para_2_text and (
-                para_2_text[0] == " "
-                or (self._is_alphabet_char(para_2_text[0]) and para_2_text[0].islower())
-                or (self._is_alphabet_char(para_2_text[0]))
-                or (self._is_year(para_2_text[0:4]))
-                or (are_two_paras_right_aligned or is_para1_left_indent_than_papa2)
-                or (self._is_chinese_char(para_2_text[0]))
-                or (self._is_other_letter_char(para_2_text[0]))
-            )
-            if para_2_is_consistent:
-                # print(f"para_2 is consistent.\n")
-                return True
-            else:
-                # print(f"para_2 is not consistent.\n")
-                pass
-        elif para_1_end_with_space:  # If para_text1 ends with space
-            # print_red(f"para_1 is end with space.")
-            para_2_is_consistent = para_2_text and (
-                para_2_text[0] == " "
-                or (self._is_alphabet_char(para_2_text[0]) and para_2_text[0].islower())
-                or (self._is_chinese_char(para_2_text[0]))
-                or (self._is_other_letter_char(para_2_text[0]))
-            )
-            if para_2_is_consistent:
-                # print(f"para_2 is consistent.\n")
-                return True
-            else:
-                pass
-                # print(f"para_2 is not consistent.\n")
-        return False
-    def _is_block_consistent(self, block1, block2):
-        """
-        This function determines whether block1 and block2 are originally from the same block
-        Parameters
-        ----------
-        block1 : dict
-            block1s
-        block2 : dict
-            block2
-        Returns
-        -------
-        is_same : bool
-            True if block1 and block2 are from the same block, else False
-        """
-        return self.__is_same_block_font(block1, block2)
-    def _is_para_continued(self, para1, para2):
-        """
-        This function determines whether para1 and para2 are originally from the same paragraph
-        Parameters
-        ----------
-        para1 : dict
-            para1
-        para2 : dict
-            para2
-        Returns
-        -------
-        is_same : bool
-            True if para1 and para2 are from the same paragraph, else False
-        """
-        is_para_font_consistent = self.__is_para_font_consistent(para1, para2)
-        is_para_puncs_consistent = self._is_para_puncs_consistent(para1, para2)
-        return is_para_font_consistent and is_para_puncs_consistent
-    def _are_boundaries_of_block_consistent(self, block1, block2):
-        """
-        This function checks if the boundaries of block1 and block2 are consistent
-        Parameters
-        ----------
-        block1 : dict
-            block1
-        block2 : dict
-            block2
-        Returns
-        -------
-        is_consistent : bool
-            True if the boundaries of block1 and block2 are consistent, else False
-        """
-        last_line_of_block1 = block1["lines"][-1]
-        first_line_of_block2 = block2["lines"][0]
-        spans_of_last_line_of_block1 = last_line_of_block1["spans"]
-        spans_of_first_line_of_block2 = first_line_of_block2["spans"]
-        font_type_of_last_line_of_block1 = spans_of_last_line_of_block1[0]["font"].lower()
-        font_size_of_last_line_of_block1 = spans_of_last_line_of_block1[0]["size"]
-        font_color_of_last_line_of_block1 = spans_of_last_line_of_block1[0]["color"]
-        font_flags_of_last_line_of_block1 = spans_of_last_line_of_block1[0]["flags"]
-        font_type_of_first_line_of_block2 = spans_of_first_line_of_block2[0]["font"].lower()
-        font_size_of_first_line_of_block2 = spans_of_first_line_of_block2[0]["size"]
-        font_color_of_first_line_of_block2 = spans_of_first_line_of_block2[0]["color"]
-        font_flags_of_first_line_of_block2 = spans_of_first_line_of_block2[0]["flags"]
-        return (
-            self.__is_similar_font_type(font_type_of_last_line_of_block1, font_type_of_first_line_of_block2)
-            and abs(font_size_of_last_line_of_block1 - font_size_of_first_line_of_block2) < 1
-            # and font_color_of_last_line_of_block1 == font_color_of_first_line_of_block2
-            and font_flags_of_last_line_of_block1 == font_flags_of_first_line_of_block2
-        )
-    def _get_last_paragraph(self, block):
-        """
-        Retrieves the last paragraph from a block.
-        Parameters
-        ----------
-        block : dict
-            The block from which to retrieve the paragraph.
-        Returns
-        -------
-        dict
-            The last paragraph of the block.
-        """
-        if block["paras"]:
-            last_para_key = list(block["paras"].keys())[-1]
-            return block["paras"][last_para_key]
-        else:
-            return None
-    def _get_first_paragraph(self, block):
-        """
-        Retrieves the first paragraph from a block.
-        Parameters
-        ----------
-        block : dict
-            The block from which to retrieve the paragraph.
-        Returns
-        -------
-        dict
-            The first paragraph of the block.
-        """
-        if block["paras"]:
-            first_para_key = list(block["paras"].keys())[0]
-            return block["paras"][first_para_key]
-        else:
-            return None
-    def should_merge_next_para(self, curr_para, next_para):
-        if self._is_para_continued(curr_para, next_para):
-            return True
-        else:
-            return False
-    def batch_tag_paras(self, pdf_dict):
-        the_last_page_id = len(pdf_dict) - 1
-        for curr_page_idx, (curr_page_id, curr_page_content) in enumerate(pdf_dict.items()):
-            if curr_page_id.startswith("page_") and curr_page_content.get("para_blocks", []):
-                para_blocks_of_curr_page = curr_page_content["para_blocks"]
-                next_page_idx = curr_page_idx + 1
-                next_page_id = f"page_{next_page_idx}"
-                next_page_content = pdf_dict.get(next_page_id, {})
-                for i, current_block in enumerate(para_blocks_of_curr_page):
-                    for para_id, curr_para in current_block["paras"].items():
-                        curr_para["curr_para_location"] = [
-                            curr_page_idx,
-                            current_block["block_id"],
-                            int(para_id.split("_")[-1]),
-                        ]
-                        curr_para["next_para_location"] = None  # 默认设置为None
-                        curr_para["merge_next_para"] = False  # 默认设置为False
-                    next_block = para_blocks_of_curr_page[i + 1] if i < len(para_blocks_of_curr_page) - 1 else None
-                    if next_block:
-                        curr_block_last_para_key = list(current_block["paras"].keys())[-1]
-                        curr_blk_last_para = current_block["paras"][curr_block_last_para_key]
-                        next_block_first_para_key = list(next_block["paras"].keys())[0]
-                        next_blk_first_para = next_block["paras"][next_block_first_para_key]
-                        if self.should_merge_next_para(curr_blk_last_para, next_blk_first_para):
-                            curr_blk_last_para["next_para_location"] = [
-                                curr_page_idx,
-                                next_block["block_id"],
-                                int(next_block_first_para_key.split("_")[-1]),
-                            ]
-                            curr_blk_last_para["merge_next_para"] = True
-                    else:
-                        # Handle the case where the next block is in a different page
-                        curr_block_last_para_key = list(current_block["paras"].keys())[-1]
-                        curr_blk_last_para = current_block["paras"][curr_block_last_para_key]
-                        while not next_page_content.get("para_blocks", []) and next_page_idx <= the_last_page_id:
-                            next_page_idx += 1
-                            next_page_id = f"page_{next_page_idx}"
-                            next_page_content = pdf_dict.get(next_page_id, {})
-                        if next_page_content.get("para_blocks", []):
-                            next_blk_first_para_key = list(next_page_content["para_blocks"][0]["paras"].keys())[0]
-                            next_blk_first_para = next_page_content["para_blocks"][0]["paras"][next_blk_first_para_key]
-                            if self.should_merge_next_para(curr_blk_last_para, next_blk_first_para):
-                                curr_blk_last_para["next_para_location"] = [
-                                    next_page_idx,
-                                    next_page_content["para_blocks"][0]["block_id"],
-                                    int(next_blk_first_para_key.split("_")[-1]),
-                                ]
-                                curr_blk_last_para["merge_next_para"] = True
-        return pdf_dict
-    def find_block_by_id(self, para_blocks, block_id):
-        for block in para_blocks:
-            if block.get("block_id") == block_id:
-                return block
-        return None
-    def batch_merge_paras(self, pdf_dict):
-        for page_id, page_content in pdf_dict.items():
-            if page_id.startswith("page_") and page_content.get("para_blocks", []):
-                para_blocks_of_page = page_content["para_blocks"]
-                for i in range(len(para_blocks_of_page)):
-                    current_block = para_blocks_of_page[i]
-                    paras = current_block["paras"]
-                    for para_id, curr_para in list(paras.items()):
-                        # 跳过标题段落
-                        if curr_para.get("is_para_title"):
-                            continue
-                        while curr_para.get("merge_next_para"):
-                            next_para_location = curr_para.get("next_para_location")
-                            if not next_para_location:
-                                break
-                            next_page_idx, next_block_id, next_para_id = next_para_location
-                            next_page_id = f"page_{next_page_idx}"
-                            next_page_content = pdf_dict.get(next_page_id)
-                            if not next_page_content:
-                                break
-                            next_block = self.find_block_by_id(next_page_content.get("para_blocks", []), next_block_id)
-                            if not next_block:
-                                break
-                            next_para = next_block["paras"].get(f"para_{next_para_id}")
-                            if not next_para or next_para.get("is_para_title"):
-                                break
-                            # 合并段落文本
-                            curr_para_text = curr_para.get("para_text", "")
-                            next_para_text = next_para.get("para_text", "")
-                            curr_para["para_text"] = curr_para_text + " " + next_para_text
-                            # 更新 next_para_location
-                            curr_para["next_para_location"] = next_para.get("next_para_location")
-                            # 将下一个段落文本置为空，表示已被合并
-                            next_para["para_text"] = ""
-                            # 更新 merge_next_para 标记
-                            curr_para["merge_next_para"] = next_para.get("merge_next_para", False)
-        return pdf_dict
--- a/magic_pdf/para/block_termination_processor.py.bak
+++ b/magic_pdf/para/block_termination_processor.py.bak
-from magic_pdf.para.commons import *
-if sys.version_info[0] >= 3:
-    sys.stdout.reconfigure(encoding="utf-8")  # type: ignore
-class BlockTerminationProcessor:
-    def __init__(self) -> None:
-        pass
-    def _is_consistent_lines(
-        self,
-        curr_line,
-        prev_line,
-        next_line,
-        consistent_direction,  # 0 for prev, 1 for next, 2 for both
-    ):
-        """
-        This function checks if the line is consistent with its neighbors
-        Parameters
-        ----------
-        curr_line : dict
-            current line
-        prev_line : dict
-            previous line
-        next_line : dict
-            next line
-        consistent_direction : int
-            0 for prev, 1 for next, 2 for both
-        Returns
-        -------
-        bool
-            True if the line is consistent with its neighbors, False otherwise.
-        """
-        curr_line_font_size = curr_line["spans"][0]["size"]
-        curr_line_font_type = curr_line["spans"][0]["font"].lower()
-        if consistent_direction == 0:
-            if prev_line:
-                prev_line_font_size = prev_line["spans"][0]["size"]
-                prev_line_font_type = prev_line["spans"][0]["font"].lower()
-                return curr_line_font_size == prev_line_font_size and curr_line_font_type == prev_line_font_type
-            else:
-                return False
-        elif consistent_direction == 1:
-            if next_line:
-                next_line_font_size = next_line["spans"][0]["size"]
-                next_line_font_type = next_line["spans"][0]["font"].lower()
-                return curr_line_font_size == next_line_font_size and curr_line_font_type == next_line_font_type
-            else:
-                return False
-        elif consistent_direction == 2:
-            if prev_line and next_line:
-                prev_line_font_size = prev_line["spans"][0]["size"]
-                prev_line_font_type = prev_line["spans"][0]["font"].lower()
-                next_line_font_size = next_line["spans"][0]["size"]
-                next_line_font_type = next_line["spans"][0]["font"].lower()
-                return (curr_line_font_size == prev_line_font_size and curr_line_font_type == prev_line_font_type) and (
-                    curr_line_font_size == next_line_font_size and curr_line_font_type == next_line_font_type
-                )
-            else:
-                return False
-        else:
-            return False
-    def _is_regular_line(self, curr_line_bbox, prev_line_bbox, next_line_bbox, avg_char_width, X0, X1, avg_line_height):
-        """
-        This function checks if the line is a regular line
-        Parameters
-        ----------
-        curr_line_bbox : list
-            bbox of the current line
-        prev_line_bbox : list
-            bbox of the previous line
-        next_line_bbox : list
-            bbox of the next line
-        avg_char_width : float
-            average of char widths
-        X0 : float
-            median of x0 values, which represents the left average boundary of the page
-        X1 : float
-            median of x1 values, which represents the right average boundary of the page
-        avg_line_height : float
-            average of line heights
-        Returns
-        -------
-        bool
-            True if the line is a regular line, False otherwise.
-        """
-        horizontal_ratio = 0.5
-        vertical_ratio = 0.5
-        horizontal_thres = horizontal_ratio * avg_char_width
-        vertical_thres = vertical_ratio * avg_line_height
-        x0, y0, x1, y1 = curr_line_bbox
-        x0_near_X0 = abs(x0 - X0) < horizontal_thres
-        x1_near_X1 = abs(x1 - X1) < horizontal_thres
-        prev_line_is_end_of_para = prev_line_bbox and (abs(prev_line_bbox[2] - X1) > avg_char_width)
-        sufficient_spacing_above = False
-        if prev_line_bbox:
-            vertical_spacing_above = y1 - prev_line_bbox[3]
-            sufficient_spacing_above = vertical_spacing_above > vertical_thres
-        sufficient_spacing_below = False
-        if next_line_bbox:
-            vertical_spacing_below = next_line_bbox[1] - y0
-            sufficient_spacing_below = vertical_spacing_below > vertical_thres
-        return (
-            (sufficient_spacing_above or sufficient_spacing_below)
-            or (not x0_near_X0 and not x1_near_X1)
-            or prev_line_is_end_of_para
-        )
-    def _is_possible_start_of_para(self, curr_line, prev_line, next_line, X0, X1, avg_char_width, avg_font_size):
-        """
-        This function checks if the line is a possible start of a paragraph
-        Parameters
-        ----------
-        curr_line : dict
-            current line
-        prev_line : dict
-            previous line
-        next_line : dict
-            next line
-        X0 : float
-            median of x0 values, which represents the left average boundary of the page
-        X1 : float
-            median of x1 values, which represents the right average boundary of the page
-        avg_char_width : float
-            average of char widths
-        avg_line_height : float
-            average of line heights
-        Returns
-        -------
-        bool
-            True if the line is a possible start of a paragraph, False otherwise.
-        """
-        start_confidence = 0.5  # Initial confidence of the line being a start of a paragraph
-        decision_path = []  # Record the decision path
-        curr_line_bbox = curr_line["bbox"]
-        prev_line_bbox = prev_line["bbox"] if prev_line else None
-        next_line_bbox = next_line["bbox"] if next_line else None
-        indent_ratio = 1
-        vertical_ratio = 1.5
-        vertical_thres = vertical_ratio * avg_font_size
-        left_horizontal_ratio = 0.5
-        left_horizontal_thres = left_horizontal_ratio * avg_char_width
-        right_horizontal_ratio = 2.5
-        right_horizontal_thres = right_horizontal_ratio * avg_char_width
-        x0, y0, x1, y1 = curr_line_bbox
-        indent_condition = x0 > X0 + indent_ratio * avg_char_width
-        if indent_condition:
-            start_confidence += 0.2
-            decision_path.append("indent_condition_met")
-        x0_near_X0 = abs(x0 - X0) < left_horizontal_thres
-        if x0_near_X0:
-            start_confidence += 0.1
-            decision_path.append("x0_near_X0")
-        x1_near_X1 = abs(x1 - X1) < right_horizontal_thres
-        if x1_near_X1:
-            start_confidence += 0.1
-            decision_path.append("x1_near_X1")
-        if prev_line is None:
-            prev_line_is_end_of_para = True
-            start_confidence += 0.2
-            decision_path.append("no_prev_line")
-        else:
-            prev_line_is_end_of_para, _, _ = self._is_possible_end_of_para(prev_line, next_line, X0, X1, avg_char_width)
-            if prev_line_is_end_of_para:
-                start_confidence += 0.1
-                decision_path.append("prev_line_is_end_of_para")
-        sufficient_spacing_above = False
-        if prev_line_bbox:
-            vertical_spacing_above = y1 - prev_line_bbox[3]
-            sufficient_spacing_above = vertical_spacing_above > vertical_thres
-            if sufficient_spacing_above:
-                start_confidence += 0.2
-                decision_path.append("sufficient_spacing_above")
-        sufficient_spacing_below = False
-        if next_line_bbox:
-            vertical_spacing_below = next_line_bbox[1] - y0
-            sufficient_spacing_below = vertical_spacing_below > vertical_thres
-            if sufficient_spacing_below:
-                start_confidence += 0.2
-                decision_path.append("sufficient_spacing_below")
-        is_regular_line = self._is_regular_line(
-            curr_line_bbox, prev_line_bbox, next_line_bbox, avg_char_width, X0, X1, avg_font_size
-        )
-        if is_regular_line:
-            start_confidence += 0.1
-            decision_path.append("is_regular_line")
-        is_start_of_para = (
-            (sufficient_spacing_above or sufficient_spacing_below)
-            or (indent_condition)
-            or (not indent_condition and x0_near_X0 and x1_near_X1 and not is_regular_line)
-            or prev_line_is_end_of_para
-        )
-        return (is_start_of_para, start_confidence, decision_path)
-    def _is_possible_end_of_para(self, curr_line, next_line, X0, X1, avg_char_width):
-        """
-        This function checks if the line is a possible end of a paragraph
-        Parameters
-        ----------
-        curr_line : dict
-            current line
-        next_line : dict
-            next line
-        X0 : float
-            median of x0 values, which represents the left average boundary of the page
-        X1 : float
-            median of x1 values, which represents the right average boundary of the page
-        avg_char_width : float
-            average of char widths
-        Returns
-        -------
-        bool
-            True if the line is a possible end of a paragraph, False otherwise.
-        """
-        end_confidence = 0.5  # Initial confidence of the line being a end of a paragraph
-        decision_path = []  # Record the decision path
-        curr_line_bbox = curr_line["bbox"]
-        next_line_bbox = next_line["bbox"] if next_line else None
-        left_horizontal_ratio = 0.5
-        right_horizontal_ratio = 0.5
-        x0, _, x1, y1 = curr_line_bbox
-        next_x0, next_y0, _, _ = next_line_bbox if next_line_bbox else (0, 0, 0, 0)
-        x0_near_X0 = abs(x0 - X0) < left_horizontal_ratio * avg_char_width
-        if x0_near_X0:
-            end_confidence += 0.1
-            decision_path.append("x0_near_X0")
-        x1_smaller_than_X1 = x1 < X1 - right_horizontal_ratio * avg_char_width
-        if x1_smaller_than_X1:
-            end_confidence += 0.1
-            decision_path.append("x1_smaller_than_X1")
-        next_line_is_start_of_para = (
-            next_line_bbox
-            and (next_x0 > X0 + left_horizontal_ratio * avg_char_width)
-            and (not is_line_left_aligned_from_neighbors(curr_line_bbox, None, next_line_bbox, avg_char_width, direction=1))
-        )
-        if next_line_is_start_of_para:
-            end_confidence += 0.2
-            decision_path.append("next_line_is_start_of_para")
-        is_line_left_aligned_from_neighbors_bool = is_line_left_aligned_from_neighbors(
-            curr_line_bbox, None, next_line_bbox, avg_char_width
-        )
-        if is_line_left_aligned_from_neighbors_bool:
-            end_confidence += 0.1
-            decision_path.append("line_is_left_aligned_from_neighbors")
-        is_line_right_aligned_from_neighbors_bool = is_line_right_aligned_from_neighbors(
-            curr_line_bbox, None, next_line_bbox, avg_char_width
-        )
-        if not is_line_right_aligned_from_neighbors_bool:
-            end_confidence += 0.1
-            decision_path.append("line_is_not_right_aligned_from_neighbors")
-        is_end_of_para = end_with_punctuation(curr_line["text"]) and (
-            (x0_near_X0 and x1_smaller_than_X1)
-            or (is_line_left_aligned_from_neighbors_bool and not is_line_right_aligned_from_neighbors_bool)
-        )
-        return (is_end_of_para, end_confidence, decision_path)
-    def _cut_paras_per_block(
-        self,
-        block,
-    ):
-        """
-        Processes a raw block from PyMuPDF and returns the processed block.
-        Parameters
-        ----------
-        raw_block : dict
-            A raw block from pymupdf.
-        Returns
-        -------
-        processed_block : dict
-        """
-        def _construct_para(lines, is_block_title, para_title_level):
-            """
-            Construct a paragraph from given lines.
-            """
-            font_sizes = [span["size"] for line in lines for span in line["spans"]]
-            avg_font_size = sum(font_sizes) / len(font_sizes) if font_sizes else 0
-            font_colors = [span["color"] for line in lines for span in line["spans"]]
-            most_common_font_color = max(set(font_colors), key=font_colors.count) if font_colors else None
-            # font_types = [span["font"] for line in lines for span in line["spans"]]
-            # most_common_font_type = max(set(font_types), key=font_types.count) if font_types else None
-            font_type_lengths = {}
-            for line in lines:
-                for span in line["spans"]:
-                    font_type = span["font"]
-                    bbox_width = span["bbox"][2] - span["bbox"][0]
-                    if font_type in font_type_lengths:
-                        font_type_lengths[font_type] += bbox_width
-                    else:
-                        font_type_lengths[font_type] = bbox_width
-            # get the font type with the longest bbox width
-            most_common_font_type = max(font_type_lengths, key=font_type_lengths.get) if font_type_lengths else None  # type: ignore
-            para_bbox = calculate_para_bbox(lines)
-            para_text = " ".join(line["text"] for line in lines)
-            return {
-                "para_bbox": para_bbox,
-                "para_text": para_text,
-                "para_font_type": most_common_font_type,
-                "para_font_size": avg_font_size,
-                "para_font_color": most_common_font_color,
-                "is_para_title": is_block_title,
-                "para_title_level": para_title_level,
-            }
-        block_bbox = block["bbox"]
-        block_text = block["text"]
-        block_lines = block["lines"]
-        X0 = safe_get(block, "X0", 0)
-        X1 = safe_get(block, "X1", 0)
-        avg_char_width = safe_get(block, "avg_char_width", 0)
-        avg_char_height = safe_get(block, "avg_char_height", 0)
-        avg_font_size = safe_get(block, "avg_font_size", 0)
-        is_block_title = safe_get(block, "is_block_title", False)
-        para_title_level = safe_get(block, "block_title_level", 0)
-        # Segment into paragraphs
-        para_ranges = []
-        in_paragraph = False
-        start_idx_of_para = None
-        # Create the processed paragraphs
-        processed_paras = {}
-        para_bboxes = []
-        end_idx_of_para = 0
-        for line_index, line in enumerate(block_lines):
-            curr_line = line
-            prev_line = block_lines[line_index - 1] if line_index > 0 else None
-            next_line = block_lines[line_index + 1] if line_index < len(block_lines) - 1 else None
-            """
-            Start processing paragraphs.
-            """
-            # Check if the line is the start of a paragraph
-            is_start_of_para, start_confidence, decision_path = self._is_possible_start_of_para(
-                curr_line, prev_line, next_line, X0, X1, avg_char_width, avg_font_size
-            )
-            if not in_paragraph and is_start_of_para:
-                in_paragraph = True
-                start_idx_of_para = line_index
-                # print_green(">>> Start of a paragraph")
-                # print("    curr_line_text: ", curr_line["text"])
-                # print("    start_confidence: ", start_confidence)
-                # print("    decision_path: ", decision_path)
-            # Check if the line is the end of a paragraph
-            is_end_of_para, end_confidence, decision_path = self._is_possible_end_of_para(
-                curr_line, next_line, X0, X1, avg_char_width
-            )
-            if in_paragraph and (is_end_of_para or not next_line):
-                para_ranges.append((start_idx_of_para, line_index))
-                start_idx_of_para = None
-                in_paragraph = False
-                # print_red(">>> End of a paragraph")
-                # print("    curr_line_text: ", curr_line["text"])
-                # print("    end_confidence: ", end_confidence)
-                # print("    decision_path: ", decision_path)
-        # Add the last paragraph if it is not added
-        if in_paragraph and start_idx_of_para is not None:
-            para_ranges.append((start_idx_of_para, len(block_lines) - 1))
-        # Process the matched paragraphs
-        for para_index, (start_idx, end_idx) in enumerate(para_ranges):
-            matched_lines = block_lines[start_idx : end_idx + 1]
-            para_properties = _construct_para(matched_lines, is_block_title, para_title_level)
-            para_key = f"para_{len(processed_paras)}"
-            processed_paras[para_key] = para_properties
-            para_bboxes.append(para_properties["para_bbox"])
-            end_idx_of_para = end_idx + 1
-        # Deal with the remaining lines
-        if end_idx_of_para < len(block_lines):
-            unmatched_lines = block_lines[end_idx_of_para:]
-            unmatched_properties = _construct_para(unmatched_lines, is_block_title, para_title_level)
-            unmatched_key = f"para_{len(processed_paras)}"
-            processed_paras[unmatched_key] = unmatched_properties
-            para_bboxes.append(unmatched_properties["para_bbox"])
-        block["paras"] = processed_paras
-        return block
-    def batch_process_blocks(self, pdf_dict):
-        """
-        Parses the blocks of all pages.
-        Parameters
-        ----------
-        pdf_dict : dict
-            PDF dictionary.
-        filter_blocks : list
-            List of bounding boxes to filter.
-        Returns
-        -------
-        result_dict : dict
-            Result dictionary.
-        """
-        num_paras = 0
-        for page_id, page in pdf_dict.items():
-            if page_id.startswith("page_"):
-                para_blocks = []
-                if "para_blocks" in page.keys():
-                    input_blocks = page["para_blocks"]
-                    for input_block in input_blocks:
-                        new_block = self._cut_paras_per_block(input_block)
-                        para_blocks.append(new_block)
-                        num_paras += len(new_block["paras"])
-                page["para_blocks"] = para_blocks
-        pdf_dict["statistics"]["num_paras"] = num_paras
-        return pdf_dict
--- a/magic_pdf/para/commons.py.bak
+++ b/magic_pdf/para/commons.py.bak
-import sys
-from magic_pdf.libs.commons import fitz
-from termcolor import cprint
-if sys.version_info[0] >= 3:
-    sys.stdout.reconfigure(encoding="utf-8")  # type: ignore
-def open_pdf(pdf_path):
-    try:
-        pdf_document = fitz.open(pdf_path)  # type: ignore
-        return pdf_document
-    except Exception as e:
-        print(f"无法打开PDF文件：{pdf_path}。原因是：{e}")
-        raise e
-def print_green_on_red(text):
-    cprint(text, "green", "on_red", attrs=["bold"], end="\n\n")
-def print_green(text):
-    print()
-    cprint(text, "green", attrs=["bold"], end="\n\n")
-def print_red(text):
-    print()
-    cprint(text, "red", attrs=["bold"], end="\n\n")
-def print_yellow(text):
-    print()
-    cprint(text, "yellow", attrs=["bold"], end="\n\n")
-def safe_get(dict_obj, key, default):
-    val = dict_obj.get(key)
-    if val is None:
-        return default
-    else:
-        return val
-def is_bbox_overlap(bbox1, bbox2):
-    """
-    This function checks if bbox1 and bbox2 overlap or not
-    Parameters
-    ----------
-    bbox1 : list
-        bbox1
-    bbox2 : list
-        bbox2
-    Returns
-    -------
-    bool
-        True if bbox1 and bbox2 overlap, else False
-    """
-    x0_1, y0_1, x1_1, y1_1 = bbox1
-    x0_2, y0_2, x1_2, y1_2 = bbox2
-    if x0_1 > x1_2 or x0_2 > x1_1:
-        return False
-    if y0_1 > y1_2 or y0_2 > y1_1:
-        return False
-    return True
-def is_in_bbox(bbox1, bbox2):
-    """
-    This function checks if bbox1 is in bbox2
-    Parameters
-    ----------
-    bbox1 : list
-        bbox1
-    bbox2 : list
-        bbox2
-    Returns
-    -------
-    bool
-        True if bbox1 is in bbox2, else False
-    """
-    x0_1, y0_1, x1_1, y1_1 = bbox1
-    x0_2, y0_2, x1_2, y1_2 = bbox2
-    if x0_1 >= x0_2 and y0_1 >= y0_2 and x1_1 <= x1_2 and y1_1 <= y1_2:
-        return True
-    else:
-        return False
-def calculate_para_bbox(lines):
-    """
-    This function calculates the minimum bbox of the paragraph
-    Parameters
-    ----------
-    lines : list
-        lines
-    Returns
-    -------
-    para_bbox : list
-        bbox of the paragraph
-    """
-    x0 = min(line["bbox"][0] for line in lines)
-    y0 = min(line["bbox"][1] for line in lines)
-    x1 = max(line["bbox"][2] for line in lines)
-    y1 = max(line["bbox"][3] for line in lines)
-    return [x0, y0, x1, y1]
-def is_line_right_aligned_from_neighbors(curr_line_bbox, prev_line_bbox, next_line_bbox, avg_char_width, direction=2):
-    """
-    This function checks if the line is right aligned from its neighbors
-    Parameters
-    ----------
-    curr_line_bbox : list
-        bbox of the current line
-    prev_line_bbox : list
-        bbox of the previous line
-    next_line_bbox : list
-        bbox of the next line
-    avg_char_width : float
-        average of char widths
-    direction : int
-        0 for prev, 1 for next, 2 for both
-    Returns
-    -------
-    bool
-        True if the line is right aligned from its neighbors, False otherwise.
-    """
-    horizontal_ratio = 0.5
-    horizontal_thres = horizontal_ratio * avg_char_width
-    _, _, x1, _ = curr_line_bbox
-    _, _, prev_x1, _ = prev_line_bbox if prev_line_bbox else (0, 0, 0, 0)
-    _, _, next_x1, _ = next_line_bbox if next_line_bbox else (0, 0, 0, 0)
-    if direction == 0:
-        return abs(x1 - prev_x1) < horizontal_thres
-    elif direction == 1:
-        return abs(x1 - next_x1) < horizontal_thres
-    elif direction == 2:
-        return abs(x1 - prev_x1) < horizontal_thres and abs(x1 - next_x1) < horizontal_thres
-    else:
-        return False
-def is_line_left_aligned_from_neighbors(curr_line_bbox, prev_line_bbox, next_line_bbox, avg_char_width, direction=2):
-    """
-    This function checks if the line is left aligned from its neighbors
-    Parameters
-    ----------
-    curr_line_bbox : list
-        bbox of the current line
-    prev_line_bbox : list
-        bbox of the previous line
-    next_line_bbox : list
-        bbox of the next line
-    avg_char_width : float
-        average of char widths
-    direction : int
-        0 for prev, 1 for next, 2 for both
-    Returns
-    -------
-    bool
-        True if the line is left aligned from its neighbors, False otherwise.
-    """
-    horizontal_ratio = 0.5
-    horizontal_thres = horizontal_ratio * avg_char_width
-    x0, _, _, _ = curr_line_bbox
-    prev_x0, _, _, _ = prev_line_bbox if prev_line_bbox else (0, 0, 0, 0)
-    next_x0, _, _, _ = next_line_bbox if next_line_bbox else (0, 0, 0, 0)
-    if direction == 0:
-        return abs(x0 - prev_x0) < horizontal_thres
-    elif direction == 1:
-        return abs(x0 - next_x0) < horizontal_thres
-    elif direction == 2:
-        return abs(x0 - prev_x0) < horizontal_thres and abs(x0 - next_x0) < horizontal_thres
-    else:
-        return False
-def end_with_punctuation(line_text):
-    """
-    This function checks if the line ends with punctuation marks
-    """
-    english_end_puncs = [".", "?", "!"]
-    chinese_end_puncs = ["。", "？", "！"]
-    end_puncs = english_end_puncs + chinese_end_puncs
-    last_non_space_char = None
-    for ch in line_text[::-1]:
-        if not ch.isspace():
-            last_non_space_char = ch
-            break
-    if last_non_space_char is None:
-        return False
-    return last_non_space_char in end_puncs
-def is_nested_list(lst):
-    if isinstance(lst, list):
-        return any(isinstance(sub, list) for sub in lst)
-    return False
--- a/magic_pdf/para/denoise.py.bak
+++ b/magic_pdf/para/denoise.py.bak
-import math
-from collections import defaultdict
-from magic_pdf.para.commons import *
-if sys.version_info[0] >= 3:
-    sys.stdout.reconfigure(encoding="utf-8")  # type: ignore
-class HeaderFooterProcessor:
-    def __init__(self) -> None:
-        pass
-    def get_most_common_bboxes(self, bboxes, page_height, position="top", threshold=0.25, num_bboxes=3, min_frequency=2):
-        """
-        This function gets the most common bboxes from the bboxes
-        Parameters
-        ----------
-        bboxes : list
-            bboxes
-        page_height : float
-            height of the page
-        position : str, optional
-            "top" or "bottom", by default "top"
-        threshold : float, optional
-            threshold, by default 0.25
-        num_bboxes : int, optional
-            number of bboxes to return, by default 3
-        min_frequency : int, optional
-            minimum frequency of the bbox, by default 2
-        Returns
-        -------
-        common_bboxes : list
-            common bboxes
-        """
-        # Filter bbox by position
-        if position == "top":
-            filtered_bboxes = [bbox for bbox in bboxes if bbox[1] < page_height * threshold]
-        else:
-            filtered_bboxes = [bbox for bbox in bboxes if bbox[3] > page_height * (1 - threshold)]
-        # Find the most common bbox
-        bbox_count = defaultdict(int)
-        for bbox in filtered_bboxes:
-            bbox_count[tuple(bbox)] += 1
-        # Get the most frequently occurring bbox, but only consider it when the frequency exceeds min_frequency
-        common_bboxes = [
-            bbox for bbox, count in sorted(bbox_count.items(), key=lambda item: item[1], reverse=True) if count >= min_frequency
-        ][:num_bboxes]
-        return common_bboxes
-    def detect_footer_header(self, result_dict, similarity_threshold=0.5):
-        """
-        This function detects the header and footer of the document.
-        Parameters
-        ----------
-        result_dict : dict
-            result dictionary
-        Returns
-        -------
-        result_dict : dict
-            result dictionary
-        """
-        def compare_bbox_with_list(bbox, bbox_list, tolerance=1):
-            return any(all(abs(a - b) < tolerance for a, b in zip(bbox, common_bbox)) for common_bbox in bbox_list)
-        def is_single_line_block(block):
-            # Determine based on the width and height of the block
-            block_width = block["X1"] - block["X0"]
-            block_height = block["bbox"][3] - block["bbox"][1]
-            # If the height of the block is close to the average character height and the width is large, it is considered a single line
-            return block_height <= block["avg_char_height"] * 3 and block_width > block["avg_char_width"] * 3
-        # Traverse all blocks in the document
-        single_preproc_blocks = 0
-        total_blocks = 0
-        single_preproc_blocks = 0
-        for page_id, blocks in result_dict.items():
-            if page_id.startswith("page_"):
-                for block_key, block in blocks.items():
-                    if block_key.startswith("block_"):
-                        total_blocks += 1
-                        if is_single_line_block(block):
-                            single_preproc_blocks += 1
-        # If there are no blocks, skip the header and footer detection
-        if total_blocks == 0:
-            print("No blocks found. Skipping header/footer detection.")
-            return result_dict
-        # If most of the blocks are single-line, skip the header and footer detection
-        if single_preproc_blocks / total_blocks > 0.5:  # 50% of the blocks are single-line
-            return result_dict
-        # Collect the bounding boxes of all blocks
-        all_bboxes = []
-        all_texts = []
-        for page_id, blocks in result_dict.items():
-            if page_id.startswith("page_"):
-                for block_key, block in blocks.items():
-                    if block_key.startswith("block_"):
-                        all_bboxes.append(block["bbox"])
-        # Get the height of the page
-        page_height = max(bbox[3] for bbox in all_bboxes)
-        # Get the most common bbox lists for headers and footers
-        common_header_bboxes = self.get_most_common_bboxes(all_bboxes, page_height, position="top") if all_bboxes else []
-        common_footer_bboxes = self.get_most_common_bboxes(all_bboxes, page_height, position="bottom") if all_bboxes else []
-        # Detect and mark headers and footers
-        for page_id, blocks in result_dict.items():
-            if page_id.startswith("page_"):
-                for block_key, block in blocks.items():
-                    if block_key.startswith("block_"):
-                        bbox = block["bbox"]
-                        text = block["text"]
-                        is_header = compare_bbox_with_list(bbox, common_header_bboxes)
-                        is_footer = compare_bbox_with_list(bbox, common_footer_bboxes)
-                        block["is_header"] = int(is_header)
-                        block["is_footer"] = int(is_footer)
-        return result_dict
-class NonHorizontalTextProcessor:
-    def __init__(self) -> None:
-        pass
-    def detect_non_horizontal_texts(self, result_dict):
-        """
-        This function detects watermarks and vertical margin notes in the document.
-        Watermarks are identified by finding blocks with the same coordinates and frequently occurring identical texts across multiple pages.
-        If these conditions are met, the blocks are highly likely to be watermarks, as opposed to headers or footers, which can change from page to page.
-        If the direction of these blocks is not horizontal, they are definitely considered to be watermarks.
-        Vertical margin notes are identified by finding blocks with the same coordinates and frequently occurring identical texts across multiple pages.
-        If these conditions are met, the blocks are highly likely to be vertical margin notes, which typically appear on the left and right sides of the page.
-        If the direction of these blocks is vertical, they are definitely considered to be vertical margin notes.
-        Parameters
-        ----------
-        result_dict : dict
-            The result dictionary.
-        Returns
-        -------
-        result_dict : dict
-            The updated result dictionary.
-        """
-        # Dictionary to store information about potential watermarks
-        potential_watermarks = {}
-        potential_margin_notes = {}
-        for page_id, page_content in result_dict.items():
-            if page_id.startswith("page_"):
-                for block_id, block_data in page_content.items():
-                    if block_id.startswith("block_"):
-                        if "dir" in block_data:
-                            coordinates_text = (block_data["bbox"], block_data["text"])  # Tuple of coordinates and text
-                            angle = math.atan2(block_data["dir"][1], block_data["dir"][0])
-                            angle = abs(math.degrees(angle))
-                            if angle > 5 and angle < 85:  # Check if direction is watermarks
-                                if coordinates_text in potential_watermarks:
-                                    potential_watermarks[coordinates_text] += 1
-                                else:
-                                    potential_watermarks[coordinates_text] = 1
-                            if angle > 85 and angle < 105:  # Check if direction is vertical
-                                if coordinates_text in potential_margin_notes:
-                                    potential_margin_notes[coordinates_text] += 1  # Increment count
-                                else:
-                                    potential_margin_notes[coordinates_text] = 1  # Initialize count
-        # Identify watermarks by finding entries with counts higher than a threshold (e.g., appearing on more than half of the pages)
-        watermark_threshold = len(result_dict) // 2
-        watermarks = {k: v for k, v in potential_watermarks.items() if v > watermark_threshold}
-        # Identify margin notes by finding entries with counts higher than a threshold (e.g., appearing on more than half of the pages)
-        margin_note_threshold = len(result_dict) // 2
-        margin_notes = {k: v for k, v in potential_margin_notes.items() if v > margin_note_threshold}
-        # Add watermark information to the result dictionary
-        for page_id, blocks in result_dict.items():
-            if page_id.startswith("page_"):
-                for block_id, block_data in blocks.items():
-                    coordinates_text = (block_data["bbox"], block_data["text"])
-                    if coordinates_text in watermarks:
-                        block_data["is_watermark"] = 1
-                    else:
-                        block_data["is_watermark"] = 0
-                    if coordinates_text in margin_notes:
-                        block_data["is_vertical_margin_note"] = 1
-                    else:
-                        block_data["is_vertical_margin_note"] = 0
-        return result_dict
-class NoiseRemover:
-    def __init__(self) -> None:
-        pass
-    def skip_data_noises(self, result_dict):
-        """
-        This function skips the data noises, including overlap blocks, header, footer, watermark, vertical margin note, title
-        """
-        filtered_result_dict = {}
-        for page_id, blocks in result_dict.items():
-            if page_id.startswith("page_"):
-                filtered_blocks = {}
-                for block_id, block in blocks.items():
-                    if block_id.startswith("block_"):
-                        if any(
-                            block.get(key, 0)
-                            for key in [
-                                "is_overlap",
-                                "is_header",
-                                "is_footer",
-                                "is_watermark",
-                                "is_vertical_margin_note",
-                                "is_block_title",
-                            ]
-                        ):
-                            continue
-                        filtered_blocks[block_id] = block
-                if filtered_blocks:
-                    filtered_result_dict[page_id] = filtered_blocks
-        return filtered_result_dict
--- a/magic_pdf/para/draw.py.bak
+++ b/magic_pdf/para/draw.py.bak
-from magic_pdf.libs.commons import fitz
-from magic_pdf.para.commons import *
-if sys.version_info[0] >= 3:
-    sys.stdout.reconfigure(encoding="utf-8")  # type: ignore
-class DrawAnnos:
-    """
-    This class draws annotations on the pdf file
-    ----------------------------------------
-                Color Code
-    ----------------------------------------
-        Red: (1, 0, 0)
-        Green: (0, 1, 0)
-        Blue: (0, 0, 1)
-        Yellow: (1, 1, 0) - mix of red and green
-        Cyan: (0, 1, 1) - mix of green and blue
-        Magenta: (1, 0, 1) - mix of red and blue
-        White: (1, 1, 1) - red, green and blue full intensity
-        Black: (0, 0, 0) - no color component whatsoever
-        Gray: (0.5, 0.5, 0.5) - equal and medium intensity of red, green and blue color components
-        Orange: (1, 0.65, 0) - maximum intensity of red, medium intensity of green, no blue component
-    """
-    def __init__(self) -> None:
-        pass
-    def __is_nested_list(self, lst):
-        """
-        This function returns True if the given list is a nested list of any degree.
-        """
-        if isinstance(lst, list):
-            return any(self.__is_nested_list(i) for i in lst) or any(isinstance(i, list) for i in lst)
-        return False
-    def __valid_rect(self, bbox):
-        # Ensure that the rectangle is not empty or invalid
-        if isinstance(bbox[0], list):
-            return False  # It's a nested list, hence it can't be valid rect
-        else:
-            return bbox[0] < bbox[2] and bbox[1] < bbox[3]
-    def __draw_nested_boxes(self, page, nested_bbox, color=(0, 1, 1)):
-        """
-        This function draws the nested boxes
-        Parameters
-        ----------
-        page : fitz.Page
-            page
-        nested_bbox : list
-            nested bbox
-        color : tuple
-            color, by default (0, 1, 1)    # draw with cyan color for combined paragraph
-        """
-        if self.__is_nested_list(nested_bbox):  # If it's a nested list
-            for bbox in nested_bbox:
-                self.__draw_nested_boxes(page, bbox, color)  # Recursively call the function
-        elif self.__valid_rect(nested_bbox):  # If valid rectangle
-            para_rect = fitz.Rect(nested_bbox)
-            para_anno = page.add_rect_annot(para_rect)
-            para_anno.set_colors(stroke=color)  # draw with cyan color for combined paragraph
-            para_anno.set_border(width=1)
-            para_anno.update()
-    def draw_annos(self, input_pdf_path, pdf_dic, output_pdf_path):
-        pdf_doc = open_pdf(input_pdf_path)
-        if pdf_dic is None:
-            pdf_dic = {}
-        if output_pdf_path is None:
-            output_pdf_path = input_pdf_path.replace(".pdf", "_anno.pdf")
-        for page_id, page in enumerate(pdf_doc):  # type: ignore
-            page_key = f"page_{page_id}"
-            for ele_key, ele_data in pdf_dic[page_key].items():
-                if ele_key == "para_blocks":
-                    para_blocks = ele_data
-                    for para_block in para_blocks:
-                        if "paras" in para_block.keys():
-                            paras = para_block["paras"]
-                            for para_key, para_content in paras.items():
-                                para_bbox = para_content["para_bbox"]
-                                # print(f"para_bbox: {para_bbox}")
-                                # print(f"is a nested list: {self.__is_nested_list(para_bbox)}")
-                                if self.__is_nested_list(para_bbox) and len(para_bbox) > 1:
-                                    color = (0, 1, 1)
-                                    self.__draw_nested_boxes(
-                                        page, para_bbox, color
-                                    )  # draw with cyan color for combined paragraph
-                                else:
-                                    if self.__valid_rect(para_bbox):
-                                        para_rect = fitz.Rect(para_bbox)
-                                        para_anno = page.add_rect_annot(para_rect)
-                                        para_anno.set_colors(stroke=(0, 1, 0))  # draw with green color for normal paragraph
-                                        para_anno.set_border(width=0.5)
-                                        para_anno.update()
-                                is_para_title = para_content["is_para_title"]
-                                if is_para_title:
-                                    if self.__is_nested_list(para_content["para_bbox"]) and len(para_content["para_bbox"]) > 1:
-                                        color = (0, 0, 1)
-                                        self.__draw_nested_boxes(
-                                            page, para_content["para_bbox"], color
-                                        )  # draw with cyan color for combined title
-                                    else:
-                                        if self.__valid_rect(para_content["para_bbox"]):
-                                            para_rect = fitz.Rect(para_content["para_bbox"])
-                                            if self.__valid_rect(para_content["para_bbox"]):
-                                                para_anno = page.add_rect_annot(para_rect)
-                                                para_anno.set_colors(stroke=(0, 0, 1))  # draw with blue color for normal title
-                                                para_anno.set_border(width=0.5)
-                                                para_anno.update()
-        pdf_doc.save(output_pdf_path)
-        pdf_doc.close()
--- a/magic_pdf/para/exceptions.py.bak
+++ b/magic_pdf/para/exceptions.py.bak
-class DenseSingleLineBlockException(Exception):
-    """
-    This class defines the exception type for dense single line-block.
-    """
-    def __init__(self, message="DenseSingleLineBlockException"):
-        self.message = message
-        super().__init__(self.message)
-    def __str__(self):
-        return f"{self.message}"
-    def __repr__(self):
-        return f"{self.message}"
-class TitleDetectionException(Exception):
-    """
-    This class defines the exception type for title detection.
-    """
-    def __init__(self, message="TitleDetectionException"):
-        self.message = message
-        super().__init__(self.message)
-    def __str__(self):
-        return f"{self.message}"
-    def __repr__(self):
-        return f"{self.message}"
-class TitleLevelException(Exception):
-    """
-    This class defines the exception type for title level.
-    """
-    def __init__(self, message="TitleLevelException"):
-        self.message = message
-        super().__init__(self.message)
-    def __str__(self):
-        return f"{self.message}"
-    def __repr__(self):
-        return f"{self.message}"
-class ParaSplitException(Exception):
-    """
-    This class defines the exception type for paragraph splitting.
-    """
-    def __init__(self, message="ParaSplitException"):
-        self.message = message
-        super().__init__(self.message)
-    def __str__(self):
-        return f"{self.message}"
-    def __repr__(self):
-        return f"{self.message}"
-class ParaMergeException(Exception):
-    """
-    This class defines the exception type for paragraph merging.
-    """
-    def __init__(self, message="ParaMergeException"):
-        self.message = message
-        super().__init__(self.message)
-    def __str__(self):
-        return f"{self.message}"
-    def __repr__(self):
-        return f"{self.message}"
-class DiscardByException:
-    """
-    This class discards pdf files by exception
-    """
-    def __init__(self) -> None:
-        pass
-    def discard_by_single_line_block(self, pdf_dic, exception: DenseSingleLineBlockException):
-        """
-        This function discards pdf files by single line block exception
-        Parameters
-        ----------
-        pdf_dic : dict
-            pdf dictionary
-        exception : str
-            exception message
-        Returns
-        -------
-        error_message : str
-        """
-        exception_page_nums = 0
-        page_num = 0
-        for page_id, page in pdf_dic.items():
-            if page_id.startswith("page_"):
-                page_num += 1
-                if "preproc_blocks" in page.keys():
-                    preproc_blocks = page["preproc_blocks"]
-                    all_single_line_blocks = []
-                    for block in preproc_blocks:
-                        if len(block["lines"]) == 1:
-                            all_single_line_blocks.append(block)
-                    if len(preproc_blocks) > 0 and len(all_single_line_blocks) / len(preproc_blocks) > 0.9:
-                        exception_page_nums += 1
-        if page_num == 0:
-            return None
-        if exception_page_nums / page_num > 0.1:  # Low ratio means basically, whenever this is the case, it is discarded
-            return exception.message
-        return None
-    def discard_by_title_detection(self, pdf_dic, exception: TitleDetectionException):
-        """
-        This function discards pdf files by title detection exception
-        Parameters
-        ----------
-        pdf_dic : dict
-            pdf dictionary
-        exception : str
-            exception message
-        Returns
-        -------
-        error_message : str
-        """
-        # return exception.message
-        return None
-    def discard_by_title_level(self, pdf_dic, exception: TitleLevelException):
-        """
-        This function discards pdf files by title level exception
-        Parameters
-        ----------
-        pdf_dic : dict
-            pdf dictionary
-        exception : str
-            exception message
-        Returns
-        -------
-        error_message : str
-        """
-        # return exception.message
-        return None
-    def discard_by_split_para(self, pdf_dic, exception: ParaSplitException):
-        """
-        This function discards pdf files by split para exception
-        Parameters
-        ----------
-        pdf_dic : dict
-            pdf dictionary
-        exception : str
-            exception message
-        Returns
-        -------
-        error_message : str
-        """
-        # return exception.message
-        return None
-    def discard_by_merge_para(self, pdf_dic, exception: ParaMergeException):
-        """
-        This function discards pdf files by merge para exception
-        Parameters
-        ----------
-        pdf_dic : dict
-            pdf dictionary
-        exception : str
-            exception message
-        Returns
-        -------
-        error_message : str
-        """
-        # return exception.message
-        return None
--- a/magic_pdf/para/layout_match_processor.py.bak
+++ b/magic_pdf/para/layout_match_processor.py.bak
-import math
-from magic_pdf.para.commons import *
-if sys.version_info[0] >= 3:
-    sys.stdout.reconfigure(encoding="utf-8")  # type: ignore
-class LayoutFilterProcessor:
-    def __init__(self) -> None:
-        pass
-    def batch_process_blocks(self, pdf_dict):
-        for page_id, blocks in pdf_dict.items():
-            if page_id.startswith("page_"):
-                if "layout_bboxes" in blocks.keys() and "para_blocks" in blocks.keys():
-                    layout_bbox_objs = blocks["layout_bboxes"]
-                    if layout_bbox_objs is None:
-                        continue
-                    layout_bboxes = [bbox_obj["layout_bbox"] for bbox_obj in layout_bbox_objs]
-                    # Use math.ceil function to enlarge each value of x0, y0, x1, y1 of each layout_bbox
-                    layout_bboxes = [
-                        [math.ceil(x0), math.ceil(y0), math.ceil(x1), math.ceil(y1)] for x0, y0, x1, y1 in layout_bboxes
-                    ]
-                    para_blocks = blocks["para_blocks"]
-                    if para_blocks is None:
-                        continue
-                    for lb_bbox in layout_bboxes:
-                        for i, para_block in enumerate(para_blocks):
-                            para_bbox = para_block["bbox"]
-                            para_blocks[i]["in_layout"] = 0
-                            if is_in_bbox(para_bbox, lb_bbox):
-                                para_blocks[i]["in_layout"] = 1
-                    blocks["para_blocks"] = para_blocks
-        return pdf_dict
--- a/magic_pdf/para/para_split.py.bak
+++ b/magic_pdf/para/para_split.py.bak
-import numpy as np
-from loguru import logger
-from sklearn.cluster import DBSCAN
-from magic_pdf.config.ocr_content_type import ContentType
-from magic_pdf.libs.boxbase import \
-    _is_in_or_part_overlap_with_area_ratio as is_in_layout
-LINE_STOP_FLAG = ['.', '!', '?', '。', '！', '？', '：', ':', ')', '）', ';']
-INLINE_EQUATION = ContentType.InlineEquation
-INTERLINE_EQUATION = ContentType.InterlineEquation
-TEXT = ContentType.Text
-def __get_span_text(span):
-    c = span.get('content', '')
-    if len(c) == 0:
-        c = span.get('image_path', '')
-    return c
-def __detect_list_lines(lines, new_layout_bboxes, lang):
-    """探测是否包含了列表，并且把列表的行分开.
-    这样的段落特点是，顶格字母大写/数字，紧跟着几行缩进的。缩进的行首字母含小写的。
-    """
-    def find_repeating_patterns(lst):
-        indices = []
-        ones_indices = []
-        i = 0
-        while i < len(lst) - 1:  # 确保余下元素至少有2个
-            if lst[i] == 1 and lst[i + 1] in [2, 3]:  # 额外检查以防止连续出现的1
-                start = i
-                ones_in_this_interval = [i]
-                i += 1
-                while i < len(lst) and lst[i] in [2, 3]:
-                    i += 1
-                # 验证下一个序列是否符合条件
-                if (
-                    i < len(lst) - 1
-                    and lst[i] == 1
-                    and lst[i + 1] in [2, 3]
-                    and lst[i - 1] in [2, 3]
-                ):
-                    while i < len(lst) and lst[i] in [1, 2, 3]:
-                        if lst[i] == 1:
-                            ones_in_this_interval.append(i)
-                        i += 1
-                    indices.append((start, i - 1))
-                    ones_indices.append(ones_in_this_interval)
-                else:
-                    i += 1
-            else:
-                i += 1
-        return indices, ones_indices
-    """===================="""
-    def split_indices(slen, index_array):
-        result = []
-        last_end = 0
-        for start, end in sorted(index_array):
-            if start > last_end:
-                # 前一个区间结束到下一个区间开始之间的部分标记为"text"
-                result.append(('text', last_end, start - 1))
-            # 区间内标记为"list"
-            result.append(('list', start, end))
-            last_end = end + 1
-        if last_end < slen:
-            # 如果最后一个区间结束后还有剩余的字符串，将其标记为"text"
-            result.append(('text', last_end, slen - 1))
-        return result
-    """===================="""
-    if lang != 'en':
-        return lines, None
-    else:
-        total_lines = len(lines)
-        line_fea_encode = []
-        """
-        对每一行进行特征编码，编码规则如下：
-        1. 如果行顶格，且大写字母开头或者数字开头，编码为1
-        2. 如果顶格，其他非大写开头编码为4
-        3. 如果非顶格，首字符大写，编码为2
-        4. 如果非顶格，首字符非大写编码为3
-        """
-        for l in lines:  # noqa: E741
-            first_char = __get_span_text(l['spans'][0])[0]
-            layout_left = __find_layout_bbox_by_line(l['bbox'], new_layout_bboxes)[0]
-            if l['bbox'][0] == layout_left:
-                if first_char.isupper() or first_char.isdigit():
-                    line_fea_encode.append(1)
-                else:
-                    line_fea_encode.append(4)
-            else:
-                if first_char.isupper():
-                    line_fea_encode.append(2)
-                else:
-                    line_fea_encode.append(3)
-        # 然后根据编码进行分段, 选出来 1,2,3连续出现至少2次的行，认为是列表。
-        list_indice, list_start_idx = find_repeating_patterns(line_fea_encode)
-        if len(list_indice) > 0:
-            logger.info(f'发现了列表，列表行数：{list_indice}， {list_start_idx}')
-        # TODO check一下这个特列表里缩进的行左侧是不是对齐的。
-        for start, end in list_indice:
-            for i in range(start, end + 1):
-                if i > 0:
-                    if line_fea_encode[i] == 4:
-                        logger.info(f'列表行的第{i}行不是顶格的')
-                        break
-            else:
-                logger.info(f'列表行的第{start}到第{end}行是列表')
-        return split_indices(total_lines, list_indice), list_start_idx
-def __valign_lines(blocks, layout_bboxes):
-    """在一个layoutbox内对齐行的左侧和右侧。 扫描行的左侧和右侧，如果x0,
-    x1差距不超过一个阈值，就强行对齐到所处layout的左右两侧（和layout有一段距离）。
-    3是个经验值，TODO，计算得来，可以设置为1.5个正文字符。"""
-    min_distance = 3
-    min_sample = 2
-    new_layout_bboxes = []
-    for layout_box in layout_bboxes:
-        blocks_in_layoutbox = [
-            b for b in blocks if is_in_layout(b['bbox'], layout_box['layout_bbox'])
-        ]
-        if len(blocks_in_layoutbox) == 0:
-            continue
-        x0_lst = np.array(
-            [
-                [line['bbox'][0], 0]
-                for block in blocks_in_layoutbox
-                for line in block['lines']
-            ]
-        )
-        x1_lst = np.array(
-            [
-                [line['bbox'][2], 0]
-                for block in blocks_in_layoutbox
-                for line in block['lines']
-            ]
-        )
-        x0_clusters = DBSCAN(eps=min_distance, min_samples=min_sample).fit(x0_lst)
-        x1_clusters = DBSCAN(eps=min_distance, min_samples=min_sample).fit(x1_lst)
-        x0_uniq_label = np.unique(x0_clusters.labels_)
-        x1_uniq_label = np.unique(x1_clusters.labels_)
-        x0_2_new_val = {}  # 存储旧值对应的新值映射
-        x1_2_new_val = {}
-        for label in x0_uniq_label:
-            if label == -1:
-                continue
-            x0_index_of_label = np.where(x0_clusters.labels_ == label)
-            x0_raw_val = x0_lst[x0_index_of_label][:, 0]
-            x0_new_val = np.min(x0_lst[x0_index_of_label][:, 0])
-            x0_2_new_val.update({idx: x0_new_val for idx in x0_raw_val})
-        for label in x1_uniq_label:
-            if label == -1:
-                continue
-            x1_index_of_label = np.where(x1_clusters.labels_ == label)
-            x1_raw_val = x1_lst[x1_index_of_label][:, 0]
-            x1_new_val = np.max(x1_lst[x1_index_of_label][:, 0])
-            x1_2_new_val.update({idx: x1_new_val for idx in x1_raw_val})
-        for block in blocks_in_layoutbox:
-            for line in block['lines']:
-                x0, x1 = line['bbox'][0], line['bbox'][2]
-                if x0 in x0_2_new_val:
-                    line['bbox'][0] = int(x0_2_new_val[x0])
-                if x1 in x1_2_new_val:
-                    line['bbox'][2] = int(x1_2_new_val[x1])
-            # 其余对不齐的保持不动
-        # 由于修改了block里的line长度，现在需要重新计算block的bbox
-        for block in blocks_in_layoutbox:
-            block['bbox'] = [
-                min([line['bbox'][0] for line in block['lines']]),
-                min([line['bbox'][1] for line in block['lines']]),
-                max([line['bbox'][2] for line in block['lines']]),
-                max([line['bbox'][3] for line in block['lines']]),
-            ]
-        """新计算layout的bbox，因为block的bbox变了。"""
-        layout_x0 = min([block['bbox'][0] for block in blocks_in_layoutbox])
-        layout_y0 = min([block['bbox'][1] for block in blocks_in_layoutbox])
-        layout_x1 = max([block['bbox'][2] for block in blocks_in_layoutbox])
-        layout_y1 = max([block['bbox'][3] for block in blocks_in_layoutbox])
-        new_layout_bboxes.append([layout_x0, layout_y0, layout_x1, layout_y1])
-    return new_layout_bboxes
-def __align_text_in_layout(blocks, layout_bboxes):
-    """由于ocr出来的line，有时候会在前后有一段空白，这个时候需要对文本进行对齐，超出的部分被layout左右侧截断。"""
-    for layout in layout_bboxes:
-        lb = layout['layout_bbox']
-        blocks_in_layoutbox = [b for b in blocks if is_in_layout(b['bbox'], lb)]
-        if len(blocks_in_layoutbox) == 0:
-            continue
-        for block in blocks_in_layoutbox:
-            for line in block['lines']:
-                x0, x1 = line['bbox'][0], line['bbox'][2]
-                if x0 < lb[0]:
-                    line['bbox'][0] = lb[0]
-                if x1 > lb[2]:
-                    line['bbox'][2] = lb[2]
-def __common_pre_proc(blocks, layout_bboxes):
-    """不分语言的，对文本进行预处理."""
-    # __add_line_period(blocks, layout_bboxes)
-    __align_text_in_layout(blocks, layout_bboxes)
-    aligned_layout_bboxes = __valign_lines(blocks, layout_bboxes)
-    return aligned_layout_bboxes
-def __pre_proc_zh_blocks(blocks, layout_bboxes):
-    """对中文文本进行分段预处理."""
-    pass
-def __pre_proc_en_blocks(blocks, layout_bboxes):
-    """对英文文本进行分段预处理."""
-    pass
-def __group_line_by_layout(blocks, layout_bboxes, lang='en'):
-    """每个layout内的行进行聚合."""
-    # 因为只是一个block一行目前, 一个block就是一个段落
-    lines_group = []
-    for lyout in layout_bboxes:
-        lines = [
-            line
-            for block in blocks
-            if is_in_layout(block['bbox'], lyout['layout_bbox'])
-            for line in block['lines']
-        ]
-        lines_group.append(lines)
-    return lines_group
-def __split_para_in_layoutbox(lines_group, new_layout_bbox, lang='en', char_avg_len=10):
-    """
-    lines_group 进行行分段——layout内部进行分段。lines_group内每个元素是一个Layoutbox内的所有行。
-    1. 先计算每个group的左右边界。
-    2. 然后根据行末尾特征进行分段。
-        末尾特征：以句号等结束符结尾。并且距离右侧边界有一定距离。
-        且下一行开头不留空白。
-    """
-    list_info = []  # 这个layout最后是不是列表,记录每一个layout里是不是列表开头，列表结尾
-    layout_paras = []
-    right_tail_distance = 1.5 * char_avg_len
-    for lines in lines_group:
-        paras = []
-        total_lines = len(lines)
-        if total_lines == 0:
-            continue  # 0行无需处理
-        if total_lines == 1:  # 1行无法分段。
-            layout_paras.append([lines])
-            list_info.append([False, False])
-            continue
-        """在进入到真正的分段之前，要对文字块从统计维度进行对齐方式的探测，
-            对齐方式分为以下：
-            1. 左对齐的文本块(特点是左侧顶格，或者左侧不顶格但是右侧顶格的行数大于非顶格的行数，顶格的首字母有大写也有小写)
-                1) 右侧对齐的行，单独成一段
-                2) 中间对齐的行，按照字体/行高聚合成一段
-            2. 左对齐的列表块（其特点是左侧顶格的行数小于等于非顶格的行数，非定格首字母会有小写，顶格90%是大写。并且左侧顶格行数大于1，大于1是为了这种模式连续出现才能称之为列表）
-                这样的文本块，顶格的为一个段落开头，紧随其后非顶格的行属于这个段落。
-        """
-        text_segments, list_start_line = __detect_list_lines(
-            lines, new_layout_bbox, lang
-        )
-        """根据list_range，把lines分成几个部分
-        """
-        layout_right = __find_layout_bbox_by_line(lines[0]['bbox'], new_layout_bbox)[2]
-        layout_left = __find_layout_bbox_by_line(lines[0]['bbox'], new_layout_bbox)[0]
-        para = []  # 元素是line
-        layout_list_info = [
-            False,
-            False,
-        ]  # 这个layout最后是不是列表,记录每一个layout里是不是列表开头，列表结尾
-        for content_type, start, end in text_segments:
-            if content_type == 'list':
-                for i, line in enumerate(lines[start : end + 1]):
-                    line_x0 = line['bbox'][0]
-                    if line_x0 == layout_left:  # 列表开头
-                        if len(para) > 0:
-                            paras.append(para)
-                            para = []
-                        para.append(line)
-                    else:
-                        para.append(line)
-                if len(para) > 0:
-                    paras.append(para)
-                    para = []
-                if start == 0:
-                    layout_list_info[0] = True
-                if end == total_lines - 1:
-                    layout_list_info[1] = True
-            else:  # 是普通文本
-                for i, line in enumerate(lines[start : end + 1]):
-                    # 如果i有下一行，那么就要根据下一行位置综合判断是否要分段。如果i之后没有行，那么只需要判断i行自己的结尾特征。
-                    cur_line_type = line['spans'][-1]['type']
-                    next_line = lines[i + 1] if i < total_lines - 1 else None
-                    if cur_line_type in [TEXT, INLINE_EQUATION]:
-                        if line['bbox'][2] < layout_right - right_tail_distance:
-                            para.append(line)
-                            paras.append(para)
-                            para = []
-                        elif (
-                            line['bbox'][2] >= layout_right - right_tail_distance
-                            and next_line
-                            and next_line['bbox'][0] == layout_left
-                        ):  # 现在这行到了行尾沾满，下一行存在且顶格。
-                            para.append(line)
-                        else:
-                            para.append(line)
-                            paras.append(para)
-                            para = []
-                    else:  # 其他，图片、表格、行间公式，各自占一段
-                        if len(para) > 0:  # 先把之前的段落加入到结果中
-                            paras.append(para)
-                            para = []
-                        paras.append(
-                            [line]
-                        )  # 再把当前行加入到结果中。当前行为行间公式、图、表等。
-                        para = []
-                if len(para) > 0:
-                    paras.append(para)
-                    para = []
-        list_info.append(layout_list_info)
-        layout_paras.append(paras)
-        paras = []
-    return layout_paras, list_info
-def __connect_list_inter_layout(
-    layout_paras, new_layout_bbox, layout_list_info, page_num, lang
-):
-    """如果上个layout的最后一个段落是列表，下一个layout的第一个段落也是列表，那么将他们连接起来。 TODO
-    因为没有区分列表和段落，所以这个方法暂时不实现。
-    根据layout_list_info判断是不是列表。，下个layout的第一个段如果不是列表，那么看他们是否有几行都有相同的缩进。"""
-    if (
-        len(layout_paras) == 0 or len(layout_list_info) == 0
-    ):  # 0的时候最后的return 会出错
-        return layout_paras, [False, False]
-    for i in range(1, len(layout_paras)):
-        pre_layout_list_info = layout_list_info[i - 1]
-        next_layout_list_info = layout_list_info[i]
-        pre_last_para = layout_paras[i - 1][-1]
-        next_paras = layout_paras[i]
-        if (
-            pre_layout_list_info[1] and not next_layout_list_info[0]
-        ):  # 前一个是列表结尾，后一个是非列表开头，此时检测是否有相同的缩进
-            logger.info(f'连接page {page_num} 内的list')
-            # 向layout_paras[i] 寻找开头具有相同缩进的连续的行
-            may_list_lines = []
-            for j in range(len(next_paras)):
-                line = next_paras[j]
-                if len(line) == 1:  # 只可能是一行，多行情况再需要分析了
-                    if (
-                        line[0]['bbox'][0]
-                        > __find_layout_bbox_by_line(line[0]['bbox'], new_layout_bbox)[
-                            0
-                        ]
-                    ):
-                        may_list_lines.append(line[0])
-                    else:
-                        break
-                else:
-                    break
-            # 如果这些行的缩进是相等的，那么连到上一个layout的最后一个段落上。
-            if (
-                len(may_list_lines) > 0
-                and len(set([x['bbox'][0] for x in may_list_lines])) == 1
-            ):
-                pre_last_para.extend(may_list_lines)
-                layout_paras[i] = layout_paras[i][len(may_list_lines) :]
-    return layout_paras, [
-        layout_list_info[0][0],
-        layout_list_info[-1][1],
-    ]  # 同时还返回了这个页面级别的开头、结尾是不是列表的信息
-def __connect_list_inter_page(
-    pre_page_paras,
-    next_page_paras,
-    pre_page_layout_bbox,
-    next_page_layout_bbox,
-    pre_page_list_info,
-    next_page_list_info,
-    page_num,
-    lang,
-):
-    """如果上个layout的最后一个段落是列表，下一个layout的第一个段落也是列表，那么将他们连接起来。 TODO
-    因为没有区分列表和段落，所以这个方法暂时不实现。
-    根据layout_list_info判断是不是列表。，下个layout的第一个段如果不是列表，那么看他们是否有几行都有相同的缩进。"""
-    if (
-        len(pre_page_paras) == 0 or len(next_page_paras) == 0
-    ):  # 0的时候最后的return 会出错
-        return False
-    if (
-        pre_page_list_info[1] and not next_page_list_info[0]
-    ):  # 前一个是列表结尾，后一个是非列表开头，此时检测是否有相同的缩进
-        logger.info(f'连接page {page_num} 内的list')
-        # 向layout_paras[i] 寻找开头具有相同缩进的连续的行
-        may_list_lines = []
-        for j in range(len(next_page_paras[0])):
-            line = next_page_paras[0][j]
-            if len(line) == 1:  # 只可能是一行，多行情况再需要分析了
-                if (
-                    line[0]['bbox'][0]
-                    > __find_layout_bbox_by_line(
-                        line[0]['bbox'], next_page_layout_bbox
-                    )[0]
-                ):
-                    may_list_lines.append(line[0])
-                else:
-                    break
-            else:
-                break
-        # 如果这些行的缩进是相等的，那么连到上一个layout的最后一个段落上。
-        if (
-            len(may_list_lines) > 0
-            and len(set([x['bbox'][0] for x in may_list_lines])) == 1
-        ):
-            pre_page_paras[-1].append(may_list_lines)
-            next_page_paras[0] = next_page_paras[0][len(may_list_lines) :]
-            return True
-    return False
-def __find_layout_bbox_by_line(line_bbox, layout_bboxes):
-    """根据line找到所在的layout."""
-    for layout in layout_bboxes:
-        if is_in_layout(line_bbox, layout):
-            return layout
-    return None
-def __connect_para_inter_layoutbox(layout_paras, new_layout_bbox, lang):
-    """
-    layout之间进行分段。
-    主要是计算前一个layOut的最后一行和后一个layout的第一行是否可以连接。
-    连接的条件需要同时满足：
-    1. 上一个layout的最后一行沾满整个行。并且没有结尾符号。
-    2. 下一行开头不留空白。
-    """
-    connected_layout_paras = []
-    if len(layout_paras) == 0:
-        return connected_layout_paras
-    connected_layout_paras.append(layout_paras[0])
-    for i in range(1, len(layout_paras)):
-        try:
-            if (
-                len(layout_paras[i]) == 0 or len(layout_paras[i - 1]) == 0
-            ):  # TODO 考虑连接问题，
-                continue
-            pre_last_line = layout_paras[i - 1][-1][-1]
-            next_first_line = layout_paras[i][0][0]
-        except Exception:
-            logger.error(f'page layout {i} has no line')
-            continue
-        pre_last_line_text = ''.join(
-            [__get_span_text(span) for span in pre_last_line['spans']]
-        )
-        pre_last_line_type = pre_last_line['spans'][-1]['type']
-        next_first_line_text = ''.join(
-            [__get_span_text(span) for span in next_first_line['spans']]
-        )
-        next_first_line_type = next_first_line['spans'][0]['type']
-        if pre_last_line_type not in [
-            TEXT,
-            INLINE_EQUATION,
-        ] or next_first_line_type not in [TEXT, INLINE_EQUATION]:
-            connected_layout_paras.append(layout_paras[i])
-            continue
-        pre_x2_max = __find_layout_bbox_by_line(pre_last_line['bbox'], new_layout_bbox)[
-            2
-        ]
-        next_x0_min = __find_layout_bbox_by_line(
-            next_first_line['bbox'], new_layout_bbox
-        )[0]
-        pre_last_line_text = pre_last_line_text.strip()
-        next_first_line_text = next_first_line_text.strip()
-        if (
-            pre_last_line['bbox'][2] == pre_x2_max
-            and pre_last_line_text[-1] not in LINE_STOP_FLAG
-            and next_first_line['bbox'][0] == next_x0_min
-        ):  # 前面一行沾满了整个行，并且没有结尾符号.下一行没有空白开头。
-            """连接段落条件成立，将前一个layout的段落和后一个layout的段落连接。"""
-            connected_layout_paras[-1][-1].extend(layout_paras[i][0])
-            layout_paras[i].pop(
-                0
-            )  # 删除后一个layout的第一个段落， 因为他已经被合并到前一个layout的最后一个段落了。
-            if len(layout_paras[i]) == 0:
-                layout_paras.pop(i)
-            else:
-                connected_layout_paras.append(layout_paras[i])
-        else:
-            """连接段落条件不成立，将前一个layout的段落加入到结果中。"""
-            connected_layout_paras.append(layout_paras[i])
-    return connected_layout_paras
-def __connect_para_inter_page(
-    pre_page_paras,
-    next_page_paras,
-    pre_page_layout_bbox,
-    next_page_layout_bbox,
-    page_num,
-    lang,
-):
-    """
-    连接起来相邻两个页面的段落——前一个页面最后一个段落和后一个页面的第一个段落。
-    是否可以连接的条件：
-    1. 前一个页面的最后一个段落最后一行沾满整个行。并且没有结尾符号。
-    2. 后一个页面的第一个段落第一行没有空白开头。
-    """
-    # 有的页面可能压根没有文字
-    if (
-        len(pre_page_paras) == 0
-        or len(next_page_paras) == 0
-        or len(pre_page_paras[0]) == 0
-        or len(next_page_paras[0]) == 0
-    ):  # TODO [[]]为什么出现在pre_page_paras里？
-        return False
-    pre_last_para = pre_page_paras[-1][-1]
-    next_first_para = next_page_paras[0][0]
-    pre_last_line = pre_last_para[-1]
-    next_first_line = next_first_para[0]
-    pre_last_line_text = ''.join(
-        [__get_span_text(span) for span in pre_last_line['spans']]
-    )
-    pre_last_line_type = pre_last_line['spans'][-1]['type']
-    next_first_line_text = ''.join(
-        [__get_span_text(span) for span in next_first_line['spans']]
-    )
-    next_first_line_type = next_first_line['spans'][0]['type']
-    if pre_last_line_type not in [
-        TEXT,
-        INLINE_EQUATION,
-    ] or next_first_line_type not in [
-        TEXT,
-        INLINE_EQUATION,
-    ]:  # TODO，真的要做好，要考虑跨table, image, 行间的情况
-        # 不是文本，不连接
-        return False
-    pre_x2_max = __find_layout_bbox_by_line(
-        pre_last_line['bbox'], pre_page_layout_bbox
-    )[2]
-    next_x0_min = __find_layout_bbox_by_line(
-        next_first_line['bbox'], next_page_layout_bbox
-    )[0]
-    pre_last_line_text = pre_last_line_text.strip()
-    next_first_line_text = next_first_line_text.strip()
-    if (
-        pre_last_line['bbox'][2] == pre_x2_max
-        and pre_last_line_text[-1] not in LINE_STOP_FLAG
-        and next_first_line['bbox'][0] == next_x0_min
-    ):  # 前面一行沾满了整个行，并且没有结尾符号.下一行没有空白开头。
-        """连接段落条件成立，将前一个layout的段落和后一个layout的段落连接。"""
-        pre_last_para.extend(next_first_para)
-        next_page_paras[0].pop(
-            0
-        )  # 删除后一个页面的第一个段落， 因为他已经被合并到前一个页面的最后一个段落了。
-        return True
-    else:
-        return False
-def find_consecutive_true_regions(input_array):
-    start_index = None  # 连续True区域的起始索引
-    regions = []  # 用于保存所有连续True区域的起始和结束索引
-    for i in range(len(input_array)):
-        # 如果我们找到了一个True值，并且当前并没有在连续True区域中
-        if input_array[i] and start_index is None:
-            start_index = i  # 记录连续True区域的起始索引
-        # 如果我们找到了一个False值，并且当前在连续True区域中
-        elif not input_array[i] and start_index is not None:
-            # 如果连续True区域长度大于1，那么将其添加到结果列表中
-            if i - start_index > 1:
-                regions.append((start_index, i - 1))
-            start_index = None  # 重置起始索引
-    # 如果最后一个元素是True，那么需要将最后一个连续True区域加入到结果列表中
-    if start_index is not None and len(input_array) - start_index > 1:
-        regions.append((start_index, len(input_array) - 1))
-    return regions
-def __connect_middle_align_text(
-    page_paras, new_layout_bbox, page_num, lang, debug_mode
-):
-    """
-    找出来中间对齐的连续单行文本，如果连续行高度相同，那么合并为一个段落。
-    一个line居中的条件是：
-    1. 水平中心点跨越layout的中心点。
-    2. 左右两侧都有空白
-    """
-    for layout_i, layout_para in enumerate(page_paras):
-        layout_box = new_layout_bbox[layout_i]
-        single_line_paras_tag = []
-        for i in range(len(layout_para)):
-            single_line_paras_tag.append(
-                len(layout_para[i]) == 1
-                and layout_para[i][0]['spans'][0]['type'] == TEXT
-            )
-        """找出来连续的单行文本，如果连续行高度相同，那么合并为一个段落。"""
-        consecutive_single_line_indices = find_consecutive_true_regions(
-            single_line_paras_tag
-        )
-        if len(consecutive_single_line_indices) > 0:
-            index_offset = 0
-            """检查这些行是否是高度相同的，居中的"""
-            for start, end in consecutive_single_line_indices:
-                start += index_offset
-                end += index_offset
-                line_hi = np.array(
-                    [
-                        line[0]['bbox'][3] - line[0]['bbox'][1]
-                        for line in layout_para[start : end + 1]
-                    ]
-                )
-                first_line_text = ''.join(
-                    [__get_span_text(span) for span in layout_para[start][0]['spans']]
-                )
-                if 'Table' in first_line_text or 'Figure' in first_line_text:
-                    pass
-                if debug_mode:
-                    logger.debug(line_hi.std())
-                if line_hi.std() < 2:
-                    """行高度相同，那么判断是否居中."""
-                    all_left_x0 = [
-                        line[0]['bbox'][0] for line in layout_para[start : end + 1]
-                    ]
-                    all_right_x1 = [
-                        line[0]['bbox'][2] for line in layout_para[start : end + 1]
-                    ]
-                    layout_center = (layout_box[0] + layout_box[2]) / 2
-                    if (
-                        all(
-                            [
-                                x0 < layout_center < x1
-                                for x0, x1 in zip(all_left_x0, all_right_x1)
-                            ]
-                        )
-                        and not all([x0 == layout_box[0] for x0 in all_left_x0])
-                        and not all([x1 == layout_box[2] for x1 in all_right_x1])
-                    ):
-                        merge_para = [l[0] for l in layout_para[start : end + 1]]  # noqa: E741
-                        para_text = ''.join(
-                            [
-                                __get_span_text(span)
-                                for line in merge_para
-                                for span in line['spans']
-                            ]
-                        )
-                        if debug_mode:
-                            logger.debug(para_text)
-                        layout_para[start : end + 1] = [merge_para]
-                        index_offset -= end - start
-    return
-def __merge_signle_list_text(page_paras, new_layout_bbox, page_num, lang):
-    """找出来连续的单行文本，如果首行顶格，接下来的几个单行段落缩进对齐，那么合并为一个段落。"""
-    pass
-def __do_split_page(blocks, layout_bboxes, new_layout_bbox, page_num, lang):
-    """根据line和layout情况进行分段 先实现一个根据行末尾特征分段的简单方法。"""
-    """
-    算法思路：
-    1. 扫描layout里每一行，找出来行尾距离layout有边界有一定距离的行。
-    2. 从上述行中找到末尾是句号等可作为断行标志的行。
-    3. 参照上述行尾特征进行分段。
-    4. 图、表，目前独占一行，不考虑分段。
-    """
-    if page_num == 343:
-        pass
-    lines_group = __group_line_by_layout(blocks, layout_bboxes, lang)  # block内分段
-    layout_paras, layout_list_info = __split_para_in_layoutbox(
-        lines_group, new_layout_bbox, lang
-    )  # layout内分段
-    layout_paras2, page_list_info = __connect_list_inter_layout(
-        layout_paras, new_layout_bbox, layout_list_info, page_num, lang
-    )  # layout之间连接列表段落
-    connected_layout_paras = __connect_para_inter_layoutbox(
-        layout_paras2, new_layout_bbox, lang
-    )  # layout间链接段落
-    return connected_layout_paras, page_list_info
-def para_split(pdf_info_dict, debug_mode, lang='en'):
-    """根据line和layout情况进行分段."""
-    new_layout_of_pages = []  # 数组的数组，每个元素是一个页面的layoutS
-    all_page_list_info = []  # 保存每个页面开头和结尾是否是列表
-    for page_num, page in pdf_info_dict.items():
-        blocks = page['preproc_blocks']
-        layout_bboxes = page['layout_bboxes']
-        new_layout_bbox = __common_pre_proc(blocks, layout_bboxes)
-        new_layout_of_pages.append(new_layout_bbox)
-        splited_blocks, page_list_info = __do_split_page(
-            blocks, layout_bboxes, new_layout_bbox, page_num, lang
-        )
-        all_page_list_info.append(page_list_info)
-        page['para_blocks'] = splited_blocks
-    """连接页面与页面之间的可能合并的段落"""
-    pdf_infos = list(pdf_info_dict.values())
-    for page_num, page in enumerate(pdf_info_dict.values()):
-        if page_num == 0:
-            continue
-        pre_page_paras = pdf_infos[page_num - 1]['para_blocks']
-        next_page_paras = pdf_infos[page_num]['para_blocks']
-        pre_page_layout_bbox = new_layout_of_pages[page_num - 1]
-        next_page_layout_bbox = new_layout_of_pages[page_num]
-        is_conn = __connect_para_inter_page(
-            pre_page_paras,
-            next_page_paras,
-            pre_page_layout_bbox,
-            next_page_layout_bbox,
-            page_num,
-            lang,
-        )
-        if debug_mode:
-            if is_conn:
-                logger.info(f'连接了第{page_num-1}页和第{page_num}页的段落')
-        is_list_conn = __connect_list_inter_page(
-            pre_page_paras,
-            next_page_paras,
-            pre_page_layout_bbox,
-            next_page_layout_bbox,
-            all_page_list_info[page_num - 1],
-            all_page_list_info[page_num],
-            page_num,
-            lang,
-        )
-        if debug_mode:
-            if is_list_conn:
-                logger.info(f'连接了第{page_num-1}页和第{page_num}页的列表段落')
-    """接下来可能会漏掉一些特别的一些可以合并的内容，对他们进行段落连接
-    1. 正文中有时出现一个行顶格，接下来几行缩进的情况。
-    2. 居中的一些连续单行，如果高度相同，那么可能是一个段落。
-    """
-    for page_num, page in enumerate(pdf_info_dict.values()):
-        page_paras = page['para_blocks']
-        new_layout_bbox = new_layout_of_pages[page_num]
-        __connect_middle_align_text(
-            page_paras, new_layout_bbox, page_num, lang, debug_mode=debug_mode
-        )
-        __merge_signle_list_text(page_paras, new_layout_bbox, page_num, lang)