Merge pull request #1099 from myhloli/dev

refactor(magic_pdf): remove unused functions and simplify code

Merge pull request #1099 from myhloli/dev
refactor(magic_pdf): remove unused functions and simplify code
e6da37dd · Xiaomeng Zhao · GitHub · 79b58a1e · 6a22b5ab · 79b58a1e
Unverified Commit e6da37dd authored Nov 26, 2024 by Xiaomeng Zhao Committed by GitHub Nov 26, 2024
10 changed files
--- a/magic_pdf/pre_proc/main_text_font.py.bak
+++ b/magic_pdf/pre_proc/main_text_font.py.bak
-import collections
-def get_main_text_font(pdf_docs):
-    font_names = collections.Counter()
-    for page in pdf_docs:
-        blocks = page.get_text('dict')['blocks']
-        if blocks is not None:
-            for block in blocks:
-                lines = block.get('lines')
-                if lines is not None:
-                    for line in lines:
-                        span_font = [(span['font'], len(span['text'])) for span in line['spans'] if
-                                     'font' in span and len(span['text']) > 0]
-                        if span_font:
-                            # main_text_font应该用基于字数最多的字体而不是span级别的统计
-                            # font_names.append(font_name for font_name in span_font)
-                            # block_fonts.append(font_name for font_name in span_font)
-                            for font, count in span_font:
-                                font_names[font] += count
-    main_text_font = font_names.most_common(1)[0][0]
-    return main_text_font
--- a/magic_pdf/pre_proc/ocr_detect_layout.py.bak
+++ b/magic_pdf/pre_proc/ocr_detect_layout.py.bak
-import fitz
-from magic_pdf.layout.layout_sort import get_bboxes_layout
-from magic_pdf.libs.boxbase import _is_part_overlap, _is_in
-from magic_pdf.libs.coordinate_transform import get_scale_ratio
-def get_center_point(bbox):
-    """
-    根据边界框坐标信息，计算出该边界框的中心点坐标。
-    Args:
-        bbox (list): 边界框坐标信息，包含四个元素，分别为左上角x坐标、左上角y坐标、右下角x坐标、右下角y坐标。
-    Returns:
-        list: 中心点坐标信息，包含两个元素，分别为x坐标和y坐标。
-    """
-    return [(bbox[0] + bbox[2]) / 2, (bbox[1] + bbox[3]) / 2]
-def get_area(bbox):
-    """
-    根据边界框坐标信息，计算出该边界框的面积。
-    Args:
-        bbox (list): 边界框坐标信息，包含四个元素，分别为左上角x坐标、左上角y坐标、右下角x坐标、右下角y坐标。
-    Returns:
-        float: 该边界框的面积。
-    """
-    return (bbox[2] - bbox[0]) * (bbox[3] - bbox[1])
-def adjust_layouts(layout_bboxes, page_boundry, page_id):
-    # 遍历所有布局框
-    for i in range(len(layout_bboxes)):
-        # 遍历当前布局框之后的布局框
-        for j in range(i + 1, len(layout_bboxes)):
-            # 判断两个布局框是否重叠
-            if _is_part_overlap(layout_bboxes[i], layout_bboxes[j]):
-                # 计算每个布局框的中心点坐标和面积
-                area_i = get_area(layout_bboxes[i])
-                area_j = get_area(layout_bboxes[j])
-                # 较大布局框和较小布局框的赋值
-                if area_i > area_j:
-                    larger_layout, smaller_layout = layout_bboxes[i], layout_bboxes[j]
-                else:
-                    larger_layout, smaller_layout = layout_bboxes[j], layout_bboxes[i]
-                center_large = get_center_point(larger_layout)
-                center_small = get_center_point(smaller_layout)
-                # 计算横向和纵向的距离差
-                distance_x = center_large[0] - center_small[0]
-                distance_y = center_large[1] - center_small[1]
-                # 根据距离差判断重叠方向并修正边界
-                if abs(distance_x) > abs(distance_y):  # 左右重叠
-                    if distance_x > 0 and larger_layout[0] < smaller_layout[2]:
-                        larger_layout[0] = smaller_layout[2]+1
-                    if distance_x < 0 and larger_layout[2] > smaller_layout[0]:
-                        larger_layout[2] = smaller_layout[0]-1
-                else:  # 上下重叠
-                    if distance_y > 0 and larger_layout[1] < smaller_layout[3]:
-                        larger_layout[1] = smaller_layout[3]+1
-                    if distance_y < 0 and larger_layout[3] > smaller_layout[1]:
-                        larger_layout[3] = smaller_layout[1]-1
-    # 排序调整布局边界框列表
-    new_bboxes = []
-    for layout_bbox in layout_bboxes:
-        new_bboxes.append([layout_bbox[0], layout_bbox[1], layout_bbox[2], layout_bbox[3], None, None, None, None, None, None, None, None, None])
-    layout_bboxes, layout_tree = get_bboxes_layout(new_bboxes, page_boundry, page_id)
-    # 返回排序调整后的布局边界框列表
-    return layout_bboxes, layout_tree
-def layout_detect(layout_info, page: fitz.Page, ocr_page_info):
-    """
-    对输入的布局信息进行解析，提取出每个子布局的边界框，并对所有子布局进行排序调整。
-    Args:
-        layout_info (list): 包含子布局信息的列表，每个子布局信息为字典类型，包含'poly'字段，表示子布局的边界框坐标信息。
-    Returns:
-        list: 经过排序调整后的所有子布局边界框信息的列表，每个边界框信息为字典类型，包含'layout_bbox'字段，表示边界框的坐标信息。
-    """
-    page_id = ocr_page_info['page_info']['page_no']-1
-    horizontal_scale_ratio, vertical_scale_ratio = get_scale_ratio(ocr_page_info, page)
-    # 初始化布局边界框列表
-    layout_bboxes = []
-    # 遍历每个子布局
-    for sub_layout in layout_info:
-        # 提取子布局的边界框坐标信息
-        x0, y0, _, _, x1, y1, _, _ = sub_layout['poly']
-        bbox = [int(x0 / horizontal_scale_ratio), int(y0 / vertical_scale_ratio),
-                int(x1 / horizontal_scale_ratio), int(y1 / vertical_scale_ratio)]
-        # 将子布局的边界框添加到列表中
-        layout_bboxes.append(bbox)
-    # 初始化新的布局边界框列表
-    new_layout_bboxes = []
-    # 遍历每个布局边界框
-    for i in range(len(layout_bboxes)):
-        # 初始化标记变量，用于判断当前边界框是否需要保留
-        keep = True
-        # 获取当前边界框的坐标信息
-        box_i = layout_bboxes[i]
-        # 遍历其他边界框
-        for j in range(len(layout_bboxes)):
-            # 排除当前边界框自身
-            if i != j:
-                # 获取其他边界框的坐标信息
-                box_j = layout_bboxes[j]
-                # 检测box_i是否被box_j包含
-                if _is_in(box_i, box_j):
-                    # 如果当前边界框被其他边界框包含，则标记为不需要保留
-                    keep = False
-                    # 跳出内层循环
-                    break
-        # 如果当前边界框需要保留，则添加到新的布局边界框列表中
-        if keep:
-            new_layout_bboxes.append(layout_bboxes[i])
-    # 对新的布局边界框列表进行排序调整
-    page_width = page.rect.width
-    page_height = page.rect.height
-    page_boundry = [0, 0, page_width, page_height]
-    layout_bboxes, layout_tree = adjust_layouts(new_layout_bboxes, page_boundry, page_id)
-    # 返回排序调整后的布局边界框列表
-    return layout_bboxes, layout_tree
--- a/magic_pdf/pre_proc/pdf_pre_filter.py.bak
+++ b/magic_pdf/pre_proc/pdf_pre_filter.py.bak
-from magic_pdf.config.drop_reason import DropReason
-from magic_pdf.libs.boxbase import _is_in, _is_in_or_part_overlap
-from magic_pdf.libs.commons import fitz
-def __area(box):
-    return (box[2] - box[0]) * (box[3] - box[1])
-def __is_contain_color_background_rect(
-    page: fitz.Page, text_blocks, image_bboxes
-) -> bool:
-    """检查page是包含有颜色背景的矩形."""
-    color_bg_rect = []
-    p_width, p_height = page.rect.width, page.rect.height
-    # 先找到最大的带背景矩形
-    blocks = page.get_cdrawings()
-    for block in blocks:
-        if 'fill' in block and block['fill']:  # 过滤掉透明的
-            fill = list(block['fill'])
-            fill[0], fill[1], fill[2] = int(fill[0]), int(fill[1]), int(fill[2])
-            if fill == (1.0, 1.0, 1.0):
-                continue
-            rect = block['rect']
-            # 过滤掉特别小的矩形
-            if __area(rect) < 10 * 10:
-                continue
-            # 为了防止是svg图片上的色块，这里过滤掉这类
-            if any(
-                [_is_in_or_part_overlap(rect, img_bbox) for img_bbox in image_bboxes]
-            ):
-                continue
-            color_bg_rect.append(rect)
-    # 找到最大的背景矩形
-    if len(color_bg_rect) > 0:
-        max_rect = max(color_bg_rect, key=lambda x: __area(x))
-        max_rect_int = (
-            int(max_rect[0]),
-            int(max_rect[1]),
-            int(max_rect[2]),
-            int(max_rect[3]),
-        )
-        # 判断最大的背景矩形是否包含超过3行文字，或者50个字 TODO
-        if (
-            max_rect[2] - max_rect[0] > 0.2 * p_width
-            and max_rect[3] - max_rect[1] > 0.1 * p_height
-        ):  # 宽度符合
-            # 看是否有文本块落入到这个矩形中
-            for text_block in text_blocks:
-                box = text_block['bbox']
-                box_int = (int(box[0]), int(box[1]), int(box[2]), int(box[3]))
-                if _is_in(box_int, max_rect_int):
-                    return True
-    return False
-def __is_table_overlap_text_block(text_blocks, table_bbox):
-    """检查table_bbox是否覆盖了text_blocks里的文本块 TODO."""
-    for text_block in text_blocks:
-        box = text_block['bbox']
-        if _is_in_or_part_overlap(table_bbox, box):
-            return True
-    return False
-def pdf_filter(page: fitz.Page, text_blocks, table_bboxes, image_bboxes) -> tuple:
-    """return:(True|False, err_msg) True, 如果pdf符合要求 False, 如果pdf不符合要求."""
-    if __is_contain_color_background_rect(page, text_blocks, image_bboxes):
-        return False, {
-            '_need_drop': True,
-            '_drop_reason': DropReason.COLOR_BACKGROUND_TEXT_BOX,
-        }
-    return True, None
--- a/magic_pdf/pre_proc/post_layout_split.py.bak
+++ b/magic_pdf/pre_proc/post_layout_split.py.bak
--- a/magic_pdf/pre_proc/remove_colored_strip_bbox.py.bak
+++ b/magic_pdf/pre_proc/remove_colored_strip_bbox.py.bak
-from loguru import logger
-from magic_pdf.config.drop_tag import COLOR_BG_HEADER_TXT_BLOCK
-from magic_pdf.libs.boxbase import (_is_in, _is_in_or_part_overlap,
-                                    calculate_overlap_area_2_minbox_area_ratio)
-def __area(box):
-    return (box[2] - box[0]) * (box[3] - box[1])
-def rectangle_position_determination(rect, p_width):
-    """判断矩形是否在页面中轴线附近。
-    Args:
-        rect (list): 矩形坐标，格式为[x1, y1, x2, y2]。
-        p_width (int): 页面宽度。
-    Returns:
-        bool: 若矩形在页面中轴线附近则返回True，否则返回False。
-    """
-    # 页面中轴线x坐标
-    x_axis = p_width / 2
-    # 矩形是否跨越中轴线
-    is_span = rect[0] < x_axis and rect[2] > x_axis
-    if is_span:
-        return True
-    else:
-        # 矩形与中轴线的距离，只算近的那一边
-        distance = rect[0] - x_axis if rect[0] > x_axis else x_axis - rect[2]
-        # 判断矩形与中轴线的距离是否小于页面宽度的20%
-        if distance < p_width * 0.2:
-            return True
-        else:
-            return False
-def remove_colored_strip_textblock(remain_text_blocks, page):
-    """根据页面中特定颜色和大小过滤文本块，将符合条件的文本块从remain_text_blocks中移除，并返回移除的文本块列表colored_str
-    ip_textblock。
-    Args:
-        remain_text_blocks (list): 剩余文本块列表。
-        page (Page): 页面对象。
-    Returns:
-        tuple: 剩余文本块列表和移除的文本块列表。
-    """
-    colored_strip_textblocks = []  # 先构造一个空的返回
-    if len(remain_text_blocks) > 0:
-        p_width, p_height = page.rect.width, page.rect.height
-        blocks = page.get_cdrawings()
-        colored_strip_bg_rect = []
-        for block in blocks:
-            is_filled = (
-                'fill' in block and block['fill'] and block['fill'] != (1.0, 1.0, 1.0)
-            )  # 过滤掉透明的
-            rect = block['rect']
-            area_is_large_enough = __area(rect) > 100  # 过滤掉特别小的矩形
-            rectangle_position_determination_result = rectangle_position_determination(
-                rect, p_width
-            )
-            in_upper_half_page = (
-                rect[3] < p_height * 0.3
-            )  # 找到位于页面上半部分的矩形，下边界小于页面高度的30%
-            aspect_ratio_exceeds_4 = (rect[2] - rect[0]) > (
-                rect[3] - rect[1]
-            ) * 4  # 找到长宽比超过4的矩形
-            if (
-                is_filled
-                and area_is_large_enough
-                and rectangle_position_determination_result
-                and in_upper_half_page
-                and aspect_ratio_exceeds_4
-            ):
-                colored_strip_bg_rect.append(rect)
-        if len(colored_strip_bg_rect) > 0:
-            for colored_strip_block_bbox in colored_strip_bg_rect:
-                for text_block in remain_text_blocks:
-                    text_bbox = text_block['bbox']
-                    if _is_in(text_bbox, colored_strip_block_bbox) or (
-                        _is_in_or_part_overlap(text_bbox, colored_strip_block_bbox)
-                        and calculate_overlap_area_2_minbox_area_ratio(
-                            text_bbox, colored_strip_block_bbox
-                        )
-                        > 0.6
-                    ):
-                        logger.info(
-                            f'remove_colored_strip_textblock: {text_bbox}, {colored_strip_block_bbox}'
-                        )
-                        text_block['tag'] = COLOR_BG_HEADER_TXT_BLOCK
-                        colored_strip_textblocks.append(text_block)
-                if len(colored_strip_textblocks) > 0:
-                    for colored_strip_textblock in colored_strip_textblocks:
-                        if colored_strip_textblock in remain_text_blocks:
-                            remain_text_blocks.remove(colored_strip_textblock)
-    return remain_text_blocks, colored_strip_textblocks
--- a/magic_pdf/pre_proc/remove_footer_header.py.bak
+++ b/magic_pdf/pre_proc/remove_footer_header.py.bak
-import re
-from magic_pdf.config.drop_tag import CONTENT_IN_FOOT_OR_HEADER, PAGE_NO
-from magic_pdf.libs.boxbase import _is_in_or_part_overlap
-def remove_headder_footer_one_page(text_raw_blocks, image_bboxes, table_bboxes, header_bboxs, footer_bboxs,
-                                   page_no_bboxs, page_w, page_h):
-    """删除页眉页脚，页码 从line级别进行删除，删除之后观察这个text-block是否是空的，如果是空的，则移动到remove_list中."""
-    header = []
-    footer = []
-    if len(header) == 0:
-        model_header = header_bboxs
-        if model_header:
-            x0 = min([x for x, _, _, _ in model_header])
-            y0 = min([y for _, y, _, _ in model_header])
-            x1 = max([x1 for _, _, x1, _ in model_header])
-            y1 = max([y1 for _, _, _, y1 in model_header])
-            header = [x0, y0, x1, y1]
-    if len(footer) == 0:
-        model_footer = footer_bboxs
-        if model_footer:
-            x0 = min([x for x, _, _, _ in model_footer])
-            y0 = min([y for _, y, _, _ in model_footer])
-            x1 = max([x1 for _, _, x1, _ in model_footer])
-            y1 = max([y1 for _, _, _, y1 in model_footer])
-            footer = [x0, y0, x1, y1]
-    header_y0 = 0 if len(header) == 0 else header[3]
-    footer_y0 = page_h if len(footer) == 0 else footer[1]
-    if page_no_bboxs:
-        top_part = [b for b in page_no_bboxs if b[3] < page_h / 2]
-        btn_part = [b for b in page_no_bboxs if b[1] > page_h / 2]
-        top_max_y0 = max([b[1] for b in top_part]) if top_part else 0
-        btn_min_y1 = min([b[3] for b in btn_part]) if btn_part else page_h
-        header_y0 = max(header_y0, top_max_y0)
-        footer_y0 = min(footer_y0, btn_min_y1)
-    content_boundry = [0, header_y0, page_w, footer_y0]
-    header = [0, 0, page_w, header_y0]
-    footer = [0, footer_y0, page_w, page_h]
-    """以上计算出来了页眉页脚的边界，下面开始进行删除"""
-    text_block_to_remove = []
-    # 首先检查每个textblock
-    for blk in text_raw_blocks:
-        if len(blk['lines']) > 0:
-            for line in blk['lines']:
-                line_del = []
-                for span in line['spans']:
-                    span_del = []
-                    if span['bbox'][3] < header_y0:
-                        span_del.append(span)
-                    elif _is_in_or_part_overlap(span['bbox'], header) or _is_in_or_part_overlap(span['bbox'], footer):
-                        span_del.append(span)
-                for span in span_del:
-                    line['spans'].remove(span)
-                if not line['spans']:
-                    line_del.append(line)
-            for line in line_del:
-                blk['lines'].remove(line)
-        else:
-            # if not blk['lines']:
-            blk['tag'] = CONTENT_IN_FOOT_OR_HEADER
-            text_block_to_remove.append(blk)
-    """有的时候由于pageNo太小了，总是会有一点和content_boundry重叠一点，被放入正文，因此对于pageNo，进行span粒度的删除"""
-    page_no_block_2_remove = []
-    if page_no_bboxs:
-        for pagenobox in page_no_bboxs:
-            for block in text_raw_blocks:
-                if _is_in_or_part_overlap(pagenobox, block['bbox']):  # 在span级别删除页码
-                    for line in block['lines']:
-                        for span in line['spans']:
-                            if _is_in_or_part_overlap(pagenobox, span['bbox']):
-                                # span['text'] = ''
-                                span['tag'] = PAGE_NO
-                                # 检查这个block是否只有这一个span，如果是，那么就把这个block也删除
-                                if len(line['spans']) == 1 and len(block['lines']) == 1:
-                                    page_no_block_2_remove.append(block)
-    else:
-        # 测试最后一个是不是页码：规则是，最后一个block仅有1个line,一个span,且text是数字，空格，符号组成，不含字母,并且包含数字
-        if len(text_raw_blocks) > 0:
-            text_raw_blocks.sort(key=lambda x: x['bbox'][1], reverse=True)
-            last_block = text_raw_blocks[0]
-            if len(last_block['lines']) == 1:
-                last_line = last_block['lines'][0]
-                if len(last_line['spans']) == 1:
-                    last_span = last_line['spans'][0]
-                    if last_span['text'].strip() and not re.search('[a-zA-Z]', last_span['text']) and re.search('[0-9]',
-                                                                                                                last_span[
-                                                                                                                    'text']):
-                        last_span['tag'] = PAGE_NO
-                        page_no_block_2_remove.append(last_block)
-    for b in page_no_block_2_remove:
-        text_block_to_remove.append(b)
-    for blk in text_block_to_remove:
-        if blk in text_raw_blocks:
-            text_raw_blocks.remove(blk)
-    text_block_remain = text_raw_blocks
-    image_bbox_to_remove = [bbox for bbox in image_bboxes if not _is_in_or_part_overlap(bbox, content_boundry)]
-    image_bbox_remain = [bbox for bbox in image_bboxes if _is_in_or_part_overlap(bbox, content_boundry)]
-    table_bbox_to_remove = [bbox for bbox in table_bboxes if not _is_in_or_part_overlap(bbox, content_boundry)]
-    table_bbox_remain = [bbox for bbox in table_bboxes if _is_in_or_part_overlap(bbox, content_boundry)]
-    return image_bbox_remain, table_bbox_remain, text_block_remain, text_block_to_remove, image_bbox_to_remove, table_bbox_to_remove
--- a/magic_pdf/pre_proc/remove_rotate_bbox.py.bak
+++ b/magic_pdf/pre_proc/remove_rotate_bbox.py.bak
-import math
-import re
-from magic_pdf.config.drop_tag import (EMPTY_SIDE_BLOCK, ROTATE_TEXT,
-                                       VERTICAL_TEXT)
-from magic_pdf.libs.boxbase import is_vbox_on_side
-def detect_non_horizontal_texts(result_dict):
-    """This function detects watermarks and vertical margin notes in the
-    document.
-    Watermarks are identified by finding blocks with the same coordinates and frequently occurring identical texts across multiple pages.
-    If these conditions are met, the blocks are highly likely to be watermarks, as opposed to headers or footers, which can change from page to page.
-    If the direction of these blocks is not horizontal, they are definitely considered to be watermarks.
-    Vertical margin notes are identified by finding blocks with the same coordinates and frequently occurring identical texts across multiple pages.
-    If these conditions are met, the blocks are highly likely to be vertical margin notes, which typically appear on the left and right sides of the page. # noqa: E501
-    If the direction of these blocks is vertical, they are definitely considered to be vertical margin notes.
-    Parameters
-    ----------
-    result_dict : dict
-        The result dictionary.
-    Returns
-    -------
-    result_dict : dict
-        The updated result dictionary.
-    """
-    # Dictionary to store information about potential watermarks
-    potential_watermarks = {}
-    potential_margin_notes = {}
-    for page_id, page_content in result_dict.items():
-        if page_id.startswith('page_'):
-            for block_id, block_data in page_content.items():
-                if block_id.startswith('block_'):
-                    if 'dir' in block_data:
-                        coordinates_text = (
-                            block_data['bbox'],
-                            block_data['text'],
-                        )  # Tuple of coordinates and text
-                        angle = math.atan2(block_data['dir'][1], block_data['dir'][0])
-                        angle = abs(math.degrees(angle))
-                        if angle > 5 and angle < 85:  # Check if direction is watermarks
-                            if coordinates_text in potential_watermarks:
-                                potential_watermarks[coordinates_text] += 1
-                            else:
-                                potential_watermarks[coordinates_text] = 1
-                        if angle > 85 and angle < 105:  # Check if direction is vertical
-                            if coordinates_text in potential_margin_notes:
-                                potential_margin_notes[coordinates_text] += (
-                                    1  # Increment count
-                                )
-                            else:
-                                potential_margin_notes[coordinates_text] = (
-                                    1  # Initialize count
-                                )
-    # Identify watermarks by finding entries with counts higher than a threshold (e.g., appearing on more than half of the pages)
-    watermark_threshold = len(result_dict) // 2
-    watermarks = {
-        k: v for k, v in potential_watermarks.items() if v > watermark_threshold
-    }
-    # Identify margin notes by finding entries with counts higher than a threshold (e.g., appearing on more than half of the pages)
-    margin_note_threshold = len(result_dict) // 2
-    margin_notes = {
-        k: v for k, v in potential_margin_notes.items() if v > margin_note_threshold
-    }
-    # Add watermark information to the result dictionary
-    for page_id, blocks in result_dict.items():
-        if page_id.startswith('page_'):
-            for block_id, block_data in blocks.items():
-                coordinates_text = (block_data['bbox'], block_data['text'])
-                if coordinates_text in watermarks:
-                    block_data['is_watermark'] = 1
-                else:
-                    block_data['is_watermark'] = 0
-                if coordinates_text in margin_notes:
-                    block_data['is_vertical_margin_note'] = 1
-                else:
-                    block_data['is_vertical_margin_note'] = 0
-    return result_dict
-"""
-1. 当一个block里全部文字都不是dir=(1,0)，这个block整体去掉
-2. 当一个block里全部文字都是dir=(1,0)，但是每行只有一个字，这个block整体去掉。这个block必须出现在页面的四周，否则不去掉
-"""
-def __is_a_word(sentence):
-    # 如果输入是中文并且长度为1，则返回True
-    if re.fullmatch(r'[\u4e00-\u9fa5]', sentence):
-        return True
-    # 判断是否为单个英文单词或字符（包括ASCII标点）
-    elif re.fullmatch(r'[a-zA-Z0-9]+', sentence) and len(sentence) <= 2:
-        return True
-    else:
-        return False
-def __get_text_color(num):
-    """获取字体的颜色RGB值."""
-    blue = num & 255
-    green = (num >> 8) & 255
-    red = (num >> 16) & 255
-    return red, green, blue
-def __is_empty_side_box(text_block):
-    """是否是边缘上的空白没有任何内容的block."""
-    for line in text_block['lines']:
-        for span in line['spans']:
-            font_color = span['color']
-            r, g, b = __get_text_color(font_color)
-            if len(span['text'].strip()) > 0 and (r, g, b) != (255, 255, 255):
-                return False
-    return True
-def remove_rotate_side_textblock(pymu_text_block, page_width, page_height):
-    """返回删除了垂直，水印，旋转的textblock 删除的内容打上tag返回."""
-    removed_text_block = []
-    for i, block in enumerate(
-        pymu_text_block
-    ):  # 格式参考test/assets/papre/pymu_textblocks.json
-        lines = block['lines']
-        block_bbox = block['bbox']
-        if not is_vbox_on_side(
-            block_bbox, page_width, page_height, 0.2
-        ):  # 保证这些box必须在页面的两边
-            continue
-        if (
-            all(
-                [
-                    __is_a_word(line['spans'][0]['text'])
-                    for line in lines
-                    if len(line['spans']) > 0
-                ]
-            )
-            and len(lines) > 1
-            and all([len(line['spans']) == 1 for line in lines])
-        ):
-            is_box_valign = (
-                (
-                    len(
-                        set(
-                            [
-                                int(line['spans'][0]['bbox'][0])
-                                for line in lines
-                                if len(line['spans']) > 0
-                            ]
-                        )
-                    )
-                    == 1
-                )
-                and (
-                    len(
-                        [
-                            int(line['spans'][0]['bbox'][0])
-                            for line in lines
-                            if len(line['spans']) > 0
-                        ]
-                    )
-                    > 1
-                )
-            )  # 测试bbox在垂直方向是不是x0都相等，也就是在垂直方向排列.同时必须大于等于2个字
-            if is_box_valign:
-                block['tag'] = VERTICAL_TEXT
-                removed_text_block.append(block)
-                continue
-        for line in lines:
-            if line['dir'] != (1, 0):
-                block['tag'] = ROTATE_TEXT
-                removed_text_block.append(
-                    block
-                )  # 只要有一个line不是dir=(1,0)，就把整个block都删掉
-                break
-    for block in removed_text_block:
-        pymu_text_block.remove(block)
-    return pymu_text_block, removed_text_block
-def get_side_boundry(rotate_bbox, page_width, page_height):
-    """根据rotate_bbox，返回页面的左右正文边界."""
-    left_x = 0
-    right_x = page_width
-    for x in rotate_bbox:
-        box = x['bbox']
-        if box[2] < page_width / 2:
-            left_x = max(left_x, box[2])
-        else:
-            right_x = min(right_x, box[0])
-    return left_x + 1, right_x - 1
-def remove_side_blank_block(pymu_text_block, page_width, page_height):
-    """删除页面两侧的空白block."""
-    removed_text_block = []
-    for i, block in enumerate(
-        pymu_text_block
-    ):  # 格式参考test/assets/papre/pymu_textblocks.json
-        block_bbox = block['bbox']
-        if not is_vbox_on_side(
-            block_bbox, page_width, page_height, 0.2
-        ):  # 保证这些box必须在页面的两边
-            continue
-        if __is_empty_side_box(block):
-            block['tag'] = EMPTY_SIDE_BLOCK
-            removed_text_block.append(block)
-            continue
-    for block in removed_text_block:
-        pymu_text_block.remove(block)
-    return pymu_text_block, removed_text_block
--- a/magic_pdf/pre_proc/resolve_bbox_conflict.py.bak
+++ b/magic_pdf/pre_proc/resolve_bbox_conflict.py.bak
-"""
-从pdf里提取出来api给出的bbox,然后根据重叠情况做出取舍
-1. 首先去掉出现在图片上的bbox，图片包括表格和图片
-2. 然后去掉出现在文字blcok上的图片bbox
-"""
-from magic_pdf.config.drop_tag import ON_IMAGE_TEXT, ON_TABLE_TEXT
-from magic_pdf.libs.boxbase import (_is_in, _is_in_or_part_overlap,
-                                    _is_left_overlap)
-def resolve_bbox_overlap_conflict(images: list, tables: list, interline_equations: list, inline_equations: list,
-                                  text_raw_blocks: list):
-    """
-    text_raw_blocks结构是从pymupdf里直接取到的结构，具体样例参考test/assets/papre/pymu_textblocks.json
-    当下采用一种粗暴的方式：
-    1. 去掉图片上的公式
-    2. 去掉table上的公式
-    2. 图片和文字block部分重叠，首先丢弃图片
-    3. 图片和图片重叠，修改图片的bbox，使得图片不重叠(暂时没这么做，先把图片都扔掉)
-    4. 去掉文字bbox里位于图片、表格上的文字（一定要完全在图、表内部）
-    5. 去掉表格上的文字
-    """
-    text_block_removed = []
-    images_backup = []
-    # 去掉位于图片上的文字block
-    for image_box in images:
-        for text_block in text_raw_blocks:
-            text_bbox = text_block['bbox']
-            if _is_in(text_bbox, image_box):
-                text_block['tag'] = ON_IMAGE_TEXT
-                text_block_removed.append(text_block)
-    # 去掉table上的文字block
-    for table_box in tables:
-        for text_block in text_raw_blocks:
-            text_bbox = text_block['bbox']
-            if _is_in(text_bbox, table_box):
-                text_block['tag'] = ON_TABLE_TEXT
-                text_block_removed.append(text_block)
-    for text_block in text_block_removed:
-        if text_block in text_raw_blocks:
-            text_raw_blocks.remove(text_block)
-    # 第一步去掉在图片上出现的公式box
-    temp = []
-    for image_box in images:
-        for eq1 in interline_equations:
-            if _is_in_or_part_overlap(image_box, eq1[:4]):
-                temp.append(eq1)
-        for eq2 in inline_equations:
-            if _is_in_or_part_overlap(image_box, eq2[:4]):
-                temp.append(eq2)
-    for eq in temp:
-        if eq in interline_equations:
-            interline_equations.remove(eq)
-        if eq in inline_equations:
-            inline_equations.remove(eq)
-    # 第二步去掉在表格上出现的公式box
-    temp = []
-    for table_box in tables:
-        for eq1 in interline_equations:
-            if _is_in_or_part_overlap(table_box, eq1[:4]):
-                temp.append(eq1)
-        for eq2 in inline_equations:
-            if _is_in_or_part_overlap(table_box, eq2[:4]):
-                temp.append(eq2)
-    for eq in temp:
-        if eq in interline_equations:
-            interline_equations.remove(eq)
-        if eq in inline_equations:
-            inline_equations.remove(eq)
-    # 图片和文字重叠，丢掉图片
-    for image_box in images:
-        for text_block in text_raw_blocks:
-            text_bbox = text_block['bbox']
-            if _is_in_or_part_overlap(image_box, text_bbox):
-                images_backup.append(image_box)
-                break
-    for image_box in images_backup:
-        images.remove(image_box)
-    # 图片和图片重叠，两张都暂时不参与版面计算
-    images_dup_index = []
-    for i in range(len(images)):
-        for j in range(i + 1, len(images)):
-            if _is_in_or_part_overlap(images[i], images[j]):
-                images_dup_index.append(i)
-                images_dup_index.append(j)
-    dup_idx = set(images_dup_index)
-    for img_id in dup_idx:
-        images_backup.append(images[img_id])
-        images[img_id] = None
-    images = [img for img in images if img is not None]
-    # 如果行间公式和文字block重叠，放到临时的数据里，防止这些文字box影响到layout计算。通过计算IOU合并行间公式和文字block
-    # 对于这样的文本块删除，然后保留行间公式的大小不变。
-    # 当计算完毕layout，这部分再合并回来
-    text_block_removed_2 = []
-    # for text_block in text_raw_blocks:
-    #     text_bbox = text_block["bbox"]
-    #     for eq in interline_equations:
-    #         ratio = calculate_overlap_area_2_minbox_area_ratio(text_bbox, eq[:4])
-    #         if ratio>0.05:
-    #             text_block['tag'] = "belong-to-interline-equation"
-    #             text_block_removed_2.append(text_block)
-    #             break
-    # for tb in text_block_removed_2:
-    #     if tb in text_raw_blocks:
-    #         text_raw_blocks.remove(tb)
-    # text_block_removed = text_block_removed + text_block_removed_2
-    return images, tables, interline_equations, inline_equations, text_raw_blocks, text_block_removed, images_backup, text_block_removed_2
-def check_text_block_horizontal_overlap(text_blocks: list, header, footer) -> bool:
-    """检查文本block之间的水平重叠情况，这种情况如果发生，那么这个pdf就不再继续处理了。 因为这种情况大概率发生了公式没有被检测出来。"""
-    if len(text_blocks) == 0:
-        return False
-    page_min_y = 0
-    page_max_y = max(yy['bbox'][3] for yy in text_blocks)
-    def __max_y(lst: list):
-        if len(lst) > 0:
-            return max([item[1] for item in lst])
-        return page_min_y
-    def __min_y(lst: list):
-        if len(lst) > 0:
-            return min([item[3] for item in lst])
-        return page_max_y
-    clip_y0 = __max_y(header)
-    clip_y1 = __min_y(footer)
-    txt_bboxes = []
-    for text_block in text_blocks:
-        bbox = text_block['bbox']
-        if bbox[1] >= clip_y0 and bbox[3] <= clip_y1:
-            txt_bboxes.append(bbox)
-    for i in range(len(txt_bboxes)):
-        for j in range(i + 1, len(txt_bboxes)):
-            if _is_left_overlap(txt_bboxes[i], txt_bboxes[j]) or _is_left_overlap(txt_bboxes[j], txt_bboxes[i]):
-                return True
-    return False
-def check_useful_block_horizontal_overlap(useful_blocks: list) -> bool:
-    """检查文本block之间的水平重叠情况，这种情况如果发生，那么这个pdf就不再继续处理了。 因为这种情况大概率发生了公式没有被检测出来。"""
-    if len(useful_blocks) == 0:
-        return False
-    page_min_y = 0
-    page_max_y = max(yy['bbox'][3] for yy in useful_blocks)
-    useful_bboxes = []
-    for text_block in useful_blocks:
-        bbox = text_block['bbox']
-        if bbox[1] >= page_min_y and bbox[3] <= page_max_y:
-            useful_bboxes.append(bbox)
-    for i in range(len(useful_bboxes)):
-        for j in range(i + 1, len(useful_bboxes)):
-            area_i = (useful_bboxes[i][2] - useful_bboxes[i][0]) * (useful_bboxes[i][3] - useful_bboxes[i][1])
-            area_j = (useful_bboxes[j][2] - useful_bboxes[j][0]) * (useful_bboxes[j][3] - useful_bboxes[j][1])
-            if _is_left_overlap(useful_bboxes[i], useful_bboxes[j]) or _is_left_overlap(useful_bboxes[j], useful_bboxes[i]):
-                if area_i > area_j:
-                    return True, useful_bboxes[j], useful_bboxes[i]
-                else:
-                    return True, useful_bboxes[i], useful_bboxes[j]
-    return False, None, None
--- a/magic_pdf/pre_proc/solve_line_alien.py.bak
+++ b/magic_pdf/pre_proc/solve_line_alien.py.bak
-def solve_inline_too_large_interval(pdf_info_dict: dict) -> dict:  # text_block -> json中的preproc_block
-    """解决行内文本间距过大问题"""
-    for i in range(len(pdf_info_dict)):
-        text_blocks = pdf_info_dict[f'page_{i}']['preproc_blocks']
-        for block in text_blocks:
-            x_pre_1, y_pre_1, x_pre_2, y_pre_2 = 0, 0, 0, 0
-            for line in block['lines']:
-                x_cur_1, y_cur_1, x_cur_2, y_cur_2 = line['bbox']
-                # line_box = [x1, y1, x2, y2] 
-                if int(y_cur_1) == int(y_pre_1) and int(y_cur_2) == int(y_pre_2):
-                    # if len(line['spans']) == 1:
-                    line['spans'][0]['text'] = ' ' + line['spans'][0]['text']
-                x_pre_1, y_pre_1, x_pre_2, y_pre_2 = line['bbox'] 
-    return pdf_info_dict
--- a/magic_pdf/pre_proc/statistics.py.bak
+++ b/magic_pdf/pre_proc/statistics.py.bak
-"""
-统计处需要跨页、全局性的数据
- 统计出字号从大到小
- 正文区域占比最高的前5
- 正文平均行间距
- 正文平均字间距
- 正文平均字符宽度
- 正文平均字符高度
-"""