Unverified Commit e6da37dd authored by Xiaomeng Zhao's avatar Xiaomeng Zhao Committed by GitHub
Browse files

Merge pull request #1099 from myhloli/dev

refactor(magic_pdf): remove unused functions and simplify code
parents 79b58a1e 6a22b5ab
import collections
def get_main_text_font(pdf_docs):
font_names = collections.Counter()
for page in pdf_docs:
blocks = page.get_text('dict')['blocks']
if blocks is not None:
for block in blocks:
lines = block.get('lines')
if lines is not None:
for line in lines:
span_font = [(span['font'], len(span['text'])) for span in line['spans'] if
'font' in span and len(span['text']) > 0]
if span_font:
# main_text_font应该用基于字数最多的字体而不是span级别的统计
# font_names.append(font_name for font_name in span_font)
# block_fonts.append(font_name for font_name in span_font)
for font, count in span_font:
font_names[font] += count
main_text_font = font_names.most_common(1)[0][0]
return main_text_font
import fitz
from magic_pdf.layout.layout_sort import get_bboxes_layout
from magic_pdf.libs.boxbase import _is_part_overlap, _is_in
from magic_pdf.libs.coordinate_transform import get_scale_ratio
def get_center_point(bbox):
"""
根据边界框坐标信息,计算出该边界框的中心点坐标。
Args:
bbox (list): 边界框坐标信息,包含四个元素,分别为左上角x坐标、左上角y坐标、右下角x坐标、右下角y坐标。
Returns:
list: 中心点坐标信息,包含两个元素,分别为x坐标和y坐标。
"""
return [(bbox[0] + bbox[2]) / 2, (bbox[1] + bbox[3]) / 2]
def get_area(bbox):
"""
根据边界框坐标信息,计算出该边界框的面积。
Args:
bbox (list): 边界框坐标信息,包含四个元素,分别为左上角x坐标、左上角y坐标、右下角x坐标、右下角y坐标。
Returns:
float: 该边界框的面积。
"""
return (bbox[2] - bbox[0]) * (bbox[3] - bbox[1])
def adjust_layouts(layout_bboxes, page_boundry, page_id):
# 遍历所有布局框
for i in range(len(layout_bboxes)):
# 遍历当前布局框之后的布局框
for j in range(i + 1, len(layout_bboxes)):
# 判断两个布局框是否重叠
if _is_part_overlap(layout_bboxes[i], layout_bboxes[j]):
# 计算每个布局框的中心点坐标和面积
area_i = get_area(layout_bboxes[i])
area_j = get_area(layout_bboxes[j])
# 较大布局框和较小布局框的赋值
if area_i > area_j:
larger_layout, smaller_layout = layout_bboxes[i], layout_bboxes[j]
else:
larger_layout, smaller_layout = layout_bboxes[j], layout_bboxes[i]
center_large = get_center_point(larger_layout)
center_small = get_center_point(smaller_layout)
# 计算横向和纵向的距离差
distance_x = center_large[0] - center_small[0]
distance_y = center_large[1] - center_small[1]
# 根据距离差判断重叠方向并修正边界
if abs(distance_x) > abs(distance_y): # 左右重叠
if distance_x > 0 and larger_layout[0] < smaller_layout[2]:
larger_layout[0] = smaller_layout[2]+1
if distance_x < 0 and larger_layout[2] > smaller_layout[0]:
larger_layout[2] = smaller_layout[0]-1
else: # 上下重叠
if distance_y > 0 and larger_layout[1] < smaller_layout[3]:
larger_layout[1] = smaller_layout[3]+1
if distance_y < 0 and larger_layout[3] > smaller_layout[1]:
larger_layout[3] = smaller_layout[1]-1
# 排序调整布局边界框列表
new_bboxes = []
for layout_bbox in layout_bboxes:
new_bboxes.append([layout_bbox[0], layout_bbox[1], layout_bbox[2], layout_bbox[3], None, None, None, None, None, None, None, None, None])
layout_bboxes, layout_tree = get_bboxes_layout(new_bboxes, page_boundry, page_id)
# 返回排序调整后的布局边界框列表
return layout_bboxes, layout_tree
def layout_detect(layout_info, page: fitz.Page, ocr_page_info):
"""
对输入的布局信息进行解析,提取出每个子布局的边界框,并对所有子布局进行排序调整。
Args:
layout_info (list): 包含子布局信息的列表,每个子布局信息为字典类型,包含'poly'字段,表示子布局的边界框坐标信息。
Returns:
list: 经过排序调整后的所有子布局边界框信息的列表,每个边界框信息为字典类型,包含'layout_bbox'字段,表示边界框的坐标信息。
"""
page_id = ocr_page_info['page_info']['page_no']-1
horizontal_scale_ratio, vertical_scale_ratio = get_scale_ratio(ocr_page_info, page)
# 初始化布局边界框列表
layout_bboxes = []
# 遍历每个子布局
for sub_layout in layout_info:
# 提取子布局的边界框坐标信息
x0, y0, _, _, x1, y1, _, _ = sub_layout['poly']
bbox = [int(x0 / horizontal_scale_ratio), int(y0 / vertical_scale_ratio),
int(x1 / horizontal_scale_ratio), int(y1 / vertical_scale_ratio)]
# 将子布局的边界框添加到列表中
layout_bboxes.append(bbox)
# 初始化新的布局边界框列表
new_layout_bboxes = []
# 遍历每个布局边界框
for i in range(len(layout_bboxes)):
# 初始化标记变量,用于判断当前边界框是否需要保留
keep = True
# 获取当前边界框的坐标信息
box_i = layout_bboxes[i]
# 遍历其他边界框
for j in range(len(layout_bboxes)):
# 排除当前边界框自身
if i != j:
# 获取其他边界框的坐标信息
box_j = layout_bboxes[j]
# 检测box_i是否被box_j包含
if _is_in(box_i, box_j):
# 如果当前边界框被其他边界框包含,则标记为不需要保留
keep = False
# 跳出内层循环
break
# 如果当前边界框需要保留,则添加到新的布局边界框列表中
if keep:
new_layout_bboxes.append(layout_bboxes[i])
# 对新的布局边界框列表进行排序调整
page_width = page.rect.width
page_height = page.rect.height
page_boundry = [0, 0, page_width, page_height]
layout_bboxes, layout_tree = adjust_layouts(new_layout_bboxes, page_boundry, page_id)
# 返回排序调整后的布局边界框列表
return layout_bboxes, layout_tree
from magic_pdf.config.drop_reason import DropReason
from magic_pdf.libs.boxbase import _is_in, _is_in_or_part_overlap
from magic_pdf.libs.commons import fitz
def __area(box):
return (box[2] - box[0]) * (box[3] - box[1])
def __is_contain_color_background_rect(
page: fitz.Page, text_blocks, image_bboxes
) -> bool:
"""检查page是包含有颜色背景的矩形."""
color_bg_rect = []
p_width, p_height = page.rect.width, page.rect.height
# 先找到最大的带背景矩形
blocks = page.get_cdrawings()
for block in blocks:
if 'fill' in block and block['fill']: # 过滤掉透明的
fill = list(block['fill'])
fill[0], fill[1], fill[2] = int(fill[0]), int(fill[1]), int(fill[2])
if fill == (1.0, 1.0, 1.0):
continue
rect = block['rect']
# 过滤掉特别小的矩形
if __area(rect) < 10 * 10:
continue
# 为了防止是svg图片上的色块,这里过滤掉这类
if any(
[_is_in_or_part_overlap(rect, img_bbox) for img_bbox in image_bboxes]
):
continue
color_bg_rect.append(rect)
# 找到最大的背景矩形
if len(color_bg_rect) > 0:
max_rect = max(color_bg_rect, key=lambda x: __area(x))
max_rect_int = (
int(max_rect[0]),
int(max_rect[1]),
int(max_rect[2]),
int(max_rect[3]),
)
# 判断最大的背景矩形是否包含超过3行文字,或者50个字 TODO
if (
max_rect[2] - max_rect[0] > 0.2 * p_width
and max_rect[3] - max_rect[1] > 0.1 * p_height
): # 宽度符合
# 看是否有文本块落入到这个矩形中
for text_block in text_blocks:
box = text_block['bbox']
box_int = (int(box[0]), int(box[1]), int(box[2]), int(box[3]))
if _is_in(box_int, max_rect_int):
return True
return False
def __is_table_overlap_text_block(text_blocks, table_bbox):
"""检查table_bbox是否覆盖了text_blocks里的文本块 TODO."""
for text_block in text_blocks:
box = text_block['bbox']
if _is_in_or_part_overlap(table_bbox, box):
return True
return False
def pdf_filter(page: fitz.Page, text_blocks, table_bboxes, image_bboxes) -> tuple:
"""return:(True|False, err_msg) True, 如果pdf符合要求 False, 如果pdf不符合要求."""
if __is_contain_color_background_rect(page, text_blocks, image_bboxes):
return False, {
'_need_drop': True,
'_drop_reason': DropReason.COLOR_BACKGROUND_TEXT_BOX,
}
return True, None
from loguru import logger
from magic_pdf.config.drop_tag import COLOR_BG_HEADER_TXT_BLOCK
from magic_pdf.libs.boxbase import (_is_in, _is_in_or_part_overlap,
calculate_overlap_area_2_minbox_area_ratio)
def __area(box):
return (box[2] - box[0]) * (box[3] - box[1])
def rectangle_position_determination(rect, p_width):
"""判断矩形是否在页面中轴线附近。
Args:
rect (list): 矩形坐标,格式为[x1, y1, x2, y2]。
p_width (int): 页面宽度。
Returns:
bool: 若矩形在页面中轴线附近则返回True,否则返回False。
"""
# 页面中轴线x坐标
x_axis = p_width / 2
# 矩形是否跨越中轴线
is_span = rect[0] < x_axis and rect[2] > x_axis
if is_span:
return True
else:
# 矩形与中轴线的距离,只算近的那一边
distance = rect[0] - x_axis if rect[0] > x_axis else x_axis - rect[2]
# 判断矩形与中轴线的距离是否小于页面宽度的20%
if distance < p_width * 0.2:
return True
else:
return False
def remove_colored_strip_textblock(remain_text_blocks, page):
"""根据页面中特定颜色和大小过滤文本块,将符合条件的文本块从remain_text_blocks中移除,并返回移除的文本块列表colored_str
ip_textblock。
Args:
remain_text_blocks (list): 剩余文本块列表。
page (Page): 页面对象。
Returns:
tuple: 剩余文本块列表和移除的文本块列表。
"""
colored_strip_textblocks = [] # 先构造一个空的返回
if len(remain_text_blocks) > 0:
p_width, p_height = page.rect.width, page.rect.height
blocks = page.get_cdrawings()
colored_strip_bg_rect = []
for block in blocks:
is_filled = (
'fill' in block and block['fill'] and block['fill'] != (1.0, 1.0, 1.0)
) # 过滤掉透明的
rect = block['rect']
area_is_large_enough = __area(rect) > 100 # 过滤掉特别小的矩形
rectangle_position_determination_result = rectangle_position_determination(
rect, p_width
)
in_upper_half_page = (
rect[3] < p_height * 0.3
) # 找到位于页面上半部分的矩形,下边界小于页面高度的30%
aspect_ratio_exceeds_4 = (rect[2] - rect[0]) > (
rect[3] - rect[1]
) * 4 # 找到长宽比超过4的矩形
if (
is_filled
and area_is_large_enough
and rectangle_position_determination_result
and in_upper_half_page
and aspect_ratio_exceeds_4
):
colored_strip_bg_rect.append(rect)
if len(colored_strip_bg_rect) > 0:
for colored_strip_block_bbox in colored_strip_bg_rect:
for text_block in remain_text_blocks:
text_bbox = text_block['bbox']
if _is_in(text_bbox, colored_strip_block_bbox) or (
_is_in_or_part_overlap(text_bbox, colored_strip_block_bbox)
and calculate_overlap_area_2_minbox_area_ratio(
text_bbox, colored_strip_block_bbox
)
> 0.6
):
logger.info(
f'remove_colored_strip_textblock: {text_bbox}, {colored_strip_block_bbox}'
)
text_block['tag'] = COLOR_BG_HEADER_TXT_BLOCK
colored_strip_textblocks.append(text_block)
if len(colored_strip_textblocks) > 0:
for colored_strip_textblock in colored_strip_textblocks:
if colored_strip_textblock in remain_text_blocks:
remain_text_blocks.remove(colored_strip_textblock)
return remain_text_blocks, colored_strip_textblocks
import re
from magic_pdf.config.drop_tag import CONTENT_IN_FOOT_OR_HEADER, PAGE_NO
from magic_pdf.libs.boxbase import _is_in_or_part_overlap
def remove_headder_footer_one_page(text_raw_blocks, image_bboxes, table_bboxes, header_bboxs, footer_bboxs,
page_no_bboxs, page_w, page_h):
"""删除页眉页脚,页码 从line级别进行删除,删除之后观察这个text-block是否是空的,如果是空的,则移动到remove_list中."""
header = []
footer = []
if len(header) == 0:
model_header = header_bboxs
if model_header:
x0 = min([x for x, _, _, _ in model_header])
y0 = min([y for _, y, _, _ in model_header])
x1 = max([x1 for _, _, x1, _ in model_header])
y1 = max([y1 for _, _, _, y1 in model_header])
header = [x0, y0, x1, y1]
if len(footer) == 0:
model_footer = footer_bboxs
if model_footer:
x0 = min([x for x, _, _, _ in model_footer])
y0 = min([y for _, y, _, _ in model_footer])
x1 = max([x1 for _, _, x1, _ in model_footer])
y1 = max([y1 for _, _, _, y1 in model_footer])
footer = [x0, y0, x1, y1]
header_y0 = 0 if len(header) == 0 else header[3]
footer_y0 = page_h if len(footer) == 0 else footer[1]
if page_no_bboxs:
top_part = [b for b in page_no_bboxs if b[3] < page_h / 2]
btn_part = [b for b in page_no_bboxs if b[1] > page_h / 2]
top_max_y0 = max([b[1] for b in top_part]) if top_part else 0
btn_min_y1 = min([b[3] for b in btn_part]) if btn_part else page_h
header_y0 = max(header_y0, top_max_y0)
footer_y0 = min(footer_y0, btn_min_y1)
content_boundry = [0, header_y0, page_w, footer_y0]
header = [0, 0, page_w, header_y0]
footer = [0, footer_y0, page_w, page_h]
"""以上计算出来了页眉页脚的边界,下面开始进行删除"""
text_block_to_remove = []
# 首先检查每个textblock
for blk in text_raw_blocks:
if len(blk['lines']) > 0:
for line in blk['lines']:
line_del = []
for span in line['spans']:
span_del = []
if span['bbox'][3] < header_y0:
span_del.append(span)
elif _is_in_or_part_overlap(span['bbox'], header) or _is_in_or_part_overlap(span['bbox'], footer):
span_del.append(span)
for span in span_del:
line['spans'].remove(span)
if not line['spans']:
line_del.append(line)
for line in line_del:
blk['lines'].remove(line)
else:
# if not blk['lines']:
blk['tag'] = CONTENT_IN_FOOT_OR_HEADER
text_block_to_remove.append(blk)
"""有的时候由于pageNo太小了,总是会有一点和content_boundry重叠一点,被放入正文,因此对于pageNo,进行span粒度的删除"""
page_no_block_2_remove = []
if page_no_bboxs:
for pagenobox in page_no_bboxs:
for block in text_raw_blocks:
if _is_in_or_part_overlap(pagenobox, block['bbox']): # 在span级别删除页码
for line in block['lines']:
for span in line['spans']:
if _is_in_or_part_overlap(pagenobox, span['bbox']):
# span['text'] = ''
span['tag'] = PAGE_NO
# 检查这个block是否只有这一个span,如果是,那么就把这个block也删除
if len(line['spans']) == 1 and len(block['lines']) == 1:
page_no_block_2_remove.append(block)
else:
# 测试最后一个是不是页码:规则是,最后一个block仅有1个line,一个span,且text是数字,空格,符号组成,不含字母,并且包含数字
if len(text_raw_blocks) > 0:
text_raw_blocks.sort(key=lambda x: x['bbox'][1], reverse=True)
last_block = text_raw_blocks[0]
if len(last_block['lines']) == 1:
last_line = last_block['lines'][0]
if len(last_line['spans']) == 1:
last_span = last_line['spans'][0]
if last_span['text'].strip() and not re.search('[a-zA-Z]', last_span['text']) and re.search('[0-9]',
last_span[
'text']):
last_span['tag'] = PAGE_NO
page_no_block_2_remove.append(last_block)
for b in page_no_block_2_remove:
text_block_to_remove.append(b)
for blk in text_block_to_remove:
if blk in text_raw_blocks:
text_raw_blocks.remove(blk)
text_block_remain = text_raw_blocks
image_bbox_to_remove = [bbox for bbox in image_bboxes if not _is_in_or_part_overlap(bbox, content_boundry)]
image_bbox_remain = [bbox for bbox in image_bboxes if _is_in_or_part_overlap(bbox, content_boundry)]
table_bbox_to_remove = [bbox for bbox in table_bboxes if not _is_in_or_part_overlap(bbox, content_boundry)]
table_bbox_remain = [bbox for bbox in table_bboxes if _is_in_or_part_overlap(bbox, content_boundry)]
return image_bbox_remain, table_bbox_remain, text_block_remain, text_block_to_remove, image_bbox_to_remove, table_bbox_to_remove
import math
import re
from magic_pdf.config.drop_tag import (EMPTY_SIDE_BLOCK, ROTATE_TEXT,
VERTICAL_TEXT)
from magic_pdf.libs.boxbase import is_vbox_on_side
def detect_non_horizontal_texts(result_dict):
"""This function detects watermarks and vertical margin notes in the
document.
Watermarks are identified by finding blocks with the same coordinates and frequently occurring identical texts across multiple pages.
If these conditions are met, the blocks are highly likely to be watermarks, as opposed to headers or footers, which can change from page to page.
If the direction of these blocks is not horizontal, they are definitely considered to be watermarks.
Vertical margin notes are identified by finding blocks with the same coordinates and frequently occurring identical texts across multiple pages.
If these conditions are met, the blocks are highly likely to be vertical margin notes, which typically appear on the left and right sides of the page. # noqa: E501
If the direction of these blocks is vertical, they are definitely considered to be vertical margin notes.
Parameters
----------
result_dict : dict
The result dictionary.
Returns
-------
result_dict : dict
The updated result dictionary.
"""
# Dictionary to store information about potential watermarks
potential_watermarks = {}
potential_margin_notes = {}
for page_id, page_content in result_dict.items():
if page_id.startswith('page_'):
for block_id, block_data in page_content.items():
if block_id.startswith('block_'):
if 'dir' in block_data:
coordinates_text = (
block_data['bbox'],
block_data['text'],
) # Tuple of coordinates and text
angle = math.atan2(block_data['dir'][1], block_data['dir'][0])
angle = abs(math.degrees(angle))
if angle > 5 and angle < 85: # Check if direction is watermarks
if coordinates_text in potential_watermarks:
potential_watermarks[coordinates_text] += 1
else:
potential_watermarks[coordinates_text] = 1
if angle > 85 and angle < 105: # Check if direction is vertical
if coordinates_text in potential_margin_notes:
potential_margin_notes[coordinates_text] += (
1 # Increment count
)
else:
potential_margin_notes[coordinates_text] = (
1 # Initialize count
)
# Identify watermarks by finding entries with counts higher than a threshold (e.g., appearing on more than half of the pages)
watermark_threshold = len(result_dict) // 2
watermarks = {
k: v for k, v in potential_watermarks.items() if v > watermark_threshold
}
# Identify margin notes by finding entries with counts higher than a threshold (e.g., appearing on more than half of the pages)
margin_note_threshold = len(result_dict) // 2
margin_notes = {
k: v for k, v in potential_margin_notes.items() if v > margin_note_threshold
}
# Add watermark information to the result dictionary
for page_id, blocks in result_dict.items():
if page_id.startswith('page_'):
for block_id, block_data in blocks.items():
coordinates_text = (block_data['bbox'], block_data['text'])
if coordinates_text in watermarks:
block_data['is_watermark'] = 1
else:
block_data['is_watermark'] = 0
if coordinates_text in margin_notes:
block_data['is_vertical_margin_note'] = 1
else:
block_data['is_vertical_margin_note'] = 0
return result_dict
"""
1. 当一个block里全部文字都不是dir=(1,0),这个block整体去掉
2. 当一个block里全部文字都是dir=(1,0),但是每行只有一个字,这个block整体去掉。这个block必须出现在页面的四周,否则不去掉
"""
def __is_a_word(sentence):
# 如果输入是中文并且长度为1,则返回True
if re.fullmatch(r'[\u4e00-\u9fa5]', sentence):
return True
# 判断是否为单个英文单词或字符(包括ASCII标点)
elif re.fullmatch(r'[a-zA-Z0-9]+', sentence) and len(sentence) <= 2:
return True
else:
return False
def __get_text_color(num):
"""获取字体的颜色RGB值."""
blue = num & 255
green = (num >> 8) & 255
red = (num >> 16) & 255
return red, green, blue
def __is_empty_side_box(text_block):
"""是否是边缘上的空白没有任何内容的block."""
for line in text_block['lines']:
for span in line['spans']:
font_color = span['color']
r, g, b = __get_text_color(font_color)
if len(span['text'].strip()) > 0 and (r, g, b) != (255, 255, 255):
return False
return True
def remove_rotate_side_textblock(pymu_text_block, page_width, page_height):
"""返回删除了垂直,水印,旋转的textblock 删除的内容打上tag返回."""
removed_text_block = []
for i, block in enumerate(
pymu_text_block
): # 格式参考test/assets/papre/pymu_textblocks.json
lines = block['lines']
block_bbox = block['bbox']
if not is_vbox_on_side(
block_bbox, page_width, page_height, 0.2
): # 保证这些box必须在页面的两边
continue
if (
all(
[
__is_a_word(line['spans'][0]['text'])
for line in lines
if len(line['spans']) > 0
]
)
and len(lines) > 1
and all([len(line['spans']) == 1 for line in lines])
):
is_box_valign = (
(
len(
set(
[
int(line['spans'][0]['bbox'][0])
for line in lines
if len(line['spans']) > 0
]
)
)
== 1
)
and (
len(
[
int(line['spans'][0]['bbox'][0])
for line in lines
if len(line['spans']) > 0
]
)
> 1
)
) # 测试bbox在垂直方向是不是x0都相等,也就是在垂直方向排列.同时必须大于等于2个字
if is_box_valign:
block['tag'] = VERTICAL_TEXT
removed_text_block.append(block)
continue
for line in lines:
if line['dir'] != (1, 0):
block['tag'] = ROTATE_TEXT
removed_text_block.append(
block
) # 只要有一个line不是dir=(1,0),就把整个block都删掉
break
for block in removed_text_block:
pymu_text_block.remove(block)
return pymu_text_block, removed_text_block
def get_side_boundry(rotate_bbox, page_width, page_height):
"""根据rotate_bbox,返回页面的左右正文边界."""
left_x = 0
right_x = page_width
for x in rotate_bbox:
box = x['bbox']
if box[2] < page_width / 2:
left_x = max(left_x, box[2])
else:
right_x = min(right_x, box[0])
return left_x + 1, right_x - 1
def remove_side_blank_block(pymu_text_block, page_width, page_height):
"""删除页面两侧的空白block."""
removed_text_block = []
for i, block in enumerate(
pymu_text_block
): # 格式参考test/assets/papre/pymu_textblocks.json
block_bbox = block['bbox']
if not is_vbox_on_side(
block_bbox, page_width, page_height, 0.2
): # 保证这些box必须在页面的两边
continue
if __is_empty_side_box(block):
block['tag'] = EMPTY_SIDE_BLOCK
removed_text_block.append(block)
continue
for block in removed_text_block:
pymu_text_block.remove(block)
return pymu_text_block, removed_text_block
"""
从pdf里提取出来api给出的bbox,然后根据重叠情况做出取舍
1. 首先去掉出现在图片上的bbox,图片包括表格和图片
2. 然后去掉出现在文字blcok上的图片bbox
"""
from magic_pdf.config.drop_tag import ON_IMAGE_TEXT, ON_TABLE_TEXT
from magic_pdf.libs.boxbase import (_is_in, _is_in_or_part_overlap,
_is_left_overlap)
def resolve_bbox_overlap_conflict(images: list, tables: list, interline_equations: list, inline_equations: list,
text_raw_blocks: list):
"""
text_raw_blocks结构是从pymupdf里直接取到的结构,具体样例参考test/assets/papre/pymu_textblocks.json
当下采用一种粗暴的方式:
1. 去掉图片上的公式
2. 去掉table上的公式
2. 图片和文字block部分重叠,首先丢弃图片
3. 图片和图片重叠,修改图片的bbox,使得图片不重叠(暂时没这么做,先把图片都扔掉)
4. 去掉文字bbox里位于图片、表格上的文字(一定要完全在图、表内部)
5. 去掉表格上的文字
"""
text_block_removed = []
images_backup = []
# 去掉位于图片上的文字block
for image_box in images:
for text_block in text_raw_blocks:
text_bbox = text_block['bbox']
if _is_in(text_bbox, image_box):
text_block['tag'] = ON_IMAGE_TEXT
text_block_removed.append(text_block)
# 去掉table上的文字block
for table_box in tables:
for text_block in text_raw_blocks:
text_bbox = text_block['bbox']
if _is_in(text_bbox, table_box):
text_block['tag'] = ON_TABLE_TEXT
text_block_removed.append(text_block)
for text_block in text_block_removed:
if text_block in text_raw_blocks:
text_raw_blocks.remove(text_block)
# 第一步去掉在图片上出现的公式box
temp = []
for image_box in images:
for eq1 in interline_equations:
if _is_in_or_part_overlap(image_box, eq1[:4]):
temp.append(eq1)
for eq2 in inline_equations:
if _is_in_or_part_overlap(image_box, eq2[:4]):
temp.append(eq2)
for eq in temp:
if eq in interline_equations:
interline_equations.remove(eq)
if eq in inline_equations:
inline_equations.remove(eq)
# 第二步去掉在表格上出现的公式box
temp = []
for table_box in tables:
for eq1 in interline_equations:
if _is_in_or_part_overlap(table_box, eq1[:4]):
temp.append(eq1)
for eq2 in inline_equations:
if _is_in_or_part_overlap(table_box, eq2[:4]):
temp.append(eq2)
for eq in temp:
if eq in interline_equations:
interline_equations.remove(eq)
if eq in inline_equations:
inline_equations.remove(eq)
# 图片和文字重叠,丢掉图片
for image_box in images:
for text_block in text_raw_blocks:
text_bbox = text_block['bbox']
if _is_in_or_part_overlap(image_box, text_bbox):
images_backup.append(image_box)
break
for image_box in images_backup:
images.remove(image_box)
# 图片和图片重叠,两张都暂时不参与版面计算
images_dup_index = []
for i in range(len(images)):
for j in range(i + 1, len(images)):
if _is_in_or_part_overlap(images[i], images[j]):
images_dup_index.append(i)
images_dup_index.append(j)
dup_idx = set(images_dup_index)
for img_id in dup_idx:
images_backup.append(images[img_id])
images[img_id] = None
images = [img for img in images if img is not None]
# 如果行间公式和文字block重叠,放到临时的数据里,防止这些文字box影响到layout计算。通过计算IOU合并行间公式和文字block
# 对于这样的文本块删除,然后保留行间公式的大小不变。
# 当计算完毕layout,这部分再合并回来
text_block_removed_2 = []
# for text_block in text_raw_blocks:
# text_bbox = text_block["bbox"]
# for eq in interline_equations:
# ratio = calculate_overlap_area_2_minbox_area_ratio(text_bbox, eq[:4])
# if ratio>0.05:
# text_block['tag'] = "belong-to-interline-equation"
# text_block_removed_2.append(text_block)
# break
# for tb in text_block_removed_2:
# if tb in text_raw_blocks:
# text_raw_blocks.remove(tb)
# text_block_removed = text_block_removed + text_block_removed_2
return images, tables, interline_equations, inline_equations, text_raw_blocks, text_block_removed, images_backup, text_block_removed_2
def check_text_block_horizontal_overlap(text_blocks: list, header, footer) -> bool:
"""检查文本block之间的水平重叠情况,这种情况如果发生,那么这个pdf就不再继续处理了。 因为这种情况大概率发生了公式没有被检测出来。"""
if len(text_blocks) == 0:
return False
page_min_y = 0
page_max_y = max(yy['bbox'][3] for yy in text_blocks)
def __max_y(lst: list):
if len(lst) > 0:
return max([item[1] for item in lst])
return page_min_y
def __min_y(lst: list):
if len(lst) > 0:
return min([item[3] for item in lst])
return page_max_y
clip_y0 = __max_y(header)
clip_y1 = __min_y(footer)
txt_bboxes = []
for text_block in text_blocks:
bbox = text_block['bbox']
if bbox[1] >= clip_y0 and bbox[3] <= clip_y1:
txt_bboxes.append(bbox)
for i in range(len(txt_bboxes)):
for j in range(i + 1, len(txt_bboxes)):
if _is_left_overlap(txt_bboxes[i], txt_bboxes[j]) or _is_left_overlap(txt_bboxes[j], txt_bboxes[i]):
return True
return False
def check_useful_block_horizontal_overlap(useful_blocks: list) -> bool:
"""检查文本block之间的水平重叠情况,这种情况如果发生,那么这个pdf就不再继续处理了。 因为这种情况大概率发生了公式没有被检测出来。"""
if len(useful_blocks) == 0:
return False
page_min_y = 0
page_max_y = max(yy['bbox'][3] for yy in useful_blocks)
useful_bboxes = []
for text_block in useful_blocks:
bbox = text_block['bbox']
if bbox[1] >= page_min_y and bbox[3] <= page_max_y:
useful_bboxes.append(bbox)
for i in range(len(useful_bboxes)):
for j in range(i + 1, len(useful_bboxes)):
area_i = (useful_bboxes[i][2] - useful_bboxes[i][0]) * (useful_bboxes[i][3] - useful_bboxes[i][1])
area_j = (useful_bboxes[j][2] - useful_bboxes[j][0]) * (useful_bboxes[j][3] - useful_bboxes[j][1])
if _is_left_overlap(useful_bboxes[i], useful_bboxes[j]) or _is_left_overlap(useful_bboxes[j], useful_bboxes[i]):
if area_i > area_j:
return True, useful_bboxes[j], useful_bboxes[i]
else:
return True, useful_bboxes[i], useful_bboxes[j]
return False, None, None
def solve_inline_too_large_interval(pdf_info_dict: dict) -> dict: # text_block -> json中的preproc_block
"""解决行内文本间距过大问题"""
for i in range(len(pdf_info_dict)):
text_blocks = pdf_info_dict[f'page_{i}']['preproc_blocks']
for block in text_blocks:
x_pre_1, y_pre_1, x_pre_2, y_pre_2 = 0, 0, 0, 0
for line in block['lines']:
x_cur_1, y_cur_1, x_cur_2, y_cur_2 = line['bbox']
# line_box = [x1, y1, x2, y2]
if int(y_cur_1) == int(y_pre_1) and int(y_cur_2) == int(y_pre_2):
# if len(line['spans']) == 1:
line['spans'][0]['text'] = ' ' + line['spans'][0]['text']
x_pre_1, y_pre_1, x_pre_2, y_pre_2 = line['bbox']
return pdf_info_dict
"""
统计处需要跨页、全局性的数据
- 统计出字号从大到小
- 正文区域占比最高的前5
- 正文平均行间距
- 正文平均字间距
- 正文平均字符宽度
- 正文平均字符高度
"""
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment