Unverified Commit 6c8f5638 authored by Xiaomeng Zhao's avatar Xiaomeng Zhao Committed by GitHub
Browse files

Merge pull request #1027 from icecraft/refactor/move_defs

refactor: move some constants or enums defs to config folder
parents bc992433 b492c19c
This diff is collapsed.
import copy
from loguru import logger
from magic_pdf.libs.Constants import LINES_DELETED, CROSS_PAGE
from magic_pdf.libs.ocr_content_type import BlockType, ContentType
LINE_STOP_FLAG = ('.', '!', '?', '。', '!', '?', ')', ')', '"', '”', ':', ':', ';', ';')
from magic_pdf.config.constants import CROSS_PAGE, LINES_DELETED
from magic_pdf.config.ocr_content_type import BlockType, ContentType
LINE_STOP_FLAG = (
'.',
'!',
'?',
'。',
'!',
'?',
')',
')',
'"',
'”',
':',
':',
';',
';',
)
LIST_END_FLAG = ('.', '。', ';', ';')
class ListLineTag:
IS_LIST_START_LINE = "is_list_start_line"
IS_LIST_END_LINE = "is_list_end_line"
IS_LIST_START_LINE = 'is_list_start_line'
IS_LIST_END_LINE = 'is_list_end_line'
def __process_blocks(blocks):
......@@ -27,12 +40,14 @@ def __process_blocks(blocks):
# 如果当前块是 text 类型
if current_block['type'] == 'text':
current_block["bbox_fs"] = copy.deepcopy(current_block["bbox"])
if 'lines' in current_block and len(current_block["lines"]) > 0:
current_block['bbox_fs'] = [min([line['bbox'][0] for line in current_block['lines']]),
min([line['bbox'][1] for line in current_block['lines']]),
max([line['bbox'][2] for line in current_block['lines']]),
max([line['bbox'][3] for line in current_block['lines']])]
current_block['bbox_fs'] = copy.deepcopy(current_block['bbox'])
if 'lines' in current_block and len(current_block['lines']) > 0:
current_block['bbox_fs'] = [
min([line['bbox'][0] for line in current_block['lines']]),
min([line['bbox'][1] for line in current_block['lines']]),
max([line['bbox'][2] for line in current_block['lines']]),
max([line['bbox'][3] for line in current_block['lines']]),
]
current_group.append(current_block)
# 检查下一个块是否存在
......@@ -83,9 +98,10 @@ def __is_list_or_index_block(block):
# logger.info(f"block_weight_radio: {block_weight_radio}")
# 如果首行左边不顶格而右边顶格,末行左边顶格而右边不顶格 (第一行可能可以右边不顶格)
if (first_line['bbox'][0] - block['bbox_fs'][0] > line_height / 2 and
abs(last_line['bbox'][0] - block['bbox_fs'][0]) < line_height / 2 and
block['bbox_fs'][2] - last_line['bbox'][2] > line_height
if (
first_line['bbox'][0] - block['bbox_fs'][0] > line_height / 2
and abs(last_line['bbox'][0] - block['bbox_fs'][0]) < line_height / 2
and block['bbox_fs'][2] - last_line['bbox'][2] > line_height
):
multiple_para_flag = True
......@@ -93,14 +109,14 @@ def __is_list_or_index_block(block):
line_mid_x = (line['bbox'][0] + line['bbox'][2]) / 2
block_mid_x = (block['bbox_fs'][0] + block['bbox_fs'][2]) / 2
if (
line['bbox'][0] - block['bbox_fs'][0] > 0.8 * line_height and
block['bbox_fs'][2] - line['bbox'][2] > 0.8 * line_height
line['bbox'][0] - block['bbox_fs'][0] > 0.8 * line_height
and block['bbox_fs'][2] - line['bbox'][2] > 0.8 * line_height
):
external_sides_not_close_num += 1
if abs(line_mid_x - block_mid_x) < line_height / 2:
center_close_num += 1
line_text = ""
line_text = ''
for span in line['spans']:
span_type = span['type']
......@@ -148,15 +164,19 @@ def __is_list_or_index_block(block):
if line_text[-1].isdigit():
num_end_count += 1
if num_start_count / len(lines_text_list) >= 0.8 or num_end_count / len(lines_text_list) >= 0.8:
if (
num_start_count / len(lines_text_list) >= 0.8
or num_end_count / len(lines_text_list) >= 0.8
):
line_num_flag = True
if flag_end_count / len(lines_text_list) >= 0.8:
line_end_flag = True
# 有的目录右侧不贴边, 目前认为左边或者右边有一边全贴边,且符合数字规则极为index
if ((left_close_num / len(block['lines']) >= 0.8 or right_close_num / len(block['lines']) >= 0.8)
and line_num_flag
):
if (
left_close_num / len(block['lines']) >= 0.8
or right_close_num / len(block['lines']) >= 0.8
) and line_num_flag:
for line in block['lines']:
line[ListLineTag.IS_LIST_START_LINE] = True
return BlockType.Index
......@@ -164,20 +184,20 @@ def __is_list_or_index_block(block):
# 全部line都居中的特殊list识别,每行都需要换行,特征是多行,且大多数行都前后not_close,每line中点x坐标接近
# 补充条件block的长宽比有要求
elif (
external_sides_not_close_num >= 2 and
center_close_num == len(block['lines']) and
external_sides_not_close_num / len(block['lines']) >= 0.5 and
block_height / block_weight > 0.4
external_sides_not_close_num >= 2
and center_close_num == len(block['lines'])
and external_sides_not_close_num / len(block['lines']) >= 0.5
and block_height / block_weight > 0.4
):
for line in block['lines']:
line[ListLineTag.IS_LIST_START_LINE] = True
return BlockType.List
elif (
left_close_num >= 2
and (right_not_close_num >= 2 or line_end_flag or left_not_close_num >= 2)
and not multiple_para_flag
# and block_weight_radio > 0.27
left_close_num >= 2
and (right_not_close_num >= 2 or line_end_flag or left_not_close_num >= 2)
and not multiple_para_flag
# and block_weight_radio > 0.27
):
# 处理一种特殊的没有缩进的list,所有行都贴左边,通过右边的空隙判断是否是item尾
if left_close_num / len(block['lines']) > 0.8:
......@@ -189,10 +209,15 @@ def __is_list_or_index_block(block):
# 这种是大部分line item 都有结束标识符的情况,按结束标识符区分不同item
elif line_end_flag:
for i, line in enumerate(block['lines']):
if len(lines_text_list[i]) > 0 and lines_text_list[i][-1] in LIST_END_FLAG:
if (
len(lines_text_list[i]) > 0
and lines_text_list[i][-1] in LIST_END_FLAG
):
line[ListLineTag.IS_LIST_END_LINE] = True
if i + 1 < len(block['lines']):
block['lines'][i + 1][ListLineTag.IS_LIST_START_LINE] = True
block['lines'][i + 1][
ListLineTag.IS_LIST_START_LINE
] = True
# line item基本没有结束标识符,而且也没有缩进,按右侧空隙判断哪些是item end
else:
line_start_flag = False
......@@ -201,7 +226,10 @@ def __is_list_or_index_block(block):
line[ListLineTag.IS_LIST_START_LINE] = True
line_start_flag = False
if abs(block['bbox_fs'][2] - line['bbox'][2]) > 0.1 * block_weight:
if (
abs(block['bbox_fs'][2] - line['bbox'][2])
> 0.1 * block_weight
):
line[ListLineTag.IS_LIST_END_LINE] = True
line_start_flag = True
# 一种有缩进的特殊有序list,start line 左侧不贴边且以数字开头,end line 以 IS_LIST_END_FLAG 结尾且数量和start line 一致
......@@ -243,11 +271,13 @@ def __merge_2_text_blocks(block1, block2):
first_span = first_line['spans'][0]
if len(first_span['content']) > 0:
span_start_with_num = first_span['content'][0].isdigit()
if (abs(block2['bbox_fs'][2] - last_line['bbox'][2]) < line_height
and not last_span['content'].endswith(LINE_STOP_FLAG)
# 两个block宽度差距超过2倍也不合并
and abs(block1_weight - block2_weight) < min_block_weight
and not span_start_with_num
if (
abs(block2['bbox_fs'][2] - last_line['bbox'][2])
< line_height
and not last_span['content'].endswith(LINE_STOP_FLAG)
# 两个block宽度差距超过2倍也不合并
and abs(block1_weight - block2_weight) < min_block_weight
and not span_start_with_num
):
if block1['page_num'] != block2['page_num']:
for line in block1['lines']:
......@@ -284,7 +314,6 @@ def __is_list_group(text_blocks_group):
def __para_merge_page(blocks):
page_text_blocks_groups = __process_blocks(blocks)
for text_blocks_group in page_text_blocks_groups:
if len(text_blocks_group) > 0:
# 需要先在合并前对所有block判断是否为list or index block
for block in text_blocks_group:
......@@ -293,7 +322,6 @@ def __para_merge_page(blocks):
# logger.info(f"{block['type']}:{block}")
if len(text_blocks_group) > 1:
# 在合并前判断这个group 是否是一个 list group
is_list_group = __is_list_group(text_blocks_group)
......@@ -305,11 +333,18 @@ def __para_merge_page(blocks):
if i - 1 >= 0:
prev_block = text_blocks_group[i - 1]
if current_block['type'] == 'text' and prev_block['type'] == 'text' and not is_list_group:
if (
current_block['type'] == 'text'
and prev_block['type'] == 'text'
and not is_list_group
):
__merge_2_text_blocks(current_block, prev_block)
elif (
(current_block['type'] == BlockType.List and prev_block['type'] == BlockType.List) or
(current_block['type'] == BlockType.Index and prev_block['type'] == BlockType.Index)
current_block['type'] == BlockType.List
and prev_block['type'] == BlockType.List
) or (
current_block['type'] == BlockType.Index
and prev_block['type'] == BlockType.Index
):
__merge_2_list_blocks(current_block, prev_block)
......@@ -339,4 +374,4 @@ if __name__ == '__main__':
# 调用函数
groups = __process_blocks(input_blocks)
for group_index, group in enumerate(groups):
print(f"Group {group_index}: {group}")
print(f'Group {group_index}: {group}')
......@@ -2,38 +2,47 @@ import time
from loguru import logger
from magic_pdf.config.drop_reason import DropReason
from magic_pdf.config.ocr_content_type import ContentType
from magic_pdf.layout.layout_sort import (LAYOUT_UNPROC, get_bboxes_layout,
get_columns_cnt_of_layout)
from magic_pdf.libs.commons import fitz, get_delta_time
from magic_pdf.layout.layout_sort import get_bboxes_layout, LAYOUT_UNPROC, get_columns_cnt_of_layout
from magic_pdf.libs.convert_utils import dict_to_list
from magic_pdf.libs.drop_reason import DropReason
from magic_pdf.libs.hash_utils import compute_md5
from magic_pdf.libs.local_math import float_equal
from magic_pdf.libs.ocr_content_type import ContentType
from magic_pdf.model.magic_model import MagicModel
from magic_pdf.para.para_split_v2 import para_split
from magic_pdf.pre_proc.citationmarker_remove import remove_citation_marker
from magic_pdf.pre_proc.construct_page_dict import ocr_construct_page_component_v2
from magic_pdf.pre_proc.construct_page_dict import \
ocr_construct_page_component_v2
from magic_pdf.pre_proc.cut_image import ocr_cut_image_and_table
from magic_pdf.pre_proc.equations_replace import remove_chars_in_text_blocks, replace_equations_in_textblock, \
combine_chars_to_pymudict
from magic_pdf.pre_proc.ocr_detect_all_bboxes import ocr_prepare_bboxes_for_layout_split
from magic_pdf.pre_proc.ocr_dict_merge import sort_blocks_by_layout, fill_spans_in_blocks, fix_block_spans, \
fix_discarded_block
from magic_pdf.pre_proc.ocr_span_list_modify import remove_overlaps_min_spans, get_qa_need_list_v2, \
remove_overlaps_low_confidence_spans
from magic_pdf.pre_proc.resolve_bbox_conflict import check_useful_block_horizontal_overlap
from magic_pdf.pre_proc.equations_replace import (
combine_chars_to_pymudict, remove_chars_in_text_blocks,
replace_equations_in_textblock)
from magic_pdf.pre_proc.ocr_detect_all_bboxes import \
ocr_prepare_bboxes_for_layout_split
from magic_pdf.pre_proc.ocr_dict_merge import (fill_spans_in_blocks,
fix_block_spans,
fix_discarded_block,
sort_blocks_by_layout)
from magic_pdf.pre_proc.ocr_span_list_modify import (
get_qa_need_list_v2, remove_overlaps_low_confidence_spans,
remove_overlaps_min_spans)
from magic_pdf.pre_proc.resolve_bbox_conflict import \
check_useful_block_horizontal_overlap
def remove_horizontal_overlap_block_which_smaller(all_bboxes):
useful_blocks = []
for bbox in all_bboxes:
useful_blocks.append({
"bbox": bbox[:4]
})
is_useful_block_horz_overlap, smaller_bbox, bigger_bbox = check_useful_block_horizontal_overlap(useful_blocks)
useful_blocks.append({'bbox': bbox[:4]})
is_useful_block_horz_overlap, smaller_bbox, bigger_bbox = (
check_useful_block_horizontal_overlap(useful_blocks)
)
if is_useful_block_horz_overlap:
logger.warning(
f"skip this page, reason: {DropReason.USEFUL_BLOCK_HOR_OVERLAP}, smaller bbox is {smaller_bbox}, bigger bbox is {bigger_bbox}")
f'skip this page, reason: {DropReason.USEFUL_BLOCK_HOR_OVERLAP}, smaller bbox is {smaller_bbox}, bigger bbox is {bigger_bbox}'
)
for bbox in all_bboxes.copy():
if smaller_bbox == bbox[:4]:
all_bboxes.remove(bbox)
......@@ -41,27 +50,27 @@ def remove_horizontal_overlap_block_which_smaller(all_bboxes):
return is_useful_block_horz_overlap, all_bboxes
def __replace_STX_ETX(text_str:str):
""" Replace \u0002 and \u0003, as these characters become garbled when extracted using pymupdf. In fact, they were originally quotation marks.
Drawback: This issue is only observed in English text; it has not been found in Chinese text so far.
def __replace_STX_ETX(text_str: str):
"""Replace \u0002 and \u0003, as these characters become garbled when extracted using pymupdf. In fact, they were originally quotation marks.
Drawback: This issue is only observed in English text; it has not been found in Chinese text so far.
Args:
text_str (str): raw text
Args:
text_str (str): raw text
Returns:
_type_: replaced text
Returns:
_type_: replaced text
"""
if text_str:
s = text_str.replace('\u0002', "'")
s = s.replace("\u0003", "'")
s = s.replace('\u0003', "'")
return s
return text_str
def txt_spans_extract(pdf_page, inline_equations, interline_equations):
text_raw_blocks = pdf_page.get_text("dict", flags=fitz.TEXTFLAGS_TEXT)["blocks"]
char_level_text_blocks = pdf_page.get_text("rawdict", flags=fitz.TEXTFLAGS_TEXT)[
"blocks"
text_raw_blocks = pdf_page.get_text('dict', flags=fitz.TEXTFLAGS_TEXT)['blocks']
char_level_text_blocks = pdf_page.get_text('rawdict', flags=fitz.TEXTFLAGS_TEXT)[
'blocks'
]
text_blocks = combine_chars_to_pymudict(text_raw_blocks, char_level_text_blocks)
text_blocks = replace_equations_in_textblock(
......@@ -71,189 +80,254 @@ def txt_spans_extract(pdf_page, inline_equations, interline_equations):
text_blocks = remove_chars_in_text_blocks(text_blocks)
spans = []
for v in text_blocks:
for line in v["lines"]:
for span in line["spans"]:
bbox = span["bbox"]
for line in v['lines']:
for span in line['spans']:
bbox = span['bbox']
if float_equal(bbox[0], bbox[2]) or float_equal(bbox[1], bbox[3]):
continue
if span.get('type') not in (ContentType.InlineEquation, ContentType.InterlineEquation):
if span.get('type') not in (
ContentType.InlineEquation,
ContentType.InterlineEquation,
):
spans.append(
{
"bbox": list(span["bbox"]),
"content": __replace_STX_ETX(span["text"]),
"type": ContentType.Text,
"score": 1.0,
'bbox': list(span['bbox']),
'content': __replace_STX_ETX(span['text']),
'type': ContentType.Text,
'score': 1.0,
}
)
return spans
def replace_text_span(pymu_spans, ocr_spans):
return list(filter(lambda x: x["type"] != ContentType.Text, ocr_spans)) + pymu_spans
return list(filter(lambda x: x['type'] != ContentType.Text, ocr_spans)) + pymu_spans
def parse_page_core(pdf_docs, magic_model, page_id, pdf_bytes_md5, imageWriter, parse_mode):
def parse_page_core(
pdf_docs, magic_model, page_id, pdf_bytes_md5, imageWriter, parse_mode
):
need_drop = False
drop_reason = []
'''从magic_model对象中获取后面会用到的区块信息'''
"""从magic_model对象中获取后面会用到的区块信息"""
img_blocks = magic_model.get_imgs(page_id)
table_blocks = magic_model.get_tables(page_id)
discarded_blocks = magic_model.get_discarded(page_id)
text_blocks = magic_model.get_text_blocks(page_id)
title_blocks = magic_model.get_title_blocks(page_id)
inline_equations, interline_equations, interline_equation_blocks = magic_model.get_equations(page_id)
inline_equations, interline_equations, interline_equation_blocks = (
magic_model.get_equations(page_id)
)
page_w, page_h = magic_model.get_page_size(page_id)
spans = magic_model.get_all_spans(page_id)
'''根据parse_mode,构造spans'''
if parse_mode == "txt":
"""根据parse_mode,构造spans"""
if parse_mode == 'txt':
"""ocr 中文本类的 span 用 pymu spans 替换!"""
pymu_spans = txt_spans_extract(
pdf_docs[page_id], inline_equations, interline_equations
)
spans = replace_text_span(pymu_spans, spans)
elif parse_mode == "ocr":
elif parse_mode == 'ocr':
pass
else:
raise Exception("parse_mode must be txt or ocr")
raise Exception('parse_mode must be txt or ocr')
'''删除重叠spans中置信度较低的那些'''
"""删除重叠spans中置信度较低的那些"""
spans, dropped_spans_by_confidence = remove_overlaps_low_confidence_spans(spans)
'''删除重叠spans中较小的那些'''
"""删除重叠spans中较小的那些"""
spans, dropped_spans_by_span_overlap = remove_overlaps_min_spans(spans)
'''对image和table截图'''
spans = ocr_cut_image_and_table(spans, pdf_docs[page_id], page_id, pdf_bytes_md5, imageWriter)
"""对image和table截图"""
spans = ocr_cut_image_and_table(
spans, pdf_docs[page_id], page_id, pdf_bytes_md5, imageWriter
)
'''将所有区块的bbox整理到一起'''
"""将所有区块的bbox整理到一起"""
# interline_equation_blocks参数不够准,后面切换到interline_equations上
interline_equation_blocks = []
if len(interline_equation_blocks) > 0:
all_bboxes, all_discarded_blocks, drop_reasons = ocr_prepare_bboxes_for_layout_split(
img_blocks, table_blocks, discarded_blocks, text_blocks, title_blocks,
interline_equation_blocks, page_w, page_h)
all_bboxes, all_discarded_blocks, drop_reasons = (
ocr_prepare_bboxes_for_layout_split(
img_blocks,
table_blocks,
discarded_blocks,
text_blocks,
title_blocks,
interline_equation_blocks,
page_w,
page_h,
)
)
else:
all_bboxes, all_discarded_blocks, drop_reasons = ocr_prepare_bboxes_for_layout_split(
img_blocks, table_blocks, discarded_blocks, text_blocks, title_blocks,
interline_equations, page_w, page_h)
all_bboxes, all_discarded_blocks, drop_reasons = (
ocr_prepare_bboxes_for_layout_split(
img_blocks,
table_blocks,
discarded_blocks,
text_blocks,
title_blocks,
interline_equations,
page_w,
page_h,
)
)
if len(drop_reasons) > 0:
need_drop = True
drop_reason.append(DropReason.OVERLAP_BLOCKS_CAN_NOT_SEPARATION)
'''先处理不需要排版的discarded_blocks'''
discarded_block_with_spans, spans = fill_spans_in_blocks(all_discarded_blocks, spans, 0.4)
"""先处理不需要排版的discarded_blocks"""
discarded_block_with_spans, spans = fill_spans_in_blocks(
all_discarded_blocks, spans, 0.4
)
fix_discarded_blocks = fix_discarded_block(discarded_block_with_spans)
'''如果当前页面没有bbox则跳过'''
"""如果当前页面没有bbox则跳过"""
if len(all_bboxes) == 0:
logger.warning(f"skip this page, not found useful bbox, page_id: {page_id}")
return ocr_construct_page_component_v2([], [], page_id, page_w, page_h, [],
[], [], interline_equations, fix_discarded_blocks,
need_drop, drop_reason)
logger.warning(f'skip this page, not found useful bbox, page_id: {page_id}')
return ocr_construct_page_component_v2(
[],
[],
page_id,
page_w,
page_h,
[],
[],
[],
interline_equations,
fix_discarded_blocks,
need_drop,
drop_reason,
)
"""在切分之前,先检查一下bbox是否有左右重叠的情况,如果有,那么就认为这个pdf暂时没有能力处理好,这种左右重叠的情况大概率是由于pdf里的行间公式、表格没有被正确识别出来造成的 """
while True: # 循环检查左右重叠的情况,如果存在就删除掉较小的那个bbox,直到不存在左右重叠的情况
is_useful_block_horz_overlap, all_bboxes = remove_horizontal_overlap_block_which_smaller(all_bboxes)
is_useful_block_horz_overlap, all_bboxes = (
remove_horizontal_overlap_block_which_smaller(all_bboxes)
)
if is_useful_block_horz_overlap:
need_drop = True
drop_reason.append(DropReason.USEFUL_BLOCK_HOR_OVERLAP)
else:
break
'''根据区块信息计算layout'''
"""根据区块信息计算layout"""
page_boundry = [0, 0, page_w, page_h]
layout_bboxes, layout_tree = get_bboxes_layout(all_bboxes, page_boundry, page_id)
if len(text_blocks) > 0 and len(all_bboxes) > 0 and len(layout_bboxes) == 0:
logger.warning(
f"skip this page, page_id: {page_id}, reason: {DropReason.CAN_NOT_DETECT_PAGE_LAYOUT}")
f'skip this page, page_id: {page_id}, reason: {DropReason.CAN_NOT_DETECT_PAGE_LAYOUT}'
)
need_drop = True
drop_reason.append(DropReason.CAN_NOT_DETECT_PAGE_LAYOUT)
"""以下去掉复杂的布局和超过2列的布局"""
if any([lay["layout_label"] == LAYOUT_UNPROC for lay in layout_bboxes]): # 复杂的布局
if any(
[lay['layout_label'] == LAYOUT_UNPROC for lay in layout_bboxes]
): # 复杂的布局
logger.warning(
f"skip this page, page_id: {page_id}, reason: {DropReason.COMPLICATED_LAYOUT}")
f'skip this page, page_id: {page_id}, reason: {DropReason.COMPLICATED_LAYOUT}'
)
need_drop = True
drop_reason.append(DropReason.COMPLICATED_LAYOUT)
layout_column_width = get_columns_cnt_of_layout(layout_tree)
if layout_column_width > 2: # 去掉超过2列的布局pdf
logger.warning(
f"skip this page, page_id: {page_id}, reason: {DropReason.TOO_MANY_LAYOUT_COLUMNS}")
f'skip this page, page_id: {page_id}, reason: {DropReason.TOO_MANY_LAYOUT_COLUMNS}'
)
need_drop = True
drop_reason.append(DropReason.TOO_MANY_LAYOUT_COLUMNS)
'''根据layout顺序,对当前页面所有需要留下的block进行排序'''
"""根据layout顺序,对当前页面所有需要留下的block进行排序"""
sorted_blocks = sort_blocks_by_layout(all_bboxes, layout_bboxes)
'''将span填入排好序的blocks中'''
"""将span填入排好序的blocks中"""
block_with_spans, spans = fill_spans_in_blocks(sorted_blocks, spans, 0.3)
'''对block进行fix操作'''
"""对block进行fix操作"""
fix_blocks = fix_block_spans(block_with_spans, img_blocks, table_blocks)
'''获取QA需要外置的list'''
"""获取QA需要外置的list"""
images, tables, interline_equations = get_qa_need_list_v2(fix_blocks)
'''构造pdf_info_dict'''
page_info = ocr_construct_page_component_v2(fix_blocks, layout_bboxes, page_id, page_w, page_h, layout_tree,
images, tables, interline_equations, fix_discarded_blocks,
need_drop, drop_reason)
"""构造pdf_info_dict"""
page_info = ocr_construct_page_component_v2(
fix_blocks,
layout_bboxes,
page_id,
page_w,
page_h,
layout_tree,
images,
tables,
interline_equations,
fix_discarded_blocks,
need_drop,
drop_reason,
)
return page_info
def pdf_parse_union(pdf_bytes,
model_list,
imageWriter,
parse_mode,
start_page_id=0,
end_page_id=None,
debug_mode=False,
):
def pdf_parse_union(
pdf_bytes,
model_list,
imageWriter,
parse_mode,
start_page_id=0,
end_page_id=None,
debug_mode=False,
):
pdf_bytes_md5 = compute_md5(pdf_bytes)
pdf_docs = fitz.open("pdf", pdf_bytes)
pdf_docs = fitz.open('pdf', pdf_bytes)
'''初始化空的pdf_info_dict'''
"""初始化空的pdf_info_dict"""
pdf_info_dict = {}
'''用model_list和docs对象初始化magic_model'''
"""用model_list和docs对象初始化magic_model"""
magic_model = MagicModel(model_list, pdf_docs)
'''根据输入的起始范围解析pdf'''
"""根据输入的起始范围解析pdf"""
# end_page_id = end_page_id if end_page_id else len(pdf_docs) - 1
end_page_id = end_page_id if end_page_id is not None and end_page_id >= 0 else len(pdf_docs) - 1
end_page_id = (
end_page_id
if end_page_id is not None and end_page_id >= 0
else len(pdf_docs) - 1
)
if end_page_id > len(pdf_docs) - 1:
logger.warning("end_page_id is out of range, use pdf_docs length")
logger.warning('end_page_id is out of range, use pdf_docs length')
end_page_id = len(pdf_docs) - 1
'''初始化启动时间'''
"""初始化启动时间"""
start_time = time.time()
for page_id, page in enumerate(pdf_docs):
'''debug时输出每页解析的耗时'''
"""debug时输出每页解析的耗时."""
if debug_mode:
time_now = time.time()
logger.info(
f"page_id: {page_id}, last_page_cost_time: {get_delta_time(start_time)}"
f'page_id: {page_id}, last_page_cost_time: {get_delta_time(start_time)}'
)
start_time = time_now
'''解析pdf中的每一页'''
"""解析pdf中的每一页"""
if start_page_id <= page_id <= end_page_id:
page_info = parse_page_core(pdf_docs, magic_model, page_id, pdf_bytes_md5, imageWriter, parse_mode)
page_info = parse_page_core(
pdf_docs, magic_model, page_id, pdf_bytes_md5, imageWriter, parse_mode
)
else:
page_w = page.rect.width
page_h = page.rect.height
page_info = ocr_construct_page_component_v2([], [], page_id, page_w, page_h, [],
[], [], [], [],
True, "skip page")
pdf_info_dict[f"page_{page_id}"] = page_info
page_info = ocr_construct_page_component_v2(
[], [], page_id, page_w, page_h, [], [], [], [], [], True, 'skip page'
)
pdf_info_dict[f'page_{page_id}'] = page_info
"""分段"""
para_split(pdf_info_dict, debug_mode=debug_mode)
......@@ -261,7 +335,7 @@ def pdf_parse_union(pdf_bytes,
"""dict转list"""
pdf_info_list = dict_to_list(pdf_info_dict)
new_pdf_info_dict = {
"pdf_info": pdf_info_list,
'pdf_info': pdf_info_list,
}
return new_pdf_info_dict
......
......@@ -7,17 +7,17 @@ from typing import List
import torch
from loguru import logger
from magic_pdf.config.drop_reason import DropReason
from magic_pdf.config.enums import SupportedPdfParseMethod
from magic_pdf.config.ocr_content_type import BlockType, ContentType
from magic_pdf.data.dataset import Dataset, PageableData
from magic_pdf.libs.boxbase import calculate_overlap_area_in_bbox1_area_ratio
from magic_pdf.libs.clean_memory import clean_memory
from magic_pdf.libs.commons import fitz, get_delta_time
from magic_pdf.libs.config_reader import get_local_layoutreader_model_dir
from magic_pdf.libs.convert_utils import dict_to_list
from magic_pdf.libs.drop_reason import DropReason
from magic_pdf.libs.hash_utils import compute_md5
from magic_pdf.libs.local_math import float_equal
from magic_pdf.libs.ocr_content_type import ContentType, BlockType
from magic_pdf.model.magic_model import MagicModel
from magic_pdf.para.para_split_v3 import para_split
from magic_pdf.pre_proc.citationmarker_remove import remove_citation_marker
......@@ -30,8 +30,8 @@ from magic_pdf.pre_proc.equations_replace import (
from magic_pdf.pre_proc.ocr_detect_all_bboxes import \
ocr_prepare_bboxes_for_layout_split_v2
from magic_pdf.pre_proc.ocr_dict_merge import (fill_spans_in_blocks,
fix_discarded_block,
fix_block_spans_v2)
fix_block_spans_v2,
fix_discarded_block)
from magic_pdf.pre_proc.ocr_span_list_modify import (
get_qa_need_list_v2, remove_overlaps_low_confidence_spans,
remove_overlaps_min_spans)
......@@ -164,8 +164,8 @@ class ModelSingleton:
def do_predict(boxes: List[List[int]], model) -> List[int]:
from magic_pdf.model.sub_modules.reading_oreder.layoutreader.helpers import (boxes2inputs, parse_logits,
prepare_inputs)
from magic_pdf.model.sub_modules.reading_oreder.layoutreader.helpers import (
boxes2inputs, parse_logits, prepare_inputs)
inputs = boxes2inputs(boxes)
inputs = prepare_inputs(inputs, model)
......@@ -206,7 +206,9 @@ def cal_block_index(fix_blocks, sorted_bboxes):
del block['real_lines']
import numpy as np
from magic_pdf.model.sub_modules.reading_oreder.layoutreader.xycut import recursive_xy_cut
from magic_pdf.model.sub_modules.reading_oreder.layoutreader.xycut import \
recursive_xy_cut
random_boxes = np.array(block_bboxes)
np.random.shuffle(random_boxes)
......@@ -291,7 +293,7 @@ def sort_lines_by_model(fix_blocks, page_w, page_h, line_height):
page_line_list.append(bbox)
elif block['type'] in [BlockType.ImageBody, BlockType.TableBody]:
bbox = block['bbox']
block["real_lines"] = copy.deepcopy(block['lines'])
block['real_lines'] = copy.deepcopy(block['lines'])
lines = insert_lines_into_block(bbox, line_height, page_w, page_h)
block['lines'] = []
for line in lines:
......
from abc import ABC, abstractmethod
from magic_pdf.config.drop_reason import DropReason
from magic_pdf.config.make_content_config import DropMode, MakeMode
from magic_pdf.data.data_reader_writer import DataWriter
from magic_pdf.dict2md.ocr_mkcontent import union_make
from magic_pdf.filter.pdf_classify_by_type import classify
from magic_pdf.filter.pdf_meta_scan import pdf_meta_scan
from magic_pdf.libs.drop_reason import DropReason
from magic_pdf.libs.json_compressor import JsonCompressor
from magic_pdf.libs.MakeContentConfig import DropMode, MakeMode
class AbsPipe(ABC):
......
from loguru import logger
from magic_pdf.config.make_content_config import DropMode, MakeMode
from magic_pdf.data.data_reader_writer import DataWriter
from magic_pdf.libs.MakeContentConfig import DropMode, MakeMode
from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
from magic_pdf.pipe.AbsPipe import AbsPipe
from magic_pdf.user_api import parse_ocr_pdf
......
from loguru import logger
from magic_pdf.config.make_content_config import DropMode, MakeMode
from magic_pdf.data.data_reader_writer import DataWriter
from magic_pdf.libs.MakeContentConfig import DropMode, MakeMode
from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
from magic_pdf.pipe.AbsPipe import AbsPipe
from magic_pdf.user_api import parse_txt_pdf
......
......@@ -2,9 +2,9 @@ import json
from loguru import logger
from magic_pdf.config.make_content_config import DropMode, MakeMode
from magic_pdf.data.data_reader_writer import DataWriter
from magic_pdf.libs.commons import join_path
from magic_pdf.libs.MakeContentConfig import DropMode, MakeMode
from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
from magic_pdf.pipe.AbsPipe import AbsPipe
from magic_pdf.user_api import parse_ocr_pdf, parse_union_pdf
......
from loguru import logger
from magic_pdf.config.drop_reason import DropReason
from magic_pdf.layout.layout_sort import get_columns_cnt_of_layout
from magic_pdf.libs.drop_reason import DropReason
def __is_pseudo_single_column(page_info) -> bool:
"""
判断一个页面是否伪单列。
"""判断一个页面是否伪单列。
Args:
page_info (dict): 页面信息字典,包括'_layout_tree'和'preproc_blocks'。
Returns:
Tuple[bool, Optional[str]]: 如果页面伪单列返回(True, extra_info),否则返回(False, None)。
"""
layout_tree = page_info['_layout_tree']
layout_column_width = get_columns_cnt_of_layout(layout_tree)
......@@ -41,27 +39,22 @@ def __is_pseudo_single_column(page_info) -> bool:
if num_lines > 20:
radio = num_satisfying_lines / num_lines
if radio >= 0.5:
extra_info = f"{{num_lines: {num_lines}, num_satisfying_lines: {num_satisfying_lines}}}"
extra_info = f'{{num_lines: {num_lines}, num_satisfying_lines: {num_satisfying_lines}}}'
block_text = []
for line in lines:
if line['spans']:
for span in line['spans']:
block_text.append(span['text'])
logger.warning(f"pseudo_single_column block_text: {block_text}")
logger.warning(f'pseudo_single_column block_text: {block_text}')
return True, extra_info
return False, None
def pdf_post_filter(page_info) -> tuple:
"""
return:(True|False, err_msg)
True, 如果pdf符合要求
False, 如果pdf不符合要求
"""
"""return:(True|False, err_msg) True, 如果pdf符合要求 False, 如果pdf不符合要求."""
bool_is_pseudo_single_column, extra_info = __is_pseudo_single_column(page_info)
if bool_is_pseudo_single_column:
return False, {"_need_drop": True, "_drop_reason": DropReason.PSEUDO_SINGLE_COLUMN, "extra_info": extra_info}
return False, {'_need_drop': True, '_drop_reason': DropReason.PSEUDO_SINGLE_COLUMN, 'extra_info': extra_info}
return True, None
\ No newline at end of file
return True, None
from loguru import logger
from magic_pdf.config.ocr_content_type import ContentType
from magic_pdf.libs.commons import join_path
from magic_pdf.libs.ocr_content_type import ContentType
from magic_pdf.libs.pdf_image_tools import cut_image
......@@ -29,9 +29,7 @@ def txt_save_images_by_bboxes(page_num: int, page, pdf_bytes_md5: str,
image_bboxes: list, images_overlap_backup: list, table_bboxes: list,
equation_inline_bboxes: list,
equation_interline_bboxes: list, imageWriter) -> dict:
"""
返回一个dict, key为bbox, 值是图片地址
"""
"""返回一个dict, key为bbox, 值是图片地址."""
image_info = []
image_backup_info = []
table_info = []
......@@ -46,26 +44,26 @@ def txt_save_images_by_bboxes(page_num: int, page, pdf_bytes_md5: str,
for bbox in image_bboxes:
if not check_img_bbox(bbox):
continue
image_path = cut_image(bbox, page_num, page, return_path("images"), imageWriter)
image_info.append({"bbox": bbox, "image_path": image_path})
image_path = cut_image(bbox, page_num, page, return_path('images'), imageWriter)
image_info.append({'bbox': bbox, 'image_path': image_path})
for bbox in images_overlap_backup:
if not check_img_bbox(bbox):
continue
image_path = cut_image(bbox, page_num, page, return_path("images"), imageWriter)
image_backup_info.append({"bbox": bbox, "image_path": image_path})
image_path = cut_image(bbox, page_num, page, return_path('images'), imageWriter)
image_backup_info.append({'bbox': bbox, 'image_path': image_path})
for bbox in table_bboxes:
if not check_img_bbox(bbox):
continue
image_path = cut_image(bbox, page_num, page, return_path("tables"), imageWriter)
table_info.append({"bbox": bbox, "image_path": image_path})
image_path = cut_image(bbox, page_num, page, return_path('tables'), imageWriter)
table_info.append({'bbox': bbox, 'image_path': image_path})
return image_info, image_backup_info, table_info, inline_eq_info, interline_eq_info
def check_img_bbox(bbox) -> bool:
if any([bbox[0] >= bbox[2], bbox[1] >= bbox[3]]):
logger.warning(f"image_bboxes: 错误的box, {bbox}")
logger.warning(f'image_bboxes: 错误的box, {bbox}')
return False
return True
This diff is collapsed.
from loguru import logger
from magic_pdf.libs.boxbase import get_minbox_if_overlap_by_ratio, calculate_overlap_area_in_bbox1_area_ratio, \
calculate_iou, calculate_vertical_projection_overlap_ratio
from magic_pdf.libs.drop_tag import DropTag
from magic_pdf.libs.ocr_content_type import BlockType
from magic_pdf.pre_proc.remove_bbox_overlap import remove_overlap_between_bbox_for_block
def ocr_prepare_bboxes_for_layout_split(img_blocks, table_blocks, discarded_blocks, text_blocks,
title_blocks, interline_equation_blocks, page_w, page_h):
from magic_pdf.config.ocr_content_type import BlockType
from magic_pdf.libs.boxbase import (
calculate_iou, calculate_overlap_area_in_bbox1_area_ratio,
calculate_vertical_projection_overlap_ratio,
get_minbox_if_overlap_by_ratio)
from magic_pdf.pre_proc.remove_bbox_overlap import \
remove_overlap_between_bbox_for_block
def ocr_prepare_bboxes_for_layout_split(
img_blocks,
table_blocks,
discarded_blocks,
text_blocks,
title_blocks,
interline_equation_blocks,
page_w,
page_h,
):
all_bboxes = []
all_discarded_blocks = []
for image in img_blocks:
x0, y0, x1, y1 = image['bbox']
all_bboxes.append([x0, y0, x1, y1, None, None, None, BlockType.Image, None, None, None, None, image["score"]])
all_bboxes.append(
[
x0,
y0,
x1,
y1,
None,
None,
None,
BlockType.Image,
None,
None,
None,
None,
image['score'],
]
)
for table in table_blocks:
x0, y0, x1, y1 = table['bbox']
all_bboxes.append([x0, y0, x1, y1, None, None, None, BlockType.Table, None, None, None, None, table["score"]])
all_bboxes.append(
[
x0,
y0,
x1,
y1,
None,
None,
None,
BlockType.Table,
None,
None,
None,
None,
table['score'],
]
)
for text in text_blocks:
x0, y0, x1, y1 = text['bbox']
all_bboxes.append([x0, y0, x1, y1, None, None, None, BlockType.Text, None, None, None, None, text["score"]])
all_bboxes.append(
[
x0,
y0,
x1,
y1,
None,
None,
None,
BlockType.Text,
None,
None,
None,
None,
text['score'],
]
)
for title in title_blocks:
x0, y0, x1, y1 = title['bbox']
all_bboxes.append([x0, y0, x1, y1, None, None, None, BlockType.Title, None, None, None, None, title["score"]])
all_bboxes.append(
[
x0,
y0,
x1,
y1,
None,
None,
None,
BlockType.Title,
None,
None,
None,
None,
title['score'],
]
)
for interline_equation in interline_equation_blocks:
x0, y0, x1, y1 = interline_equation['bbox']
all_bboxes.append([x0, y0, x1, y1, None, None, None, BlockType.InterlineEquation, None, None, None, None, interline_equation["score"]])
'''block嵌套问题解决'''
'''文本框与标题框重叠,优先信任文本框'''
all_bboxes.append(
[
x0,
y0,
x1,
y1,
None,
None,
None,
BlockType.InterlineEquation,
None,
None,
None,
None,
interline_equation['score'],
]
)
"""block嵌套问题解决"""
"""文本框与标题框重叠,优先信任文本框"""
all_bboxes = fix_text_overlap_title_blocks(all_bboxes)
'''任何框体与舍弃框重叠,优先信任舍弃框'''
"""任何框体与舍弃框重叠,优先信任舍弃框"""
all_bboxes = remove_need_drop_blocks(all_bboxes, discarded_blocks)
# interline_equation 与title或text框冲突的情况,分两种情况处理
'''interline_equation框与文本类型框iou比较接近1的时候,信任行间公式框'''
"""interline_equation框与文本类型框iou比较接近1的时候,信任行间公式框"""
all_bboxes = fix_interline_equation_overlap_text_blocks_with_hi_iou(all_bboxes)
'''interline_equation框被包含在文本类型框内,且interline_equation比文本区块小很多时信任文本框,这时需要舍弃公式框'''
"""interline_equation框被包含在文本类型框内,且interline_equation比文本区块小很多时信任文本框,这时需要舍弃公式框"""
# 通过后续大框套小框逻辑删除
'''discarded_blocks中只保留宽度超过1/3页面宽度的,高度超过10的,处于页面下半50%区域的(限定footnote)'''
"""discarded_blocks中只保留宽度超过1/3页面宽度的,高度超过10的,处于页面下半50%区域的(限定footnote)"""
for discarded in discarded_blocks:
x0, y0, x1, y1 = discarded['bbox']
all_discarded_blocks.append([x0, y0, x1, y1, None, None, None, BlockType.Discarded, None, None, None, None, discarded["score"]])
all_discarded_blocks.append(
[
x0,
y0,
x1,
y1,
None,
None,
None,
BlockType.Discarded,
None,
None,
None,
None,
discarded['score'],
]
)
# 将footnote加入到all_bboxes中,用来计算layout
if (x1 - x0) > (page_w / 3) and (y1 - y0) > 10 and y0 > (page_h / 2):
all_bboxes.append([x0, y0, x1, y1, None, None, None, BlockType.Footnote, None, None, None, None, discarded["score"]])
'''经过以上处理后,还存在大框套小框的情况,则删除小框'''
all_bboxes.append(
[
x0,
y0,
x1,
y1,
None,
None,
None,
BlockType.Footnote,
None,
None,
None,
None,
discarded['score'],
]
)
"""经过以上处理后,还存在大框套小框的情况,则删除小框"""
all_bboxes = remove_overlaps_min_blocks(all_bboxes)
all_discarded_blocks = remove_overlaps_min_blocks(all_discarded_blocks)
'''将剩余的bbox做分离处理,防止后面分layout时出错'''
"""将剩余的bbox做分离处理,防止后面分layout时出错"""
all_bboxes, drop_reasons = remove_overlap_between_bbox_for_block(all_bboxes)
return all_bboxes, all_discarded_blocks, drop_reasons
......@@ -64,18 +185,64 @@ def add_bboxes(blocks, block_type, bboxes):
for block in blocks:
x0, y0, x1, y1 = block['bbox']
if block_type in [
BlockType.ImageBody, BlockType.ImageCaption, BlockType.ImageFootnote,
BlockType.TableBody, BlockType.TableCaption, BlockType.TableFootnote
BlockType.ImageBody,
BlockType.ImageCaption,
BlockType.ImageFootnote,
BlockType.TableBody,
BlockType.TableCaption,
BlockType.TableFootnote,
]:
bboxes.append([x0, y0, x1, y1, None, None, None, block_type, None, None, None, None, block["score"], block["group_id"]])
bboxes.append(
[
x0,
y0,
x1,
y1,
None,
None,
None,
block_type,
None,
None,
None,
None,
block['score'],
block['group_id'],
]
)
else:
bboxes.append([x0, y0, x1, y1, None, None, None, block_type, None, None, None, None, block["score"]])
bboxes.append(
[
x0,
y0,
x1,
y1,
None,
None,
None,
block_type,
None,
None,
None,
None,
block['score'],
]
)
def ocr_prepare_bboxes_for_layout_split_v2(
img_body_blocks, img_caption_blocks, img_footnote_blocks,
table_body_blocks, table_caption_blocks, table_footnote_blocks,
discarded_blocks, text_blocks, title_blocks, interline_equation_blocks, page_w, page_h
img_body_blocks,
img_caption_blocks,
img_footnote_blocks,
table_body_blocks,
table_caption_blocks,
table_footnote_blocks,
discarded_blocks,
text_blocks,
title_blocks,
interline_equation_blocks,
page_w,
page_h,
):
all_bboxes = []
......@@ -89,40 +256,40 @@ def ocr_prepare_bboxes_for_layout_split_v2(
add_bboxes(title_blocks, BlockType.Title, all_bboxes)
add_bboxes(interline_equation_blocks, BlockType.InterlineEquation, all_bboxes)
'''block嵌套问题解决'''
'''文本框与标题框重叠,优先信任文本框'''
"""block嵌套问题解决"""
"""文本框与标题框重叠,优先信任文本框"""
all_bboxes = fix_text_overlap_title_blocks(all_bboxes)
'''任何框体与舍弃框重叠,优先信任舍弃框'''
"""任何框体与舍弃框重叠,优先信任舍弃框"""
all_bboxes = remove_need_drop_blocks(all_bboxes, discarded_blocks)
# interline_equation 与title或text框冲突的情况,分两种情况处理
'''interline_equation框与文本类型框iou比较接近1的时候,信任行间公式框'''
"""interline_equation框与文本类型框iou比较接近1的时候,信任行间公式框"""
all_bboxes = fix_interline_equation_overlap_text_blocks_with_hi_iou(all_bboxes)
'''interline_equation框被包含在文本类型框内,且interline_equation比文本区块小很多时信任文本框,这时需要舍弃公式框'''
"""interline_equation框被包含在文本类型框内,且interline_equation比文本区块小很多时信任文本框,这时需要舍弃公式框"""
# 通过后续大框套小框逻辑删除
'''discarded_blocks'''
"""discarded_blocks"""
all_discarded_blocks = []
add_bboxes(discarded_blocks, BlockType.Discarded, all_discarded_blocks)
'''footnote识别:宽度超过1/3页面宽度的,高度超过10的,处于页面下半50%区域的'''
"""footnote识别:宽度超过1/3页面宽度的,高度超过10的,处于页面下半50%区域的"""
footnote_blocks = []
for discarded in discarded_blocks:
x0, y0, x1, y1 = discarded['bbox']
if (x1 - x0) > (page_w / 3) and (y1 - y0) > 10 and y0 > (page_h / 2):
footnote_blocks.append([x0, y0, x1, y1])
'''移除在footnote下面的任何框'''
"""移除在footnote下面的任何框"""
need_remove_blocks = find_blocks_under_footnote(all_bboxes, footnote_blocks)
if len(need_remove_blocks) > 0:
for block in need_remove_blocks:
all_bboxes.remove(block)
all_discarded_blocks.append(block)
'''经过以上处理后,还存在大框套小框的情况,则删除小框'''
"""经过以上处理后,还存在大框套小框的情况,则删除小框"""
all_bboxes = remove_overlaps_min_blocks(all_bboxes)
all_discarded_blocks = remove_overlaps_min_blocks(all_discarded_blocks)
'''将剩余的bbox做分离处理,防止后面分layout时出错'''
"""将剩余的bbox做分离处理,防止后面分layout时出错"""
all_bboxes, drop_reasons = remove_overlap_between_bbox_for_block(all_bboxes)
return all_bboxes, all_discarded_blocks
......@@ -135,7 +302,13 @@ def find_blocks_under_footnote(all_bboxes, footnote_blocks):
for footnote_bbox in footnote_blocks:
footnote_x0, footnote_y0, footnote_x1, footnote_y1 = footnote_bbox
# 如果footnote的纵向投影覆盖了block的纵向投影的80%且block的y0大于等于footnote的y1
if block_y0 >= footnote_y1 and calculate_vertical_projection_overlap_ratio((block_x0, block_y0, block_x1, block_y1), footnote_bbox) >= 0.8:
if (
block_y0 >= footnote_y1
and calculate_vertical_projection_overlap_ratio(
(block_x0, block_y0, block_x1, block_y1), footnote_bbox
)
>= 0.8
):
if block not in need_remove_blocks:
need_remove_blocks.append(block)
break
......@@ -203,7 +376,12 @@ def remove_need_drop_blocks(all_bboxes, discarded_blocks):
for block in all_bboxes:
for discarded_block in discarded_blocks:
block_bbox = block[:4]
if calculate_overlap_area_in_bbox1_area_ratio(block_bbox, discarded_block['bbox']) > 0.6:
if (
calculate_overlap_area_in_bbox1_area_ratio(
block_bbox, discarded_block['bbox']
)
> 0.6
):
if block not in need_remove:
need_remove.append(block)
break
......@@ -223,10 +401,18 @@ def remove_overlaps_min_blocks(all_bboxes):
if block1 != block2:
block1_bbox = block1[:4]
block2_bbox = block2[:4]
overlap_box = get_minbox_if_overlap_by_ratio(block1_bbox, block2_bbox, 0.8)
overlap_box = get_minbox_if_overlap_by_ratio(
block1_bbox, block2_bbox, 0.8
)
if overlap_box is not None:
block_to_remove = next((block for block in all_bboxes if block[:4] == overlap_box), None)
if block_to_remove is not None and block_to_remove not in need_remove:
block_to_remove = next(
(block for block in all_bboxes if block[:4] == overlap_box),
None,
)
if (
block_to_remove is not None
and block_to_remove not in need_remove
):
large_block = block1 if block1 != block_to_remove else block2
x1, y1, x2, y2 = large_block[:4]
sx1, sy1, sx2, sy2 = block_to_remove[:4]
......
from magic_pdf.config.drop_tag import DropTag
from magic_pdf.config.ocr_content_type import BlockType, ContentType
from magic_pdf.libs.boxbase import (__is_overlaps_y_exceeds_threshold,
_is_in_or_part_overlap_with_area_ratio,
calculate_overlap_area_in_bbox1_area_ratio)
from magic_pdf.libs.drop_tag import DropTag
from magic_pdf.libs.ocr_content_type import BlockType, ContentType
# 将每一个line中的span从左到右排序
......@@ -157,7 +157,7 @@ def fill_spans_in_blocks(blocks, spans, radio):
BlockType.ImageBody, BlockType.ImageCaption, BlockType.ImageFootnote,
BlockType.TableBody, BlockType.TableCaption, BlockType.TableFootnote
]:
block_dict["group_id"] = block[-1]
block_dict['group_id'] = block[-1]
block_spans = []
for span in spans:
span_bbox = span['bbox']
......
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
import re
from magic_pdf.config.drop_tag import CONTENT_IN_FOOT_OR_HEADER, PAGE_NO
from magic_pdf.libs.boxbase import _is_in_or_part_overlap
from magic_pdf.libs.drop_tag import CONTENT_IN_FOOT_OR_HEADER, PAGE_NO
def remove_headder_footer_one_page(text_raw_blocks, image_bboxes, table_bboxes, header_bboxs, footer_bboxs,
page_no_bboxs, page_w, page_h):
"""
删除页眉页脚,页码
从line级别进行删除,删除之后观察这个text-block是否是空的,如果是空的,则移动到remove_list中
"""
"""删除页眉页脚,页码 从line级别进行删除,删除之后观察这个text-block是否是空的,如果是空的,则移动到remove_list中."""
header = []
footer = []
if len(header) == 0:
......
This diff is collapsed.
This diff is collapsed.
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment