Unverified Commit 6c8f5638 authored by Xiaomeng Zhao's avatar Xiaomeng Zhao Committed by GitHub
Browse files

Merge pull request #1027 from icecraft/refactor/move_defs

refactor: move some constants or enums defs to config folder
parents bc992433 b492c19c
This diff is collapsed.
import copy import copy
from loguru import logger from magic_pdf.config.constants import CROSS_PAGE, LINES_DELETED
from magic_pdf.config.ocr_content_type import BlockType, ContentType
from magic_pdf.libs.Constants import LINES_DELETED, CROSS_PAGE
from magic_pdf.libs.ocr_content_type import BlockType, ContentType LINE_STOP_FLAG = (
'.',
LINE_STOP_FLAG = ('.', '!', '?', '。', '!', '?', ')', ')', '"', '”', ':', ':', ';', ';') '!',
'?',
'。',
'!',
'?',
')',
')',
'"',
'”',
':',
':',
';',
';',
)
LIST_END_FLAG = ('.', '。', ';', ';') LIST_END_FLAG = ('.', '。', ';', ';')
class ListLineTag: class ListLineTag:
IS_LIST_START_LINE = "is_list_start_line" IS_LIST_START_LINE = 'is_list_start_line'
IS_LIST_END_LINE = "is_list_end_line" IS_LIST_END_LINE = 'is_list_end_line'
def __process_blocks(blocks): def __process_blocks(blocks):
...@@ -27,12 +40,14 @@ def __process_blocks(blocks): ...@@ -27,12 +40,14 @@ def __process_blocks(blocks):
# 如果当前块是 text 类型 # 如果当前块是 text 类型
if current_block['type'] == 'text': if current_block['type'] == 'text':
current_block["bbox_fs"] = copy.deepcopy(current_block["bbox"]) current_block['bbox_fs'] = copy.deepcopy(current_block['bbox'])
if 'lines' in current_block and len(current_block["lines"]) > 0: if 'lines' in current_block and len(current_block['lines']) > 0:
current_block['bbox_fs'] = [min([line['bbox'][0] for line in current_block['lines']]), current_block['bbox_fs'] = [
min([line['bbox'][1] for line in current_block['lines']]), min([line['bbox'][0] for line in current_block['lines']]),
max([line['bbox'][2] for line in current_block['lines']]), min([line['bbox'][1] for line in current_block['lines']]),
max([line['bbox'][3] for line in current_block['lines']])] max([line['bbox'][2] for line in current_block['lines']]),
max([line['bbox'][3] for line in current_block['lines']]),
]
current_group.append(current_block) current_group.append(current_block)
# 检查下一个块是否存在 # 检查下一个块是否存在
...@@ -83,9 +98,10 @@ def __is_list_or_index_block(block): ...@@ -83,9 +98,10 @@ def __is_list_or_index_block(block):
# logger.info(f"block_weight_radio: {block_weight_radio}") # logger.info(f"block_weight_radio: {block_weight_radio}")
# 如果首行左边不顶格而右边顶格,末行左边顶格而右边不顶格 (第一行可能可以右边不顶格) # 如果首行左边不顶格而右边顶格,末行左边顶格而右边不顶格 (第一行可能可以右边不顶格)
if (first_line['bbox'][0] - block['bbox_fs'][0] > line_height / 2 and if (
abs(last_line['bbox'][0] - block['bbox_fs'][0]) < line_height / 2 and first_line['bbox'][0] - block['bbox_fs'][0] > line_height / 2
block['bbox_fs'][2] - last_line['bbox'][2] > line_height and abs(last_line['bbox'][0] - block['bbox_fs'][0]) < line_height / 2
and block['bbox_fs'][2] - last_line['bbox'][2] > line_height
): ):
multiple_para_flag = True multiple_para_flag = True
...@@ -93,14 +109,14 @@ def __is_list_or_index_block(block): ...@@ -93,14 +109,14 @@ def __is_list_or_index_block(block):
line_mid_x = (line['bbox'][0] + line['bbox'][2]) / 2 line_mid_x = (line['bbox'][0] + line['bbox'][2]) / 2
block_mid_x = (block['bbox_fs'][0] + block['bbox_fs'][2]) / 2 block_mid_x = (block['bbox_fs'][0] + block['bbox_fs'][2]) / 2
if ( if (
line['bbox'][0] - block['bbox_fs'][0] > 0.8 * line_height and line['bbox'][0] - block['bbox_fs'][0] > 0.8 * line_height
block['bbox_fs'][2] - line['bbox'][2] > 0.8 * line_height and block['bbox_fs'][2] - line['bbox'][2] > 0.8 * line_height
): ):
external_sides_not_close_num += 1 external_sides_not_close_num += 1
if abs(line_mid_x - block_mid_x) < line_height / 2: if abs(line_mid_x - block_mid_x) < line_height / 2:
center_close_num += 1 center_close_num += 1
line_text = "" line_text = ''
for span in line['spans']: for span in line['spans']:
span_type = span['type'] span_type = span['type']
...@@ -148,15 +164,19 @@ def __is_list_or_index_block(block): ...@@ -148,15 +164,19 @@ def __is_list_or_index_block(block):
if line_text[-1].isdigit(): if line_text[-1].isdigit():
num_end_count += 1 num_end_count += 1
if num_start_count / len(lines_text_list) >= 0.8 or num_end_count / len(lines_text_list) >= 0.8: if (
num_start_count / len(lines_text_list) >= 0.8
or num_end_count / len(lines_text_list) >= 0.8
):
line_num_flag = True line_num_flag = True
if flag_end_count / len(lines_text_list) >= 0.8: if flag_end_count / len(lines_text_list) >= 0.8:
line_end_flag = True line_end_flag = True
# 有的目录右侧不贴边, 目前认为左边或者右边有一边全贴边,且符合数字规则极为index # 有的目录右侧不贴边, 目前认为左边或者右边有一边全贴边,且符合数字规则极为index
if ((left_close_num / len(block['lines']) >= 0.8 or right_close_num / len(block['lines']) >= 0.8) if (
and line_num_flag left_close_num / len(block['lines']) >= 0.8
): or right_close_num / len(block['lines']) >= 0.8
) and line_num_flag:
for line in block['lines']: for line in block['lines']:
line[ListLineTag.IS_LIST_START_LINE] = True line[ListLineTag.IS_LIST_START_LINE] = True
return BlockType.Index return BlockType.Index
...@@ -164,20 +184,20 @@ def __is_list_or_index_block(block): ...@@ -164,20 +184,20 @@ def __is_list_or_index_block(block):
# 全部line都居中的特殊list识别,每行都需要换行,特征是多行,且大多数行都前后not_close,每line中点x坐标接近 # 全部line都居中的特殊list识别,每行都需要换行,特征是多行,且大多数行都前后not_close,每line中点x坐标接近
# 补充条件block的长宽比有要求 # 补充条件block的长宽比有要求
elif ( elif (
external_sides_not_close_num >= 2 and external_sides_not_close_num >= 2
center_close_num == len(block['lines']) and and center_close_num == len(block['lines'])
external_sides_not_close_num / len(block['lines']) >= 0.5 and and external_sides_not_close_num / len(block['lines']) >= 0.5
block_height / block_weight > 0.4 and block_height / block_weight > 0.4
): ):
for line in block['lines']: for line in block['lines']:
line[ListLineTag.IS_LIST_START_LINE] = True line[ListLineTag.IS_LIST_START_LINE] = True
return BlockType.List return BlockType.List
elif ( elif (
left_close_num >= 2 left_close_num >= 2
and (right_not_close_num >= 2 or line_end_flag or left_not_close_num >= 2) and (right_not_close_num >= 2 or line_end_flag or left_not_close_num >= 2)
and not multiple_para_flag and not multiple_para_flag
# and block_weight_radio > 0.27 # and block_weight_radio > 0.27
): ):
# 处理一种特殊的没有缩进的list,所有行都贴左边,通过右边的空隙判断是否是item尾 # 处理一种特殊的没有缩进的list,所有行都贴左边,通过右边的空隙判断是否是item尾
if left_close_num / len(block['lines']) > 0.8: if left_close_num / len(block['lines']) > 0.8:
...@@ -189,10 +209,15 @@ def __is_list_or_index_block(block): ...@@ -189,10 +209,15 @@ def __is_list_or_index_block(block):
# 这种是大部分line item 都有结束标识符的情况,按结束标识符区分不同item # 这种是大部分line item 都有结束标识符的情况,按结束标识符区分不同item
elif line_end_flag: elif line_end_flag:
for i, line in enumerate(block['lines']): for i, line in enumerate(block['lines']):
if len(lines_text_list[i]) > 0 and lines_text_list[i][-1] in LIST_END_FLAG: if (
len(lines_text_list[i]) > 0
and lines_text_list[i][-1] in LIST_END_FLAG
):
line[ListLineTag.IS_LIST_END_LINE] = True line[ListLineTag.IS_LIST_END_LINE] = True
if i + 1 < len(block['lines']): if i + 1 < len(block['lines']):
block['lines'][i + 1][ListLineTag.IS_LIST_START_LINE] = True block['lines'][i + 1][
ListLineTag.IS_LIST_START_LINE
] = True
# line item基本没有结束标识符,而且也没有缩进,按右侧空隙判断哪些是item end # line item基本没有结束标识符,而且也没有缩进,按右侧空隙判断哪些是item end
else: else:
line_start_flag = False line_start_flag = False
...@@ -201,7 +226,10 @@ def __is_list_or_index_block(block): ...@@ -201,7 +226,10 @@ def __is_list_or_index_block(block):
line[ListLineTag.IS_LIST_START_LINE] = True line[ListLineTag.IS_LIST_START_LINE] = True
line_start_flag = False line_start_flag = False
if abs(block['bbox_fs'][2] - line['bbox'][2]) > 0.1 * block_weight: if (
abs(block['bbox_fs'][2] - line['bbox'][2])
> 0.1 * block_weight
):
line[ListLineTag.IS_LIST_END_LINE] = True line[ListLineTag.IS_LIST_END_LINE] = True
line_start_flag = True line_start_flag = True
# 一种有缩进的特殊有序list,start line 左侧不贴边且以数字开头,end line 以 IS_LIST_END_FLAG 结尾且数量和start line 一致 # 一种有缩进的特殊有序list,start line 左侧不贴边且以数字开头,end line 以 IS_LIST_END_FLAG 结尾且数量和start line 一致
...@@ -243,11 +271,13 @@ def __merge_2_text_blocks(block1, block2): ...@@ -243,11 +271,13 @@ def __merge_2_text_blocks(block1, block2):
first_span = first_line['spans'][0] first_span = first_line['spans'][0]
if len(first_span['content']) > 0: if len(first_span['content']) > 0:
span_start_with_num = first_span['content'][0].isdigit() span_start_with_num = first_span['content'][0].isdigit()
if (abs(block2['bbox_fs'][2] - last_line['bbox'][2]) < line_height if (
and not last_span['content'].endswith(LINE_STOP_FLAG) abs(block2['bbox_fs'][2] - last_line['bbox'][2])
# 两个block宽度差距超过2倍也不合并 < line_height
and abs(block1_weight - block2_weight) < min_block_weight and not last_span['content'].endswith(LINE_STOP_FLAG)
and not span_start_with_num # 两个block宽度差距超过2倍也不合并
and abs(block1_weight - block2_weight) < min_block_weight
and not span_start_with_num
): ):
if block1['page_num'] != block2['page_num']: if block1['page_num'] != block2['page_num']:
for line in block1['lines']: for line in block1['lines']:
...@@ -284,7 +314,6 @@ def __is_list_group(text_blocks_group): ...@@ -284,7 +314,6 @@ def __is_list_group(text_blocks_group):
def __para_merge_page(blocks): def __para_merge_page(blocks):
page_text_blocks_groups = __process_blocks(blocks) page_text_blocks_groups = __process_blocks(blocks)
for text_blocks_group in page_text_blocks_groups: for text_blocks_group in page_text_blocks_groups:
if len(text_blocks_group) > 0: if len(text_blocks_group) > 0:
# 需要先在合并前对所有block判断是否为list or index block # 需要先在合并前对所有block判断是否为list or index block
for block in text_blocks_group: for block in text_blocks_group:
...@@ -293,7 +322,6 @@ def __para_merge_page(blocks): ...@@ -293,7 +322,6 @@ def __para_merge_page(blocks):
# logger.info(f"{block['type']}:{block}") # logger.info(f"{block['type']}:{block}")
if len(text_blocks_group) > 1: if len(text_blocks_group) > 1:
# 在合并前判断这个group 是否是一个 list group # 在合并前判断这个group 是否是一个 list group
is_list_group = __is_list_group(text_blocks_group) is_list_group = __is_list_group(text_blocks_group)
...@@ -305,11 +333,18 @@ def __para_merge_page(blocks): ...@@ -305,11 +333,18 @@ def __para_merge_page(blocks):
if i - 1 >= 0: if i - 1 >= 0:
prev_block = text_blocks_group[i - 1] prev_block = text_blocks_group[i - 1]
if current_block['type'] == 'text' and prev_block['type'] == 'text' and not is_list_group: if (
current_block['type'] == 'text'
and prev_block['type'] == 'text'
and not is_list_group
):
__merge_2_text_blocks(current_block, prev_block) __merge_2_text_blocks(current_block, prev_block)
elif ( elif (
(current_block['type'] == BlockType.List and prev_block['type'] == BlockType.List) or current_block['type'] == BlockType.List
(current_block['type'] == BlockType.Index and prev_block['type'] == BlockType.Index) and prev_block['type'] == BlockType.List
) or (
current_block['type'] == BlockType.Index
and prev_block['type'] == BlockType.Index
): ):
__merge_2_list_blocks(current_block, prev_block) __merge_2_list_blocks(current_block, prev_block)
...@@ -339,4 +374,4 @@ if __name__ == '__main__': ...@@ -339,4 +374,4 @@ if __name__ == '__main__':
# 调用函数 # 调用函数
groups = __process_blocks(input_blocks) groups = __process_blocks(input_blocks)
for group_index, group in enumerate(groups): for group_index, group in enumerate(groups):
print(f"Group {group_index}: {group}") print(f'Group {group_index}: {group}')
...@@ -2,38 +2,47 @@ import time ...@@ -2,38 +2,47 @@ import time
from loguru import logger from loguru import logger
from magic_pdf.config.drop_reason import DropReason
from magic_pdf.config.ocr_content_type import ContentType
from magic_pdf.layout.layout_sort import (LAYOUT_UNPROC, get_bboxes_layout,
get_columns_cnt_of_layout)
from magic_pdf.libs.commons import fitz, get_delta_time from magic_pdf.libs.commons import fitz, get_delta_time
from magic_pdf.layout.layout_sort import get_bboxes_layout, LAYOUT_UNPROC, get_columns_cnt_of_layout
from magic_pdf.libs.convert_utils import dict_to_list from magic_pdf.libs.convert_utils import dict_to_list
from magic_pdf.libs.drop_reason import DropReason
from magic_pdf.libs.hash_utils import compute_md5 from magic_pdf.libs.hash_utils import compute_md5
from magic_pdf.libs.local_math import float_equal from magic_pdf.libs.local_math import float_equal
from magic_pdf.libs.ocr_content_type import ContentType
from magic_pdf.model.magic_model import MagicModel from magic_pdf.model.magic_model import MagicModel
from magic_pdf.para.para_split_v2 import para_split from magic_pdf.para.para_split_v2 import para_split
from magic_pdf.pre_proc.citationmarker_remove import remove_citation_marker from magic_pdf.pre_proc.citationmarker_remove import remove_citation_marker
from magic_pdf.pre_proc.construct_page_dict import ocr_construct_page_component_v2 from magic_pdf.pre_proc.construct_page_dict import \
ocr_construct_page_component_v2
from magic_pdf.pre_proc.cut_image import ocr_cut_image_and_table from magic_pdf.pre_proc.cut_image import ocr_cut_image_and_table
from magic_pdf.pre_proc.equations_replace import remove_chars_in_text_blocks, replace_equations_in_textblock, \ from magic_pdf.pre_proc.equations_replace import (
combine_chars_to_pymudict combine_chars_to_pymudict, remove_chars_in_text_blocks,
from magic_pdf.pre_proc.ocr_detect_all_bboxes import ocr_prepare_bboxes_for_layout_split replace_equations_in_textblock)
from magic_pdf.pre_proc.ocr_dict_merge import sort_blocks_by_layout, fill_spans_in_blocks, fix_block_spans, \ from magic_pdf.pre_proc.ocr_detect_all_bboxes import \
fix_discarded_block ocr_prepare_bboxes_for_layout_split
from magic_pdf.pre_proc.ocr_span_list_modify import remove_overlaps_min_spans, get_qa_need_list_v2, \ from magic_pdf.pre_proc.ocr_dict_merge import (fill_spans_in_blocks,
remove_overlaps_low_confidence_spans fix_block_spans,
from magic_pdf.pre_proc.resolve_bbox_conflict import check_useful_block_horizontal_overlap fix_discarded_block,
sort_blocks_by_layout)
from magic_pdf.pre_proc.ocr_span_list_modify import (
get_qa_need_list_v2, remove_overlaps_low_confidence_spans,
remove_overlaps_min_spans)
from magic_pdf.pre_proc.resolve_bbox_conflict import \
check_useful_block_horizontal_overlap
def remove_horizontal_overlap_block_which_smaller(all_bboxes): def remove_horizontal_overlap_block_which_smaller(all_bboxes):
useful_blocks = [] useful_blocks = []
for bbox in all_bboxes: for bbox in all_bboxes:
useful_blocks.append({ useful_blocks.append({'bbox': bbox[:4]})
"bbox": bbox[:4] is_useful_block_horz_overlap, smaller_bbox, bigger_bbox = (
}) check_useful_block_horizontal_overlap(useful_blocks)
is_useful_block_horz_overlap, smaller_bbox, bigger_bbox = check_useful_block_horizontal_overlap(useful_blocks) )
if is_useful_block_horz_overlap: if is_useful_block_horz_overlap:
logger.warning( logger.warning(
f"skip this page, reason: {DropReason.USEFUL_BLOCK_HOR_OVERLAP}, smaller bbox is {smaller_bbox}, bigger bbox is {bigger_bbox}") f'skip this page, reason: {DropReason.USEFUL_BLOCK_HOR_OVERLAP}, smaller bbox is {smaller_bbox}, bigger bbox is {bigger_bbox}'
)
for bbox in all_bboxes.copy(): for bbox in all_bboxes.copy():
if smaller_bbox == bbox[:4]: if smaller_bbox == bbox[:4]:
all_bboxes.remove(bbox) all_bboxes.remove(bbox)
...@@ -41,27 +50,27 @@ def remove_horizontal_overlap_block_which_smaller(all_bboxes): ...@@ -41,27 +50,27 @@ def remove_horizontal_overlap_block_which_smaller(all_bboxes):
return is_useful_block_horz_overlap, all_bboxes return is_useful_block_horz_overlap, all_bboxes
def __replace_STX_ETX(text_str:str): def __replace_STX_ETX(text_str: str):
""" Replace \u0002 and \u0003, as these characters become garbled when extracted using pymupdf. In fact, they were originally quotation marks. """Replace \u0002 and \u0003, as these characters become garbled when extracted using pymupdf. In fact, they were originally quotation marks.
Drawback: This issue is only observed in English text; it has not been found in Chinese text so far. Drawback: This issue is only observed in English text; it has not been found in Chinese text so far.
Args: Args:
text_str (str): raw text text_str (str): raw text
Returns: Returns:
_type_: replaced text _type_: replaced text
""" """
if text_str: if text_str:
s = text_str.replace('\u0002', "'") s = text_str.replace('\u0002', "'")
s = s.replace("\u0003", "'") s = s.replace('\u0003', "'")
return s return s
return text_str return text_str
def txt_spans_extract(pdf_page, inline_equations, interline_equations): def txt_spans_extract(pdf_page, inline_equations, interline_equations):
text_raw_blocks = pdf_page.get_text("dict", flags=fitz.TEXTFLAGS_TEXT)["blocks"] text_raw_blocks = pdf_page.get_text('dict', flags=fitz.TEXTFLAGS_TEXT)['blocks']
char_level_text_blocks = pdf_page.get_text("rawdict", flags=fitz.TEXTFLAGS_TEXT)[ char_level_text_blocks = pdf_page.get_text('rawdict', flags=fitz.TEXTFLAGS_TEXT)[
"blocks" 'blocks'
] ]
text_blocks = combine_chars_to_pymudict(text_raw_blocks, char_level_text_blocks) text_blocks = combine_chars_to_pymudict(text_raw_blocks, char_level_text_blocks)
text_blocks = replace_equations_in_textblock( text_blocks = replace_equations_in_textblock(
...@@ -71,189 +80,254 @@ def txt_spans_extract(pdf_page, inline_equations, interline_equations): ...@@ -71,189 +80,254 @@ def txt_spans_extract(pdf_page, inline_equations, interline_equations):
text_blocks = remove_chars_in_text_blocks(text_blocks) text_blocks = remove_chars_in_text_blocks(text_blocks)
spans = [] spans = []
for v in text_blocks: for v in text_blocks:
for line in v["lines"]: for line in v['lines']:
for span in line["spans"]: for span in line['spans']:
bbox = span["bbox"] bbox = span['bbox']
if float_equal(bbox[0], bbox[2]) or float_equal(bbox[1], bbox[3]): if float_equal(bbox[0], bbox[2]) or float_equal(bbox[1], bbox[3]):
continue continue
if span.get('type') not in (ContentType.InlineEquation, ContentType.InterlineEquation): if span.get('type') not in (
ContentType.InlineEquation,
ContentType.InterlineEquation,
):
spans.append( spans.append(
{ {
"bbox": list(span["bbox"]), 'bbox': list(span['bbox']),
"content": __replace_STX_ETX(span["text"]), 'content': __replace_STX_ETX(span['text']),
"type": ContentType.Text, 'type': ContentType.Text,
"score": 1.0, 'score': 1.0,
} }
) )
return spans return spans
def replace_text_span(pymu_spans, ocr_spans): def replace_text_span(pymu_spans, ocr_spans):
return list(filter(lambda x: x["type"] != ContentType.Text, ocr_spans)) + pymu_spans return list(filter(lambda x: x['type'] != ContentType.Text, ocr_spans)) + pymu_spans
def parse_page_core(pdf_docs, magic_model, page_id, pdf_bytes_md5, imageWriter, parse_mode): def parse_page_core(
pdf_docs, magic_model, page_id, pdf_bytes_md5, imageWriter, parse_mode
):
need_drop = False need_drop = False
drop_reason = [] drop_reason = []
'''从magic_model对象中获取后面会用到的区块信息''' """从magic_model对象中获取后面会用到的区块信息"""
img_blocks = magic_model.get_imgs(page_id) img_blocks = magic_model.get_imgs(page_id)
table_blocks = magic_model.get_tables(page_id) table_blocks = magic_model.get_tables(page_id)
discarded_blocks = magic_model.get_discarded(page_id) discarded_blocks = magic_model.get_discarded(page_id)
text_blocks = magic_model.get_text_blocks(page_id) text_blocks = magic_model.get_text_blocks(page_id)
title_blocks = magic_model.get_title_blocks(page_id) title_blocks = magic_model.get_title_blocks(page_id)
inline_equations, interline_equations, interline_equation_blocks = magic_model.get_equations(page_id) inline_equations, interline_equations, interline_equation_blocks = (
magic_model.get_equations(page_id)
)
page_w, page_h = magic_model.get_page_size(page_id) page_w, page_h = magic_model.get_page_size(page_id)
spans = magic_model.get_all_spans(page_id) spans = magic_model.get_all_spans(page_id)
'''根据parse_mode,构造spans''' """根据parse_mode,构造spans"""
if parse_mode == "txt": if parse_mode == 'txt':
"""ocr 中文本类的 span 用 pymu spans 替换!""" """ocr 中文本类的 span 用 pymu spans 替换!"""
pymu_spans = txt_spans_extract( pymu_spans = txt_spans_extract(
pdf_docs[page_id], inline_equations, interline_equations pdf_docs[page_id], inline_equations, interline_equations
) )
spans = replace_text_span(pymu_spans, spans) spans = replace_text_span(pymu_spans, spans)
elif parse_mode == "ocr": elif parse_mode == 'ocr':
pass pass
else: else:
raise Exception("parse_mode must be txt or ocr") raise Exception('parse_mode must be txt or ocr')
'''删除重叠spans中置信度较低的那些''' """删除重叠spans中置信度较低的那些"""
spans, dropped_spans_by_confidence = remove_overlaps_low_confidence_spans(spans) spans, dropped_spans_by_confidence = remove_overlaps_low_confidence_spans(spans)
'''删除重叠spans中较小的那些''' """删除重叠spans中较小的那些"""
spans, dropped_spans_by_span_overlap = remove_overlaps_min_spans(spans) spans, dropped_spans_by_span_overlap = remove_overlaps_min_spans(spans)
'''对image和table截图''' """对image和table截图"""
spans = ocr_cut_image_and_table(spans, pdf_docs[page_id], page_id, pdf_bytes_md5, imageWriter) spans = ocr_cut_image_and_table(
spans, pdf_docs[page_id], page_id, pdf_bytes_md5, imageWriter
)
'''将所有区块的bbox整理到一起''' """将所有区块的bbox整理到一起"""
# interline_equation_blocks参数不够准,后面切换到interline_equations上 # interline_equation_blocks参数不够准,后面切换到interline_equations上
interline_equation_blocks = [] interline_equation_blocks = []
if len(interline_equation_blocks) > 0: if len(interline_equation_blocks) > 0:
all_bboxes, all_discarded_blocks, drop_reasons = ocr_prepare_bboxes_for_layout_split( all_bboxes, all_discarded_blocks, drop_reasons = (
img_blocks, table_blocks, discarded_blocks, text_blocks, title_blocks, ocr_prepare_bboxes_for_layout_split(
interline_equation_blocks, page_w, page_h) img_blocks,
table_blocks,
discarded_blocks,
text_blocks,
title_blocks,
interline_equation_blocks,
page_w,
page_h,
)
)
else: else:
all_bboxes, all_discarded_blocks, drop_reasons = ocr_prepare_bboxes_for_layout_split( all_bboxes, all_discarded_blocks, drop_reasons = (
img_blocks, table_blocks, discarded_blocks, text_blocks, title_blocks, ocr_prepare_bboxes_for_layout_split(
interline_equations, page_w, page_h) img_blocks,
table_blocks,
discarded_blocks,
text_blocks,
title_blocks,
interline_equations,
page_w,
page_h,
)
)
if len(drop_reasons) > 0: if len(drop_reasons) > 0:
need_drop = True need_drop = True
drop_reason.append(DropReason.OVERLAP_BLOCKS_CAN_NOT_SEPARATION) drop_reason.append(DropReason.OVERLAP_BLOCKS_CAN_NOT_SEPARATION)
'''先处理不需要排版的discarded_blocks''' """先处理不需要排版的discarded_blocks"""
discarded_block_with_spans, spans = fill_spans_in_blocks(all_discarded_blocks, spans, 0.4) discarded_block_with_spans, spans = fill_spans_in_blocks(
all_discarded_blocks, spans, 0.4
)
fix_discarded_blocks = fix_discarded_block(discarded_block_with_spans) fix_discarded_blocks = fix_discarded_block(discarded_block_with_spans)
'''如果当前页面没有bbox则跳过''' """如果当前页面没有bbox则跳过"""
if len(all_bboxes) == 0: if len(all_bboxes) == 0:
logger.warning(f"skip this page, not found useful bbox, page_id: {page_id}") logger.warning(f'skip this page, not found useful bbox, page_id: {page_id}')
return ocr_construct_page_component_v2([], [], page_id, page_w, page_h, [], return ocr_construct_page_component_v2(
[], [], interline_equations, fix_discarded_blocks, [],
need_drop, drop_reason) [],
page_id,
page_w,
page_h,
[],
[],
[],
interline_equations,
fix_discarded_blocks,
need_drop,
drop_reason,
)
"""在切分之前,先检查一下bbox是否有左右重叠的情况,如果有,那么就认为这个pdf暂时没有能力处理好,这种左右重叠的情况大概率是由于pdf里的行间公式、表格没有被正确识别出来造成的 """ """在切分之前,先检查一下bbox是否有左右重叠的情况,如果有,那么就认为这个pdf暂时没有能力处理好,这种左右重叠的情况大概率是由于pdf里的行间公式、表格没有被正确识别出来造成的 """
while True: # 循环检查左右重叠的情况,如果存在就删除掉较小的那个bbox,直到不存在左右重叠的情况 while True: # 循环检查左右重叠的情况,如果存在就删除掉较小的那个bbox,直到不存在左右重叠的情况
is_useful_block_horz_overlap, all_bboxes = remove_horizontal_overlap_block_which_smaller(all_bboxes) is_useful_block_horz_overlap, all_bboxes = (
remove_horizontal_overlap_block_which_smaller(all_bboxes)
)
if is_useful_block_horz_overlap: if is_useful_block_horz_overlap:
need_drop = True need_drop = True
drop_reason.append(DropReason.USEFUL_BLOCK_HOR_OVERLAP) drop_reason.append(DropReason.USEFUL_BLOCK_HOR_OVERLAP)
else: else:
break break
'''根据区块信息计算layout''' """根据区块信息计算layout"""
page_boundry = [0, 0, page_w, page_h] page_boundry = [0, 0, page_w, page_h]
layout_bboxes, layout_tree = get_bboxes_layout(all_bboxes, page_boundry, page_id) layout_bboxes, layout_tree = get_bboxes_layout(all_bboxes, page_boundry, page_id)
if len(text_blocks) > 0 and len(all_bboxes) > 0 and len(layout_bboxes) == 0: if len(text_blocks) > 0 and len(all_bboxes) > 0 and len(layout_bboxes) == 0:
logger.warning( logger.warning(
f"skip this page, page_id: {page_id}, reason: {DropReason.CAN_NOT_DETECT_PAGE_LAYOUT}") f'skip this page, page_id: {page_id}, reason: {DropReason.CAN_NOT_DETECT_PAGE_LAYOUT}'
)
need_drop = True need_drop = True
drop_reason.append(DropReason.CAN_NOT_DETECT_PAGE_LAYOUT) drop_reason.append(DropReason.CAN_NOT_DETECT_PAGE_LAYOUT)
"""以下去掉复杂的布局和超过2列的布局""" """以下去掉复杂的布局和超过2列的布局"""
if any([lay["layout_label"] == LAYOUT_UNPROC for lay in layout_bboxes]): # 复杂的布局 if any(
[lay['layout_label'] == LAYOUT_UNPROC for lay in layout_bboxes]
): # 复杂的布局
logger.warning( logger.warning(
f"skip this page, page_id: {page_id}, reason: {DropReason.COMPLICATED_LAYOUT}") f'skip this page, page_id: {page_id}, reason: {DropReason.COMPLICATED_LAYOUT}'
)
need_drop = True need_drop = True
drop_reason.append(DropReason.COMPLICATED_LAYOUT) drop_reason.append(DropReason.COMPLICATED_LAYOUT)
layout_column_width = get_columns_cnt_of_layout(layout_tree) layout_column_width = get_columns_cnt_of_layout(layout_tree)
if layout_column_width > 2: # 去掉超过2列的布局pdf if layout_column_width > 2: # 去掉超过2列的布局pdf
logger.warning( logger.warning(
f"skip this page, page_id: {page_id}, reason: {DropReason.TOO_MANY_LAYOUT_COLUMNS}") f'skip this page, page_id: {page_id}, reason: {DropReason.TOO_MANY_LAYOUT_COLUMNS}'
)
need_drop = True need_drop = True
drop_reason.append(DropReason.TOO_MANY_LAYOUT_COLUMNS) drop_reason.append(DropReason.TOO_MANY_LAYOUT_COLUMNS)
'''根据layout顺序,对当前页面所有需要留下的block进行排序''' """根据layout顺序,对当前页面所有需要留下的block进行排序"""
sorted_blocks = sort_blocks_by_layout(all_bboxes, layout_bboxes) sorted_blocks = sort_blocks_by_layout(all_bboxes, layout_bboxes)
'''将span填入排好序的blocks中''' """将span填入排好序的blocks中"""
block_with_spans, spans = fill_spans_in_blocks(sorted_blocks, spans, 0.3) block_with_spans, spans = fill_spans_in_blocks(sorted_blocks, spans, 0.3)
'''对block进行fix操作''' """对block进行fix操作"""
fix_blocks = fix_block_spans(block_with_spans, img_blocks, table_blocks) fix_blocks = fix_block_spans(block_with_spans, img_blocks, table_blocks)
'''获取QA需要外置的list''' """获取QA需要外置的list"""
images, tables, interline_equations = get_qa_need_list_v2(fix_blocks) images, tables, interline_equations = get_qa_need_list_v2(fix_blocks)
'''构造pdf_info_dict''' """构造pdf_info_dict"""
page_info = ocr_construct_page_component_v2(fix_blocks, layout_bboxes, page_id, page_w, page_h, layout_tree, page_info = ocr_construct_page_component_v2(
images, tables, interline_equations, fix_discarded_blocks, fix_blocks,
need_drop, drop_reason) layout_bboxes,
page_id,
page_w,
page_h,
layout_tree,
images,
tables,
interline_equations,
fix_discarded_blocks,
need_drop,
drop_reason,
)
return page_info return page_info
def pdf_parse_union(pdf_bytes, def pdf_parse_union(
model_list, pdf_bytes,
imageWriter, model_list,
parse_mode, imageWriter,
start_page_id=0, parse_mode,
end_page_id=None, start_page_id=0,
debug_mode=False, end_page_id=None,
): debug_mode=False,
):
pdf_bytes_md5 = compute_md5(pdf_bytes) pdf_bytes_md5 = compute_md5(pdf_bytes)
pdf_docs = fitz.open("pdf", pdf_bytes) pdf_docs = fitz.open('pdf', pdf_bytes)
'''初始化空的pdf_info_dict''' """初始化空的pdf_info_dict"""
pdf_info_dict = {} pdf_info_dict = {}
'''用model_list和docs对象初始化magic_model''' """用model_list和docs对象初始化magic_model"""
magic_model = MagicModel(model_list, pdf_docs) magic_model = MagicModel(model_list, pdf_docs)
'''根据输入的起始范围解析pdf''' """根据输入的起始范围解析pdf"""
# end_page_id = end_page_id if end_page_id else len(pdf_docs) - 1 # end_page_id = end_page_id if end_page_id else len(pdf_docs) - 1
end_page_id = end_page_id if end_page_id is not None and end_page_id >= 0 else len(pdf_docs) - 1 end_page_id = (
end_page_id
if end_page_id is not None and end_page_id >= 0
else len(pdf_docs) - 1
)
if end_page_id > len(pdf_docs) - 1: if end_page_id > len(pdf_docs) - 1:
logger.warning("end_page_id is out of range, use pdf_docs length") logger.warning('end_page_id is out of range, use pdf_docs length')
end_page_id = len(pdf_docs) - 1 end_page_id = len(pdf_docs) - 1
'''初始化启动时间''' """初始化启动时间"""
start_time = time.time() start_time = time.time()
for page_id, page in enumerate(pdf_docs): for page_id, page in enumerate(pdf_docs):
'''debug时输出每页解析的耗时''' """debug时输出每页解析的耗时."""
if debug_mode: if debug_mode:
time_now = time.time() time_now = time.time()
logger.info( logger.info(
f"page_id: {page_id}, last_page_cost_time: {get_delta_time(start_time)}" f'page_id: {page_id}, last_page_cost_time: {get_delta_time(start_time)}'
) )
start_time = time_now start_time = time_now
'''解析pdf中的每一页''' """解析pdf中的每一页"""
if start_page_id <= page_id <= end_page_id: if start_page_id <= page_id <= end_page_id:
page_info = parse_page_core(pdf_docs, magic_model, page_id, pdf_bytes_md5, imageWriter, parse_mode) page_info = parse_page_core(
pdf_docs, magic_model, page_id, pdf_bytes_md5, imageWriter, parse_mode
)
else: else:
page_w = page.rect.width page_w = page.rect.width
page_h = page.rect.height page_h = page.rect.height
page_info = ocr_construct_page_component_v2([], [], page_id, page_w, page_h, [], page_info = ocr_construct_page_component_v2(
[], [], [], [], [], [], page_id, page_w, page_h, [], [], [], [], [], True, 'skip page'
True, "skip page") )
pdf_info_dict[f"page_{page_id}"] = page_info pdf_info_dict[f'page_{page_id}'] = page_info
"""分段""" """分段"""
para_split(pdf_info_dict, debug_mode=debug_mode) para_split(pdf_info_dict, debug_mode=debug_mode)
...@@ -261,7 +335,7 @@ def pdf_parse_union(pdf_bytes, ...@@ -261,7 +335,7 @@ def pdf_parse_union(pdf_bytes,
"""dict转list""" """dict转list"""
pdf_info_list = dict_to_list(pdf_info_dict) pdf_info_list = dict_to_list(pdf_info_dict)
new_pdf_info_dict = { new_pdf_info_dict = {
"pdf_info": pdf_info_list, 'pdf_info': pdf_info_list,
} }
return new_pdf_info_dict return new_pdf_info_dict
......
...@@ -7,17 +7,17 @@ from typing import List ...@@ -7,17 +7,17 @@ from typing import List
import torch import torch
from loguru import logger from loguru import logger
from magic_pdf.config.drop_reason import DropReason
from magic_pdf.config.enums import SupportedPdfParseMethod from magic_pdf.config.enums import SupportedPdfParseMethod
from magic_pdf.config.ocr_content_type import BlockType, ContentType
from magic_pdf.data.dataset import Dataset, PageableData from magic_pdf.data.dataset import Dataset, PageableData
from magic_pdf.libs.boxbase import calculate_overlap_area_in_bbox1_area_ratio from magic_pdf.libs.boxbase import calculate_overlap_area_in_bbox1_area_ratio
from magic_pdf.libs.clean_memory import clean_memory from magic_pdf.libs.clean_memory import clean_memory
from magic_pdf.libs.commons import fitz, get_delta_time from magic_pdf.libs.commons import fitz, get_delta_time
from magic_pdf.libs.config_reader import get_local_layoutreader_model_dir from magic_pdf.libs.config_reader import get_local_layoutreader_model_dir
from magic_pdf.libs.convert_utils import dict_to_list from magic_pdf.libs.convert_utils import dict_to_list
from magic_pdf.libs.drop_reason import DropReason
from magic_pdf.libs.hash_utils import compute_md5 from magic_pdf.libs.hash_utils import compute_md5
from magic_pdf.libs.local_math import float_equal from magic_pdf.libs.local_math import float_equal
from magic_pdf.libs.ocr_content_type import ContentType, BlockType
from magic_pdf.model.magic_model import MagicModel from magic_pdf.model.magic_model import MagicModel
from magic_pdf.para.para_split_v3 import para_split from magic_pdf.para.para_split_v3 import para_split
from magic_pdf.pre_proc.citationmarker_remove import remove_citation_marker from magic_pdf.pre_proc.citationmarker_remove import remove_citation_marker
...@@ -30,8 +30,8 @@ from magic_pdf.pre_proc.equations_replace import ( ...@@ -30,8 +30,8 @@ from magic_pdf.pre_proc.equations_replace import (
from magic_pdf.pre_proc.ocr_detect_all_bboxes import \ from magic_pdf.pre_proc.ocr_detect_all_bboxes import \
ocr_prepare_bboxes_for_layout_split_v2 ocr_prepare_bboxes_for_layout_split_v2
from magic_pdf.pre_proc.ocr_dict_merge import (fill_spans_in_blocks, from magic_pdf.pre_proc.ocr_dict_merge import (fill_spans_in_blocks,
fix_discarded_block, fix_block_spans_v2,
fix_block_spans_v2) fix_discarded_block)
from magic_pdf.pre_proc.ocr_span_list_modify import ( from magic_pdf.pre_proc.ocr_span_list_modify import (
get_qa_need_list_v2, remove_overlaps_low_confidence_spans, get_qa_need_list_v2, remove_overlaps_low_confidence_spans,
remove_overlaps_min_spans) remove_overlaps_min_spans)
...@@ -164,8 +164,8 @@ class ModelSingleton: ...@@ -164,8 +164,8 @@ class ModelSingleton:
def do_predict(boxes: List[List[int]], model) -> List[int]: def do_predict(boxes: List[List[int]], model) -> List[int]:
from magic_pdf.model.sub_modules.reading_oreder.layoutreader.helpers import (boxes2inputs, parse_logits, from magic_pdf.model.sub_modules.reading_oreder.layoutreader.helpers import (
prepare_inputs) boxes2inputs, parse_logits, prepare_inputs)
inputs = boxes2inputs(boxes) inputs = boxes2inputs(boxes)
inputs = prepare_inputs(inputs, model) inputs = prepare_inputs(inputs, model)
...@@ -206,7 +206,9 @@ def cal_block_index(fix_blocks, sorted_bboxes): ...@@ -206,7 +206,9 @@ def cal_block_index(fix_blocks, sorted_bboxes):
del block['real_lines'] del block['real_lines']
import numpy as np import numpy as np
from magic_pdf.model.sub_modules.reading_oreder.layoutreader.xycut import recursive_xy_cut
from magic_pdf.model.sub_modules.reading_oreder.layoutreader.xycut import \
recursive_xy_cut
random_boxes = np.array(block_bboxes) random_boxes = np.array(block_bboxes)
np.random.shuffle(random_boxes) np.random.shuffle(random_boxes)
...@@ -291,7 +293,7 @@ def sort_lines_by_model(fix_blocks, page_w, page_h, line_height): ...@@ -291,7 +293,7 @@ def sort_lines_by_model(fix_blocks, page_w, page_h, line_height):
page_line_list.append(bbox) page_line_list.append(bbox)
elif block['type'] in [BlockType.ImageBody, BlockType.TableBody]: elif block['type'] in [BlockType.ImageBody, BlockType.TableBody]:
bbox = block['bbox'] bbox = block['bbox']
block["real_lines"] = copy.deepcopy(block['lines']) block['real_lines'] = copy.deepcopy(block['lines'])
lines = insert_lines_into_block(bbox, line_height, page_w, page_h) lines = insert_lines_into_block(bbox, line_height, page_w, page_h)
block['lines'] = [] block['lines'] = []
for line in lines: for line in lines:
......
from abc import ABC, abstractmethod from abc import ABC, abstractmethod
from magic_pdf.config.drop_reason import DropReason
from magic_pdf.config.make_content_config import DropMode, MakeMode
from magic_pdf.data.data_reader_writer import DataWriter from magic_pdf.data.data_reader_writer import DataWriter
from magic_pdf.dict2md.ocr_mkcontent import union_make from magic_pdf.dict2md.ocr_mkcontent import union_make
from magic_pdf.filter.pdf_classify_by_type import classify from magic_pdf.filter.pdf_classify_by_type import classify
from magic_pdf.filter.pdf_meta_scan import pdf_meta_scan from magic_pdf.filter.pdf_meta_scan import pdf_meta_scan
from magic_pdf.libs.drop_reason import DropReason
from magic_pdf.libs.json_compressor import JsonCompressor from magic_pdf.libs.json_compressor import JsonCompressor
from magic_pdf.libs.MakeContentConfig import DropMode, MakeMode
class AbsPipe(ABC): class AbsPipe(ABC):
......
from loguru import logger from loguru import logger
from magic_pdf.config.make_content_config import DropMode, MakeMode
from magic_pdf.data.data_reader_writer import DataWriter from magic_pdf.data.data_reader_writer import DataWriter
from magic_pdf.libs.MakeContentConfig import DropMode, MakeMode
from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
from magic_pdf.pipe.AbsPipe import AbsPipe from magic_pdf.pipe.AbsPipe import AbsPipe
from magic_pdf.user_api import parse_ocr_pdf from magic_pdf.user_api import parse_ocr_pdf
......
from loguru import logger from loguru import logger
from magic_pdf.config.make_content_config import DropMode, MakeMode
from magic_pdf.data.data_reader_writer import DataWriter from magic_pdf.data.data_reader_writer import DataWriter
from magic_pdf.libs.MakeContentConfig import DropMode, MakeMode
from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
from magic_pdf.pipe.AbsPipe import AbsPipe from magic_pdf.pipe.AbsPipe import AbsPipe
from magic_pdf.user_api import parse_txt_pdf from magic_pdf.user_api import parse_txt_pdf
......
...@@ -2,9 +2,9 @@ import json ...@@ -2,9 +2,9 @@ import json
from loguru import logger from loguru import logger
from magic_pdf.config.make_content_config import DropMode, MakeMode
from magic_pdf.data.data_reader_writer import DataWriter from magic_pdf.data.data_reader_writer import DataWriter
from magic_pdf.libs.commons import join_path from magic_pdf.libs.commons import join_path
from magic_pdf.libs.MakeContentConfig import DropMode, MakeMode
from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
from magic_pdf.pipe.AbsPipe import AbsPipe from magic_pdf.pipe.AbsPipe import AbsPipe
from magic_pdf.user_api import parse_ocr_pdf, parse_union_pdf from magic_pdf.user_api import parse_ocr_pdf, parse_union_pdf
......
from loguru import logger from loguru import logger
from magic_pdf.config.drop_reason import DropReason
from magic_pdf.layout.layout_sort import get_columns_cnt_of_layout from magic_pdf.layout.layout_sort import get_columns_cnt_of_layout
from magic_pdf.libs.drop_reason import DropReason
def __is_pseudo_single_column(page_info) -> bool: def __is_pseudo_single_column(page_info) -> bool:
""" """判断一个页面是否伪单列。
判断一个页面是否伪单列。
Args: Args:
page_info (dict): 页面信息字典,包括'_layout_tree'和'preproc_blocks'。 page_info (dict): 页面信息字典,包括'_layout_tree'和'preproc_blocks'。
Returns: Returns:
Tuple[bool, Optional[str]]: 如果页面伪单列返回(True, extra_info),否则返回(False, None)。 Tuple[bool, Optional[str]]: 如果页面伪单列返回(True, extra_info),否则返回(False, None)。
""" """
layout_tree = page_info['_layout_tree'] layout_tree = page_info['_layout_tree']
layout_column_width = get_columns_cnt_of_layout(layout_tree) layout_column_width = get_columns_cnt_of_layout(layout_tree)
...@@ -41,27 +39,22 @@ def __is_pseudo_single_column(page_info) -> bool: ...@@ -41,27 +39,22 @@ def __is_pseudo_single_column(page_info) -> bool:
if num_lines > 20: if num_lines > 20:
radio = num_satisfying_lines / num_lines radio = num_satisfying_lines / num_lines
if radio >= 0.5: if radio >= 0.5:
extra_info = f"{{num_lines: {num_lines}, num_satisfying_lines: {num_satisfying_lines}}}" extra_info = f'{{num_lines: {num_lines}, num_satisfying_lines: {num_satisfying_lines}}}'
block_text = [] block_text = []
for line in lines: for line in lines:
if line['spans']: if line['spans']:
for span in line['spans']: for span in line['spans']:
block_text.append(span['text']) block_text.append(span['text'])
logger.warning(f"pseudo_single_column block_text: {block_text}") logger.warning(f'pseudo_single_column block_text: {block_text}')
return True, extra_info return True, extra_info
return False, None return False, None
def pdf_post_filter(page_info) -> tuple: def pdf_post_filter(page_info) -> tuple:
""" """return:(True|False, err_msg) True, 如果pdf符合要求 False, 如果pdf不符合要求."""
return:(True|False, err_msg)
True, 如果pdf符合要求
False, 如果pdf不符合要求
"""
bool_is_pseudo_single_column, extra_info = __is_pseudo_single_column(page_info) bool_is_pseudo_single_column, extra_info = __is_pseudo_single_column(page_info)
if bool_is_pseudo_single_column: if bool_is_pseudo_single_column:
return False, {"_need_drop": True, "_drop_reason": DropReason.PSEUDO_SINGLE_COLUMN, "extra_info": extra_info} return False, {'_need_drop': True, '_drop_reason': DropReason.PSEUDO_SINGLE_COLUMN, 'extra_info': extra_info}
return True, None return True, None
\ No newline at end of file
from loguru import logger from loguru import logger
from magic_pdf.config.ocr_content_type import ContentType
from magic_pdf.libs.commons import join_path from magic_pdf.libs.commons import join_path
from magic_pdf.libs.ocr_content_type import ContentType
from magic_pdf.libs.pdf_image_tools import cut_image from magic_pdf.libs.pdf_image_tools import cut_image
...@@ -29,9 +29,7 @@ def txt_save_images_by_bboxes(page_num: int, page, pdf_bytes_md5: str, ...@@ -29,9 +29,7 @@ def txt_save_images_by_bboxes(page_num: int, page, pdf_bytes_md5: str,
image_bboxes: list, images_overlap_backup: list, table_bboxes: list, image_bboxes: list, images_overlap_backup: list, table_bboxes: list,
equation_inline_bboxes: list, equation_inline_bboxes: list,
equation_interline_bboxes: list, imageWriter) -> dict: equation_interline_bboxes: list, imageWriter) -> dict:
""" """返回一个dict, key为bbox, 值是图片地址."""
返回一个dict, key为bbox, 值是图片地址
"""
image_info = [] image_info = []
image_backup_info = [] image_backup_info = []
table_info = [] table_info = []
...@@ -46,26 +44,26 @@ def txt_save_images_by_bboxes(page_num: int, page, pdf_bytes_md5: str, ...@@ -46,26 +44,26 @@ def txt_save_images_by_bboxes(page_num: int, page, pdf_bytes_md5: str,
for bbox in image_bboxes: for bbox in image_bboxes:
if not check_img_bbox(bbox): if not check_img_bbox(bbox):
continue continue
image_path = cut_image(bbox, page_num, page, return_path("images"), imageWriter) image_path = cut_image(bbox, page_num, page, return_path('images'), imageWriter)
image_info.append({"bbox": bbox, "image_path": image_path}) image_info.append({'bbox': bbox, 'image_path': image_path})
for bbox in images_overlap_backup: for bbox in images_overlap_backup:
if not check_img_bbox(bbox): if not check_img_bbox(bbox):
continue continue
image_path = cut_image(bbox, page_num, page, return_path("images"), imageWriter) image_path = cut_image(bbox, page_num, page, return_path('images'), imageWriter)
image_backup_info.append({"bbox": bbox, "image_path": image_path}) image_backup_info.append({'bbox': bbox, 'image_path': image_path})
for bbox in table_bboxes: for bbox in table_bboxes:
if not check_img_bbox(bbox): if not check_img_bbox(bbox):
continue continue
image_path = cut_image(bbox, page_num, page, return_path("tables"), imageWriter) image_path = cut_image(bbox, page_num, page, return_path('tables'), imageWriter)
table_info.append({"bbox": bbox, "image_path": image_path}) table_info.append({'bbox': bbox, 'image_path': image_path})
return image_info, image_backup_info, table_info, inline_eq_info, interline_eq_info return image_info, image_backup_info, table_info, inline_eq_info, interline_eq_info
def check_img_bbox(bbox) -> bool: def check_img_bbox(bbox) -> bool:
if any([bbox[0] >= bbox[2], bbox[1] >= bbox[3]]): if any([bbox[0] >= bbox[2], bbox[1] >= bbox[3]]):
logger.warning(f"image_bboxes: 错误的box, {bbox}") logger.warning(f'image_bboxes: 错误的box, {bbox}')
return False return False
return True return True
This diff is collapsed.
from loguru import logger
from magic_pdf.libs.boxbase import get_minbox_if_overlap_by_ratio, calculate_overlap_area_in_bbox1_area_ratio, \ from magic_pdf.config.ocr_content_type import BlockType
calculate_iou, calculate_vertical_projection_overlap_ratio from magic_pdf.libs.boxbase import (
from magic_pdf.libs.drop_tag import DropTag calculate_iou, calculate_overlap_area_in_bbox1_area_ratio,
from magic_pdf.libs.ocr_content_type import BlockType calculate_vertical_projection_overlap_ratio,
from magic_pdf.pre_proc.remove_bbox_overlap import remove_overlap_between_bbox_for_block get_minbox_if_overlap_by_ratio)
from magic_pdf.pre_proc.remove_bbox_overlap import \
remove_overlap_between_bbox_for_block
def ocr_prepare_bboxes_for_layout_split(img_blocks, table_blocks, discarded_blocks, text_blocks,
title_blocks, interline_equation_blocks, page_w, page_h):
def ocr_prepare_bboxes_for_layout_split(
img_blocks,
table_blocks,
discarded_blocks,
text_blocks,
title_blocks,
interline_equation_blocks,
page_w,
page_h,
):
all_bboxes = [] all_bboxes = []
all_discarded_blocks = [] all_discarded_blocks = []
for image in img_blocks: for image in img_blocks:
x0, y0, x1, y1 = image['bbox'] x0, y0, x1, y1 = image['bbox']
all_bboxes.append([x0, y0, x1, y1, None, None, None, BlockType.Image, None, None, None, None, image["score"]]) all_bboxes.append(
[
x0,
y0,
x1,
y1,
None,
None,
None,
BlockType.Image,
None,
None,
None,
None,
image['score'],
]
)
for table in table_blocks: for table in table_blocks:
x0, y0, x1, y1 = table['bbox'] x0, y0, x1, y1 = table['bbox']
all_bboxes.append([x0, y0, x1, y1, None, None, None, BlockType.Table, None, None, None, None, table["score"]]) all_bboxes.append(
[
x0,
y0,
x1,
y1,
None,
None,
None,
BlockType.Table,
None,
None,
None,
None,
table['score'],
]
)
for text in text_blocks: for text in text_blocks:
x0, y0, x1, y1 = text['bbox'] x0, y0, x1, y1 = text['bbox']
all_bboxes.append([x0, y0, x1, y1, None, None, None, BlockType.Text, None, None, None, None, text["score"]]) all_bboxes.append(
[
x0,
y0,
x1,
y1,
None,
None,
None,
BlockType.Text,
None,
None,
None,
None,
text['score'],
]
)
for title in title_blocks: for title in title_blocks:
x0, y0, x1, y1 = title['bbox'] x0, y0, x1, y1 = title['bbox']
all_bboxes.append([x0, y0, x1, y1, None, None, None, BlockType.Title, None, None, None, None, title["score"]]) all_bboxes.append(
[
x0,
y0,
x1,
y1,
None,
None,
None,
BlockType.Title,
None,
None,
None,
None,
title['score'],
]
)
for interline_equation in interline_equation_blocks: for interline_equation in interline_equation_blocks:
x0, y0, x1, y1 = interline_equation['bbox'] x0, y0, x1, y1 = interline_equation['bbox']
all_bboxes.append([x0, y0, x1, y1, None, None, None, BlockType.InterlineEquation, None, None, None, None, interline_equation["score"]]) all_bboxes.append(
[
'''block嵌套问题解决''' x0,
'''文本框与标题框重叠,优先信任文本框''' y0,
x1,
y1,
None,
None,
None,
BlockType.InterlineEquation,
None,
None,
None,
None,
interline_equation['score'],
]
)
"""block嵌套问题解决"""
"""文本框与标题框重叠,优先信任文本框"""
all_bboxes = fix_text_overlap_title_blocks(all_bboxes) all_bboxes = fix_text_overlap_title_blocks(all_bboxes)
'''任何框体与舍弃框重叠,优先信任舍弃框''' """任何框体与舍弃框重叠,优先信任舍弃框"""
all_bboxes = remove_need_drop_blocks(all_bboxes, discarded_blocks) all_bboxes = remove_need_drop_blocks(all_bboxes, discarded_blocks)
# interline_equation 与title或text框冲突的情况,分两种情况处理 # interline_equation 与title或text框冲突的情况,分两种情况处理
'''interline_equation框与文本类型框iou比较接近1的时候,信任行间公式框''' """interline_equation框与文本类型框iou比较接近1的时候,信任行间公式框"""
all_bboxes = fix_interline_equation_overlap_text_blocks_with_hi_iou(all_bboxes) all_bboxes = fix_interline_equation_overlap_text_blocks_with_hi_iou(all_bboxes)
'''interline_equation框被包含在文本类型框内,且interline_equation比文本区块小很多时信任文本框,这时需要舍弃公式框''' """interline_equation框被包含在文本类型框内,且interline_equation比文本区块小很多时信任文本框,这时需要舍弃公式框"""
# 通过后续大框套小框逻辑删除 # 通过后续大框套小框逻辑删除
'''discarded_blocks中只保留宽度超过1/3页面宽度的,高度超过10的,处于页面下半50%区域的(限定footnote)''' """discarded_blocks中只保留宽度超过1/3页面宽度的,高度超过10的,处于页面下半50%区域的(限定footnote)"""
for discarded in discarded_blocks: for discarded in discarded_blocks:
x0, y0, x1, y1 = discarded['bbox'] x0, y0, x1, y1 = discarded['bbox']
all_discarded_blocks.append([x0, y0, x1, y1, None, None, None, BlockType.Discarded, None, None, None, None, discarded["score"]]) all_discarded_blocks.append(
[
x0,
y0,
x1,
y1,
None,
None,
None,
BlockType.Discarded,
None,
None,
None,
None,
discarded['score'],
]
)
# 将footnote加入到all_bboxes中,用来计算layout # 将footnote加入到all_bboxes中,用来计算layout
if (x1 - x0) > (page_w / 3) and (y1 - y0) > 10 and y0 > (page_h / 2): if (x1 - x0) > (page_w / 3) and (y1 - y0) > 10 and y0 > (page_h / 2):
all_bboxes.append([x0, y0, x1, y1, None, None, None, BlockType.Footnote, None, None, None, None, discarded["score"]]) all_bboxes.append(
[
'''经过以上处理后,还存在大框套小框的情况,则删除小框''' x0,
y0,
x1,
y1,
None,
None,
None,
BlockType.Footnote,
None,
None,
None,
None,
discarded['score'],
]
)
"""经过以上处理后,还存在大框套小框的情况,则删除小框"""
all_bboxes = remove_overlaps_min_blocks(all_bboxes) all_bboxes = remove_overlaps_min_blocks(all_bboxes)
all_discarded_blocks = remove_overlaps_min_blocks(all_discarded_blocks) all_discarded_blocks = remove_overlaps_min_blocks(all_discarded_blocks)
'''将剩余的bbox做分离处理,防止后面分layout时出错''' """将剩余的bbox做分离处理,防止后面分layout时出错"""
all_bboxes, drop_reasons = remove_overlap_between_bbox_for_block(all_bboxes) all_bboxes, drop_reasons = remove_overlap_between_bbox_for_block(all_bboxes)
return all_bboxes, all_discarded_blocks, drop_reasons return all_bboxes, all_discarded_blocks, drop_reasons
...@@ -64,18 +185,64 @@ def add_bboxes(blocks, block_type, bboxes): ...@@ -64,18 +185,64 @@ def add_bboxes(blocks, block_type, bboxes):
for block in blocks: for block in blocks:
x0, y0, x1, y1 = block['bbox'] x0, y0, x1, y1 = block['bbox']
if block_type in [ if block_type in [
BlockType.ImageBody, BlockType.ImageCaption, BlockType.ImageFootnote, BlockType.ImageBody,
BlockType.TableBody, BlockType.TableCaption, BlockType.TableFootnote BlockType.ImageCaption,
BlockType.ImageFootnote,
BlockType.TableBody,
BlockType.TableCaption,
BlockType.TableFootnote,
]: ]:
bboxes.append([x0, y0, x1, y1, None, None, None, block_type, None, None, None, None, block["score"], block["group_id"]]) bboxes.append(
[
x0,
y0,
x1,
y1,
None,
None,
None,
block_type,
None,
None,
None,
None,
block['score'],
block['group_id'],
]
)
else: else:
bboxes.append([x0, y0, x1, y1, None, None, None, block_type, None, None, None, None, block["score"]]) bboxes.append(
[
x0,
y0,
x1,
y1,
None,
None,
None,
block_type,
None,
None,
None,
None,
block['score'],
]
)
def ocr_prepare_bboxes_for_layout_split_v2( def ocr_prepare_bboxes_for_layout_split_v2(
img_body_blocks, img_caption_blocks, img_footnote_blocks, img_body_blocks,
table_body_blocks, table_caption_blocks, table_footnote_blocks, img_caption_blocks,
discarded_blocks, text_blocks, title_blocks, interline_equation_blocks, page_w, page_h img_footnote_blocks,
table_body_blocks,
table_caption_blocks,
table_footnote_blocks,
discarded_blocks,
text_blocks,
title_blocks,
interline_equation_blocks,
page_w,
page_h,
): ):
all_bboxes = [] all_bboxes = []
...@@ -89,40 +256,40 @@ def ocr_prepare_bboxes_for_layout_split_v2( ...@@ -89,40 +256,40 @@ def ocr_prepare_bboxes_for_layout_split_v2(
add_bboxes(title_blocks, BlockType.Title, all_bboxes) add_bboxes(title_blocks, BlockType.Title, all_bboxes)
add_bboxes(interline_equation_blocks, BlockType.InterlineEquation, all_bboxes) add_bboxes(interline_equation_blocks, BlockType.InterlineEquation, all_bboxes)
'''block嵌套问题解决''' """block嵌套问题解决"""
'''文本框与标题框重叠,优先信任文本框''' """文本框与标题框重叠,优先信任文本框"""
all_bboxes = fix_text_overlap_title_blocks(all_bboxes) all_bboxes = fix_text_overlap_title_blocks(all_bboxes)
'''任何框体与舍弃框重叠,优先信任舍弃框''' """任何框体与舍弃框重叠,优先信任舍弃框"""
all_bboxes = remove_need_drop_blocks(all_bboxes, discarded_blocks) all_bboxes = remove_need_drop_blocks(all_bboxes, discarded_blocks)
# interline_equation 与title或text框冲突的情况,分两种情况处理 # interline_equation 与title或text框冲突的情况,分两种情况处理
'''interline_equation框与文本类型框iou比较接近1的时候,信任行间公式框''' """interline_equation框与文本类型框iou比较接近1的时候,信任行间公式框"""
all_bboxes = fix_interline_equation_overlap_text_blocks_with_hi_iou(all_bboxes) all_bboxes = fix_interline_equation_overlap_text_blocks_with_hi_iou(all_bboxes)
'''interline_equation框被包含在文本类型框内,且interline_equation比文本区块小很多时信任文本框,这时需要舍弃公式框''' """interline_equation框被包含在文本类型框内,且interline_equation比文本区块小很多时信任文本框,这时需要舍弃公式框"""
# 通过后续大框套小框逻辑删除 # 通过后续大框套小框逻辑删除
'''discarded_blocks''' """discarded_blocks"""
all_discarded_blocks = [] all_discarded_blocks = []
add_bboxes(discarded_blocks, BlockType.Discarded, all_discarded_blocks) add_bboxes(discarded_blocks, BlockType.Discarded, all_discarded_blocks)
'''footnote识别:宽度超过1/3页面宽度的,高度超过10的,处于页面下半50%区域的''' """footnote识别:宽度超过1/3页面宽度的,高度超过10的,处于页面下半50%区域的"""
footnote_blocks = [] footnote_blocks = []
for discarded in discarded_blocks: for discarded in discarded_blocks:
x0, y0, x1, y1 = discarded['bbox'] x0, y0, x1, y1 = discarded['bbox']
if (x1 - x0) > (page_w / 3) and (y1 - y0) > 10 and y0 > (page_h / 2): if (x1 - x0) > (page_w / 3) and (y1 - y0) > 10 and y0 > (page_h / 2):
footnote_blocks.append([x0, y0, x1, y1]) footnote_blocks.append([x0, y0, x1, y1])
'''移除在footnote下面的任何框''' """移除在footnote下面的任何框"""
need_remove_blocks = find_blocks_under_footnote(all_bboxes, footnote_blocks) need_remove_blocks = find_blocks_under_footnote(all_bboxes, footnote_blocks)
if len(need_remove_blocks) > 0: if len(need_remove_blocks) > 0:
for block in need_remove_blocks: for block in need_remove_blocks:
all_bboxes.remove(block) all_bboxes.remove(block)
all_discarded_blocks.append(block) all_discarded_blocks.append(block)
'''经过以上处理后,还存在大框套小框的情况,则删除小框''' """经过以上处理后,还存在大框套小框的情况,则删除小框"""
all_bboxes = remove_overlaps_min_blocks(all_bboxes) all_bboxes = remove_overlaps_min_blocks(all_bboxes)
all_discarded_blocks = remove_overlaps_min_blocks(all_discarded_blocks) all_discarded_blocks = remove_overlaps_min_blocks(all_discarded_blocks)
'''将剩余的bbox做分离处理,防止后面分layout时出错''' """将剩余的bbox做分离处理,防止后面分layout时出错"""
all_bboxes, drop_reasons = remove_overlap_between_bbox_for_block(all_bboxes) all_bboxes, drop_reasons = remove_overlap_between_bbox_for_block(all_bboxes)
return all_bboxes, all_discarded_blocks return all_bboxes, all_discarded_blocks
...@@ -135,7 +302,13 @@ def find_blocks_under_footnote(all_bboxes, footnote_blocks): ...@@ -135,7 +302,13 @@ def find_blocks_under_footnote(all_bboxes, footnote_blocks):
for footnote_bbox in footnote_blocks: for footnote_bbox in footnote_blocks:
footnote_x0, footnote_y0, footnote_x1, footnote_y1 = footnote_bbox footnote_x0, footnote_y0, footnote_x1, footnote_y1 = footnote_bbox
# 如果footnote的纵向投影覆盖了block的纵向投影的80%且block的y0大于等于footnote的y1 # 如果footnote的纵向投影覆盖了block的纵向投影的80%且block的y0大于等于footnote的y1
if block_y0 >= footnote_y1 and calculate_vertical_projection_overlap_ratio((block_x0, block_y0, block_x1, block_y1), footnote_bbox) >= 0.8: if (
block_y0 >= footnote_y1
and calculate_vertical_projection_overlap_ratio(
(block_x0, block_y0, block_x1, block_y1), footnote_bbox
)
>= 0.8
):
if block not in need_remove_blocks: if block not in need_remove_blocks:
need_remove_blocks.append(block) need_remove_blocks.append(block)
break break
...@@ -203,7 +376,12 @@ def remove_need_drop_blocks(all_bboxes, discarded_blocks): ...@@ -203,7 +376,12 @@ def remove_need_drop_blocks(all_bboxes, discarded_blocks):
for block in all_bboxes: for block in all_bboxes:
for discarded_block in discarded_blocks: for discarded_block in discarded_blocks:
block_bbox = block[:4] block_bbox = block[:4]
if calculate_overlap_area_in_bbox1_area_ratio(block_bbox, discarded_block['bbox']) > 0.6: if (
calculate_overlap_area_in_bbox1_area_ratio(
block_bbox, discarded_block['bbox']
)
> 0.6
):
if block not in need_remove: if block not in need_remove:
need_remove.append(block) need_remove.append(block)
break break
...@@ -223,10 +401,18 @@ def remove_overlaps_min_blocks(all_bboxes): ...@@ -223,10 +401,18 @@ def remove_overlaps_min_blocks(all_bboxes):
if block1 != block2: if block1 != block2:
block1_bbox = block1[:4] block1_bbox = block1[:4]
block2_bbox = block2[:4] block2_bbox = block2[:4]
overlap_box = get_minbox_if_overlap_by_ratio(block1_bbox, block2_bbox, 0.8) overlap_box = get_minbox_if_overlap_by_ratio(
block1_bbox, block2_bbox, 0.8
)
if overlap_box is not None: if overlap_box is not None:
block_to_remove = next((block for block in all_bboxes if block[:4] == overlap_box), None) block_to_remove = next(
if block_to_remove is not None and block_to_remove not in need_remove: (block for block in all_bboxes if block[:4] == overlap_box),
None,
)
if (
block_to_remove is not None
and block_to_remove not in need_remove
):
large_block = block1 if block1 != block_to_remove else block2 large_block = block1 if block1 != block_to_remove else block2
x1, y1, x2, y2 = large_block[:4] x1, y1, x2, y2 = large_block[:4]
sx1, sy1, sx2, sy2 = block_to_remove[:4] sx1, sy1, sx2, sy2 = block_to_remove[:4]
......
from magic_pdf.config.drop_tag import DropTag
from magic_pdf.config.ocr_content_type import BlockType, ContentType
from magic_pdf.libs.boxbase import (__is_overlaps_y_exceeds_threshold, from magic_pdf.libs.boxbase import (__is_overlaps_y_exceeds_threshold,
_is_in_or_part_overlap_with_area_ratio, _is_in_or_part_overlap_with_area_ratio,
calculate_overlap_area_in_bbox1_area_ratio) calculate_overlap_area_in_bbox1_area_ratio)
from magic_pdf.libs.drop_tag import DropTag
from magic_pdf.libs.ocr_content_type import BlockType, ContentType
# 将每一个line中的span从左到右排序 # 将每一个line中的span从左到右排序
...@@ -157,7 +157,7 @@ def fill_spans_in_blocks(blocks, spans, radio): ...@@ -157,7 +157,7 @@ def fill_spans_in_blocks(blocks, spans, radio):
BlockType.ImageBody, BlockType.ImageCaption, BlockType.ImageFootnote, BlockType.ImageBody, BlockType.ImageCaption, BlockType.ImageFootnote,
BlockType.TableBody, BlockType.TableCaption, BlockType.TableFootnote BlockType.TableBody, BlockType.TableCaption, BlockType.TableFootnote
]: ]:
block_dict["group_id"] = block[-1] block_dict['group_id'] = block[-1]
block_spans = [] block_spans = []
for span in spans: for span in spans:
span_bbox = span['bbox'] span_bbox = span['bbox']
......
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
import re import re
from magic_pdf.config.drop_tag import CONTENT_IN_FOOT_OR_HEADER, PAGE_NO
from magic_pdf.libs.boxbase import _is_in_or_part_overlap from magic_pdf.libs.boxbase import _is_in_or_part_overlap
from magic_pdf.libs.drop_tag import CONTENT_IN_FOOT_OR_HEADER, PAGE_NO
def remove_headder_footer_one_page(text_raw_blocks, image_bboxes, table_bboxes, header_bboxs, footer_bboxs, def remove_headder_footer_one_page(text_raw_blocks, image_bboxes, table_bboxes, header_bboxs, footer_bboxs,
page_no_bboxs, page_w, page_h): page_no_bboxs, page_w, page_h):
""" """删除页眉页脚,页码 从line级别进行删除,删除之后观察这个text-block是否是空的,如果是空的,则移动到remove_list中."""
删除页眉页脚,页码
从line级别进行删除,删除之后观察这个text-block是否是空的,如果是空的,则移动到remove_list中
"""
header = [] header = []
footer = [] footer = []
if len(header) == 0: if len(header) == 0:
......
This diff is collapsed.
This diff is collapsed.
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment