Unverified Commit 2de5a79f authored by Xiaomeng Zhao's avatar Xiaomeng Zhao Committed by GitHub
Browse files

Merge pull request #2251 from myhloli/dev

feat(pdf_parse): add footnote block handling in layout split
parents cfa90743 058d3184
...@@ -490,7 +490,7 @@ def insert_lines_into_block(block_bbox, line_height, page_w, page_h): ...@@ -490,7 +490,7 @@ def insert_lines_into_block(block_bbox, line_height, page_w, page_h):
return [[x0, y0, x1, y1]] return [[x0, y0, x1, y1]]
def sort_lines_by_model(fix_blocks, page_w, page_h, line_height): def sort_lines_by_model(fix_blocks, page_w, page_h, line_height, footnote_blocks):
page_line_list = [] page_line_list = []
def add_lines_to_block(b): def add_lines_to_block(b):
...@@ -519,6 +519,10 @@ def sort_lines_by_model(fix_blocks, page_w, page_h, line_height): ...@@ -519,6 +519,10 @@ def sort_lines_by_model(fix_blocks, page_w, page_h, line_height):
block['real_lines'] = copy.deepcopy(block['lines']) block['real_lines'] = copy.deepcopy(block['lines'])
add_lines_to_block(block) add_lines_to_block(block)
for block in footnote_blocks:
footnote_block = {'bbox': block[:4]}
add_lines_to_block(footnote_block)
if len(page_line_list) > 200: # layoutreader最高支持512line if len(page_line_list) > 200: # layoutreader最高支持512line
return None return None
...@@ -779,7 +783,7 @@ def parse_page_core( ...@@ -779,7 +783,7 @@ def parse_page_core(
# interline_equation_blocks参数不够准,后面切换到interline_equations上 # interline_equation_blocks参数不够准,后面切换到interline_equations上
interline_equation_blocks = [] interline_equation_blocks = []
if len(interline_equation_blocks) > 0: if len(interline_equation_blocks) > 0:
all_bboxes, all_discarded_blocks = ocr_prepare_bboxes_for_layout_split_v2( all_bboxes, all_discarded_blocks, footnote_blocks = ocr_prepare_bboxes_for_layout_split_v2(
img_body_blocks, img_caption_blocks, img_footnote_blocks, img_body_blocks, img_caption_blocks, img_footnote_blocks,
table_body_blocks, table_caption_blocks, table_footnote_blocks, table_body_blocks, table_caption_blocks, table_footnote_blocks,
discarded_blocks, discarded_blocks,
...@@ -790,7 +794,7 @@ def parse_page_core( ...@@ -790,7 +794,7 @@ def parse_page_core(
page_h, page_h,
) )
else: else:
all_bboxes, all_discarded_blocks = ocr_prepare_bboxes_for_layout_split_v2( all_bboxes, all_discarded_blocks, footnote_blocks = ocr_prepare_bboxes_for_layout_split_v2(
img_body_blocks, img_caption_blocks, img_footnote_blocks, img_body_blocks, img_caption_blocks, img_footnote_blocks,
table_body_blocks, table_caption_blocks, table_footnote_blocks, table_body_blocks, table_caption_blocks, table_footnote_blocks,
discarded_blocks, discarded_blocks,
...@@ -866,7 +870,7 @@ def parse_page_core( ...@@ -866,7 +870,7 @@ def parse_page_core(
line_height = get_line_height(fix_blocks) line_height = get_line_height(fix_blocks)
"""获取所有line并对line排序""" """获取所有line并对line排序"""
sorted_bboxes = sort_lines_by_model(fix_blocks, page_w, page_h, line_height) sorted_bboxes = sort_lines_by_model(fix_blocks, page_w, page_h, line_height, footnote_blocks)
"""根据line的中位数算block的序列关系""" """根据line的中位数算block的序列关系"""
fix_blocks = cal_block_index(fix_blocks, sorted_bboxes) fix_blocks = cal_block_index(fix_blocks, sorted_bboxes)
......
...@@ -119,7 +119,7 @@ def ocr_prepare_bboxes_for_layout_split_v2( ...@@ -119,7 +119,7 @@ def ocr_prepare_bboxes_for_layout_split_v2(
"""将剩余的bbox做分离处理,防止后面分layout时出错""" """将剩余的bbox做分离处理,防止后面分layout时出错"""
# all_bboxes, drop_reasons = remove_overlap_between_bbox_for_block(all_bboxes) # all_bboxes, drop_reasons = remove_overlap_between_bbox_for_block(all_bboxes)
all_bboxes.sort(key=lambda x: x[0]+x[1]) all_bboxes.sort(key=lambda x: x[0]+x[1])
return all_bboxes, all_discarded_blocks return all_bboxes, all_discarded_blocks, footnote_blocks
def find_blocks_under_footnote(all_bboxes, footnote_blocks): def find_blocks_under_footnote(all_bboxes, footnote_blocks):
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment