"...v2/git@developer.sourcefind.cn:OpenDAS/vision.git" did not exist on "bf4914632176f868bde1553a6b58782eb5b75196"
Commit deb98fd0 authored by 赵小蒙's avatar 赵小蒙
Browse files

fix footnote overlap error

parent 288bb074
...@@ -29,10 +29,10 @@ def remove_horizontal_overlap_block_which_smaller(all_bboxes): ...@@ -29,10 +29,10 @@ def remove_horizontal_overlap_block_which_smaller(all_bboxes):
useful_blocks.append({ useful_blocks.append({
"bbox": bbox[:4] "bbox": bbox[:4]
}) })
is_useful_block_horz_overlap, smaller_bbox = check_useful_block_horizontal_overlap(useful_blocks) is_useful_block_horz_overlap, smaller_bbox, bigger_bbox = check_useful_block_horizontal_overlap(useful_blocks)
if is_useful_block_horz_overlap: if is_useful_block_horz_overlap:
logger.warning( logger.warning(
f"skip this page, reason: {DropReason.USEFUL_BLOCK_HOR_OVERLAP}") f"skip this page, reason: {DropReason.USEFUL_BLOCK_HOR_OVERLAP}, smaller bbox is {smaller_bbox}, bigger bbox is {bigger_bbox}")
for bbox in all_bboxes.copy(): for bbox in all_bboxes.copy():
if smaller_bbox == bbox[:4]: if smaller_bbox == bbox[:4]:
all_bboxes.remove(bbox) all_bboxes.remove(bbox)
......
...@@ -34,10 +34,6 @@ def ocr_prepare_bboxes_for_layout_split(img_blocks, table_blocks, discarded_bloc ...@@ -34,10 +34,6 @@ def ocr_prepare_bboxes_for_layout_split(img_blocks, table_blocks, discarded_bloc
all_bboxes = fix_text_overlap_title_blocks(all_bboxes) all_bboxes = fix_text_overlap_title_blocks(all_bboxes)
'''任何框体与舍弃框重叠,优先信任舍弃框''' '''任何框体与舍弃框重叠,优先信任舍弃框'''
all_bboxes = remove_need_drop_blocks(all_bboxes, discarded_blocks) all_bboxes = remove_need_drop_blocks(all_bboxes, discarded_blocks)
'''经过以上处理后,还存在大框套小框的情况,则删除小框'''
all_bboxes = remove_overlaps_min_blocks(all_bboxes)
'''将剩余的bbox做分离处理,防止后面分layout时出错'''
all_bboxes = remove_overlap_between_bbox_for_block(all_bboxes)
'''discarded_blocks中只保留宽度超过1/3页面宽度的,高度超过10的,处于页面下半50%区域的(限定footnote)''' '''discarded_blocks中只保留宽度超过1/3页面宽度的,高度超过10的,处于页面下半50%区域的(限定footnote)'''
for discarded in discarded_blocks: for discarded in discarded_blocks:
...@@ -47,6 +43,12 @@ def ocr_prepare_bboxes_for_layout_split(img_blocks, table_blocks, discarded_bloc ...@@ -47,6 +43,12 @@ def ocr_prepare_bboxes_for_layout_split(img_blocks, table_blocks, discarded_bloc
if (x1 - x0) > (page_w / 3) and (y1 - y0) > 10 and y0 > (page_h / 2): if (x1 - x0) > (page_w / 3) and (y1 - y0) > 10 and y0 > (page_h / 2):
all_bboxes.append([x0, y0, x1, y1, None, None, None, BlockType.Footnote, None, None, None, None]) all_bboxes.append([x0, y0, x1, y1, None, None, None, BlockType.Footnote, None, None, None, None])
'''经过以上处理后,还存在大框套小框的情况,则删除小框'''
all_bboxes = remove_overlaps_min_blocks(all_bboxes)
all_discarded_blocks = remove_overlaps_min_blocks(all_discarded_blocks)
'''将剩余的bbox做分离处理,防止后面分layout时出错'''
all_bboxes = remove_overlap_between_bbox_for_block(all_bboxes)
return all_bboxes, all_discarded_blocks return all_bboxes, all_discarded_blocks
......
...@@ -184,8 +184,8 @@ def check_useful_block_horizontal_overlap(useful_blocks: list) -> bool: ...@@ -184,8 +184,8 @@ def check_useful_block_horizontal_overlap(useful_blocks: list) -> bool:
area_j = (useful_bboxes[j][2] - useful_bboxes[j][0]) * (useful_bboxes[j][3] - useful_bboxes[j][1]) area_j = (useful_bboxes[j][2] - useful_bboxes[j][0]) * (useful_bboxes[j][3] - useful_bboxes[j][1])
if _is_left_overlap(useful_bboxes[i], useful_bboxes[j]) or _is_left_overlap(useful_bboxes[j], useful_bboxes[i]): if _is_left_overlap(useful_bboxes[i], useful_bboxes[j]) or _is_left_overlap(useful_bboxes[j], useful_bboxes[i]):
if area_i > area_j: if area_i > area_j:
return True, useful_bboxes[j] return True, useful_bboxes[j], useful_bboxes[i]
else: else:
return True, useful_bboxes[i] return True, useful_bboxes[i], useful_bboxes[j]
return False, None return False, None, None
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment