Commit 18691cfd authored by myhloli's avatar myhloli
Browse files

refactor: enhance span merging logic for vertical text blocks in span_block_fix.py and ocr_utils.py

parent a2f0099c
...@@ -45,7 +45,21 @@ def __is_overlaps_y_exceeds_threshold(bbox1, ...@@ -45,7 +45,21 @@ def __is_overlaps_y_exceeds_threshold(bbox1,
# max_height = max(height1, height2) # max_height = max(height1, height2)
min_height = min(height1, height2) min_height = min(height1, height2)
return (overlap / min_height) > overlap_ratio_threshold return (overlap / min_height) > overlap_ratio_threshold if min_height > 0 else False
def __is_overlaps_x_exceeds_threshold(bbox1,
bbox2,
overlap_ratio_threshold=0.8):
"""检查两个bbox在x轴上是否有重叠,并且该重叠区域的宽度占两个bbox宽度更低的那个超过指定阈值"""
x0_1, _, x1_1, _ = bbox1
x0_2, _, x1_2, _ = bbox2
overlap = max(0, min(x1_1, x1_2) - max(x0_1, x0_2))
width1, width2 = x1_1 - x0_1, x1_2 - x0_2
min_width = min(width1, width2)
return (overlap / min_width) > overlap_ratio_threshold if min_width > 0 else False
def img_decode(content: bytes): def img_decode(content: bytes):
......
# Copyright (c) Opendatalab. All rights reserved. # Copyright (c) Opendatalab. All rights reserved.
from mineru.utils.boxbase import calculate_overlap_area_in_bbox1_area_ratio from mineru.utils.boxbase import calculate_overlap_area_in_bbox1_area_ratio
from mineru.utils.enum_class import BlockType, ContentType from mineru.utils.enum_class import BlockType, ContentType
from mineru.utils.ocr_utils import __is_overlaps_y_exceeds_threshold from mineru.utils.ocr_utils import __is_overlaps_y_exceeds_threshold, __is_overlaps_x_exceeds_threshold
def fill_spans_in_blocks(blocks, spans, radio): def fill_spans_in_blocks(blocks, spans, radio):
...@@ -71,8 +71,26 @@ def fix_text_block(block): ...@@ -71,8 +71,26 @@ def fix_text_block(block):
for span in block['spans']: for span in block['spans']:
if span['type'] == ContentType.INTERLINE_EQUATION: if span['type'] == ContentType.INTERLINE_EQUATION:
span['type'] = ContentType.INLINE_EQUATION span['type'] = ContentType.INLINE_EQUATION
block_lines = merge_spans_to_line(block['spans'])
sort_block_lines = line_sort_spans_by_left_to_right(block_lines) # 假设block中的span超过80%的数量高度是宽度的两倍以上,则认为是纵向文本块
vertical_span_count = sum(
1 for span in block['spans']
if (span['bbox'][3] - span['bbox'][1]) / (span['bbox'][2] - span['bbox'][0]) > 2
)
total_span_count = len(block['spans'])
if total_span_count == 0:
vertical_ratio = 0
else:
vertical_ratio = vertical_span_count / total_span_count
if vertical_ratio > 0.8:
# 如果是纵向文本块,则按纵向lines处理
block_lines = merge_spans_to_vertical_line(block['spans'])
sort_block_lines = vertical_line_sort_spans_from_top_to_bottom(block_lines)
else:
block_lines = merge_spans_to_line(block['spans'])
sort_block_lines = line_sort_spans_by_left_to_right(block_lines)
block['lines'] = sort_block_lines block['lines'] = sort_block_lines
del block['spans'] del block['spans']
return block return block
...@@ -117,6 +135,44 @@ def merge_spans_to_line(spans, threshold=0.6): ...@@ -117,6 +135,44 @@ def merge_spans_to_line(spans, threshold=0.6):
return lines return lines
def merge_spans_to_vertical_line(spans, threshold=0.6):
"""将纵向文本的spans合并成纵向lines(从右向左阅读)"""
if len(spans) == 0:
return []
else:
# 按照x2坐标从大到小排序(从右向左)
spans.sort(key=lambda span: span['bbox'][2], reverse=True)
vertical_lines = []
current_line = [spans[0]]
for span in spans[1:]:
# 特殊类型元素单独成列
if span['type'] in [
ContentType.INTERLINE_EQUATION, ContentType.IMAGE,
ContentType.TABLE
] or any(s['type'] in [
ContentType.INTERLINE_EQUATION, ContentType.IMAGE,
ContentType.TABLE
] for s in current_line):
vertical_lines.append(current_line)
current_line = [span]
continue
# 如果当前的span与当前行的最后一个span在y轴上重叠,则添加到当前行
if __is_overlaps_x_exceeds_threshold(span['bbox'], current_line[-1]['bbox'], threshold):
current_line.append(span)
else:
vertical_lines.append(current_line)
current_line = [span]
# 添加最后一列
if current_line:
vertical_lines.append(current_line)
return vertical_lines
# 将每一个line中的span从左到右排序 # 将每一个line中的span从左到右排序
def line_sort_spans_by_left_to_right(lines): def line_sort_spans_by_left_to_right(lines):
line_objects = [] line_objects = []
...@@ -136,6 +192,28 @@ def line_sort_spans_by_left_to_right(lines): ...@@ -136,6 +192,28 @@ def line_sort_spans_by_left_to_right(lines):
return line_objects return line_objects
def vertical_line_sort_spans_from_top_to_bottom(vertical_lines):
line_objects = []
for line in vertical_lines:
# 按照y0坐标排序(从上到下)
line.sort(key=lambda span: span['bbox'][1])
# 计算整个列的边界框
line_bbox = [
min(span['bbox'][0] for span in line), # x0
min(span['bbox'][1] for span in line), # y0
max(span['bbox'][2] for span in line), # x1
max(span['bbox'][3] for span in line), # y1
]
# 组装结果
line_objects.append({
'bbox': line_bbox,
'spans': line,
})
return line_objects
def fix_block_spans(block_with_spans): def fix_block_spans(block_with_spans):
fix_blocks = [] fix_blocks = []
for block in block_with_spans: for block in block_with_spans:
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment