Commit 1c491e7d authored by myhloli's avatar myhloli
Browse files

refactor: rename vertical text thresholds for clarity in span_block_fix.py

parent e56eec0d
...@@ -73,10 +73,11 @@ def fix_text_block(block): ...@@ -73,10 +73,11 @@ def fix_text_block(block):
span['type'] = ContentType.INLINE_EQUATION span['type'] = ContentType.INLINE_EQUATION
# 假设block中的span超过80%的数量高度是宽度的两倍以上,则认为是纵向文本块 # 假设block中的span超过80%的数量高度是宽度的两倍以上,则认为是纵向文本块
VERTICAL_TEXT_RATIO_THRESHOLD = 2 # Threshold for determining vertical text blocks VERTICAL_SPAN_HEIGHT_WIDTH_THRESHOLD = 2 # Threshold for determining vertical text blocks
VERTICAL_SPAN_IN_BLOCK_THRESHOLD = 0.8 # Threshold for determining vertical text blocks
vertical_span_count = sum( vertical_span_count = sum(
1 for span in block['spans'] 1 for span in block['spans']
if (span['bbox'][3] - span['bbox'][1]) / (span['bbox'][2] - span['bbox'][0]) > VERTICAL_TEXT_RATIO_THRESHOLD if (span['bbox'][3] - span['bbox'][1]) / (span['bbox'][2] - span['bbox'][0]) > VERTICAL_SPAN_HEIGHT_WIDTH_THRESHOLD
) )
total_span_count = len(block['spans']) total_span_count = len(block['spans'])
if total_span_count == 0: if total_span_count == 0:
...@@ -84,7 +85,7 @@ def fix_text_block(block): ...@@ -84,7 +85,7 @@ def fix_text_block(block):
else: else:
vertical_ratio = vertical_span_count / total_span_count vertical_ratio = vertical_span_count / total_span_count
if vertical_ratio > VERTICAL_TEXT_BLOCK_THRESHOLD: if vertical_ratio > VERTICAL_SPAN_IN_BLOCK_THRESHOLD:
# 如果是纵向文本块,则按纵向lines处理 # 如果是纵向文本块,则按纵向lines处理
block_lines = merge_spans_to_vertical_line(block['spans']) block_lines = merge_spans_to_vertical_line(block['spans'])
sort_block_lines = vertical_line_sort_spans_from_top_to_bottom(block_lines) sort_block_lines = vertical_line_sort_spans_from_top_to_bottom(block_lines)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment