"llm/llama.cpp/vscode:/vscode.git/clone" did not exist on "f3648fd20607f92f5fcf65ef7d7b52d919ae4909"
Commit 160624bd authored by myhloli's avatar myhloli
Browse files

refactor(para): improve block merging logic in para_split_v3.py

- Add checks for uppercase character start in the first span of a block
parent 14656085
......@@ -271,13 +271,18 @@ def __merge_2_text_blocks(block1, block2):
first_span = first_line['spans'][0]
if len(first_span['content']) > 0:
span_start_with_num = first_span['content'][0].isdigit()
span_start_with_big_char = first_span['content'][0].isupper()
if (
abs(block2['bbox_fs'][2] - last_line['bbox'][2])
< line_height
# 上一个block的最后一个line的右边界和block的右边界差距不超过line_height
abs(block2['bbox_fs'][2] - last_line['bbox'][2]) < line_height
# 上一个block的最后一个span不是以特定符号结尾
and not last_span['content'].endswith(LINE_STOP_FLAG)
# 两个block宽度差距超过2倍也不合并
and abs(block1_weight - block2_weight) < min_block_weight
# 下一个block的第一个字符是数字
and not span_start_with_num
# 下一个block的第一个字符是大写字母
and not span_start_with_big_char
):
if block1['page_num'] != block2['page_num']:
for line in block1['lines']:
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment