"git@developer.sourcefind.cn:OpenDAS/torch-scatter.git" did not exist on "5909accb96874b0eb21dbc952e98e0191467d7b9"
Commit f674b8d4 authored by myhloli's avatar myhloli
Browse files

refactor(para): improve language detection and block splitting

- Add language detection for each block of text
- Implement language-specific logic for right margin alignment
- Introduce logging for debugging purposes
parent 08392d63
import copy import copy
from loguru import logger
from magic_pdf.config.constants import CROSS_PAGE, LINES_DELETED from magic_pdf.config.constants import CROSS_PAGE, LINES_DELETED
from magic_pdf.config.ocr_content_type import BlockType, ContentType from magic_pdf.config.ocr_content_type import BlockType, ContentType
from magic_pdf.libs.language import detect_lang
LINE_STOP_FLAG = ( LINE_STOP_FLAG = (
'.', '.',
...@@ -125,6 +128,9 @@ def __is_list_or_index_block(block): ...@@ -125,6 +128,9 @@ def __is_list_or_index_block(block):
# 添加所有文本,包括空行,保持与block['lines']长度一致 # 添加所有文本,包括空行,保持与block['lines']长度一致
lines_text_list.append(line_text) lines_text_list.append(line_text)
block_text = ''.join(lines_text_list)
block_lang = detect_lang(block_text)
# logger.info(f"block_lang: {block_lang}")
# 计算line左侧顶格数量是否大于2,是否顶格用abs(block['bbox_fs'][0] - line['bbox'][0]) < line_height/2 来判断 # 计算line左侧顶格数量是否大于2,是否顶格用abs(block['bbox_fs'][0] - line['bbox'][0]) < line_height/2 来判断
if abs(block['bbox_fs'][0] - line['bbox'][0]) < line_height / 2: if abs(block['bbox_fs'][0] - line['bbox'][0]) < line_height / 2:
...@@ -136,13 +142,16 @@ def __is_list_or_index_block(block): ...@@ -136,13 +142,16 @@ def __is_list_or_index_block(block):
if abs(block['bbox_fs'][2] - line['bbox'][2]) < line_height: if abs(block['bbox_fs'][2] - line['bbox'][2]) < line_height:
right_close_num += 1 right_close_num += 1
else: else:
# 右侧不顶格情况下是否有一段距离,拍脑袋用0.3block宽度做阈值 # 类中文没有超长单词的情况,可以用统一的阈值
# block宽的阈值可以小些,block窄的阈值要大 if block_lang in ['zh', 'ja', 'ko']:
if block_weight_radio >= 0.5:
closed_area = 0.26 * block_weight closed_area = 0.26 * block_weight
else: else:
closed_area = 0.36 * block_weight # 右侧不顶格情况下是否有一段距离,拍脑袋用0.3block宽度做阈值
# block宽的阈值可以小些,block窄的阈值要大
if block_weight_radio >= 0.5:
closed_area = 0.26 * block_weight
else:
closed_area = 0.36 * block_weight
if block['bbox_fs'][2] - line['bbox'][2] > closed_area: if block['bbox_fs'][2] - line['bbox'][2] > closed_area:
right_not_close_num += 1 right_not_close_num += 1
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment