Commit c8cabb3c authored by myhloli's avatar myhloli
Browse files

feat(ocr_mkcontent): add language detection for line spacing

- Introduce language detection to determine line spacing based on language context
- Implement different spacing rules for Chinese/Japanese/Korean and Western texts
- Adjust span content handling based on detected language and span type
parent 7f2f2c0f
......@@ -5,6 +5,7 @@ from loguru import logger
from magic_pdf.config.make_content_config import DropMode, MakeMode
from magic_pdf.config.ocr_content_type import BlockType, ContentType
from magic_pdf.libs.commons import join_path
from magic_pdf.libs.language import detect_lang
from magic_pdf.libs.markdown_utils import ocr_escape_special_markdown_char
from magic_pdf.para.para_split_v3 import ListLineTag
......@@ -142,11 +143,14 @@ def merge_para_with_text(para_block):
para_text += ' \n'
line_text = ''
line_lang = ''
for span in line['spans']:
span_type = span['type']
if span_type == ContentType.Text:
line_text += span['content'].strip()
if line_text != '':
line_lang = detect_lang(line_text)
for j, span in enumerate(line['spans']):
span_type = span['type']
......@@ -159,15 +163,20 @@ def merge_para_with_text(para_block):
content = f"\n$$\n{span['content']}\n$$\n"
content = content.strip()
if content != '':
if span_type in [ContentType.Text, ContentType.InlineEquation]:
# 如果span是line的最后一个且末尾带有-连字符,那么末尾不应该加空格,同时应该把-删除
if j == len(line['spans'])-1 and __is_hyphen_at_line_end(content):
para_text += content[:-1]
else: # content间需要空格分隔
para_text += f'{content} '
elif span_type == ContentType.InterlineEquation:
para_text += content
if content:
langs = ['zh', 'ja', 'ko']
if line_lang in langs: # 中文/日语/韩文语境下,换行不需要空格分隔
para_text += content if j == len(line['spans']) - 1 else f'{content} '
else:
if span_type in [ContentType.Text, ContentType.InlineEquation]:
# 如果span是line的最后一个且末尾带有-连字符,那么末尾不应该加空格,同时应该把-删除
if j == len(line['spans'])-1 and __is_hyphen_at_line_end(content):
para_text += content[:-1]
else: # 西方文本语境下 content间需要空格分隔
para_text += f'{content} '
elif span_type == ContentType.InterlineEquation:
para_text += content
else:
continue
# 连写字符拆分
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment