Commit b80befe9 authored by myhloli's avatar myhloli
Browse files

refactor(mkcontent): optimize paragraph text merging and language detection

- Extract language detection to block level instead of line level
- Improve logic for handling Chinese, Japanese, and Korean languages
- Refactor code for better readability and performance
- Optimize handling of hyphenated words at line ends
parent ea35fa6b
......@@ -136,21 +136,19 @@ def __replace_ligatures(text: str):
def merge_para_with_text(para_block):
block_text = ''
for line in para_block['lines']:
for span in line['spans']:
if span['type'] in [ContentType.Text]:
block_text += span['content']
block_lang = detect_lang(block_text)
para_text = ''
for i, line in enumerate(para_block['lines']):
if i >= 1 and line.get(ListLineTag.IS_LIST_START_LINE, False):
para_text += ' \n'
line_text = ''
line_lang = ''
for span in line['spans']:
span_type = span['type']
if span_type == ContentType.Text:
line_text += span['content'].strip()
if line_text != '':
line_lang = detect_lang(line_text)
for j, span in enumerate(line['spans']):
span_type = span['type']
......@@ -166,12 +164,16 @@ def merge_para_with_text(para_block):
if content:
langs = ['zh', 'ja', 'ko']
if line_lang in langs: # 中文/日语/韩文语境下,换行不需要空格分隔
para_text += content if j == len(line['spans']) - 1 else f'{content} '
# logger.info(f'block_lang: {block_lang}, content: {content}')
if block_lang in langs: # 中文/日语/韩文语境下,换行不需要空格分隔
if j == len(line['spans']) - 1:
para_text += content
else:
para_text += f'{content} '
else:
if span_type in [ContentType.Text, ContentType.InlineEquation]:
# 如果span是line的最后一个且末尾带有-连字符,那么末尾不应该加空格,同时应该把-删除
if j == len(line['spans'])-1 and __is_hyphen_at_line_end(content):
if j == len(line['spans'])-1 and span_type == ContentType.Text and __is_hyphen_at_line_end(content):
para_text += content[:-1]
else: # 西方文本语境下 content间需要空格分隔
para_text += f'{content} '
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment