Commit bd755962 authored by myhloli's avatar myhloli
Browse files

fix(merge_text): add ligature replacement functionality

- Implement __replace_ligatures function to split ligature characters- Integrate ligature replacement into the merge_para_with_text function
- Handle common ligatures such as fi, fl, ff, ffi, and ffl
parent 24fb7041
...@@ -119,6 +119,16 @@ def detect_language(text): ...@@ -119,6 +119,16 @@ def detect_language(text):
return 'empty' return 'empty'
# 连写字符拆分
def __replace_ligatures(text: str):
text = re.sub(r'fi', 'fi', text) # 替换 fi 连写符
text = re.sub(r'fl', 'fl', text) # 替换 fl 连写符
text = re.sub(r'ff', 'ff', text) # 替换 ff 连写符
text = re.sub(r'ffi', 'ffi', text) # 替换 ffi 连写符
text = re.sub(r'ffl', 'ffl', text) # 替换 ffl 连写符
return text
def merge_para_with_text(para_block): def merge_para_with_text(para_block):
para_text = '' para_text = ''
for i, line in enumerate(para_block['lines']): for i, line in enumerate(para_block['lines']):
...@@ -166,6 +176,8 @@ def merge_para_with_text(para_block): ...@@ -166,6 +176,8 @@ def merge_para_with_text(para_block):
para_text += content para_text += content
else: else:
continue continue
# 连写字符拆分
para_text = __replace_ligatures(para_text)
return para_text return para_text
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment