Commit bfb80cb2 authored by myhloli's avatar myhloli
Browse files

fix(mfr): improve LaTeX whitespace handling in unimernet model

- Preserve "\ " sequences during whitespace removal
- Add temporary substitution to prevent incorrect processing of "\ " sequences
- Restore "\ " sequences after removing unnecessary whitespace
parent 80a80482
...@@ -60,6 +60,10 @@ class TokenizerWrapper: ...@@ -60,6 +60,10 @@ class TokenizerWrapper:
def latex_rm_whitespace(s: str): def latex_rm_whitespace(s: str):
"""Remove unnecessary whitespace from LaTeX code. """Remove unnecessary whitespace from LaTeX code.
""" """
# 先保存 "\ " 序列,防止被错误处理
s = re.sub(r'\\ ', r'\\SPACE', s)
text_reg = r'(\\(operatorname|mathrm|text|mathbf)\s?\*? {.*?})' text_reg = r'(\\(operatorname|mathrm|text|mathbf)\s?\*? {.*?})'
letter = r'[a-zA-Z]' letter = r'[a-zA-Z]'
noletter = r'[\W_^\d]' noletter = r'[\W_^\d]'
...@@ -73,7 +77,11 @@ def latex_rm_whitespace(s: str): ...@@ -73,7 +77,11 @@ def latex_rm_whitespace(s: str):
news = re.sub(r'(%s)\s+?(%s)' % (letter, noletter), r'\1\2', news) news = re.sub(r'(%s)\s+?(%s)' % (letter, noletter), r'\1\2', news)
if news == s: if news == s:
break break
return s
# 恢复 "\ " 序列
news = re.sub(r'\\SPACE', r'\\ ', news)
return news
class UnimernetModel(VisionEncoderDecoderModel): class UnimernetModel(VisionEncoderDecoderModel):
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment