Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
wangsen
MinerU
Commits
4f6d8d7c
Commit
4f6d8d7c
authored
Jun 12, 2025
by
myhloli
Browse files
fix: improve LaTeX delimiter handling by replacing valid and invalid pairs
parent
84fa04e2
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
30 additions
and
23 deletions
+30
-23
mineru/backend/vlm/vlm_magic_model.py
mineru/backend/vlm/vlm_magic_model.py
+30
-23
No files found.
mineru/backend/vlm/vlm_magic_model.py
View file @
4f6d8d7c
...
@@ -205,35 +205,42 @@ def isolated_formula_clean(txt):
...
@@ -205,35 +205,42 @@ def isolated_formula_clean(txt):
def
latex_fix
(
latex
):
def
latex_fix
(
latex
):
# 白名单分隔符
# valid pairs:
valid_delims_list
=
[
r
'('
,
r
')'
,
r
'['
,
r
']'
,
r
'{'
,
r
'}'
,
r
'/'
,
r
'|'
,
# \left\{ ... \right\}
r
'\{'
,
r
'\}'
,
r
'\lceil'
,
r
'\rceil'
,
r
'\lfloor'
,
# \left( ... \right)
r
'\rfloor'
,
r
'\backslash'
,
r
'\uparrow'
,
r
'\downarrow'
,
# \left| ... \right|
r
'\Uparrow'
,
r
'\Downarrow'
,
r
'\|'
,
r
'\.'
]
# \left\| ... \right\|
# \left[ ... \right]
# 为\left后缺失有效分隔符的情况添加点
def
fix_delim
(
match
):
cmd
=
match
.
group
(
1
)
# \left 或 \right
rest
=
match
.
group
(
2
)
if
len
(
match
.
groups
())
>
1
else
""
if
not
rest
or
rest
not
in
valid_delims_list
:
return
cmd
+
"."
return
match
.
group
(
0
)
LEFT_PATTERN
=
re
.
compile
(
r
'(\\left)(\S*)'
)
RIGHT_PATTERN
=
re
.
compile
(
r
'(\\right)(\S*)'
)
LEFT_COUNT_PATTERN
=
re
.
compile
(
r
'\\left(?![a-zA-Z])'
)
LEFT_COUNT_PATTERN
=
re
.
compile
(
r
'\\left(?![a-zA-Z])'
)
RIGHT_COUNT_PATTERN
=
re
.
compile
(
r
'\\right(?![a-zA-Z])'
)
RIGHT_COUNT_PATTERN
=
re
.
compile
(
r
'\\right(?![a-zA-Z])'
)
LEFT_RIGHT_REMOVE_PATTERN
=
re
.
compile
(
r
'\\left\.?|\\right\.?'
)
latex
=
LEFT_PATTERN
.
sub
(
lambda
m
:
fix_delim
(
m
),
latex
)
latex
=
RIGHT_PATTERN
.
sub
(
lambda
m
:
fix_delim
(
m
),
latex
)
left_count
=
len
(
LEFT_COUNT_PATTERN
.
findall
(
latex
))
# 不匹配\lefteqn等
left_count
=
len
(
LEFT_COUNT_PATTERN
.
findall
(
latex
))
# 不匹配\lefteqn等
right_count
=
len
(
RIGHT_COUNT_PATTERN
.
findall
(
latex
))
# 不匹配\rightarrow
right_count
=
len
(
RIGHT_COUNT_PATTERN
.
findall
(
latex
))
# 不匹配\rightarrow
if
left_count
!=
right_count
:
if
left_count
!=
right_count
:
return
LEFT_RIGHT_REMOVE_PATTERN
.
sub
(
''
,
latex
)
for
_
in
range
(
2
):
# replace valid pairs
latex
=
re
.
sub
(
r
'\\left\\\{'
,
"{"
,
latex
)
# \left\{
latex
=
re
.
sub
(
r
"\\left\|"
,
"|"
,
latex
)
# \left|
latex
=
re
.
sub
(
r
"\\left\\\|"
,
"|"
,
latex
)
# \left\|
latex
=
re
.
sub
(
r
"\\left\("
,
"("
,
latex
)
# \left(
latex
=
re
.
sub
(
r
"\\left\["
,
"["
,
latex
)
# \left[
latex
=
re
.
sub
(
r
"\\right\\\}"
,
"}"
,
latex
)
# \right\}
latex
=
re
.
sub
(
r
"\\right\|"
,
"|"
,
latex
)
# \right|
latex
=
re
.
sub
(
r
"\\right\\\|"
,
"|"
,
latex
)
# \right\|
latex
=
re
.
sub
(
r
"\\right\)"
,
")"
,
latex
)
# \right)
latex
=
re
.
sub
(
r
"\\right\]"
,
"]"
,
latex
)
# \right]
latex
=
re
.
sub
(
r
"\\right\."
,
""
,
latex
)
# \right.
# replace invalid pairs first
latex
=
re
.
sub
(
r
'\\left\{'
,
"{"
,
latex
)
latex
=
re
.
sub
(
r
'\\right\}'
,
"}"
,
latex
)
# \left{ ... \right}
latex
=
re
.
sub
(
r
'\\left\\\('
,
"("
,
latex
)
latex
=
re
.
sub
(
r
'\\right\\\)'
,
")"
,
latex
)
# \left\( ... \right\)
latex
=
re
.
sub
(
r
'\\left\\\['
,
"["
,
latex
)
latex
=
re
.
sub
(
r
'\\right\\\]'
,
"]"
,
latex
)
# \left\[ ... \right\]
return
latex
return
latex
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment