Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
wangsen
MinerU
Commits
f2115541
Commit
f2115541
authored
Jun 04, 2025
by
myhloli
Browse files
refactor: improve text processing by adding ligature and unicode replacement functions
parent
76e1a7c1
Changes
3
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
17 additions
and
2 deletions
+17
-2
mineru/backend/pipeline/batch_analyze.py
mineru/backend/pipeline/batch_analyze.py
+0
-1
mineru/utils/span_block_fix.py
mineru/utils/span_block_fix.py
+1
-1
mineru/utils/span_pre_proc.py
mineru/utils/span_pre_proc.py
+16
-0
No files found.
mineru/backend/pipeline/batch_analyze.py
View file @
f2115541
...
...
@@ -132,7 +132,6 @@ class BatchAnalyze:
# 获取OCR模型
ocr_model
=
atom_model_manager
.
get_atom_model
(
atom_model_name
=
'ocr'
,
ocr_show_log
=
False
,
det_db_box_thresh
=
0.3
,
lang
=
lang
)
...
...
mineru/utils/span_block_fix.py
View file @
f2115541
...
...
@@ -38,7 +38,7 @@ def fill_spans_in_blocks(blocks, spans, radio):
def
span_block_type_compatible
(
span_type
,
block_type
):
if
span_type
in
[
ContentType
.
TEXT
,
ContentType
.
IN
TER
LINE_EQUATION
]:
if
span_type
in
[
ContentType
.
TEXT
,
ContentType
.
INLINE_EQUATION
]:
return
block_type
in
[
BlockType
.
TEXT
,
BlockType
.
TITLE
,
...
...
mineru/utils/span_pre_proc.py
View file @
f2115541
# Copyright (c) Opendatalab. All rights reserved.
import
re
import
cv2
import
numpy
as
np
...
...
@@ -100,6 +101,19 @@ def remove_overlaps_min_spans(spans):
return
spans
,
dropped_spans
def
__replace_ligatures
(
text
:
str
):
ligatures
=
{
'fi'
:
'fi'
,
'fl'
:
'fl'
,
'ff'
:
'ff'
,
'ffi'
:
'ffi'
,
'ffl'
:
'ffl'
,
'ſt'
:
'ft'
,
'st'
:
'st'
}
return
re
.
sub
(
'|'
.
join
(
map
(
re
.
escape
,
ligatures
.
keys
())),
lambda
m
:
ligatures
[
m
.
group
()],
text
)
def
__replace_unicode
(
text
:
str
):
ligatures
=
{
'
\r\n
'
:
''
,
'
\u0002
'
:
'-'
,
}
return
re
.
sub
(
'|'
.
join
(
map
(
re
.
escape
,
ligatures
.
keys
())),
lambda
m
:
ligatures
[
m
.
group
()],
text
)
def
txt_spans_extract
(
pdf_page
,
spans
,
pil_img
,
scale
):
textpage
=
pdf_page
.
get_textpage
()
...
...
@@ -117,6 +131,8 @@ def txt_spans_extract(pdf_page, spans, pil_img, scale):
text
=
textpage
.
get_text_bounded
(
left
=
rect_box
[
0
],
top
=
rect_box
[
1
],
right
=
rect_box
[
2
],
bottom
=
rect_box
[
3
])
if
text
and
len
(
text
)
>
0
:
text
=
__replace_unicode
(
text
)
text
=
__replace_ligatures
(
text
)
span
[
'content'
]
=
text
.
strip
()
span
[
'score'
]
=
1.0
else
:
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment