Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
wangsen
MinerU
Commits
1ee15504
"model/models/git@developer.sourcefind.cn:OpenDAS/ollama.git" did not exist on "903b1fc97f37fda25fd233ed853355acfc0f63cf"
Commit
1ee15504
authored
Jul 04, 2025
by
Jmper
Browse files
refactor: Optimize fill_char_in_spans using a spatial grid
parent
592b659e
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
17 additions
and
4 deletions
+17
-4
mineru/utils/span_pre_proc.py
mineru/utils/span_pre_proc.py
+17
-4
No files found.
mineru/utils/span_pre_proc.py
View file @
1ee15504
# Copyright (c) Opendatalab. All rights reserved.
# Copyright (c) Opendatalab. All rights reserved.
import
collections
import
re
import
re
import
statistics
import
statistics
...
@@ -187,7 +188,7 @@ def txt_spans_extract(pdf_page, spans, pil_img, scale, all_bboxes, all_discarded
...
@@ -187,7 +188,7 @@ def txt_spans_extract(pdf_page, spans, pil_img, scale, all_bboxes, all_discarded
span
[
'chars'
]
=
[]
span
[
'chars'
]
=
[]
new_spans
.
append
(
span
)
new_spans
.
append
(
span
)
need_ocr_spans
=
fill_char_in_spans
(
new_spans
,
page_all_chars
)
need_ocr_spans
=
fill_char_in_spans
(
new_spans
,
page_all_chars
,
median_span_height
)
"""对未填充的span进行ocr"""
"""对未填充的span进行ocr"""
if
len
(
need_ocr_spans
)
>
0
:
if
len
(
need_ocr_spans
)
>
0
:
...
@@ -208,14 +209,26 @@ def txt_spans_extract(pdf_page, spans, pil_img, scale, all_bboxes, all_discarded
...
@@ -208,14 +209,26 @@ def txt_spans_extract(pdf_page, spans, pil_img, scale, all_bboxes, all_discarded
return
spans
return
spans
def
fill_char_in_spans
(
spans
,
all_chars
):
def
fill_char_in_spans
(
spans
,
all_chars
,
median_span_height
):
# 简单从上到下排一下序
# 简单从上到下排一下序
spans
=
sorted
(
spans
,
key
=
lambda
x
:
x
[
'bbox'
][
1
])
spans
=
sorted
(
spans
,
key
=
lambda
x
:
x
[
'bbox'
][
1
])
grid_size
=
median_span_height
grid
=
collections
.
defaultdict
(
list
)
for
i
,
span
in
enumerate
(
spans
):
start_cell
=
int
(
span
[
'bbox'
][
1
]
/
grid_size
)
end_cell
=
int
(
span
[
'bbox'
][
3
]
/
grid_size
)
for
cell_idx
in
range
(
start_cell
,
end_cell
+
1
):
grid
[
cell_idx
].
append
(
i
)
for
char
in
all_chars
:
for
char
in
all_chars
:
char_center_y
=
(
char
[
'bbox'
][
1
]
+
char
[
'bbox'
][
3
])
/
2
cell_idx
=
int
(
char_center_y
/
grid_size
)
candidate_span_indices
=
grid
.
get
(
cell_idx
,
[])
for
span
in
spans
:
for
span_idx
in
candidate_span_indices
:
span
=
spans
[
span_idx
]
if
calculate_char_in_span
(
char
[
'bbox'
],
span
[
'bbox'
],
char
[
'char'
]):
if
calculate_char_in_span
(
char
[
'bbox'
],
span
[
'bbox'
],
char
[
'char'
]):
span
[
'chars'
].
append
(
char
)
span
[
'chars'
].
append
(
char
)
break
break
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment