Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
wangsen
MinerU
Commits
18691cfd
Commit
18691cfd
authored
Jul 04, 2025
by
myhloli
Browse files
refactor: enhance span merging logic for vertical text blocks in span_block_fix.py and ocr_utils.py
parent
a2f0099c
Changes
2
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
96 additions
and
4 deletions
+96
-4
mineru/utils/ocr_utils.py
mineru/utils/ocr_utils.py
+15
-1
mineru/utils/span_block_fix.py
mineru/utils/span_block_fix.py
+81
-3
No files found.
mineru/utils/ocr_utils.py
View file @
18691cfd
...
@@ -45,7 +45,21 @@ def __is_overlaps_y_exceeds_threshold(bbox1,
...
@@ -45,7 +45,21 @@ def __is_overlaps_y_exceeds_threshold(bbox1,
# max_height = max(height1, height2)
# max_height = max(height1, height2)
min_height
=
min
(
height1
,
height2
)
min_height
=
min
(
height1
,
height2
)
return
(
overlap
/
min_height
)
>
overlap_ratio_threshold
return
(
overlap
/
min_height
)
>
overlap_ratio_threshold
if
min_height
>
0
else
False
def
__is_overlaps_x_exceeds_threshold
(
bbox1
,
bbox2
,
overlap_ratio_threshold
=
0.8
):
"""检查两个bbox在x轴上是否有重叠,并且该重叠区域的宽度占两个bbox宽度更低的那个超过指定阈值"""
x0_1
,
_
,
x1_1
,
_
=
bbox1
x0_2
,
_
,
x1_2
,
_
=
bbox2
overlap
=
max
(
0
,
min
(
x1_1
,
x1_2
)
-
max
(
x0_1
,
x0_2
))
width1
,
width2
=
x1_1
-
x0_1
,
x1_2
-
x0_2
min_width
=
min
(
width1
,
width2
)
return
(
overlap
/
min_width
)
>
overlap_ratio_threshold
if
min_width
>
0
else
False
def
img_decode
(
content
:
bytes
):
def
img_decode
(
content
:
bytes
):
...
...
mineru/utils/span_block_fix.py
View file @
18691cfd
# Copyright (c) Opendatalab. All rights reserved.
# Copyright (c) Opendatalab. All rights reserved.
from
mineru.utils.boxbase
import
calculate_overlap_area_in_bbox1_area_ratio
from
mineru.utils.boxbase
import
calculate_overlap_area_in_bbox1_area_ratio
from
mineru.utils.enum_class
import
BlockType
,
ContentType
from
mineru.utils.enum_class
import
BlockType
,
ContentType
from
mineru.utils.ocr_utils
import
__is_overlaps_y_exceeds_threshold
from
mineru.utils.ocr_utils
import
__is_overlaps_y_exceeds_threshold
,
__is_overlaps_x_exceeds_threshold
def
fill_spans_in_blocks
(
blocks
,
spans
,
radio
):
def
fill_spans_in_blocks
(
blocks
,
spans
,
radio
):
...
@@ -71,8 +71,26 @@ def fix_text_block(block):
...
@@ -71,8 +71,26 @@ def fix_text_block(block):
for
span
in
block
[
'spans'
]:
for
span
in
block
[
'spans'
]:
if
span
[
'type'
]
==
ContentType
.
INTERLINE_EQUATION
:
if
span
[
'type'
]
==
ContentType
.
INTERLINE_EQUATION
:
span
[
'type'
]
=
ContentType
.
INLINE_EQUATION
span
[
'type'
]
=
ContentType
.
INLINE_EQUATION
block_lines
=
merge_spans_to_line
(
block
[
'spans'
])
sort_block_lines
=
line_sort_spans_by_left_to_right
(
block_lines
)
# 假设block中的span超过80%的数量高度是宽度的两倍以上,则认为是纵向文本块
vertical_span_count
=
sum
(
1
for
span
in
block
[
'spans'
]
if
(
span
[
'bbox'
][
3
]
-
span
[
'bbox'
][
1
])
/
(
span
[
'bbox'
][
2
]
-
span
[
'bbox'
][
0
])
>
2
)
total_span_count
=
len
(
block
[
'spans'
])
if
total_span_count
==
0
:
vertical_ratio
=
0
else
:
vertical_ratio
=
vertical_span_count
/
total_span_count
if
vertical_ratio
>
0.8
:
# 如果是纵向文本块,则按纵向lines处理
block_lines
=
merge_spans_to_vertical_line
(
block
[
'spans'
])
sort_block_lines
=
vertical_line_sort_spans_from_top_to_bottom
(
block_lines
)
else
:
block_lines
=
merge_spans_to_line
(
block
[
'spans'
])
sort_block_lines
=
line_sort_spans_by_left_to_right
(
block_lines
)
block
[
'lines'
]
=
sort_block_lines
block
[
'lines'
]
=
sort_block_lines
del
block
[
'spans'
]
del
block
[
'spans'
]
return
block
return
block
...
@@ -117,6 +135,44 @@ def merge_spans_to_line(spans, threshold=0.6):
...
@@ -117,6 +135,44 @@ def merge_spans_to_line(spans, threshold=0.6):
return
lines
return
lines
def
merge_spans_to_vertical_line
(
spans
,
threshold
=
0.6
):
"""将纵向文本的spans合并成纵向lines(从右向左阅读)"""
if
len
(
spans
)
==
0
:
return
[]
else
:
# 按照x2坐标从大到小排序(从右向左)
spans
.
sort
(
key
=
lambda
span
:
span
[
'bbox'
][
2
],
reverse
=
True
)
vertical_lines
=
[]
current_line
=
[
spans
[
0
]]
for
span
in
spans
[
1
:]:
# 特殊类型元素单独成列
if
span
[
'type'
]
in
[
ContentType
.
INTERLINE_EQUATION
,
ContentType
.
IMAGE
,
ContentType
.
TABLE
]
or
any
(
s
[
'type'
]
in
[
ContentType
.
INTERLINE_EQUATION
,
ContentType
.
IMAGE
,
ContentType
.
TABLE
]
for
s
in
current_line
):
vertical_lines
.
append
(
current_line
)
current_line
=
[
span
]
continue
# 如果当前的span与当前行的最后一个span在y轴上重叠,则添加到当前行
if
__is_overlaps_x_exceeds_threshold
(
span
[
'bbox'
],
current_line
[
-
1
][
'bbox'
],
threshold
):
current_line
.
append
(
span
)
else
:
vertical_lines
.
append
(
current_line
)
current_line
=
[
span
]
# 添加最后一列
if
current_line
:
vertical_lines
.
append
(
current_line
)
return
vertical_lines
# 将每一个line中的span从左到右排序
# 将每一个line中的span从左到右排序
def
line_sort_spans_by_left_to_right
(
lines
):
def
line_sort_spans_by_left_to_right
(
lines
):
line_objects
=
[]
line_objects
=
[]
...
@@ -136,6 +192,28 @@ def line_sort_spans_by_left_to_right(lines):
...
@@ -136,6 +192,28 @@ def line_sort_spans_by_left_to_right(lines):
return
line_objects
return
line_objects
def
vertical_line_sort_spans_from_top_to_bottom
(
vertical_lines
):
line_objects
=
[]
for
line
in
vertical_lines
:
# 按照y0坐标排序(从上到下)
line
.
sort
(
key
=
lambda
span
:
span
[
'bbox'
][
1
])
# 计算整个列的边界框
line_bbox
=
[
min
(
span
[
'bbox'
][
0
]
for
span
in
line
),
# x0
min
(
span
[
'bbox'
][
1
]
for
span
in
line
),
# y0
max
(
span
[
'bbox'
][
2
]
for
span
in
line
),
# x1
max
(
span
[
'bbox'
][
3
]
for
span
in
line
),
# y1
]
# 组装结果
line_objects
.
append
({
'bbox'
:
line_bbox
,
'spans'
:
line
,
})
return
line_objects
def
fix_block_spans
(
block_with_spans
):
def
fix_block_spans
(
block_with_spans
):
fix_blocks
=
[]
fix_blocks
=
[]
for
block
in
block_with_spans
:
for
block
in
block_with_spans
:
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment