Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
wangsen
MinerU
Commits
98d23e71
Commit
98d23e71
authored
Jul 04, 2025
by
myhloli
Browse files
refactor: rename overlap detection functions for consistency in ocr_utils.py and span_block_fix.py
parent
e413f005
Changes
2
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
9 additions
and
9 deletions
+9
-9
mineru/utils/ocr_utils.py
mineru/utils/ocr_utils.py
+6
-6
mineru/utils/span_block_fix.py
mineru/utils/span_block_fix.py
+3
-3
No files found.
mineru/utils/ocr_utils.py
View file @
98d23e71
...
@@ -22,7 +22,7 @@ def merge_spans_to_line(spans, threshold=0.6):
...
@@ -22,7 +22,7 @@ def merge_spans_to_line(spans, threshold=0.6):
current_line
=
[
spans
[
0
]]
current_line
=
[
spans
[
0
]]
for
span
in
spans
[
1
:]:
for
span
in
spans
[
1
:]:
# 如果当前的span与当前行的最后一个span在y轴上重叠,则添加到当前行
# 如果当前的span与当前行的最后一个span在y轴上重叠,则添加到当前行
if
_
_is_overlaps_y_exceeds_threshold
(
span
[
'bbox'
],
current_line
[
-
1
][
'bbox'
],
threshold
):
if
_is_overlaps_y_exceeds_threshold
(
span
[
'bbox'
],
current_line
[
-
1
][
'bbox'
],
threshold
):
current_line
.
append
(
span
)
current_line
.
append
(
span
)
else
:
else
:
# 否则,开始新行
# 否则,开始新行
...
@@ -35,9 +35,9 @@ def merge_spans_to_line(spans, threshold=0.6):
...
@@ -35,9 +35,9 @@ def merge_spans_to_line(spans, threshold=0.6):
return
lines
return
lines
def
_
_is_overlaps_y_exceeds_threshold
(
bbox1
,
def
_is_overlaps_y_exceeds_threshold
(
bbox1
,
bbox2
,
bbox2
,
overlap_ratio_threshold
=
0.8
):
overlap_ratio_threshold
=
0.8
):
"""检查两个bbox在y轴上是否有重叠,并且该重叠区域的高度占两个bbox高度更低的那个超过80%"""
"""检查两个bbox在y轴上是否有重叠,并且该重叠区域的高度占两个bbox高度更低的那个超过80%"""
_
,
y0_1
,
_
,
y1_1
=
bbox1
_
,
y0_1
,
_
,
y1_1
=
bbox1
_
,
y0_2
,
_
,
y1_2
=
bbox2
_
,
y0_2
,
_
,
y1_2
=
bbox2
...
@@ -50,7 +50,7 @@ def __is_overlaps_y_exceeds_threshold(bbox1,
...
@@ -50,7 +50,7 @@ def __is_overlaps_y_exceeds_threshold(bbox1,
return
(
overlap
/
min_height
)
>
overlap_ratio_threshold
if
min_height
>
0
else
False
return
(
overlap
/
min_height
)
>
overlap_ratio_threshold
if
min_height
>
0
else
False
def
_
_is_overlaps_x_exceeds_threshold
(
bbox1
,
def
_is_overlaps_x_exceeds_threshold
(
bbox1
,
bbox2
,
bbox2
,
overlap_ratio_threshold
=
0.8
):
overlap_ratio_threshold
=
0.8
):
"""检查两个bbox在x轴上是否有重叠,并且该重叠区域的宽度占两个bbox宽度更低的那个超过指定阈值"""
"""检查两个bbox在x轴上是否有重叠,并且该重叠区域的宽度占两个bbox宽度更低的那个超过指定阈值"""
...
@@ -194,7 +194,7 @@ def update_det_boxes(dt_boxes, mfd_res):
...
@@ -194,7 +194,7 @@ def update_det_boxes(dt_boxes, mfd_res):
masks_list
=
[]
masks_list
=
[]
for
mf_box
in
mfd_res
:
for
mf_box
in
mfd_res
:
mf_bbox
=
mf_box
[
'bbox'
]
mf_bbox
=
mf_box
[
'bbox'
]
if
_
_is_overlaps_y_exceeds_threshold
(
text_bbox
,
mf_bbox
):
if
_is_overlaps_y_exceeds_threshold
(
text_bbox
,
mf_bbox
):
masks_list
.
append
([
mf_bbox
[
0
],
mf_bbox
[
2
]])
masks_list
.
append
([
mf_bbox
[
0
],
mf_bbox
[
2
]])
text_x_range
=
[
text_bbox
[
0
],
text_bbox
[
2
]]
text_x_range
=
[
text_bbox
[
0
],
text_bbox
[
2
]]
text_remove_mask_range
=
remove_intervals
(
text_x_range
,
masks_list
)
text_remove_mask_range
=
remove_intervals
(
text_x_range
,
masks_list
)
...
...
mineru/utils/span_block_fix.py
View file @
98d23e71
# Copyright (c) Opendatalab. All rights reserved.
# Copyright (c) Opendatalab. All rights reserved.
from
mineru.utils.boxbase
import
calculate_overlap_area_in_bbox1_area_ratio
from
mineru.utils.boxbase
import
calculate_overlap_area_in_bbox1_area_ratio
from
mineru.utils.enum_class
import
BlockType
,
ContentType
from
mineru.utils.enum_class
import
BlockType
,
ContentType
from
mineru.utils.ocr_utils
import
_
_is_overlaps_y_exceeds_threshold
,
_
_is_overlaps_x_exceeds_threshold
from
mineru.utils.ocr_utils
import
_is_overlaps_y_exceeds_threshold
,
_is_overlaps_x_exceeds_threshold
VERTICAL_SPAN_HEIGHT_TO_WIDTH_RATIO_THRESHOLD
=
2
VERTICAL_SPAN_HEIGHT_TO_WIDTH_RATIO_THRESHOLD
=
2
VERTICAL_SPAN_IN_BLOCK_THRESHOLD
=
0.8
VERTICAL_SPAN_IN_BLOCK_THRESHOLD
=
0.8
...
@@ -123,7 +123,7 @@ def merge_spans_to_line(spans, threshold=0.6):
...
@@ -123,7 +123,7 @@ def merge_spans_to_line(spans, threshold=0.6):
continue
continue
# 如果当前的span与当前行的最后一个span在y轴上重叠,则添加到当前行
# 如果当前的span与当前行的最后一个span在y轴上重叠,则添加到当前行
if
_
_is_overlaps_y_exceeds_threshold
(
span
[
'bbox'
],
current_line
[
-
1
][
'bbox'
],
threshold
):
if
_is_overlaps_y_exceeds_threshold
(
span
[
'bbox'
],
current_line
[
-
1
][
'bbox'
],
threshold
):
current_line
.
append
(
span
)
current_line
.
append
(
span
)
else
:
else
:
# 否则,开始新行
# 否则,开始新行
...
@@ -162,7 +162,7 @@ def merge_spans_to_vertical_line(spans, threshold=0.6):
...
@@ -162,7 +162,7 @@ def merge_spans_to_vertical_line(spans, threshold=0.6):
continue
continue
# 如果当前的span与当前行的最后一个span在y轴上重叠,则添加到当前行
# 如果当前的span与当前行的最后一个span在y轴上重叠,则添加到当前行
if
_
_is_overlaps_x_exceeds_threshold
(
span
[
'bbox'
],
current_line
[
-
1
][
'bbox'
],
threshold
):
if
_is_overlaps_x_exceeds_threshold
(
span
[
'bbox'
],
current_line
[
-
1
][
'bbox'
],
threshold
):
current_line
.
append
(
span
)
current_line
.
append
(
span
)
else
:
else
:
vertical_lines
.
append
(
current_line
)
vertical_lines
.
append
(
current_line
)
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment