Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
wangsen
MinerU
Commits
ae7b0a6e
Commit
ae7b0a6e
authored
May 30, 2025
by
myhloli
Browse files
refactor: implement block preprocessing utilities for improved bounding box management
parent
8f1f9abe
Changes
3
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
341 additions
and
3 deletions
+341
-3
mineru/backend/pipeline/model_json_to_middle_json.py
mineru/backend/pipeline/model_json_to_middle_json.py
+61
-2
mineru/utils/block_pre_proc.py
mineru/utils/block_pre_proc.py
+224
-0
mineru/utils/boxbase.py
mineru/utils/boxbase.py
+56
-1
No files found.
mineru/backend/pipeline/model_json_to_middle_json.py
View file @
ae7b0a6e
# Copyright (c) Opendatalab. All rights reserved.
# Copyright (c) Opendatalab. All rights reserved.
from
mineru.utils.block_pre_proc
import
prepare_block_bboxes
from
mineru.utils.pipeline_magic_model
import
MagicModel
from
mineru.utils.pipeline_magic_model
import
MagicModel
from
mineru.version
import
__version__
from
mineru.version
import
__version__
from
mineru.utils.hash_utils
import
str_md5
from
mineru.utils.hash_utils
import
str_md5
...
@@ -8,9 +9,51 @@ def page_model_info_to_page_info(page_model_info, image_dict, page, image_writer
...
@@ -8,9 +9,51 @@ def page_model_info_to_page_info(page_model_info, image_dict, page, image_writer
scale
=
image_dict
[
"scale"
]
scale
=
image_dict
[
"scale"
]
page_pil_img
=
image_dict
[
"img_pil"
]
page_pil_img
=
image_dict
[
"img_pil"
]
page_img_md5
=
str_md5
(
image_dict
[
"img_base64"
])
page_img_md5
=
str_md5
(
image_dict
[
"img_base64"
])
width
,
height
=
map
(
int
,
page
.
get_size
())
page_w
,
page_h
=
map
(
int
,
page
.
get_size
())
magic_model
=
MagicModel
(
page_model_info
,
scale
)
magic_model
=
MagicModel
(
page_model_info
,
scale
)
"""从magic_model对象中获取后面会用到的区块信息"""
img_groups
=
magic_model
.
get_imgs
()
table_groups
=
magic_model
.
get_tables
()
"""对image和table的区块分组"""
img_body_blocks
,
img_caption_blocks
,
img_footnote_blocks
=
process_groups
(
img_groups
,
'image_body'
,
'image_caption_list'
,
'image_footnote_list'
)
table_body_blocks
,
table_caption_blocks
,
table_footnote_blocks
=
process_groups
(
table_groups
,
'table_body'
,
'table_caption_list'
,
'table_footnote_list'
)
discarded_blocks
=
magic_model
.
get_discarded
()
text_blocks
=
magic_model
.
get_text_blocks
()
title_blocks
=
magic_model
.
get_title_blocks
()
inline_equations
,
interline_equations
,
interline_equation_blocks
=
magic_model
.
get_equations
()
"""将所有区块的bbox整理到一起"""
interline_equation_blocks
=
[]
if
len
(
interline_equation_blocks
)
>
0
:
all_bboxes
,
all_discarded_blocks
,
footnote_blocks
=
prepare_block_bboxes
(
img_body_blocks
,
img_caption_blocks
,
img_footnote_blocks
,
table_body_blocks
,
table_caption_blocks
,
table_footnote_blocks
,
discarded_blocks
,
text_blocks
,
title_blocks
,
interline_equation_blocks
,
page_w
,
page_h
,
)
else
:
all_bboxes
,
all_discarded_blocks
,
footnote_blocks
=
prepare_block_bboxes
(
img_body_blocks
,
img_caption_blocks
,
img_footnote_blocks
,
table_body_blocks
,
table_caption_blocks
,
table_footnote_blocks
,
discarded_blocks
,
text_blocks
,
title_blocks
,
interline_equations
,
page_w
,
page_h
,
)
def
result_to_middle_json
(
model_list
,
images_list
,
pdf_doc
,
image_writer
,
lang
=
None
,
ocr
=
False
):
def
result_to_middle_json
(
model_list
,
images_list
,
pdf_doc
,
image_writer
,
lang
=
None
,
ocr
=
False
):
...
@@ -22,4 +65,20 @@ def result_to_middle_json(model_list, images_list, pdf_doc, image_writer, lang=N
...
@@ -22,4 +65,20 @@ def result_to_middle_json(model_list, images_list, pdf_doc, image_writer, lang=N
page_model_info
,
image_dict
,
page
,
image_writer
,
page_index
,
lang
=
lang
,
ocr
=
ocr
page_model_info
,
image_dict
,
page
,
image_writer
,
page_index
,
lang
=
lang
,
ocr
=
ocr
)
)
middle_json
[
"pdf_info"
].
append
(
page_info
)
middle_json
[
"pdf_info"
].
append
(
page_info
)
return
middle_json
return
middle_json
\ No newline at end of file
def
process_groups
(
groups
,
body_key
,
caption_key
,
footnote_key
):
body_blocks
=
[]
caption_blocks
=
[]
footnote_blocks
=
[]
for
i
,
group
in
enumerate
(
groups
):
group
[
body_key
][
'group_id'
]
=
i
body_blocks
.
append
(
group
[
body_key
])
for
caption_block
in
group
[
caption_key
]:
caption_block
[
'group_id'
]
=
i
caption_blocks
.
append
(
caption_block
)
for
footnote_block
in
group
[
footnote_key
]:
footnote_block
[
'group_id'
]
=
i
footnote_blocks
.
append
(
footnote_block
)
return
body_blocks
,
caption_blocks
,
footnote_blocks
\ No newline at end of file
mineru/utils/block_pre_proc.py
0 → 100644
View file @
ae7b0a6e
# Copyright (c) Opendatalab. All rights reserved.
from
mineru.utils.boxbase
import
(
calculate_iou
,
calculate_overlap_area_in_bbox1_area_ratio
,
calculate_vertical_projection_overlap_ratio
,
get_minbox_if_overlap_by_ratio
)
from
mineru.utils.enum_class
import
BlockType
def
prepare_block_bboxes
(
img_body_blocks
,
img_caption_blocks
,
img_footnote_blocks
,
table_body_blocks
,
table_caption_blocks
,
table_footnote_blocks
,
discarded_blocks
,
text_blocks
,
title_blocks
,
interline_equation_blocks
,
page_w
,
page_h
,
):
all_bboxes
=
[]
add_bboxes
(
img_body_blocks
,
BlockType
.
IMAGE_BODY
,
all_bboxes
)
add_bboxes
(
img_caption_blocks
,
BlockType
.
IMAGE_CAPTION
,
all_bboxes
)
add_bboxes
(
img_footnote_blocks
,
BlockType
.
IMAGE_CAPTION
,
all_bboxes
)
add_bboxes
(
table_body_blocks
,
BlockType
.
TABLE_BODY
,
all_bboxes
)
add_bboxes
(
table_caption_blocks
,
BlockType
.
TABLE_CAPTION
,
all_bboxes
)
add_bboxes
(
table_footnote_blocks
,
BlockType
.
TABLE_FOOTNOTE
,
all_bboxes
)
add_bboxes
(
text_blocks
,
BlockType
.
TEXT
,
all_bboxes
)
add_bboxes
(
title_blocks
,
BlockType
.
TITLE
,
all_bboxes
)
add_bboxes
(
interline_equation_blocks
,
BlockType
.
INTERLINE_EQUATION
,
all_bboxes
)
"""block嵌套问题解决"""
"""文本框与标题框重叠,优先信任文本框"""
all_bboxes
=
fix_text_overlap_title_blocks
(
all_bboxes
)
"""任何框体与舍弃框重叠,优先信任舍弃框"""
all_bboxes
=
remove_need_drop_blocks
(
all_bboxes
,
discarded_blocks
)
# interline_equation 与title或text框冲突的情况,分两种情况处理
"""interline_equation框与文本类型框iou比较接近1的时候,信任行间公式框"""
all_bboxes
=
fix_interline_equation_overlap_text_blocks_with_hi_iou
(
all_bboxes
)
"""interline_equation框被包含在文本类型框内,且interline_equation比文本区块小很多时信任文本框,这时需要舍弃公式框"""
# 通过后续大框套小框逻辑删除
"""discarded_blocks"""
all_discarded_blocks
=
[]
add_bboxes
(
discarded_blocks
,
BlockType
.
DISCARDED
,
all_discarded_blocks
)
"""footnote识别:宽度超过1/3页面宽度的,高度超过10的,处于页面下半30%区域的"""
footnote_blocks
=
[]
for
discarded
in
discarded_blocks
:
x0
,
y0
,
x1
,
y1
=
discarded
[
'bbox'
]
if
(
x1
-
x0
)
>
(
page_w
/
3
)
and
(
y1
-
y0
)
>
10
and
y0
>
(
page_h
*
0.7
):
footnote_blocks
.
append
([
x0
,
y0
,
x1
,
y1
])
"""移除在footnote下面的任何框"""
need_remove_blocks
=
find_blocks_under_footnote
(
all_bboxes
,
footnote_blocks
)
if
len
(
need_remove_blocks
)
>
0
:
for
block
in
need_remove_blocks
:
all_bboxes
.
remove
(
block
)
all_discarded_blocks
.
append
(
block
)
"""经过以上处理后,还存在大框套小框的情况,则删除小框"""
all_bboxes
=
remove_overlaps_min_blocks
(
all_bboxes
)
all_discarded_blocks
=
remove_overlaps_min_blocks
(
all_discarded_blocks
)
"""将剩余的bbox做分离处理,防止后面分layout时出错"""
# all_bboxes, drop_reasons = remove_overlap_between_bbox_for_block(all_bboxes)
all_bboxes
.
sort
(
key
=
lambda
x
:
x
[
0
]
+
x
[
1
])
return
all_bboxes
,
all_discarded_blocks
,
footnote_blocks
def
add_bboxes
(
blocks
,
block_type
,
bboxes
):
for
block
in
blocks
:
x0
,
y0
,
x1
,
y1
=
block
[
'bbox'
]
if
block_type
in
[
BlockType
.
IMAGE_BODY
,
BlockType
.
IMAGE_CAPTION
,
BlockType
.
IMAGE_FOOTNOTE
,
BlockType
.
TABLE_BODY
,
BlockType
.
TABLE_CAPTION
,
BlockType
.
TABLE_FOOTNOTE
,
]:
bboxes
.
append
([
x0
,
y0
,
x1
,
y1
,
None
,
None
,
None
,
block_type
,
None
,
None
,
None
,
None
,
block
[
'score'
],
block
[
'group_id'
]])
else
:
bboxes
.
append
([
x0
,
y0
,
x1
,
y1
,
None
,
None
,
None
,
block_type
,
None
,
None
,
None
,
None
,
block
[
'score'
]])
def
fix_text_overlap_title_blocks
(
all_bboxes
):
# 先提取所有text和title block
text_blocks
=
[]
for
block
in
all_bboxes
:
if
block
[
7
]
==
BlockType
.
TEXT
:
text_blocks
.
append
(
block
)
title_blocks
=
[]
for
block
in
all_bboxes
:
if
block
[
7
]
==
BlockType
.
TITLE
:
title_blocks
.
append
(
block
)
need_remove
=
[]
for
text_block
in
text_blocks
:
for
title_block
in
title_blocks
:
text_block_bbox
=
text_block
[:
4
]
title_block_bbox
=
title_block
[:
4
]
if
calculate_iou
(
text_block_bbox
,
title_block_bbox
)
>
0.8
:
if
title_block
not
in
need_remove
:
need_remove
.
append
(
title_block
)
if
len
(
need_remove
)
>
0
:
for
block
in
need_remove
:
all_bboxes
.
remove
(
block
)
return
all_bboxes
def
remove_need_drop_blocks
(
all_bboxes
,
discarded_blocks
):
need_remove
=
[]
for
block
in
all_bboxes
:
for
discarded_block
in
discarded_blocks
:
block_bbox
=
block
[:
4
]
if
(
calculate_overlap_area_in_bbox1_area_ratio
(
block_bbox
,
discarded_block
[
'bbox'
]
)
>
0.6
):
if
block
not
in
need_remove
:
need_remove
.
append
(
block
)
break
if
len
(
need_remove
)
>
0
:
for
block
in
need_remove
:
all_bboxes
.
remove
(
block
)
return
all_bboxes
def
fix_interline_equation_overlap_text_blocks_with_hi_iou
(
all_bboxes
):
# 先提取所有text和interline block
text_blocks
=
[]
for
block
in
all_bboxes
:
if
block
[
7
]
==
BlockType
.
TEXT
:
text_blocks
.
append
(
block
)
interline_equation_blocks
=
[]
for
block
in
all_bboxes
:
if
block
[
7
]
==
BlockType
.
INTERLINE_EQUATION
:
interline_equation_blocks
.
append
(
block
)
need_remove
=
[]
for
interline_equation_block
in
interline_equation_blocks
:
for
text_block
in
text_blocks
:
interline_equation_block_bbox
=
interline_equation_block
[:
4
]
text_block_bbox
=
text_block
[:
4
]
if
calculate_iou
(
interline_equation_block_bbox
,
text_block_bbox
)
>
0.8
:
if
text_block
not
in
need_remove
:
need_remove
.
append
(
text_block
)
if
len
(
need_remove
)
>
0
:
for
block
in
need_remove
:
all_bboxes
.
remove
(
block
)
return
all_bboxes
def
find_blocks_under_footnote
(
all_bboxes
,
footnote_blocks
):
need_remove_blocks
=
[]
for
block
in
all_bboxes
:
block_x0
,
block_y0
,
block_x1
,
block_y1
=
block
[:
4
]
for
footnote_bbox
in
footnote_blocks
:
footnote_x0
,
footnote_y0
,
footnote_x1
,
footnote_y1
=
footnote_bbox
# 如果footnote的纵向投影覆盖了block的纵向投影的80%且block的y0大于等于footnote的y1
if
(
block_y0
>=
footnote_y1
and
calculate_vertical_projection_overlap_ratio
(
(
block_x0
,
block_y0
,
block_x1
,
block_y1
),
footnote_bbox
)
>=
0.8
):
if
block
not
in
need_remove_blocks
:
need_remove_blocks
.
append
(
block
)
break
return
need_remove_blocks
def
remove_overlaps_min_blocks
(
all_bboxes
):
# 重叠block,小的不能直接删除,需要和大的那个合并成一个更大的。
# 删除重叠blocks中较小的那些
need_remove
=
[]
for
block1
in
all_bboxes
:
for
block2
in
all_bboxes
:
if
block1
!=
block2
:
block1_bbox
=
block1
[:
4
]
block2_bbox
=
block2
[:
4
]
overlap_box
=
get_minbox_if_overlap_by_ratio
(
block1_bbox
,
block2_bbox
,
0.8
)
if
overlap_box
is
not
None
:
block_to_remove
=
next
(
(
block
for
block
in
all_bboxes
if
block
[:
4
]
==
overlap_box
),
None
,
)
if
(
block_to_remove
is
not
None
and
block_to_remove
not
in
need_remove
):
large_block
=
block1
if
block1
!=
block_to_remove
else
block2
x1
,
y1
,
x2
,
y2
=
large_block
[:
4
]
sx1
,
sy1
,
sx2
,
sy2
=
block_to_remove
[:
4
]
x1
=
min
(
x1
,
sx1
)
y1
=
min
(
y1
,
sy1
)
x2
=
max
(
x2
,
sx2
)
y2
=
max
(
y2
,
sy2
)
large_block
[:
4
]
=
[
x1
,
y1
,
x2
,
y2
]
need_remove
.
append
(
block_to_remove
)
if
len
(
need_remove
)
>
0
:
for
block
in
need_remove
:
all_bboxes
.
remove
(
block
)
return
all_bboxes
\ No newline at end of file
mineru/utils/boxbase.py
View file @
ae7b0a6e
...
@@ -156,4 +156,59 @@ def _is_in(box1, box2) -> bool:
...
@@ -156,4 +156,59 @@ def _is_in(box1, box2) -> bool:
return
(
x0_1
>=
x0_2
and
# box1的左边界不在box2的左边外
return
(
x0_1
>=
x0_2
and
# box1的左边界不在box2的左边外
y0_1
>=
y0_2
and
# box1的上边界不在box2的上边外
y0_1
>=
y0_2
and
# box1的上边界不在box2的上边外
x1_1
<=
x1_2
and
# box1的右边界不在box2的右边外
x1_1
<=
x1_2
and
# box1的右边界不在box2的右边外
y1_1
<=
y1_2
)
# box1的下边界不在box2的下边外
y1_1
<=
y1_2
)
# box1的下边界不在box2的下边外
\ No newline at end of file
def
calculate_overlap_area_in_bbox1_area_ratio
(
bbox1
,
bbox2
):
"""计算box1和box2的重叠面积占bbox1的比例."""
# Determine the coordinates of the intersection rectangle
x_left
=
max
(
bbox1
[
0
],
bbox2
[
0
])
y_top
=
max
(
bbox1
[
1
],
bbox2
[
1
])
x_right
=
min
(
bbox1
[
2
],
bbox2
[
2
])
y_bottom
=
min
(
bbox1
[
3
],
bbox2
[
3
])
if
x_right
<
x_left
or
y_bottom
<
y_top
:
return
0.0
# The area of overlap area
intersection_area
=
(
x_right
-
x_left
)
*
(
y_bottom
-
y_top
)
bbox1_area
=
(
bbox1
[
2
]
-
bbox1
[
0
])
*
(
bbox1
[
3
]
-
bbox1
[
1
])
if
bbox1_area
==
0
:
return
0
else
:
return
intersection_area
/
bbox1_area
def
calculate_vertical_projection_overlap_ratio
(
block1
,
block2
):
"""
Calculate the proportion of the x-axis covered by the vertical projection of two blocks.
Args:
block1 (tuple): Coordinates of the first block (x0, y0, x1, y1).
block2 (tuple): Coordinates of the second block (x0, y0, x1, y1).
Returns:
float: The proportion of the x-axis covered by the vertical projection of the two blocks.
"""
x0_1
,
_
,
x1_1
,
_
=
block1
x0_2
,
_
,
x1_2
,
_
=
block2
# Calculate the intersection of the x-coordinates
x_left
=
max
(
x0_1
,
x0_2
)
x_right
=
min
(
x1_1
,
x1_2
)
if
x_right
<
x_left
:
return
0.0
# Length of the intersection
intersection_length
=
x_right
-
x_left
# Length of the x-axis projection of the first block
block1_length
=
x1_1
-
x0_1
if
block1_length
==
0
:
return
0.0
# Proportion of the x-axis covered by the intersection
# logger.info(f"intersection_length: {intersection_length}, block1_length: {block1_length}")
return
intersection_length
/
block1_length
\ No newline at end of file
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment