Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
wangsen
MinerU
Commits
2aea5d6f
Unverified
Commit
2aea5d6f
authored
Jan 15, 2025
by
Xiaomeng Zhao
Committed by
GitHub
Jan 15, 2025
Browse files
Merge pull request #1551 from myhloli/dev
refactor(magic_pdf): improve title block merging logic
parents
852ae370
8570e006
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
33 additions
and
21 deletions
+33
-21
magic_pdf/pdf_parse_union_core_v2.py
magic_pdf/pdf_parse_union_core_v2.py
+33
-21
No files found.
magic_pdf/pdf_parse_union_core_v2.py
View file @
2aea5d6f
...
@@ -674,38 +674,48 @@ def parse_page_core(
...
@@ -674,38 +674,48 @@ def parse_page_core(
page_w
,
page_h
=
magic_model
.
get_page_size
(
page_id
)
page_w
,
page_h
=
magic_model
.
get_page_size
(
page_id
)
def
merge_title_blocks
(
blocks
,
x_distance_threshold
=
0.1
*
page_w
):
def
merge_title_blocks
(
blocks
,
x_distance_threshold
=
0.1
*
page_w
):
def
merge_two_blocks
(
b1
,
b2
):
def
merge_two_bbox
(
b1
,
b2
):
# 合并两个标题块的边界框
x_min
=
min
(
b1
[
'bbox'
][
0
],
b2
[
'bbox'
][
0
])
x_min
=
min
(
b1
[
'bbox'
][
0
],
b2
[
'bbox'
][
0
])
y_min
=
min
(
b1
[
'bbox'
][
1
],
b2
[
'bbox'
][
1
])
y_min
=
min
(
b1
[
'bbox'
][
1
],
b2
[
'bbox'
][
1
])
x_max
=
max
(
b1
[
'bbox'
][
2
],
b2
[
'bbox'
][
2
])
x_max
=
max
(
b1
[
'bbox'
][
2
],
b2
[
'bbox'
][
2
])
y_max
=
max
(
b1
[
'bbox'
][
3
],
b2
[
'bbox'
][
3
])
y_max
=
max
(
b1
[
'bbox'
][
3
],
b2
[
'bbox'
][
3
])
merged_bbox
=
(
x_min
,
y_min
,
x_max
,
y_max
)
return
x_min
,
y_min
,
x_max
,
y_max
def
merge_two_blocks
(
b1
,
b2
):
# 合并两个标题块的边界框
b1
[
'bbox'
]
=
merge_two_bbox
(
b1
,
b2
)
# 合并两个标题块的文本内容
# 合并两个标题块的文本内容
merged_score
=
(
b1
[
'score'
]
+
b2
[
'score'
])
/
2
line1
=
b1
[
'lines'
][
0
]
line2
=
b2
[
'lines'
][
0
]
line1
[
'bbox'
]
=
merge_two_bbox
(
line1
,
line2
)
line1
[
'spans'
].
extend
(
line2
[
'spans'
])
return
{
'bbox'
:
merged_bbox
,
'score'
:
merged_score
}
return
b1
,
b2
# 按 y 轴重叠度聚集标题块
# 按 y 轴重叠度聚集标题块
y_overlapping_blocks
=
[]
y_overlapping_blocks
=
[]
while
blocks
:
title_bs
=
[
b
for
b
in
blocks
if
b
[
'type'
]
==
BlockType
.
Title
]
block1
=
blocks
.
pop
(
0
)
while
title_bs
:
block1
=
title_bs
.
pop
(
0
)
current_row
=
[
block1
]
current_row
=
[
block1
]
to_remove
=
[]
to_remove
=
[]
for
block2
in
blocks
:
for
block2
in
title_bs
:
if
__is_overlaps_y_exceeds_threshold
(
block1
[
'bbox'
],
block2
[
'bbox'
],
0.9
):
if
(
__is_overlaps_y_exceeds_threshold
(
block1
[
'bbox'
],
block2
[
'bbox'
],
0.9
)
and
len
(
block1
[
'lines'
])
==
1
and
len
(
block2
[
'lines'
])
==
1
):
current_row
.
append
(
block2
)
current_row
.
append
(
block2
)
to_remove
.
append
(
block2
)
to_remove
.
append
(
block2
)
for
b
in
to_remove
:
for
b
in
to_remove
:
block
s
.
remove
(
b
)
title_b
s
.
remove
(
b
)
y_overlapping_blocks
.
append
(
current_row
)
y_overlapping_blocks
.
append
(
current_row
)
# 按x轴坐标排序并合并标题块
# 按x轴坐标排序并合并标题块
merged
_blocks
=
[]
to_remove
_blocks
=
[]
for
row
in
y_overlapping_blocks
:
for
row
in
y_overlapping_blocks
:
if
len
(
row
)
==
1
:
if
len
(
row
)
==
1
:
merged_blocks
.
append
(
row
[
0
])
continue
continue
# 按x轴坐标排序
# 按x轴坐标排序
...
@@ -719,18 +729,17 @@ def parse_page_core(
...
@@ -719,18 +729,17 @@ def parse_page_core(
left_height
=
left_block
[
'bbox'
][
3
]
-
left_block
[
'bbox'
][
1
]
left_height
=
left_block
[
'bbox'
][
3
]
-
left_block
[
'bbox'
][
1
]
right_height
=
right_block
[
'bbox'
][
3
]
-
right_block
[
'bbox'
][
1
]
right_height
=
right_block
[
'bbox'
][
3
]
-
right_block
[
'bbox'
][
1
]
if
right_block
[
'bbox'
][
0
]
-
left_block
[
'bbox'
][
2
]
<
x_distance_threshold
and
left_height
*
0.95
<
right_height
<
left_height
*
1.05
:
if
(
merged_block
=
merge_two_blocks
(
merged_block
,
right_block
)
right_block
[
'bbox'
][
0
]
-
left_block
[
'bbox'
][
2
]
<
x_distance_threshold
and
left_height
*
0.95
<
right_height
<
left_height
*
1.05
):
merged_block
,
to_remove_block
=
merge_two_blocks
(
merged_block
,
right_block
)
to_remove_blocks
.
append
(
to_remove_block
)
else
:
else
:
merged_blocks
.
append
(
merged_block
)
merged_block
=
right_block
merged_block
=
right_block
merged_blocks
.
append
(
merged_block
)
for
b
in
to_remove_blocks
:
blocks
.
remove
(
b
)
return
merged_blocks
"""同一行被断开的titile合并"""
title_blocks
=
merge_title_blocks
(
title_blocks
)
"""将所有区块的bbox整理到一起"""
"""将所有区块的bbox整理到一起"""
# interline_equation_blocks参数不够准,后面切换到interline_equations上
# interline_equation_blocks参数不够准,后面切换到interline_equations上
...
@@ -816,6 +825,9 @@ def parse_page_core(
...
@@ -816,6 +825,9 @@ def parse_page_core(
"""对block进行fix操作"""
"""对block进行fix操作"""
fix_blocks
=
fix_block_spans_v2
(
block_with_spans
)
fix_blocks
=
fix_block_spans_v2
(
block_with_spans
)
"""同一行被断开的titile合并"""
merge_title_blocks
(
fix_blocks
)
"""获取所有line并计算正文line的高度"""
"""获取所有line并计算正文line的高度"""
line_height
=
get_line_height
(
fix_blocks
)
line_height
=
get_line_height
(
fix_blocks
)
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment