Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
wangsen
MinerU
Commits
2cf7b1c6
Unverified
Commit
2cf7b1c6
authored
Nov 18, 2024
by
Xiaomeng Zhao
Committed by
GitHub
Nov 18, 2024
Browse files
Merge pull request #1013 from myhloli/dev
refactor(para): improve paragraph splitting logic
parents
66faa2d7
517fbe5b
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
33 additions
and
15 deletions
+33
-15
magic_pdf/para/para_split_v3.py
magic_pdf/para/para_split_v3.py
+33
-15
No files found.
magic_pdf/para/para_split_v3.py
View file @
2cf7b1c6
...
@@ -64,6 +64,7 @@ def __is_list_or_index_block(block):
...
@@ -64,6 +64,7 @@ def __is_list_or_index_block(block):
line_height
=
first_line
[
'bbox'
][
3
]
-
first_line
[
'bbox'
][
1
]
line_height
=
first_line
[
'bbox'
][
3
]
-
first_line
[
'bbox'
][
1
]
block_weight
=
block
[
'bbox_fs'
][
2
]
-
block
[
'bbox_fs'
][
0
]
block_weight
=
block
[
'bbox_fs'
][
2
]
-
block
[
'bbox_fs'
][
0
]
block_height
=
block
[
'bbox_fs'
][
3
]
-
block
[
'bbox_fs'
][
1
]
block_height
=
block
[
'bbox_fs'
][
3
]
-
block
[
'bbox_fs'
][
1
]
page_weight
,
page_height
=
block
[
'page_size'
]
left_close_num
=
0
left_close_num
=
0
left_not_close_num
=
0
left_not_close_num
=
0
...
@@ -75,6 +76,12 @@ def __is_list_or_index_block(block):
...
@@ -75,6 +76,12 @@ def __is_list_or_index_block(block):
multiple_para_flag
=
False
multiple_para_flag
=
False
last_line
=
block
[
'lines'
][
-
1
]
last_line
=
block
[
'lines'
][
-
1
]
if
page_weight
==
0
:
block_weight_radio
=
0
else
:
block_weight_radio
=
block_weight
/
page_weight
# logger.info(f"block_weight_radio: {block_weight_radio}")
# 如果首行左边不顶格而右边顶格,末行左边顶格而右边不顶格 (第一行可能可以右边不顶格)
# 如果首行左边不顶格而右边顶格,末行左边顶格而右边不顶格 (第一行可能可以右边不顶格)
if
(
first_line
[
'bbox'
][
0
]
-
block
[
'bbox_fs'
][
0
]
>
line_height
/
2
and
if
(
first_line
[
'bbox'
][
0
]
-
block
[
'bbox_fs'
][
0
]
>
line_height
/
2
and
abs
(
last_line
[
'bbox'
][
0
]
-
block
[
'bbox_fs'
][
0
])
<
line_height
/
2
and
abs
(
last_line
[
'bbox'
][
0
]
-
block
[
'bbox_fs'
][
0
])
<
line_height
/
2
and
...
@@ -114,7 +121,8 @@ def __is_list_or_index_block(block):
...
@@ -114,7 +121,8 @@ def __is_list_or_index_block(block):
right_close_num
+=
1
right_close_num
+=
1
else
:
else
:
# 右侧不顶格情况下是否有一段距离,拍脑袋用0.3block宽度做阈值
# 右侧不顶格情况下是否有一段距离,拍脑袋用0.3block宽度做阈值
closed_area
=
0.26
*
block_weight
# 0.26
closed_area
=
0.35
*
block_weight
if
block
[
'bbox_fs'
][
2
]
-
line
[
'bbox'
][
2
]
>
closed_area
:
if
block
[
'bbox_fs'
][
2
]
-
line
[
'bbox'
][
2
]
>
closed_area
:
right_not_close_num
+=
1
right_not_close_num
+=
1
...
@@ -161,8 +169,12 @@ def __is_list_or_index_block(block):
...
@@ -161,8 +169,12 @@ def __is_list_or_index_block(block):
line
[
ListLineTag
.
IS_LIST_START_LINE
]
=
True
line
[
ListLineTag
.
IS_LIST_START_LINE
]
=
True
return
BlockType
.
List
return
BlockType
.
List
elif
left_close_num
>=
2
and
(
elif
(
right_not_close_num
>=
2
or
line_end_flag
or
left_not_close_num
>=
2
)
and
not
multiple_para_flag
:
left_close_num
>=
2
and
(
right_not_close_num
>=
2
or
line_end_flag
or
left_not_close_num
>=
2
)
and
not
multiple_para_flag
# and block_weight_radio > 0.27
):
# 处理一种特殊的没有缩进的list,所有行都贴左边,通过右边的空隙判断是否是item尾
# 处理一种特殊的没有缩进的list,所有行都贴左边,通过右边的空隙判断是否是item尾
if
left_close_num
/
len
(
block
[
'lines'
])
>
0.8
:
if
left_close_num
/
len
(
block
[
'lines'
])
>
0.8
:
# 这种是每个item只有一行,且左边都贴边的短item list
# 这种是每个item只有一行,且左边都贴边的短item list
...
@@ -223,18 +235,23 @@ def __merge_2_text_blocks(block1, block2):
...
@@ -223,18 +235,23 @@ def __merge_2_text_blocks(block1, block2):
if
len
(
last_line
[
'spans'
])
>
0
:
if
len
(
last_line
[
'spans'
])
>
0
:
last_span
=
last_line
[
'spans'
][
-
1
]
last_span
=
last_line
[
'spans'
][
-
1
]
line_height
=
last_line
[
'bbox'
][
3
]
-
last_line
[
'bbox'
][
1
]
line_height
=
last_line
[
'bbox'
][
3
]
-
last_line
[
'bbox'
][
1
]
if
(
abs
(
block2
[
'bbox_fs'
][
2
]
-
last_line
[
'bbox'
][
2
])
<
line_height
and
if
len
(
first_line
[
'spans'
])
>
0
:
not
last_span
[
'content'
].
endswith
(
LINE_STOP_FLAG
)
and
first_span
=
first_line
[
'spans'
][
0
]
# 两个block宽度差距超过2倍也不合并
if
len
(
first_span
[
'content'
])
>
0
:
abs
(
block1_weight
-
block2_weight
)
<
min_block_weight
span_start_with_num
=
first_span
[
'content'
][
0
].
isdigit
()
):
if
(
abs
(
block2
[
'bbox_fs'
][
2
]
-
last_line
[
'bbox'
][
2
])
<
line_height
if
block1
[
'page_num'
]
!=
block2
[
'page_num'
]:
and
not
last_span
[
'content'
].
endswith
(
LINE_STOP_FLAG
)
for
line
in
block1
[
'lines'
]:
# 两个block宽度差距超过2倍也不合并
for
span
in
line
[
'spans'
]:
and
abs
(
block1_weight
-
block2_weight
)
<
min_block_weight
span
[
CROSS_PAGE
]
=
True
and
not
span_start_with_num
block2
[
'lines'
].
extend
(
block1
[
'lines'
])
):
block1
[
'lines'
]
=
[]
if
block1
[
'page_num'
]
!=
block2
[
'page_num'
]:
block1
[
LINES_DELETED
]
=
True
for
line
in
block1
[
'lines'
]:
for
span
in
line
[
'spans'
]:
span
[
CROSS_PAGE
]
=
True
block2
[
'lines'
].
extend
(
block1
[
'lines'
])
block1
[
'lines'
]
=
[]
block1
[
LINES_DELETED
]
=
True
return
block1
,
block2
return
block1
,
block2
...
@@ -302,6 +319,7 @@ def para_split(pdf_info_dict, debug_mode=False):
...
@@ -302,6 +319,7 @@ def para_split(pdf_info_dict, debug_mode=False):
blocks
=
copy
.
deepcopy
(
page
[
'preproc_blocks'
])
blocks
=
copy
.
deepcopy
(
page
[
'preproc_blocks'
])
for
block
in
blocks
:
for
block
in
blocks
:
block
[
'page_num'
]
=
page_num
block
[
'page_num'
]
=
page_num
block
[
'page_size'
]
=
page
[
'page_size'
]
all_blocks
.
extend
(
blocks
)
all_blocks
.
extend
(
blocks
)
__para_merge_page
(
all_blocks
)
__para_merge_page
(
all_blocks
)
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment