Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
wangsen
MinerU
Commits
d2cb75e8
You need to sign in or sign up before continuing.
Commit
d2cb75e8
authored
Mar 19, 2024
by
xuchao
Browse files
利用下一行开头具有的空格特征分割段落
parent
acabae56
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
23 additions
and
19 deletions
+23
-19
magic_pdf/para/para_split.py
magic_pdf/para/para_split.py
+23
-19
No files found.
magic_pdf/para/para_split.py
View file @
d2cb75e8
...
@@ -142,47 +142,51 @@ def __group_line_by_layout(blocks, layout_bboxes, lang="en"):
...
@@ -142,47 +142,51 @@ def __group_line_by_layout(blocks, layout_bboxes, lang="en"):
return
lines_group
return
lines_group
def
__split_para_in_layoutbox
(
lines_group
,
layout_bbox
es
,
lang
=
"en"
,
char_avg_len
=
10
):
def
__split_para_in_layoutbox
(
lines_group
,
new_
layout_bbox
,
lang
=
"en"
,
char_avg_len
=
10
):
"""
"""
lines_group 进行行分段——layout内部进行分段。
lines_group 进行行分段——layout内部进行分段。
lines_group内每个元素是一个Layoutbox内的所有行。
1. 先计算每个group的左右边界。
1. 先计算每个group的左右边界。
2. 然后根据行末尾特征进行分段。
2. 然后根据行末尾特征进行分段。
末尾特征:以句号等结束符结尾。并且距离右侧边界有一定距离。
末尾特征:以句号等结束符结尾。并且距离右侧边界有一定距离。
且下一行开头不留空白。
"""
"""
paras
=
[]
paras
=
[]
right_tail_distance
=
1.5
*
char_avg_len
right_tail_distance
=
1.5
*
char_avg_len
for
lines
in
lines_group
:
for
lines
in
lines_group
:
if
len
(
lines
)
==
0
:
total_lines
=
len
(
lines
)
if
total_lines
<=
1
:
# 0行无需处理。1行无法分段。
continue
continue
layout_right
=
max
([
line
[
'bbox'
][
2
]
for
line
in
lines
])
#layout_right = max([line['bbox'][2] for line in lines])
layout_right
=
__find_layout_bbox_by_line
(
lines
[
0
][
'bbox'
],
new_layout_bbox
)[
2
]
para
=
[]
# 元素是line
para
=
[]
# 元素是line
for
line
in
lines
:
line_text
=
''
.
join
([
__get_span_text
(
span
)
for
span
in
line
[
'spans'
]])
for
i
,
line
in
enumerate
(
lines
):
#logger.info(line_text)
# 如果i有下一行,那么就要根据下一行位置综合判断是否要分段。如果i之后没有行,那么只需要判断一下行结尾特征。
last_span_type
=
line
[
'spans'
][
-
1
][
'type'
]
if
last_span_type
in
[
TEXT
,
INLINE_EQUATION
]:
cur_line_type
=
line
[
'spans'
][
-
1
][
'type'
]
last_char
=
line
[
'spans'
][
-
1
][
'content'
][
-
1
]
#cur_line_last_char = line['spans'][-1]['content'][-1]
if
last_char
in
LINE_STOP_FLAG
or
line
[
'bbox'
][
2
]
<
layout_right
-
right_tail_distance
:
next_line
=
lines
[
i
+
1
]
if
i
<
total_lines
-
1
else
None
if
cur_line_type
in
[
TEXT
,
INLINE_EQUATION
]:
if
line
[
'bbox'
][
2
]
<
layout_right
-
right_tail_distance
:
para
.
append
(
line
)
para
.
append
(
line
)
paras
.
append
(
para
)
paras
.
append
(
para
)
# para_text = ''.join([span['content'] for line in para for span in line['spans']])
# logger.info(para_text)
para
=
[]
para
=
[]
elif
line
[
'bbox'
][
2
]
>=
layout_right
-
right_tail_distance
and
next_line
and
next_line
[
'bbox'
][
0
]
==
layout_right
:
# 现在这行到了行尾沾满,下一行存在且顶格。
para
.
append
(
line
)
else
:
else
:
para
.
append
(
line
)
para
.
append
(
line
)
paras
.
append
(
para
)
para
=
[]
else
:
# 其他,图片、表格、行间公式,各自占一段
else
:
# 其他,图片、表格、行间公式,各自占一段
if
len
(
para
)
>
0
:
# 先把之前的段落加入到结果中
if
len
(
para
)
>
0
:
# 先把之前的段落加入到结果中
paras
.
append
(
para
)
paras
.
append
(
para
)
para
=
[]
para
=
[]
paras
.
append
([
line
])
# 再把当前行加入到结果中。当前行为行间公式、图、表等。
paras
.
append
([
line
])
# 再把当前行加入到结果中。当前行为行间公式、图、表等。
para
=
[]
para
=
[]
# para_text = ''.join([get_span_text(span) for line in para for span in line['spans']])
# logger.info(para_text)
if
len
(
para
)
>
0
:
if
len
(
para
)
>
0
:
paras
.
append
(
para
)
paras
.
append
(
para
)
# para_text = ''.join([get_span_text(span) for line in para for span in line['spans']])
# logger.info(para_text)
para
=
[]
para
=
[]
return
paras
return
paras
...
@@ -285,7 +289,7 @@ def __do_split(blocks, layout_bboxes, new_layout_bbox, lang="en"):
...
@@ -285,7 +289,7 @@ def __do_split(blocks, layout_bboxes, new_layout_bbox, lang="en"):
4. 图、表,目前独占一行,不考虑分段。
4. 图、表,目前独占一行,不考虑分段。
"""
"""
lines_group
=
__group_line_by_layout
(
blocks
,
layout_bboxes
,
lang
)
# block内分段
lines_group
=
__group_line_by_layout
(
blocks
,
layout_bboxes
,
lang
)
# block内分段
layout_paras
=
__split_para_in_layoutbox
(
lines_group
,
layout_bbox
es
,
lang
)
# layout内分段
layout_paras
=
__split_para_in_layoutbox
(
lines_group
,
new_
layout_bbox
,
lang
)
# layout内分段
connected_layout_paras
=
__connect_para_inter_layoutbox
(
layout_paras
,
new_layout_bbox
,
lang
)
# layout间链接段落
connected_layout_paras
=
__connect_para_inter_layoutbox
(
layout_paras
,
new_layout_bbox
,
lang
)
# layout间链接段落
return
connected_layout_paras
return
connected_layout_paras
...
@@ -315,4 +319,4 @@ def para_split(pdf_info_dict, lang="en"):
...
@@ -315,4 +319,4 @@ def para_split(pdf_info_dict, lang="en"):
is_conn
=
__connect_para_inter_page
(
pre_page_paras
,
next_page_paras
,
pre_page_layout_bbox
,
next_page_layout_bbox
,
lang
)
is_conn
=
__connect_para_inter_page
(
pre_page_paras
,
next_page_paras
,
pre_page_layout_bbox
,
next_page_layout_bbox
,
lang
)
if
is_conn
:
if
is_conn
:
logger
.
info
(
f
"连接了第
{
i
-
1
}
页和第
{
i
}
页的段落"
)
logger
.
info
(
f
"连接了第
{
i
-
1
}
页和第
{
i
}
页的段落"
)
\ No newline at end of file
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment