Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
zhougaofeng
magic_pdf
Commits
68851ae0
Commit
68851ae0
authored
Oct 25, 2024
by
zhougaofeng
Browse files
Update para_split.py
parent
f3b09a0b
Changes
1
Show whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
18 additions
and
18 deletions
+18
-18
magic_pdf/para/para_split.py
magic_pdf/para/para_split.py
+18
-18
No files found.
magic_pdf/para/para_split.py
View file @
68851ae0
...
@@ -98,8 +98,8 @@ def __detect_list_lines(lines, new_layout_bboxes, lang):
...
@@ -98,8 +98,8 @@ def __detect_list_lines(lines, new_layout_bboxes, lang):
# 然后根据编码进行分段, 选出来 1,2,3连续出现至少2次的行,认为是列表。
# 然后根据编码进行分段, 选出来 1,2,3连续出现至少2次的行,认为是列表。
list_indice
,
list_start_idx
=
find_repeating_patterns
(
line_fea_encode
)
list_indice
,
list_start_idx
=
find_repeating_patterns
(
line_fea_encode
)
if
len
(
list_indice
)
>
0
:
#
if len(list_indice)>0:
logger
.
info
(
f
"发现了列表,列表行数:
{
list_indice
}
,
{
list_start_idx
}
"
)
#
logger.info(f"发现了列表,列表行数:{list_indice}, {list_start_idx}")
# TODO check一下这个特列表里缩进的行左侧是不是对齐的。
# TODO check一下这个特列表里缩进的行左侧是不是对齐的。
segments
=
[]
segments
=
[]
...
@@ -107,10 +107,10 @@ def __detect_list_lines(lines, new_layout_bboxes, lang):
...
@@ -107,10 +107,10 @@ def __detect_list_lines(lines, new_layout_bboxes, lang):
for
i
in
range
(
start
,
end
+
1
):
for
i
in
range
(
start
,
end
+
1
):
if
i
>
0
:
if
i
>
0
:
if
line_fea_encode
[
i
]
==
4
:
if
line_fea_encode
[
i
]
==
4
:
logger
.
info
(
f
"列表行的第
{
i
}
行不是顶格的"
)
#
logger.info(f"列表行的第{i}行不是顶格的")
break
break
else
:
#
else:
logger
.
info
(
f
"列表行的第
{
start
}
到第
{
end
}
行是列表"
)
#
logger.info(f"列表行的第{start}到第{end}行是列表")
return
split_indices
(
total_lines
,
list_indice
),
list_start_idx
return
split_indices
(
total_lines
,
list_indice
),
list_start_idx
...
@@ -350,7 +350,7 @@ def __connect_list_inter_layout(layout_paras, new_layout_bbox, layout_list_info,
...
@@ -350,7 +350,7 @@ def __connect_list_inter_layout(layout_paras, new_layout_bbox, layout_list_info,
next_first_para
=
next_paras
[
0
]
next_first_para
=
next_paras
[
0
]
if
pre_layout_list_info
[
1
]
and
not
next_layout_list_info
[
0
]:
# 前一个是列表结尾,后一个是非列表开头,此时检测是否有相同的缩进
if
pre_layout_list_info
[
1
]
and
not
next_layout_list_info
[
0
]:
# 前一个是列表结尾,后一个是非列表开头,此时检测是否有相同的缩进
logger
.
info
(
f
"连接page
{
page_num
}
内的list"
)
#
logger.info(f"连接page {page_num} 内的list")
# 向layout_paras[i] 寻找开头具有相同缩进的连续的行
# 向layout_paras[i] 寻找开头具有相同缩进的连续的行
may_list_lines
=
[]
may_list_lines
=
[]
for
j
in
range
(
len
(
next_paras
)):
for
j
in
range
(
len
(
next_paras
)):
...
@@ -379,7 +379,7 @@ def __connect_list_inter_page(pre_page_paras, next_page_paras, pre_page_layout_b
...
@@ -379,7 +379,7 @@ def __connect_list_inter_page(pre_page_paras, next_page_paras, pre_page_layout_b
return
False
return
False
if
pre_page_list_info
[
1
]
and
not
next_page_list_info
[
0
]:
# 前一个是列表结尾,后一个是非列表开头,此时检测是否有相同的缩进
if
pre_page_list_info
[
1
]
and
not
next_page_list_info
[
0
]:
# 前一个是列表结尾,后一个是非列表开头,此时检测是否有相同的缩进
logger
.
info
(
f
"连接page
{
page_num
}
内的list"
)
#
logger.info(f"连接page {page_num} 内的list")
# 向layout_paras[i] 寻找开头具有相同缩进的连续的行
# 向layout_paras[i] 寻找开头具有相同缩进的连续的行
may_list_lines
=
[]
may_list_lines
=
[]
for
j
in
range
(
len
(
next_page_paras
[
0
])):
for
j
in
range
(
len
(
next_page_paras
[
0
])):
...
@@ -431,7 +431,7 @@ def __connect_para_inter_layoutbox(layout_paras, new_layout_bbox, lang):
...
@@ -431,7 +431,7 @@ def __connect_para_inter_layoutbox(layout_paras, new_layout_bbox, lang):
pre_last_line
=
layout_paras
[
i
-
1
][
-
1
][
-
1
]
pre_last_line
=
layout_paras
[
i
-
1
][
-
1
][
-
1
]
next_first_line
=
layout_paras
[
i
][
0
][
0
]
next_first_line
=
layout_paras
[
i
][
0
][
0
]
except
Exception
as
e
:
except
Exception
as
e
:
logger
.
error
(
f
"page layout
{
i
}
has no line"
)
#
logger.error(f"page layout {i} has no line")
continue
continue
pre_last_line_text
=
''
.
join
([
__get_span_text
(
span
)
for
span
in
pre_last_line
[
'spans'
]])
pre_last_line_text
=
''
.
join
([
__get_span_text
(
span
)
for
span
in
pre_last_line
[
'spans'
]])
pre_last_line_type
=
pre_last_line
[
'spans'
][
-
1
][
'type'
]
pre_last_line_type
=
pre_last_line
[
'spans'
][
-
1
][
'type'
]
...
@@ -547,7 +547,7 @@ def __connect_middle_align_text(page_paras, new_layout_bbox, page_num, lang, deb
...
@@ -547,7 +547,7 @@ def __connect_middle_align_text(page_paras, new_layout_bbox, page_num, lang, deb
if
"Table"
in
first_line_text
or
"Figure"
in
first_line_text
:
if
"Table"
in
first_line_text
or
"Figure"
in
first_line_text
:
pass
pass
if
debug_mode
:
if
debug_mode
:
logger
.
debug
(
line_hi
.
std
())
#
logger.debug(line_hi.std())
if
line_hi
.
std
()
<
2
:
if
line_hi
.
std
()
<
2
:
"""行高度相同,那么判断是否居中"""
"""行高度相同,那么判断是否居中"""
...
@@ -559,8 +559,8 @@ def __connect_middle_align_text(page_paras, new_layout_bbox, page_num, lang, deb
...
@@ -559,8 +559,8 @@ def __connect_middle_align_text(page_paras, new_layout_bbox, page_num, lang, deb
and
not
all
([
x1
==
layout_box
[
2
]
for
x1
in
all_right_x1
]):
and
not
all
([
x1
==
layout_box
[
2
]
for
x1
in
all_right_x1
]):
merge_para
=
[
l
[
0
]
for
l
in
layout_para
[
start
:
end
+
1
]]
merge_para
=
[
l
[
0
]
for
l
in
layout_para
[
start
:
end
+
1
]]
para_text
=
''
.
join
([
__get_span_text
(
span
)
for
line
in
merge_para
for
span
in
line
[
'spans'
]])
para_text
=
''
.
join
([
__get_span_text
(
span
)
for
line
in
merge_para
for
span
in
line
[
'spans'
]])
if
debug_mode
:
#
if debug_mode:
logger
.
debug
(
para_text
)
#
logger.debug(para_text)
layout_para
[
start
:
end
+
1
]
=
[
merge_para
]
layout_para
[
start
:
end
+
1
]
=
[
merge_para
]
index_offset
-=
end
-
start
index_offset
-=
end
-
start
...
@@ -624,14 +624,14 @@ def para_split(pdf_info_dict, debug_mode, lang="en"):
...
@@ -624,14 +624,14 @@ def para_split(pdf_info_dict, debug_mode, lang="en"):
next_page_layout_bbox
=
new_layout_of_pages
[
page_num
]
next_page_layout_bbox
=
new_layout_of_pages
[
page_num
]
is_conn
=
__connect_para_inter_page
(
pre_page_paras
,
next_page_paras
,
pre_page_layout_bbox
,
next_page_layout_bbox
,
page_num
,
lang
)
is_conn
=
__connect_para_inter_page
(
pre_page_paras
,
next_page_paras
,
pre_page_layout_bbox
,
next_page_layout_bbox
,
page_num
,
lang
)
if
debug_mode
:
#
if debug_mode:
if
is_conn
:
#
if is_conn:
logger
.
info
(
f
"连接了第
{
page_num
-
1
}
页和第
{
page_num
}
页的段落"
)
#
logger.info(f"连接了第{page_num-1}页和第{page_num}页的段落")
#
is_list_conn
=
__connect_list_inter_page
(
pre_page_paras
,
next_page_paras
,
pre_page_layout_bbox
,
next_page_layout_bbox
,
all_page_list_info
[
page_num
-
1
],
all_page_list_info
[
page_num
],
page_num
,
lang
)
is_list_conn
=
__connect_list_inter_page
(
pre_page_paras
,
next_page_paras
,
pre_page_layout_bbox
,
next_page_layout_bbox
,
all_page_list_info
[
page_num
-
1
],
all_page_list_info
[
page_num
],
page_num
,
lang
)
if
debug_mode
:
#
if debug_mode:
if
is_list_conn
:
#
if is_list_conn:
logger
.
info
(
f
"连接了第
{
page_num
-
1
}
页和第
{
page_num
}
页的列表段落"
)
#
logger.info(f"连接了第{page_num-1}页和第{page_num}页的列表段落")
"""接下来可能会漏掉一些特别的一些可以合并的内容,对他们进行段落连接
"""接下来可能会漏掉一些特别的一些可以合并的内容,对他们进行段落连接
1. 正文中有时出现一个行顶格,接下来几行缩进的情况。
1. 正文中有时出现一个行顶格,接下来几行缩进的情况。
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment