Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
zhougaofeng
magic_pdf
Commits
dd4fde1f
Commit
dd4fde1f
authored
Oct 25, 2024
by
zhougaofeng
Browse files
Update para_split_v2.py
parent
e469df71
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
16 additions
and
16 deletions
+16
-16
magic_pdf/para/para_split_v2.py
magic_pdf/para/para_split_v2.py
+16
-16
No files found.
magic_pdf/para/para_split_v2.py
View file @
dd4fde1f
...
...
@@ -140,9 +140,9 @@ def __detect_list_lines(lines, new_layout_bboxes, lang):
# 然后根据编码进行分段, 选出来 1,2,3连续出现至少2次的行,认为是列表。
list_indice
,
list_start_idx
=
find_repeating_patterns2
(
line_fea_encode
)
if
len
(
list_indice
)
>
0
:
if
debug_able
:
logger
.
info
(
f
"发现了列表,列表行数:
{
list_indice
}
,
{
list_start_idx
}
"
)
#
if len(list_indice) > 0:
#
if debug_able:
#
logger.info(f"发现了列表,列表行数:{list_indice}, {list_start_idx}")
# TODO check一下这个特列表里缩进的行左侧是不是对齐的。
segments
=
[]
...
...
@@ -150,12 +150,12 @@ def __detect_list_lines(lines, new_layout_bboxes, lang):
for
i
in
range
(
start
,
end
+
1
):
if
i
>
0
:
if
line_fea_encode
[
i
]
==
4
:
if
debug_able
:
logger
.
info
(
f
"列表行的第
{
i
}
行不是顶格的"
)
#
if debug_able:
#
logger.info(f"列表行的第{i}行不是顶格的")
break
else
:
if
debug_able
:
logger
.
info
(
f
"列表行的第
{
start
}
到第
{
end
}
行是列表"
)
#
else:
#
if debug_able:
#
logger.info(f"列表行的第{start}到第{end}行是列表")
return
split_indices
(
total_lines
,
list_indice
),
list_start_idx
...
...
@@ -435,8 +435,8 @@ def __connect_list_inter_layout(blocks_group, new_layout_bbox, layout_list_info,
if
pre_layout_list_info
[
1
]
and
not
next_layout_list_info
[
0
]
and
next_first_para
[
"type"
]
==
BlockType
.
Text
:
# 前一个是列表结尾,后一个是非列表开头,此时检测是否有相同的缩进
if
debug_able
:
logger
.
info
(
f
"连接page
{
page_num
}
内的list"
)
#
if debug_able:
#
logger.info(f"连接page {page_num} 内的list")
# 向layout_paras[i] 寻找开头具有相同缩进的连续的行
may_list_lines
=
[]
lines
=
next_first_para
.
get
(
"lines"
,
[])
...
...
@@ -467,8 +467,8 @@ def __connect_list_inter_page(pre_page_paras, next_page_paras, pre_page_layout_b
if
pre_page_paras
[
-
1
][
-
1
][
"type"
]
!=
BlockType
.
Text
or
next_page_paras
[
0
][
0
][
"type"
]
!=
BlockType
.
Text
:
return
False
if
pre_page_list_info
[
1
]
and
not
next_page_list_info
[
0
]:
# 前一个是列表结尾,后一个是非列表开头,此时检测是否有相同的缩进
if
debug_able
:
logger
.
info
(
f
"连接page
{
page_num
}
内的list"
)
#
if debug_able:
#
logger.info(f"连接page {page_num} 内的list")
# 向layout_paras[i] 寻找开头具有相同缩进的连续的行
may_list_lines
=
[]
next_page_first_para
=
next_page_paras
[
0
][
0
]
...
...
@@ -680,8 +680,8 @@ def __connect_middle_align_text(page_paras, new_layout_bbox, page_num, lang):
first_line_text
=
''
.
join
([
__get_span_text
(
span
)
for
span
in
layout_para
[
start
][
"lines"
][
0
][
'spans'
]])
if
"Table"
in
first_line_text
or
"Figure"
in
first_line_text
:
pass
if
debug_able
:
logger
.
info
(
line_hi
.
std
())
#
if debug_able:
#
logger.info(line_hi.std())
if
line_hi
.
std
()
<
2
:
"""行高度相同,那么判断是否居中"""
...
...
@@ -693,8 +693,8 @@ def __connect_middle_align_text(page_paras, new_layout_bbox, page_num, lang):
and
not
all
([
x1
==
layout_box
[
2
]
for
x1
in
all_right_x1
]):
merge_para
=
[
block
[
"lines"
][
0
]
for
block
in
layout_para
[
start
:
end
+
1
]]
para_text
=
''
.
join
([
__get_span_text
(
span
)
for
line
in
merge_para
for
span
in
line
[
'spans'
]])
if
debug_able
:
logger
.
info
(
para_text
)
#
if debug_able:
#
logger.info(para_text)
layout_para
[
start
][
"lines"
]
=
merge_para
for
i_para
in
range
(
start
+
1
,
end
+
1
):
layout_para
[
i_para
][
"lines"
]
=
[]
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment